Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Virtio-mem device driver.
   4 *
   5 * Copyright Red Hat, Inc. 2020
   6 *
   7 * Author(s): David Hildenbrand <david@redhat.com>
   8 */
   9
  10#include <linux/virtio.h>
  11#include <linux/virtio_mem.h>
  12#include <linux/workqueue.h>
  13#include <linux/slab.h>
  14#include <linux/module.h>
  15#include <linux/mm.h>
  16#include <linux/memory_hotplug.h>
  17#include <linux/memory.h>
  18#include <linux/hrtimer.h>
  19#include <linux/crash_dump.h>
  20#include <linux/mutex.h>
  21#include <linux/bitmap.h>
  22#include <linux/lockdep.h>
  23
  24#include <acpi/acpi_numa.h>
  25
  26static bool unplug_online = true;
  27module_param(unplug_online, bool, 0644);
  28MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
  29
  30enum virtio_mem_mb_state {
  31	/* Unplugged, not added to Linux. Can be reused later. */
  32	VIRTIO_MEM_MB_STATE_UNUSED = 0,
  33	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
  34	VIRTIO_MEM_MB_STATE_PLUGGED,
  35	/* Fully plugged, fully added to Linux, offline. */
  36	VIRTIO_MEM_MB_STATE_OFFLINE,
  37	/* Partially plugged, fully added to Linux, offline. */
  38	VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
  39	/* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */
  40	VIRTIO_MEM_MB_STATE_ONLINE,
  41	/* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */
  42	VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
  43	/*
  44	 * Fully plugged, fully added to Linux, online (ZONE_MOVABLE).
  45	 * We are not allowed to allocate (unplug) parts of this block that
  46	 * are not movable (similar to gigantic pages). We will never allow
  47	 * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain
  48	 * unmovable parts).
  49	 */
  50	VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE,
  51	VIRTIO_MEM_MB_STATE_COUNT
  52};
  53
  54struct virtio_mem {
  55	struct virtio_device *vdev;
  56
  57	/* We might first have to unplug all memory when starting up. */
  58	bool unplug_all_required;
  59
  60	/* Workqueue that processes the plug/unplug requests. */
  61	struct work_struct wq;
  62	atomic_t config_changed;
  63
  64	/* Virtqueue for guest->host requests. */
  65	struct virtqueue *vq;
  66
  67	/* Wait for a host response to a guest request. */
  68	wait_queue_head_t host_resp;
  69
  70	/* Space for one guest request and the host response. */
  71	struct virtio_mem_req req;
  72	struct virtio_mem_resp resp;
  73
  74	/* The current size of the device. */
  75	uint64_t plugged_size;
  76	/* The requested size of the device. */
  77	uint64_t requested_size;
  78
  79	/* The device block size (for communicating with the device). */
  80	uint64_t device_block_size;
  81	/* The translated node id. NUMA_NO_NODE in case not specified. */
  82	int nid;
  83	/* Physical start address of the memory region. */
  84	uint64_t addr;
  85	/* Maximum region size in bytes. */
  86	uint64_t region_size;
  87
  88	/* The subblock size. */
  89	uint64_t subblock_size;
  90	/* The number of subblocks per memory block. */
  91	uint32_t nb_sb_per_mb;
  92
  93	/* Id of the first memory block of this device. */
  94	unsigned long first_mb_id;
  95	/* Id of the last memory block of this device. */
  96	unsigned long last_mb_id;
  97	/* Id of the last usable memory block of this device. */
  98	unsigned long last_usable_mb_id;
  99	/* Id of the next memory bock to prepare when needed. */
 100	unsigned long next_mb_id;
 101
 102	/* The parent resource for all memory added via this device. */
 103	struct resource *parent_resource;
 104	/*
 105	 * Copy of "System RAM (virtio_mem)" to be used for
 106	 * add_memory_driver_managed().
 107	 */
 108	const char *resource_name;
 109
 110	/* Summary of all memory block states. */
 111	unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
 112#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD		10
 113
 114	/*
 115	 * One byte state per memory block.
 116	 *
 117	 * Allocated via vmalloc(). When preparing new blocks, resized
 118	 * (alloc+copy+free) when needed (crossing pages with the next mb).
 119	 * (when crossing pages).
 120	 *
 121	 * With 128MB memory blocks, we have states for 512GB of memory in one
 122	 * page.
 123	 */
 124	uint8_t *mb_state;
 125
 126	/*
 127	 * $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
 128	 *
 129	 * With 4MB subblocks, we manage 128GB of memory in one page.
 130	 */
 131	unsigned long *sb_bitmap;
 132
 133	/*
 134	 * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
 135	 *
 136	 * When this lock is held the pointers can't change, ONLINE and
 137	 * OFFLINE blocks can't change the state and no subblocks will get
 138	 * plugged/unplugged.
 139	 */
 140	struct mutex hotplug_mutex;
 141	bool hotplug_active;
 142
 143	/* An error occurred we cannot handle - stop processing requests. */
 144	bool broken;
 145
 146	/* The driver is being removed. */
 147	spinlock_t removal_lock;
 148	bool removing;
 149
 150	/* Timer for retrying to plug/unplug memory. */
 151	struct hrtimer retry_timer;
 152	unsigned int retry_timer_ms;
 153#define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
 154#define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
 155
 156	/* Memory notifier (online/offline events). */
 157	struct notifier_block memory_notifier;
 158
 159	/* Next device in the list of virtio-mem devices. */
 160	struct list_head next;
 161};
 162
 163/*
 164 * We have to share a single online_page callback among all virtio-mem
 165 * devices. We use RCU to iterate the list in the callback.
 166 */
 167static DEFINE_MUTEX(virtio_mem_mutex);
 168static LIST_HEAD(virtio_mem_devices);
 169
 170static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
 171
 172/*
 173 * Register a virtio-mem device so it will be considered for the online_page
 174 * callback.
 175 */
 176static int register_virtio_mem_device(struct virtio_mem *vm)
 177{
 178	int rc = 0;
 179
 180	/* First device registers the callback. */
 181	mutex_lock(&virtio_mem_mutex);
 182	if (list_empty(&virtio_mem_devices))
 183		rc = set_online_page_callback(&virtio_mem_online_page_cb);
 184	if (!rc)
 185		list_add_rcu(&vm->next, &virtio_mem_devices);
 186	mutex_unlock(&virtio_mem_mutex);
 187
 188	return rc;
 189}
 190
 191/*
 192 * Unregister a virtio-mem device so it will no longer be considered for the
 193 * online_page callback.
 194 */
 195static void unregister_virtio_mem_device(struct virtio_mem *vm)
 196{
 197	/* Last device unregisters the callback. */
 198	mutex_lock(&virtio_mem_mutex);
 199	list_del_rcu(&vm->next);
 200	if (list_empty(&virtio_mem_devices))
 201		restore_online_page_callback(&virtio_mem_online_page_cb);
 202	mutex_unlock(&virtio_mem_mutex);
 203
 204	synchronize_rcu();
 205}
 206
 207/*
 208 * Calculate the memory block id of a given address.
 209 */
 210static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
 211{
 212	return addr / memory_block_size_bytes();
 213}
 214
 215/*
 216 * Calculate the physical start address of a given memory block id.
 217 */
 218static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
 219{
 220	return mb_id * memory_block_size_bytes();
 221}
 222
 223/*
 224 * Calculate the subblock id of a given address.
 225 */
 226static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
 227					      unsigned long addr)
 228{
 229	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
 230	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
 231
 232	return (addr - mb_addr) / vm->subblock_size;
 233}
 234
 235/*
 236 * Set the state of a memory block, taking care of the state counter.
 237 */
 238static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id,
 239				    enum virtio_mem_mb_state state)
 240{
 241	const unsigned long idx = mb_id - vm->first_mb_id;
 242	enum virtio_mem_mb_state old_state;
 243
 244	old_state = vm->mb_state[idx];
 245	vm->mb_state[idx] = state;
 246
 247	BUG_ON(vm->nb_mb_state[old_state] == 0);
 248	vm->nb_mb_state[old_state]--;
 249	vm->nb_mb_state[state]++;
 250}
 251
 252/*
 253 * Get the state of a memory block.
 254 */
 255static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm,
 256							unsigned long mb_id)
 257{
 258	const unsigned long idx = mb_id - vm->first_mb_id;
 259
 260	return vm->mb_state[idx];
 261}
 262
 263/*
 264 * Prepare the state array for the next memory block.
 265 */
 266static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
 267{
 268	unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
 269	unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
 270	int old_pages = PFN_UP(old_bytes);
 271	int new_pages = PFN_UP(new_bytes);
 272	uint8_t *new_mb_state;
 273
 274	if (vm->mb_state && old_pages == new_pages)
 275		return 0;
 276
 277	new_mb_state = vzalloc(new_pages * PAGE_SIZE);
 278	if (!new_mb_state)
 279		return -ENOMEM;
 280
 281	mutex_lock(&vm->hotplug_mutex);
 282	if (vm->mb_state)
 283		memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE);
 284	vfree(vm->mb_state);
 285	vm->mb_state = new_mb_state;
 286	mutex_unlock(&vm->hotplug_mutex);
 287
 288	return 0;
 289}
 290
 291#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \
 292	for (_mb_id = _vm->first_mb_id; \
 293	     _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \
 294	     _mb_id++) \
 295		if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
 296
 297#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \
 298	for (_mb_id = _vm->next_mb_id - 1; \
 299	     _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \
 300	     _mb_id--) \
 301		if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
 302
 303/*
 304 * Mark all selected subblocks plugged.
 305 *
 306 * Will not modify the state of the memory block.
 307 */
 308static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
 309					 unsigned long mb_id, int sb_id,
 310					 int count)
 311{
 312	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
 313
 314	__bitmap_set(vm->sb_bitmap, bit, count);
 315}
 316
 317/*
 318 * Mark all selected subblocks unplugged.
 319 *
 320 * Will not modify the state of the memory block.
 321 */
 322static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm,
 323					   unsigned long mb_id, int sb_id,
 324					   int count)
 325{
 326	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
 327
 328	__bitmap_clear(vm->sb_bitmap, bit, count);
 329}
 330
 331/*
 332 * Test if all selected subblocks are plugged.
 333 */
 334static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm,
 335					  unsigned long mb_id, int sb_id,
 336					  int count)
 337{
 338	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
 339
 340	if (count == 1)
 341		return test_bit(bit, vm->sb_bitmap);
 342
 343	/* TODO: Helper similar to bitmap_set() */
 344	return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >=
 345	       bit + count;
 346}
 347
 348/*
 349 * Test if all selected subblocks are unplugged.
 350 */
 351static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm,
 352					    unsigned long mb_id, int sb_id,
 353					    int count)
 354{
 355	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
 356
 357	/* TODO: Helper similar to bitmap_set() */
 358	return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count;
 359}
 360
 361/*
 362 * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is
 363 * none.
 364 */
 365static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm,
 366					    unsigned long mb_id)
 367{
 368	const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb;
 369
 370	return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) -
 371	       bit;
 372}
 373
 374/*
 375 * Prepare the subblock bitmap for the next memory block.
 376 */
 377static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm)
 378{
 379	const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id;
 380	const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb;
 381	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb;
 382	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
 383	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
 384	unsigned long *new_sb_bitmap, *old_sb_bitmap;
 385
 386	if (vm->sb_bitmap && old_pages == new_pages)
 387		return 0;
 388
 389	new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE);
 390	if (!new_sb_bitmap)
 391		return -ENOMEM;
 392
 393	mutex_lock(&vm->hotplug_mutex);
 394	if (new_sb_bitmap)
 395		memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE);
 396
 397	old_sb_bitmap = vm->sb_bitmap;
 398	vm->sb_bitmap = new_sb_bitmap;
 399	mutex_unlock(&vm->hotplug_mutex);
 400
 401	vfree(old_sb_bitmap);
 402	return 0;
 403}
 404
 405/*
 406 * Try to add a memory block to Linux. This will usually only fail
 407 * if out of memory.
 408 *
 409 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 410 * onlining code).
 411 *
 412 * Will not modify the state of the memory block.
 413 */
 414static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
 415{
 416	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 417	int nid = vm->nid;
 418
 419	if (nid == NUMA_NO_NODE)
 420		nid = memory_add_physaddr_to_nid(addr);
 421
 422	/*
 423	 * When force-unloading the driver and we still have memory added to
 424	 * Linux, the resource name has to stay.
 425	 */
 426	if (!vm->resource_name) {
 427		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
 428						  GFP_KERNEL);
 429		if (!vm->resource_name)
 430			return -ENOMEM;
 431	}
 432
 433	dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
 434	return add_memory_driver_managed(nid, addr, memory_block_size_bytes(),
 435					 vm->resource_name);
 436}
 437
 438/*
 439 * Try to remove a memory block from Linux. Will only fail if the memory block
 440 * is not offline.
 441 *
 442 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 443 * onlining code).
 444 *
 445 * Will not modify the state of the memory block.
 446 */
 447static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
 448{
 449	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 450	int nid = vm->nid;
 451
 452	if (nid == NUMA_NO_NODE)
 453		nid = memory_add_physaddr_to_nid(addr);
 454
 455	dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id);
 456	return remove_memory(nid, addr, memory_block_size_bytes());
 457}
 458
 459/*
 460 * Try to offline and remove a memory block from Linux.
 461 *
 462 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 463 * onlining code).
 464 *
 465 * Will not modify the state of the memory block.
 466 */
 467static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm,
 468					    unsigned long mb_id)
 469{
 470	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 471	int nid = vm->nid;
 472
 473	if (nid == NUMA_NO_NODE)
 474		nid = memory_add_physaddr_to_nid(addr);
 475
 476	dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n",
 477		mb_id);
 478	return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
 479}
 480
 481/*
 482 * Trigger the workqueue so the device can perform its magic.
 483 */
 484static void virtio_mem_retry(struct virtio_mem *vm)
 485{
 486	unsigned long flags;
 487
 488	spin_lock_irqsave(&vm->removal_lock, flags);
 489	if (!vm->removing)
 490		queue_work(system_freezable_wq, &vm->wq);
 491	spin_unlock_irqrestore(&vm->removal_lock, flags);
 492}
 493
 494static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
 495{
 496	int node = NUMA_NO_NODE;
 497
 498#if defined(CONFIG_ACPI_NUMA)
 499	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
 500		node = pxm_to_node(node_id);
 501#endif
 502	return node;
 503}
 504
 505/*
 506 * Test if a virtio-mem device overlaps with the given range. Can be called
 507 * from (notifier) callbacks lockless.
 508 */
 509static bool virtio_mem_overlaps_range(struct virtio_mem *vm,
 510				      unsigned long start, unsigned long size)
 511{
 512	unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id);
 513	unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) +
 514				memory_block_size_bytes();
 515
 516	return start < dev_end && dev_start < start + size;
 517}
 518
 519/*
 520 * Test if a virtio-mem device owns a memory block. Can be called from
 521 * (notifier) callbacks lockless.
 522 */
 523static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
 524{
 525	return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id;
 526}
 527
 528static int virtio_mem_notify_going_online(struct virtio_mem *vm,
 529					  unsigned long mb_id,
 530					  enum zone_type zone)
 531{
 532	switch (virtio_mem_mb_get_state(vm, mb_id)) {
 533	case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
 534		/*
 535		 * We won't allow to online a partially plugged memory block
 536		 * to the MOVABLE zone - it would contain unmovable parts.
 537		 */
 538		if (zone == ZONE_MOVABLE) {
 539			dev_warn_ratelimited(&vm->vdev->dev,
 540					     "memory block has holes, MOVABLE not supported\n");
 541			return NOTIFY_BAD;
 542		}
 543		return NOTIFY_OK;
 544	case VIRTIO_MEM_MB_STATE_OFFLINE:
 545		return NOTIFY_OK;
 546	default:
 547		break;
 548	}
 549	dev_warn_ratelimited(&vm->vdev->dev,
 550			     "memory block onlining denied\n");
 551	return NOTIFY_BAD;
 552}
 553
 554static void virtio_mem_notify_offline(struct virtio_mem *vm,
 555				      unsigned long mb_id)
 556{
 557	switch (virtio_mem_mb_get_state(vm, mb_id)) {
 558	case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL:
 559		virtio_mem_mb_set_state(vm, mb_id,
 560					VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
 561		break;
 562	case VIRTIO_MEM_MB_STATE_ONLINE:
 563	case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE:
 564		virtio_mem_mb_set_state(vm, mb_id,
 565					VIRTIO_MEM_MB_STATE_OFFLINE);
 566		break;
 567	default:
 568		BUG();
 569		break;
 570	}
 571
 572	/*
 573	 * Trigger the workqueue, maybe we can now unplug memory. Also,
 574	 * when we offline and remove a memory block, this will re-trigger
 575	 * us immediately - which is often nice because the removal of
 576	 * the memory block (e.g., memmap) might have freed up memory
 577	 * on other memory blocks we manage.
 578	 */
 579	virtio_mem_retry(vm);
 580}
 581
 582static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id,
 583				     enum zone_type zone)
 584{
 585	unsigned long nb_offline;
 586
 587	switch (virtio_mem_mb_get_state(vm, mb_id)) {
 588	case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
 589		BUG_ON(zone == ZONE_MOVABLE);
 590		virtio_mem_mb_set_state(vm, mb_id,
 591					VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
 592		break;
 593	case VIRTIO_MEM_MB_STATE_OFFLINE:
 594		if (zone == ZONE_MOVABLE)
 595			virtio_mem_mb_set_state(vm, mb_id,
 596					    VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE);
 597		else
 598			virtio_mem_mb_set_state(vm, mb_id,
 599						VIRTIO_MEM_MB_STATE_ONLINE);
 600		break;
 601	default:
 602		BUG();
 603		break;
 604	}
 605	nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
 606		     vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
 607
 608	/* see if we can add new blocks now that we onlined one block */
 609	if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1)
 610		virtio_mem_retry(vm);
 611}
 612
 613static void virtio_mem_notify_going_offline(struct virtio_mem *vm,
 614					    unsigned long mb_id)
 615{
 616	const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
 617	struct page *page;
 618	unsigned long pfn;
 619	int sb_id, i;
 620
 621	for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
 622		if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
 623			continue;
 624		/*
 625		 * Drop our reference to the pages so the memory can get
 626		 * offlined and add the unplugged pages to the managed
 627		 * page counters (so offlining code can correctly subtract
 628		 * them again).
 629		 */
 630		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 631			       sb_id * vm->subblock_size);
 632		adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
 633		for (i = 0; i < nr_pages; i++) {
 634			page = pfn_to_page(pfn + i);
 635			if (WARN_ON(!page_ref_dec_and_test(page)))
 636				dump_page(page, "unplugged page referenced");
 637		}
 638	}
 639}
 640
 641static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm,
 642					     unsigned long mb_id)
 643{
 644	const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
 645	unsigned long pfn;
 646	int sb_id, i;
 647
 648	for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
 649		if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
 650			continue;
 651		/*
 652		 * Get the reference we dropped when going offline and
 653		 * subtract the unplugged pages from the managed page
 654		 * counters.
 655		 */
 656		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 657			       sb_id * vm->subblock_size);
 658		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
 659		for (i = 0; i < nr_pages; i++)
 660			page_ref_inc(pfn_to_page(pfn + i));
 661	}
 662}
 663
 664/*
 665 * This callback will either be called synchronously from add_memory() or
 666 * asynchronously (e.g., triggered via user space). We have to be careful
 667 * with locking when calling add_memory().
 668 */
 669static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
 670					 unsigned long action, void *arg)
 671{
 672	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
 673					     memory_notifier);
 674	struct memory_notify *mhp = arg;
 675	const unsigned long start = PFN_PHYS(mhp->start_pfn);
 676	const unsigned long size = PFN_PHYS(mhp->nr_pages);
 677	const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
 678	enum zone_type zone;
 679	int rc = NOTIFY_OK;
 680
 681	if (!virtio_mem_overlaps_range(vm, start, size))
 682		return NOTIFY_DONE;
 683
 684	/*
 685	 * Memory is onlined/offlined in memory block granularity. We cannot
 686	 * cross virtio-mem device boundaries and memory block boundaries. Bail
 687	 * out if this ever changes.
 688	 */
 689	if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
 690			 !IS_ALIGNED(start, memory_block_size_bytes())))
 691		return NOTIFY_BAD;
 692
 693	/*
 694	 * Avoid circular locking lockdep warnings. We lock the mutex
 695	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
 696	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
 697	 * between both notifier calls and will bail out. False positive.
 698	 */
 699	lockdep_off();
 700
 701	switch (action) {
 702	case MEM_GOING_OFFLINE:
 703		mutex_lock(&vm->hotplug_mutex);
 704		if (vm->removing) {
 705			rc = notifier_from_errno(-EBUSY);
 706			mutex_unlock(&vm->hotplug_mutex);
 707			break;
 708		}
 709		vm->hotplug_active = true;
 710		virtio_mem_notify_going_offline(vm, mb_id);
 711		break;
 712	case MEM_GOING_ONLINE:
 713		mutex_lock(&vm->hotplug_mutex);
 714		if (vm->removing) {
 715			rc = notifier_from_errno(-EBUSY);
 716			mutex_unlock(&vm->hotplug_mutex);
 717			break;
 718		}
 719		vm->hotplug_active = true;
 720		zone = page_zonenum(pfn_to_page(mhp->start_pfn));
 721		rc = virtio_mem_notify_going_online(vm, mb_id, zone);
 722		break;
 723	case MEM_OFFLINE:
 724		virtio_mem_notify_offline(vm, mb_id);
 725		vm->hotplug_active = false;
 726		mutex_unlock(&vm->hotplug_mutex);
 727		break;
 728	case MEM_ONLINE:
 729		zone = page_zonenum(pfn_to_page(mhp->start_pfn));
 730		virtio_mem_notify_online(vm, mb_id, zone);
 731		vm->hotplug_active = false;
 732		mutex_unlock(&vm->hotplug_mutex);
 733		break;
 734	case MEM_CANCEL_OFFLINE:
 735		if (!vm->hotplug_active)
 736			break;
 737		virtio_mem_notify_cancel_offline(vm, mb_id);
 738		vm->hotplug_active = false;
 739		mutex_unlock(&vm->hotplug_mutex);
 740		break;
 741	case MEM_CANCEL_ONLINE:
 742		if (!vm->hotplug_active)
 743			break;
 744		vm->hotplug_active = false;
 745		mutex_unlock(&vm->hotplug_mutex);
 746		break;
 747	default:
 748		break;
 749	}
 750
 751	lockdep_on();
 752
 753	return rc;
 754}
 755
 756/*
 757 * Set a range of pages PG_offline. Remember pages that were never onlined
 758 * (via generic_online_page()) using PageDirty().
 759 */
 760static void virtio_mem_set_fake_offline(unsigned long pfn,
 761					unsigned int nr_pages, bool onlined)
 762{
 763	for (; nr_pages--; pfn++) {
 764		struct page *page = pfn_to_page(pfn);
 765
 766		__SetPageOffline(page);
 767		if (!onlined) {
 768			SetPageDirty(page);
 769			/* FIXME: remove after cleanups */
 770			ClearPageReserved(page);
 771		}
 772	}
 773}
 774
 775/*
 776 * Clear PG_offline from a range of pages. If the pages were never onlined,
 777 * (via generic_online_page()), clear PageDirty().
 778 */
 779static void virtio_mem_clear_fake_offline(unsigned long pfn,
 780					  unsigned int nr_pages, bool onlined)
 781{
 782	for (; nr_pages--; pfn++) {
 783		struct page *page = pfn_to_page(pfn);
 784
 785		__ClearPageOffline(page);
 786		if (!onlined)
 787			ClearPageDirty(page);
 788	}
 789}
 790
 791/*
 792 * Release a range of fake-offline pages to the buddy, effectively
 793 * fake-onlining them.
 794 */
 795static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
 796{
 797	const int order = MAX_ORDER - 1;
 798	int i;
 799
 800	/*
 801	 * We are always called with subblock granularity, which is at least
 802	 * aligned to MAX_ORDER - 1.
 803	 */
 804	for (i = 0; i < nr_pages; i += 1 << order) {
 805		struct page *page = pfn_to_page(pfn + i);
 806
 807		/*
 808		 * If the page is PageDirty(), it was kept fake-offline when
 809		 * onlining the memory block. Otherwise, it was allocated
 810		 * using alloc_contig_range(). All pages in a subblock are
 811		 * alike.
 812		 */
 813		if (PageDirty(page)) {
 814			virtio_mem_clear_fake_offline(pfn + i, 1 << order,
 815						      false);
 816			generic_online_page(page, order);
 817		} else {
 818			virtio_mem_clear_fake_offline(pfn + i, 1 << order,
 819						      true);
 820			free_contig_range(pfn + i, 1 << order);
 821			adjust_managed_page_count(page, 1 << order);
 822		}
 823	}
 824}
 825
 826static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
 827{
 828	const unsigned long addr = page_to_phys(page);
 829	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
 830	struct virtio_mem *vm;
 831	int sb_id;
 832
 833	/*
 834	 * We exploit here that subblocks have at least MAX_ORDER - 1
 835	 * size/alignment and that this callback is is called with such a
 836	 * size/alignment. So we cannot cross subblocks and therefore
 837	 * also not memory blocks.
 838	 */
 839	rcu_read_lock();
 840	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
 841		if (!virtio_mem_owned_mb(vm, mb_id))
 842			continue;
 843
 844		sb_id = virtio_mem_phys_to_sb_id(vm, addr);
 845		/*
 846		 * If plugged, online the pages, otherwise, set them fake
 847		 * offline (PageOffline).
 848		 */
 849		if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
 850			generic_online_page(page, order);
 851		else
 852			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
 853						    false);
 854		rcu_read_unlock();
 855		return;
 856	}
 857	rcu_read_unlock();
 858
 859	/* not virtio-mem memory, but e.g., a DIMM. online it */
 860	generic_online_page(page, order);
 861}
 862
 863static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
 864					const struct virtio_mem_req *req)
 865{
 866	struct scatterlist *sgs[2], sg_req, sg_resp;
 867	unsigned int len;
 868	int rc;
 869
 870	/* don't use the request residing on the stack (vaddr) */
 871	vm->req = *req;
 872
 873	/* out: buffer for request */
 874	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
 875	sgs[0] = &sg_req;
 876
 877	/* in: buffer for response */
 878	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
 879	sgs[1] = &sg_resp;
 880
 881	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
 882	if (rc < 0)
 883		return rc;
 884
 885	virtqueue_kick(vm->vq);
 886
 887	/* wait for a response */
 888	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
 889
 890	return virtio16_to_cpu(vm->vdev, vm->resp.type);
 891}
 892
 893static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
 894					uint64_t size)
 895{
 896	const uint64_t nb_vm_blocks = size / vm->device_block_size;
 897	const struct virtio_mem_req req = {
 898		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
 899		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
 900		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
 901	};
 902
 903	if (atomic_read(&vm->config_changed))
 904		return -EAGAIN;
 905
 906	switch (virtio_mem_send_request(vm, &req)) {
 907	case VIRTIO_MEM_RESP_ACK:
 908		vm->plugged_size += size;
 909		return 0;
 910	case VIRTIO_MEM_RESP_NACK:
 911		return -EAGAIN;
 912	case VIRTIO_MEM_RESP_BUSY:
 913		return -ETXTBSY;
 914	case VIRTIO_MEM_RESP_ERROR:
 915		return -EINVAL;
 916	default:
 917		return -ENOMEM;
 918	}
 919}
 920
 921static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
 922					  uint64_t size)
 923{
 924	const uint64_t nb_vm_blocks = size / vm->device_block_size;
 925	const struct virtio_mem_req req = {
 926		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
 927		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
 928		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
 929	};
 930
 931	if (atomic_read(&vm->config_changed))
 932		return -EAGAIN;
 933
 934	switch (virtio_mem_send_request(vm, &req)) {
 935	case VIRTIO_MEM_RESP_ACK:
 936		vm->plugged_size -= size;
 937		return 0;
 938	case VIRTIO_MEM_RESP_BUSY:
 939		return -ETXTBSY;
 940	case VIRTIO_MEM_RESP_ERROR:
 941		return -EINVAL;
 942	default:
 943		return -ENOMEM;
 944	}
 945}
 946
 947static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
 948{
 949	const struct virtio_mem_req req = {
 950		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
 951	};
 952
 953	switch (virtio_mem_send_request(vm, &req)) {
 954	case VIRTIO_MEM_RESP_ACK:
 955		vm->unplug_all_required = false;
 956		vm->plugged_size = 0;
 957		/* usable region might have shrunk */
 958		atomic_set(&vm->config_changed, 1);
 959		return 0;
 960	case VIRTIO_MEM_RESP_BUSY:
 961		return -ETXTBSY;
 962	default:
 963		return -ENOMEM;
 964	}
 965}
 966
 967/*
 968 * Plug selected subblocks. Updates the plugged state, but not the state
 969 * of the memory block.
 970 */
 971static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
 972				 int sb_id, int count)
 973{
 974	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
 975			      sb_id * vm->subblock_size;
 976	const uint64_t size = count * vm->subblock_size;
 977	int rc;
 978
 979	dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id,
 980		sb_id, sb_id + count - 1);
 981
 982	rc = virtio_mem_send_plug_request(vm, addr, size);
 983	if (!rc)
 984		virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count);
 985	return rc;
 986}
 987
 988/*
 989 * Unplug selected subblocks. Updates the plugged state, but not the state
 990 * of the memory block.
 991 */
 992static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
 993				   int sb_id, int count)
 994{
 995	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
 996			      sb_id * vm->subblock_size;
 997	const uint64_t size = count * vm->subblock_size;
 998	int rc;
 999
1000	dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n",
1001		mb_id, sb_id, sb_id + count - 1);
1002
1003	rc = virtio_mem_send_unplug_request(vm, addr, size);
1004	if (!rc)
1005		virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count);
1006	return rc;
1007}
1008
1009/*
1010 * Unplug the desired number of plugged subblocks of a offline or not-added
1011 * memory block. Will fail if any subblock cannot get unplugged (instead of
1012 * skipping it).
1013 *
1014 * Will not modify the state of the memory block.
1015 *
1016 * Note: can fail after some subblocks were unplugged.
1017 */
1018static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
1019				       unsigned long mb_id, uint64_t *nb_sb)
1020{
1021	int sb_id, count;
1022	int rc;
1023
1024	sb_id = vm->nb_sb_per_mb - 1;
1025	while (*nb_sb) {
1026		/* Find the next candidate subblock */
1027		while (sb_id >= 0 &&
1028		       virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1))
1029			sb_id--;
1030		if (sb_id < 0)
1031			break;
1032		/* Try to unplug multiple subblocks at a time */
1033		count = 1;
1034		while (count < *nb_sb && sb_id > 0 &&
1035		       virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1036			count++;
1037			sb_id--;
1038		}
1039
1040		rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
1041		if (rc)
1042			return rc;
1043		*nb_sb -= count;
1044		sb_id--;
1045	}
1046
1047	return 0;
1048}
1049
1050/*
1051 * Unplug all plugged subblocks of an offline or not-added memory block.
1052 *
1053 * Will not modify the state of the memory block.
1054 *
1055 * Note: can fail after some subblocks were unplugged.
1056 */
1057static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id)
1058{
1059	uint64_t nb_sb = vm->nb_sb_per_mb;
1060
1061	return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb);
1062}
1063
1064/*
1065 * Prepare tracking data for the next memory block.
1066 */
1067static int virtio_mem_prepare_next_mb(struct virtio_mem *vm,
1068				      unsigned long *mb_id)
1069{
1070	int rc;
1071
1072	if (vm->next_mb_id > vm->last_usable_mb_id)
1073		return -ENOSPC;
1074
1075	/* Resize the state array if required. */
1076	rc = virtio_mem_mb_state_prepare_next_mb(vm);
1077	if (rc)
1078		return rc;
1079
1080	/* Resize the subblock bitmap if required. */
1081	rc = virtio_mem_sb_bitmap_prepare_next_mb(vm);
1082	if (rc)
1083		return rc;
1084
1085	vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++;
1086	*mb_id = vm->next_mb_id++;
1087	return 0;
1088}
1089
1090/*
1091 * Don't add too many blocks that are not onlined yet to avoid running OOM.
1092 */
1093static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm)
1094{
1095	unsigned long nb_offline;
1096
1097	nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
1098		     vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
1099	return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD;
1100}
1101
1102/*
1103 * Try to plug the desired number of subblocks and add the memory block
1104 * to Linux.
1105 *
1106 * Will modify the state of the memory block.
1107 */
1108static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
1109				      unsigned long mb_id,
1110				      uint64_t *nb_sb)
1111{
1112	const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb);
1113	int rc, rc2;
1114
1115	if (WARN_ON_ONCE(!count))
1116		return -EINVAL;
1117
1118	/*
1119	 * Plug the requested number of subblocks before adding it to linux,
1120	 * so that onlining will directly online all plugged subblocks.
1121	 */
1122	rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count);
1123	if (rc)
1124		return rc;
1125
1126	/*
1127	 * Mark the block properly offline before adding it to Linux,
1128	 * so the memory notifiers will find the block in the right state.
1129	 */
1130	if (count == vm->nb_sb_per_mb)
1131		virtio_mem_mb_set_state(vm, mb_id,
1132					VIRTIO_MEM_MB_STATE_OFFLINE);
1133	else
1134		virtio_mem_mb_set_state(vm, mb_id,
1135					VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
1136
1137	/* Add the memory block to linux - if that fails, try to unplug. */
1138	rc = virtio_mem_mb_add(vm, mb_id);
1139	if (rc) {
1140		enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED;
1141
1142		dev_err(&vm->vdev->dev,
1143			"adding memory block %lu failed with %d\n", mb_id, rc);
1144		rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count);
1145
1146		/*
1147		 * TODO: Linux MM does not properly clean up yet in all cases
1148		 * where adding of memory failed - especially on -ENOMEM.
1149		 */
1150		if (rc2)
1151			new_state = VIRTIO_MEM_MB_STATE_PLUGGED;
1152		virtio_mem_mb_set_state(vm, mb_id, new_state);
1153		return rc;
1154	}
1155
1156	*nb_sb -= count;
1157	return 0;
1158}
1159
1160/*
1161 * Try to plug the desired number of subblocks of a memory block that
1162 * is already added to Linux.
1163 *
1164 * Will modify the state of the memory block.
1165 *
1166 * Note: Can fail after some subblocks were successfully plugged.
1167 */
1168static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
1169				     uint64_t *nb_sb, bool online)
1170{
1171	unsigned long pfn, nr_pages;
1172	int sb_id, count;
1173	int rc;
1174
1175	if (WARN_ON_ONCE(!*nb_sb))
1176		return -EINVAL;
1177
1178	while (*nb_sb) {
1179		sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id);
1180		if (sb_id >= vm->nb_sb_per_mb)
1181			break;
1182		count = 1;
1183		while (count < *nb_sb &&
1184		       sb_id + count < vm->nb_sb_per_mb &&
1185		       !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count,
1186						      1))
1187			count++;
1188
1189		rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count);
1190		if (rc)
1191			return rc;
1192		*nb_sb -= count;
1193		if (!online)
1194			continue;
1195
1196		/* fake-online the pages if the memory block is online */
1197		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1198			       sb_id * vm->subblock_size);
1199		nr_pages = PFN_DOWN(count * vm->subblock_size);
1200		virtio_mem_fake_online(pfn, nr_pages);
1201	}
1202
1203	if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1204		if (online)
1205			virtio_mem_mb_set_state(vm, mb_id,
1206						VIRTIO_MEM_MB_STATE_ONLINE);
1207		else
1208			virtio_mem_mb_set_state(vm, mb_id,
1209						VIRTIO_MEM_MB_STATE_OFFLINE);
1210	}
1211
1212	return 0;
1213}
1214
1215/*
1216 * Try to plug the requested amount of memory.
1217 */
1218static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1219{
1220	uint64_t nb_sb = diff / vm->subblock_size;
1221	unsigned long mb_id;
1222	int rc;
1223
1224	if (!nb_sb)
1225		return 0;
1226
1227	/* Don't race with onlining/offlining */
1228	mutex_lock(&vm->hotplug_mutex);
1229
1230	/* Try to plug subblocks of partially plugged online blocks. */
1231	virtio_mem_for_each_mb_state(vm, mb_id,
1232				     VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
1233		rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true);
1234		if (rc || !nb_sb)
1235			goto out_unlock;
1236		cond_resched();
1237	}
1238
1239	/* Try to plug subblocks of partially plugged offline blocks. */
1240	virtio_mem_for_each_mb_state(vm, mb_id,
1241				     VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
1242		rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false);
1243		if (rc || !nb_sb)
1244			goto out_unlock;
1245		cond_resched();
1246	}
1247
1248	/*
1249	 * We won't be working on online/offline memory blocks from this point,
1250	 * so we can't race with memory onlining/offlining. Drop the mutex.
1251	 */
1252	mutex_unlock(&vm->hotplug_mutex);
1253
1254	/* Try to plug and add unused blocks */
1255	virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) {
1256		if (virtio_mem_too_many_mb_offline(vm))
1257			return -ENOSPC;
1258
1259		rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
1260		if (rc || !nb_sb)
1261			return rc;
1262		cond_resched();
1263	}
1264
1265	/* Try to prepare, plug and add new blocks */
1266	while (nb_sb) {
1267		if (virtio_mem_too_many_mb_offline(vm))
1268			return -ENOSPC;
1269
1270		rc = virtio_mem_prepare_next_mb(vm, &mb_id);
1271		if (rc)
1272			return rc;
1273		rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
1274		if (rc)
1275			return rc;
1276		cond_resched();
1277	}
1278
1279	return 0;
1280out_unlock:
1281	mutex_unlock(&vm->hotplug_mutex);
1282	return rc;
1283}
1284
1285/*
1286 * Unplug the desired number of plugged subblocks of an offline memory block.
1287 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1288 *
1289 * Will modify the state of the memory block. Might temporarily drop the
1290 * hotplug_mutex.
1291 *
1292 * Note: Can fail after some subblocks were successfully unplugged.
1293 */
1294static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
1295					       unsigned long mb_id,
1296					       uint64_t *nb_sb)
1297{
1298	int rc;
1299
1300	rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb);
1301
1302	/* some subblocks might have been unplugged even on failure */
1303	if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb))
1304		virtio_mem_mb_set_state(vm, mb_id,
1305					VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
1306	if (rc)
1307		return rc;
1308
1309	if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1310		/*
1311		 * Remove the block from Linux - this should never fail.
1312		 * Hinder the block from getting onlined by marking it
1313		 * unplugged. Temporarily drop the mutex, so
1314		 * any pending GOING_ONLINE requests can be serviced/rejected.
1315		 */
1316		virtio_mem_mb_set_state(vm, mb_id,
1317					VIRTIO_MEM_MB_STATE_UNUSED);
1318
1319		mutex_unlock(&vm->hotplug_mutex);
1320		rc = virtio_mem_mb_remove(vm, mb_id);
1321		BUG_ON(rc);
1322		mutex_lock(&vm->hotplug_mutex);
1323	}
1324	return 0;
1325}
1326
1327/*
1328 * Unplug the given plugged subblocks of an online memory block.
1329 *
1330 * Will modify the state of the memory block.
1331 */
1332static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
1333					  unsigned long mb_id, int sb_id,
1334					  int count)
1335{
1336	const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count;
1337	unsigned long start_pfn;
1338	int rc;
1339
1340	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1341			     sb_id * vm->subblock_size);
1342	rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
1343				MIGRATE_MOVABLE, GFP_KERNEL);
1344	if (rc == -ENOMEM)
1345		/* whoops, out of memory */
1346		return rc;
1347	if (rc)
1348		return -EBUSY;
1349
1350	/* Mark it as fake-offline before unplugging it */
1351	virtio_mem_set_fake_offline(start_pfn, nr_pages, true);
1352	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
1353
1354	/* Try to unplug the allocated memory */
1355	rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
1356	if (rc) {
1357		/* Return the memory to the buddy. */
1358		virtio_mem_fake_online(start_pfn, nr_pages);
1359		return rc;
1360	}
1361
1362	virtio_mem_mb_set_state(vm, mb_id,
1363				VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
1364	return 0;
1365}
1366
1367/*
1368 * Unplug the desired number of plugged subblocks of an online memory block.
1369 * Will skip subblock that are busy.
1370 *
1371 * Will modify the state of the memory block. Might temporarily drop the
1372 * hotplug_mutex.
1373 *
1374 * Note: Can fail after some subblocks were successfully unplugged. Can
1375 *       return 0 even if subblocks were busy and could not get unplugged.
1376 */
1377static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
1378					      unsigned long mb_id,
1379					      uint64_t *nb_sb)
1380{
1381	int rc, sb_id;
1382
1383	/* If possible, try to unplug the complete block in one shot. */
1384	if (*nb_sb >= vm->nb_sb_per_mb &&
1385	    virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1386		rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0,
1387						    vm->nb_sb_per_mb);
1388		if (!rc) {
1389			*nb_sb -= vm->nb_sb_per_mb;
1390			goto unplugged;
1391		} else if (rc != -EBUSY)
1392			return rc;
1393	}
1394
1395	/* Fallback to single subblocks. */
1396	for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
1397		/* Find the next candidate subblock */
1398		while (sb_id >= 0 &&
1399		       !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
1400			sb_id--;
1401		if (sb_id < 0)
1402			break;
1403
1404		rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1);
1405		if (rc == -EBUSY)
1406			continue;
1407		else if (rc)
1408			return rc;
1409		*nb_sb -= 1;
1410	}
1411
1412unplugged:
1413	/*
1414	 * Once all subblocks of a memory block were unplugged, offline and
1415	 * remove it. This will usually not fail, as no memory is in use
1416	 * anymore - however some other notifiers might NACK the request.
1417	 */
1418	if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
1419		mutex_unlock(&vm->hotplug_mutex);
1420		rc = virtio_mem_mb_offline_and_remove(vm, mb_id);
1421		mutex_lock(&vm->hotplug_mutex);
1422		if (!rc)
1423			virtio_mem_mb_set_state(vm, mb_id,
1424						VIRTIO_MEM_MB_STATE_UNUSED);
1425	}
1426
1427	return 0;
1428}
1429
1430/*
1431 * Try to unplug the requested amount of memory.
1432 */
1433static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
1434{
1435	uint64_t nb_sb = diff / vm->subblock_size;
1436	unsigned long mb_id;
1437	int rc;
1438
1439	if (!nb_sb)
1440		return 0;
1441
1442	/*
1443	 * We'll drop the mutex a couple of times when it is safe to do so.
1444	 * This might result in some blocks switching the state (online/offline)
1445	 * and we could miss them in this run - we will retry again later.
1446	 */
1447	mutex_lock(&vm->hotplug_mutex);
1448
1449	/* Try to unplug subblocks of partially plugged offline blocks. */
1450	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1451					 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
1452		rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
1453							 &nb_sb);
1454		if (rc || !nb_sb)
1455			goto out_unlock;
1456		cond_resched();
1457	}
1458
1459	/* Try to unplug subblocks of plugged offline blocks. */
1460	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1461					 VIRTIO_MEM_MB_STATE_OFFLINE) {
1462		rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
1463							 &nb_sb);
1464		if (rc || !nb_sb)
1465			goto out_unlock;
1466		cond_resched();
1467	}
1468
1469	if (!unplug_online) {
1470		mutex_unlock(&vm->hotplug_mutex);
1471		return 0;
1472	}
1473
1474	/* Try to unplug subblocks of partially plugged online blocks. */
1475	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1476					 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
1477		rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
1478							&nb_sb);
1479		if (rc || !nb_sb)
1480			goto out_unlock;
1481		mutex_unlock(&vm->hotplug_mutex);
1482		cond_resched();
1483		mutex_lock(&vm->hotplug_mutex);
1484	}
1485
1486	/* Try to unplug subblocks of plugged online blocks. */
1487	virtio_mem_for_each_mb_state_rev(vm, mb_id,
1488					 VIRTIO_MEM_MB_STATE_ONLINE) {
1489		rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
1490							&nb_sb);
1491		if (rc || !nb_sb)
1492			goto out_unlock;
1493		mutex_unlock(&vm->hotplug_mutex);
1494		cond_resched();
1495		mutex_lock(&vm->hotplug_mutex);
1496	}
1497
1498	mutex_unlock(&vm->hotplug_mutex);
1499	return nb_sb ? -EBUSY : 0;
1500out_unlock:
1501	mutex_unlock(&vm->hotplug_mutex);
1502	return rc;
1503}
1504
1505/*
1506 * Try to unplug all blocks that couldn't be unplugged before, for example,
1507 * because the hypervisor was busy.
1508 */
1509static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
1510{
1511	unsigned long mb_id;
1512	int rc;
1513
1514	virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) {
1515		rc = virtio_mem_mb_unplug(vm, mb_id);
1516		if (rc)
1517			return rc;
1518		virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
1519	}
1520
1521	return 0;
1522}
1523
1524/*
1525 * Update all parts of the config that could have changed.
1526 */
1527static void virtio_mem_refresh_config(struct virtio_mem *vm)
1528{
1529	const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
1530	uint64_t new_plugged_size, usable_region_size, end_addr;
1531
1532	/* the plugged_size is just a reflection of what _we_ did previously */
1533	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
1534			&new_plugged_size);
1535	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
1536		vm->plugged_size = new_plugged_size;
1537
1538	/* calculate the last usable memory block id */
1539	virtio_cread_le(vm->vdev, struct virtio_mem_config,
1540			usable_region_size, &usable_region_size);
1541	end_addr = vm->addr + usable_region_size;
1542	end_addr = min(end_addr, phys_limit);
1543	vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1;
1544
1545	/* see if there is a request to change the size */
1546	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
1547			&vm->requested_size);
1548
1549	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
1550	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
1551}
1552
1553/*
1554 * Workqueue function for handling plug/unplug requests and config updates.
1555 */
1556static void virtio_mem_run_wq(struct work_struct *work)
1557{
1558	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
1559	uint64_t diff;
1560	int rc;
1561
1562	hrtimer_cancel(&vm->retry_timer);
1563
1564	if (vm->broken)
1565		return;
1566
1567retry:
1568	rc = 0;
1569
1570	/* Make sure we start with a clean state if there are leftovers. */
1571	if (unlikely(vm->unplug_all_required))
1572		rc = virtio_mem_send_unplug_all_request(vm);
1573
1574	if (atomic_read(&vm->config_changed)) {
1575		atomic_set(&vm->config_changed, 0);
1576		virtio_mem_refresh_config(vm);
1577	}
1578
1579	/* Unplug any leftovers from previous runs */
1580	if (!rc)
1581		rc = virtio_mem_unplug_pending_mb(vm);
1582
1583	if (!rc && vm->requested_size != vm->plugged_size) {
1584		if (vm->requested_size > vm->plugged_size) {
1585			diff = vm->requested_size - vm->plugged_size;
1586			rc = virtio_mem_plug_request(vm, diff);
1587		} else {
1588			diff = vm->plugged_size - vm->requested_size;
1589			rc = virtio_mem_unplug_request(vm, diff);
1590		}
1591	}
1592
1593	switch (rc) {
1594	case 0:
1595		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
1596		break;
1597	case -ENOSPC:
1598		/*
1599		 * We cannot add any more memory (alignment, physical limit)
1600		 * or we have too many offline memory blocks.
1601		 */
1602		break;
1603	case -ETXTBSY:
1604		/*
1605		 * The hypervisor cannot process our request right now
1606		 * (e.g., out of memory, migrating);
1607		 */
1608	case -EBUSY:
1609		/*
1610		 * We cannot free up any memory to unplug it (all plugged memory
1611		 * is busy).
1612		 */
1613	case -ENOMEM:
1614		/* Out of memory, try again later. */
1615		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
1616			      HRTIMER_MODE_REL);
1617		break;
1618	case -EAGAIN:
1619		/* Retry immediately (e.g., the config changed). */
1620		goto retry;
1621	default:
1622		/* Unknown error, mark as broken */
1623		dev_err(&vm->vdev->dev,
1624			"unknown error, marking device broken: %d\n", rc);
1625		vm->broken = true;
1626	}
1627}
1628
1629static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
1630{
1631	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
1632					     retry_timer);
1633
1634	virtio_mem_retry(vm);
1635	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
1636				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
1637	return HRTIMER_NORESTART;
1638}
1639
1640static void virtio_mem_handle_response(struct virtqueue *vq)
1641{
1642	struct virtio_mem *vm = vq->vdev->priv;
1643
1644	wake_up(&vm->host_resp);
1645}
1646
1647static int virtio_mem_init_vq(struct virtio_mem *vm)
1648{
1649	struct virtqueue *vq;
1650
1651	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
1652				   "guest-request");
1653	if (IS_ERR(vq))
1654		return PTR_ERR(vq);
1655	vm->vq = vq;
1656
1657	return 0;
1658}
1659
1660static int virtio_mem_init(struct virtio_mem *vm)
1661{
1662	const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
1663	uint16_t node_id;
1664
1665	if (!vm->vdev->config->get) {
1666		dev_err(&vm->vdev->dev, "config access disabled\n");
1667		return -EINVAL;
1668	}
1669
1670	/*
1671	 * We don't want to (un)plug or reuse any memory when in kdump. The
1672	 * memory is still accessible (but not mapped).
1673	 */
1674	if (is_kdump_kernel()) {
1675		dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
1676		return -EBUSY;
1677	}
1678
1679	/* Fetch all properties that can't change. */
1680	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
1681			&vm->plugged_size);
1682	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
1683			&vm->device_block_size);
1684	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
1685			&node_id);
1686	vm->nid = virtio_mem_translate_node_id(vm, node_id);
1687	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
1688	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
1689			&vm->region_size);
1690
1691	/*
1692	 * We always hotplug memory in memory block granularity. This way,
1693	 * we have to wait for exactly one memory block to online.
1694	 */
1695	if (vm->device_block_size > memory_block_size_bytes()) {
1696		dev_err(&vm->vdev->dev,
1697			"The block size is not supported (too big).\n");
1698		return -EINVAL;
1699	}
1700
1701	/* bad device setup - warn only */
1702	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
1703		dev_warn(&vm->vdev->dev,
1704			 "The alignment of the physical start address can make some memory unusable.\n");
1705	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
1706		dev_warn(&vm->vdev->dev,
1707			 "The alignment of the physical end address can make some memory unusable.\n");
1708	if (vm->addr + vm->region_size > phys_limit)
1709		dev_warn(&vm->vdev->dev,
1710			 "Some memory is not addressable. This can make some memory unusable.\n");
1711
1712	/*
1713	 * Calculate the subblock size:
1714	 * - At least MAX_ORDER - 1 / pageblock_order.
1715	 * - At least the device block size.
1716	 * In the worst case, a single subblock per memory block.
1717	 */
1718	vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1,
1719						     pageblock_order);
1720	vm->subblock_size = max_t(uint64_t, vm->device_block_size,
1721				  vm->subblock_size);
1722	vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size;
1723
1724	/* Round up to the next full memory block */
1725	vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 +
1726						   memory_block_size_bytes());
1727	vm->next_mb_id = vm->first_mb_id;
1728	vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr +
1729			 vm->region_size) - 1;
1730
1731	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
1732	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
1733	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
1734		 (unsigned long long)vm->device_block_size);
1735	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
1736		 memory_block_size_bytes());
1737	dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
1738		 (unsigned long long)vm->subblock_size);
1739	if (vm->nid != NUMA_NO_NODE)
1740		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
1741
1742	return 0;
1743}
1744
1745static int virtio_mem_create_resource(struct virtio_mem *vm)
1746{
1747	/*
1748	 * When force-unloading the driver and removing the device, we
1749	 * could have a garbage pointer. Duplicate the string.
1750	 */
1751	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
1752
1753	if (!name)
1754		return -ENOMEM;
1755
1756	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
1757						   name, IORESOURCE_SYSTEM_RAM);
1758	if (!vm->parent_resource) {
1759		kfree(name);
1760		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
1761		dev_info(&vm->vdev->dev,
1762			 "reloading the driver is not supported\n");
1763		return -EBUSY;
1764	}
1765
1766	/* The memory is not actually busy - make add_memory() work. */
1767	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
1768	return 0;
1769}
1770
1771static void virtio_mem_delete_resource(struct virtio_mem *vm)
1772{
1773	const char *name;
1774
1775	if (!vm->parent_resource)
1776		return;
1777
1778	name = vm->parent_resource->name;
1779	release_resource(vm->parent_resource);
1780	kfree(vm->parent_resource);
1781	kfree(name);
1782	vm->parent_resource = NULL;
1783}
1784
1785static int virtio_mem_probe(struct virtio_device *vdev)
1786{
1787	struct virtio_mem *vm;
1788	int rc;
1789
1790	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
1791	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
1792
1793	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1794	if (!vm)
1795		return -ENOMEM;
1796
1797	init_waitqueue_head(&vm->host_resp);
1798	vm->vdev = vdev;
1799	INIT_WORK(&vm->wq, virtio_mem_run_wq);
1800	mutex_init(&vm->hotplug_mutex);
1801	INIT_LIST_HEAD(&vm->next);
1802	spin_lock_init(&vm->removal_lock);
1803	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1804	vm->retry_timer.function = virtio_mem_timer_expired;
1805	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
1806
1807	/* register the virtqueue */
1808	rc = virtio_mem_init_vq(vm);
1809	if (rc)
1810		goto out_free_vm;
1811
1812	/* initialize the device by querying the config */
1813	rc = virtio_mem_init(vm);
1814	if (rc)
1815		goto out_del_vq;
1816
1817	/* create the parent resource for all memory */
1818	rc = virtio_mem_create_resource(vm);
1819	if (rc)
1820		goto out_del_vq;
1821
1822	/*
1823	 * If we still have memory plugged, we have to unplug all memory first.
1824	 * Registering our parent resource makes sure that this memory isn't
1825	 * actually in use (e.g., trying to reload the driver).
1826	 */
1827	if (vm->plugged_size) {
1828		vm->unplug_all_required = 1;
1829		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
1830	}
1831
1832	/* register callbacks */
1833	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
1834	rc = register_memory_notifier(&vm->memory_notifier);
1835	if (rc)
1836		goto out_del_resource;
1837	rc = register_virtio_mem_device(vm);
1838	if (rc)
1839		goto out_unreg_mem;
1840
1841	virtio_device_ready(vdev);
1842
1843	/* trigger a config update to start processing the requested_size */
1844	atomic_set(&vm->config_changed, 1);
1845	queue_work(system_freezable_wq, &vm->wq);
1846
1847	return 0;
1848out_unreg_mem:
1849	unregister_memory_notifier(&vm->memory_notifier);
1850out_del_resource:
1851	virtio_mem_delete_resource(vm);
1852out_del_vq:
1853	vdev->config->del_vqs(vdev);
1854out_free_vm:
1855	kfree(vm);
1856	vdev->priv = NULL;
1857
1858	return rc;
1859}
1860
1861static void virtio_mem_remove(struct virtio_device *vdev)
1862{
1863	struct virtio_mem *vm = vdev->priv;
1864	unsigned long mb_id;
1865	int rc;
1866
1867	/*
1868	 * Make sure the workqueue won't be triggered anymore and no memory
1869	 * blocks can be onlined/offlined until we're finished here.
1870	 */
1871	mutex_lock(&vm->hotplug_mutex);
1872	spin_lock_irq(&vm->removal_lock);
1873	vm->removing = true;
1874	spin_unlock_irq(&vm->removal_lock);
1875	mutex_unlock(&vm->hotplug_mutex);
1876
1877	/* wait until the workqueue stopped */
1878	cancel_work_sync(&vm->wq);
1879	hrtimer_cancel(&vm->retry_timer);
1880
1881	/*
1882	 * After we unregistered our callbacks, user space can online partially
1883	 * plugged offline blocks. Make sure to remove them.
1884	 */
1885	virtio_mem_for_each_mb_state(vm, mb_id,
1886				     VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
1887		rc = virtio_mem_mb_remove(vm, mb_id);
1888		BUG_ON(rc);
1889		virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
1890	}
1891	/*
1892	 * After we unregistered our callbacks, user space can no longer
1893	 * offline partially plugged online memory blocks. No need to worry
1894	 * about them.
1895	 */
1896
1897	/* unregister callbacks */
1898	unregister_virtio_mem_device(vm);
1899	unregister_memory_notifier(&vm->memory_notifier);
1900
1901	/*
1902	 * There is no way we could reliably remove all memory we have added to
1903	 * the system. And there is no way to stop the driver/device from going
1904	 * away. Warn at least.
1905	 */
1906	if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
1907	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
1908	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
1909	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL] ||
1910	    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE]) {
1911		dev_warn(&vdev->dev, "device still has system memory added\n");
1912	} else {
1913		virtio_mem_delete_resource(vm);
1914		kfree_const(vm->resource_name);
1915	}
1916
1917	/* remove all tracking data - no locking needed */
1918	vfree(vm->mb_state);
1919	vfree(vm->sb_bitmap);
1920
1921	/* reset the device and cleanup the queues */
1922	vdev->config->reset(vdev);
1923	vdev->config->del_vqs(vdev);
1924
1925	kfree(vm);
1926	vdev->priv = NULL;
1927}
1928
1929static void virtio_mem_config_changed(struct virtio_device *vdev)
1930{
1931	struct virtio_mem *vm = vdev->priv;
1932
1933	atomic_set(&vm->config_changed, 1);
1934	virtio_mem_retry(vm);
1935}
1936
1937#ifdef CONFIG_PM_SLEEP
1938static int virtio_mem_freeze(struct virtio_device *vdev)
1939{
1940	/*
1941	 * When restarting the VM, all memory is usually unplugged. Don't
1942	 * allow to suspend/hibernate.
1943	 */
1944	dev_err(&vdev->dev, "save/restore not supported.\n");
1945	return -EPERM;
1946}
1947
1948static int virtio_mem_restore(struct virtio_device *vdev)
1949{
1950	return -EPERM;
1951}
1952#endif
1953
1954static unsigned int virtio_mem_features[] = {
1955#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
1956	VIRTIO_MEM_F_ACPI_PXM,
1957#endif
1958};
1959
1960static struct virtio_device_id virtio_mem_id_table[] = {
1961	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
1962	{ 0 },
1963};
1964
1965static struct virtio_driver virtio_mem_driver = {
1966	.feature_table = virtio_mem_features,
1967	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
1968	.driver.name = KBUILD_MODNAME,
1969	.driver.owner = THIS_MODULE,
1970	.id_table = virtio_mem_id_table,
1971	.probe = virtio_mem_probe,
1972	.remove = virtio_mem_remove,
1973	.config_changed = virtio_mem_config_changed,
1974#ifdef CONFIG_PM_SLEEP
1975	.freeze	=	virtio_mem_freeze,
1976	.restore =	virtio_mem_restore,
1977#endif
1978};
1979
1980module_virtio_driver(virtio_mem_driver);
1981MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
1982MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
1983MODULE_DESCRIPTION("Virtio-mem driver");
1984MODULE_LICENSE("GPL");