Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Virtio-mem device driver.
   4 *
   5 * Copyright Red Hat, Inc. 2020
   6 *
   7 * Author(s): David Hildenbrand <david@redhat.com>
   8 */
   9
  10#include <linux/virtio.h>
  11#include <linux/virtio_mem.h>
  12#include <linux/workqueue.h>
  13#include <linux/slab.h>
  14#include <linux/module.h>
  15#include <linux/mm.h>
  16#include <linux/memory_hotplug.h>
  17#include <linux/memory.h>
  18#include <linux/hrtimer.h>
  19#include <linux/crash_dump.h>
  20#include <linux/mutex.h>
  21#include <linux/bitmap.h>
  22#include <linux/lockdep.h>
  23#include <linux/log2.h>
  24#include <linux/vmalloc.h>
  25#include <linux/suspend.h>
  26
  27#include <acpi/acpi_numa.h>
  28
  29static bool unplug_online = true;
  30module_param(unplug_online, bool, 0644);
  31MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
  32
  33static bool force_bbm;
  34module_param(force_bbm, bool, 0444);
  35MODULE_PARM_DESC(force_bbm,
  36		"Force Big Block Mode. Default is 0 (auto-selection)");
  37
  38static unsigned long bbm_block_size;
  39module_param(bbm_block_size, ulong, 0444);
  40MODULE_PARM_DESC(bbm_block_size,
  41		 "Big Block size in bytes. Default is 0 (auto-detection).");
  42
 
 
 
 
 
  43/*
  44 * virtio-mem currently supports the following modes of operation:
  45 *
  46 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
  47 *   size of a Sub Block (SB) is determined based on the device block size, the
  48 *   pageblock size, and the maximum allocation granularity of the buddy.
  49 *   Subblocks within a Linux memory block might either be plugged or unplugged.
  50 *   Memory is added/removed to Linux MM in Linux memory block granularity.
  51 *
  52 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
  53 *   Memory is added/removed to Linux MM in Big Block granularity.
  54 *
  55 * The mode is determined automatically based on the Linux memory block size
  56 * and the device block size.
  57 *
  58 * User space / core MM (auto onlining) is responsible for onlining added
  59 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
  60 * always onlined separately, and all memory within a Linux memory block is
  61 * onlined to the same zone - virtio-mem relies on this behavior.
  62 */
  63
  64/*
  65 * State of a Linux memory block in SBM.
  66 */
  67enum virtio_mem_sbm_mb_state {
  68	/* Unplugged, not added to Linux. Can be reused later. */
  69	VIRTIO_MEM_SBM_MB_UNUSED = 0,
  70	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
  71	VIRTIO_MEM_SBM_MB_PLUGGED,
  72	/* Fully plugged, fully added to Linux, offline. */
  73	VIRTIO_MEM_SBM_MB_OFFLINE,
  74	/* Partially plugged, fully added to Linux, offline. */
  75	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
  76	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
  77	VIRTIO_MEM_SBM_MB_KERNEL,
  78	/* Partially plugged, fully added to Linux, online to a kernel zone */
  79	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
  80	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  81	VIRTIO_MEM_SBM_MB_MOVABLE,
  82	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  83	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
  84	VIRTIO_MEM_SBM_MB_COUNT
  85};
  86
  87/*
  88 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
  89 */
  90enum virtio_mem_bbm_bb_state {
  91	/* Unplugged, not added to Linux. Can be reused later. */
  92	VIRTIO_MEM_BBM_BB_UNUSED = 0,
  93	/* Plugged, not added to Linux. Error on add_memory(). */
  94	VIRTIO_MEM_BBM_BB_PLUGGED,
  95	/* Plugged and added to Linux. */
  96	VIRTIO_MEM_BBM_BB_ADDED,
  97	/* All online parts are fake-offline, ready to remove. */
  98	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
  99	VIRTIO_MEM_BBM_BB_COUNT
 100};
 101
 102struct virtio_mem {
 103	struct virtio_device *vdev;
 104
 105	/* We might first have to unplug all memory when starting up. */
 106	bool unplug_all_required;
 107
 108	/* Workqueue that processes the plug/unplug requests. */
 109	struct work_struct wq;
 110	atomic_t wq_active;
 111	atomic_t config_changed;
 112
 113	/* Virtqueue for guest->host requests. */
 114	struct virtqueue *vq;
 115
 116	/* Wait for a host response to a guest request. */
 117	wait_queue_head_t host_resp;
 118
 119	/* Space for one guest request and the host response. */
 120	struct virtio_mem_req req;
 121	struct virtio_mem_resp resp;
 122
 123	/* The current size of the device. */
 124	uint64_t plugged_size;
 125	/* The requested size of the device. */
 126	uint64_t requested_size;
 127
 128	/* The device block size (for communicating with the device). */
 129	uint64_t device_block_size;
 130	/* The determined node id for all memory of the device. */
 131	int nid;
 132	/* Physical start address of the memory region. */
 133	uint64_t addr;
 134	/* Maximum region size in bytes. */
 135	uint64_t region_size;
 136
 137	/* The parent resource for all memory added via this device. */
 138	struct resource *parent_resource;
 139	/*
 140	 * Copy of "System RAM (virtio_mem)" to be used for
 141	 * add_memory_driver_managed().
 142	 */
 143	const char *resource_name;
 144	/* Memory group identification. */
 145	int mgid;
 146
 147	/*
 148	 * We don't want to add too much memory if it's not getting onlined,
 149	 * to avoid running OOM. Besides this threshold, we allow to have at
 150	 * least two offline blocks at a time (whatever is bigger).
 151	 */
 152#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
 153	atomic64_t offline_size;
 154	uint64_t offline_threshold;
 155
 156	/* If set, the driver is in SBM, otherwise in BBM. */
 157	bool in_sbm;
 158
 159	union {
 160		struct {
 161			/* Id of the first memory block of this device. */
 162			unsigned long first_mb_id;
 163			/* Id of the last usable memory block of this device. */
 164			unsigned long last_usable_mb_id;
 165			/* Id of the next memory bock to prepare when needed. */
 166			unsigned long next_mb_id;
 167
 168			/* The subblock size. */
 169			uint64_t sb_size;
 170			/* The number of subblocks per Linux memory block. */
 171			uint32_t sbs_per_mb;
 172
 173			/*
 174			 * Some of the Linux memory blocks tracked as "partially
 175			 * plugged" are completely unplugged and can be offlined
 176			 * and removed -- which previously failed.
 177			 */
 178			bool have_unplugged_mb;
 179
 180			/* Summary of all memory block states. */
 181			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
 182
 183			/*
 184			 * One byte state per memory block. Allocated via
 185			 * vmalloc(). Resized (alloc+copy+free) on demand.
 186			 *
 187			 * With 128 MiB memory blocks, we have states for 512
 188			 * GiB of memory in one 4 KiB page.
 189			 */
 190			uint8_t *mb_states;
 191
 192			/*
 193			 * Bitmap: one bit per subblock. Allocated similar to
 194			 * sbm.mb_states.
 195			 *
 196			 * A set bit means the corresponding subblock is
 197			 * plugged, otherwise it's unblocked.
 198			 *
 199			 * With 4 MiB subblocks, we manage 128 GiB of memory
 200			 * in one 4 KiB page.
 201			 */
 202			unsigned long *sb_states;
 203		} sbm;
 204
 205		struct {
 206			/* Id of the first big block of this device. */
 207			unsigned long first_bb_id;
 208			/* Id of the last usable big block of this device. */
 209			unsigned long last_usable_bb_id;
 210			/* Id of the next device bock to prepare when needed. */
 211			unsigned long next_bb_id;
 212
 213			/* Summary of all big block states. */
 214			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
 215
 216			/* One byte state per big block. See sbm.mb_states. */
 217			uint8_t *bb_states;
 218
 219			/* The block size used for plugging/adding/removing. */
 220			uint64_t bb_size;
 221		} bbm;
 222	};
 223
 224	/*
 225	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
 226	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
 227	 *
 228	 * When this lock is held the pointers can't change, ONLINE and
 229	 * OFFLINE blocks can't change the state and no subblocks will get
 230	 * plugged/unplugged.
 231	 *
 232	 * In kdump mode, used to serialize requests, last_block_addr and
 233	 * last_block_plugged.
 234	 */
 235	struct mutex hotplug_mutex;
 236	bool hotplug_active;
 237
 238	/* An error occurred we cannot handle - stop processing requests. */
 239	bool broken;
 240
 241	/* Cached valued of is_kdump_kernel() when the device was probed. */
 242	bool in_kdump;
 243
 244	/* The driver is being removed. */
 245	spinlock_t removal_lock;
 246	bool removing;
 247
 248	/* Timer for retrying to plug/unplug memory. */
 249	struct hrtimer retry_timer;
 250	unsigned int retry_timer_ms;
 251#define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
 252#define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
 253
 254	/* Memory notifier (online/offline events). */
 255	struct notifier_block memory_notifier;
 256
 257	/* Notifier to block hibernation image storing/reloading. */
 258	struct notifier_block pm_notifier;
 259
 260#ifdef CONFIG_PROC_VMCORE
 261	/* vmcore callback for /proc/vmcore handling in kdump mode */
 262	struct vmcore_cb vmcore_cb;
 263	uint64_t last_block_addr;
 264	bool last_block_plugged;
 265#endif /* CONFIG_PROC_VMCORE */
 266
 267	/* Next device in the list of virtio-mem devices. */
 268	struct list_head next;
 269};
 270
 271/*
 272 * We have to share a single online_page callback among all virtio-mem
 273 * devices. We use RCU to iterate the list in the callback.
 274 */
 275static DEFINE_MUTEX(virtio_mem_mutex);
 276static LIST_HEAD(virtio_mem_devices);
 277
 278static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
 279static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
 280						  unsigned long nr_pages);
 281static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
 282						   unsigned long nr_pages);
 283static void virtio_mem_retry(struct virtio_mem *vm);
 284static int virtio_mem_create_resource(struct virtio_mem *vm);
 285static void virtio_mem_delete_resource(struct virtio_mem *vm);
 286
 287/*
 288 * Register a virtio-mem device so it will be considered for the online_page
 289 * callback.
 290 */
 291static int register_virtio_mem_device(struct virtio_mem *vm)
 292{
 293	int rc = 0;
 294
 295	/* First device registers the callback. */
 296	mutex_lock(&virtio_mem_mutex);
 297	if (list_empty(&virtio_mem_devices))
 298		rc = set_online_page_callback(&virtio_mem_online_page_cb);
 299	if (!rc)
 300		list_add_rcu(&vm->next, &virtio_mem_devices);
 301	mutex_unlock(&virtio_mem_mutex);
 302
 303	return rc;
 304}
 305
 306/*
 307 * Unregister a virtio-mem device so it will no longer be considered for the
 308 * online_page callback.
 309 */
 310static void unregister_virtio_mem_device(struct virtio_mem *vm)
 311{
 312	/* Last device unregisters the callback. */
 313	mutex_lock(&virtio_mem_mutex);
 314	list_del_rcu(&vm->next);
 315	if (list_empty(&virtio_mem_devices))
 316		restore_online_page_callback(&virtio_mem_online_page_cb);
 317	mutex_unlock(&virtio_mem_mutex);
 318
 319	synchronize_rcu();
 320}
 321
 322/*
 323 * Calculate the memory block id of a given address.
 324 */
 325static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
 326{
 327	return addr / memory_block_size_bytes();
 328}
 329
 330/*
 331 * Calculate the physical start address of a given memory block id.
 332 */
 333static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
 334{
 335	return mb_id * memory_block_size_bytes();
 336}
 337
 338/*
 339 * Calculate the big block id of a given address.
 340 */
 341static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
 342					      uint64_t addr)
 343{
 344	return addr / vm->bbm.bb_size;
 345}
 346
 347/*
 348 * Calculate the physical start address of a given big block id.
 349 */
 350static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
 351					 unsigned long bb_id)
 352{
 353	return bb_id * vm->bbm.bb_size;
 354}
 355
 356/*
 357 * Calculate the subblock id of a given address.
 358 */
 359static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
 360					      unsigned long addr)
 361{
 362	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
 363	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
 364
 365	return (addr - mb_addr) / vm->sbm.sb_size;
 366}
 367
 368/*
 369 * Set the state of a big block, taking care of the state counter.
 370 */
 371static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
 372					unsigned long bb_id,
 373					enum virtio_mem_bbm_bb_state state)
 374{
 375	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
 376	enum virtio_mem_bbm_bb_state old_state;
 377
 378	old_state = vm->bbm.bb_states[idx];
 379	vm->bbm.bb_states[idx] = state;
 380
 381	BUG_ON(vm->bbm.bb_count[old_state] == 0);
 382	vm->bbm.bb_count[old_state]--;
 383	vm->bbm.bb_count[state]++;
 384}
 385
 386/*
 387 * Get the state of a big block.
 388 */
 389static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
 390								unsigned long bb_id)
 391{
 392	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
 393}
 394
 395/*
 396 * Prepare the big block state array for the next big block.
 397 */
 398static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
 399{
 400	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
 401	unsigned long new_bytes = old_bytes + 1;
 402	int old_pages = PFN_UP(old_bytes);
 403	int new_pages = PFN_UP(new_bytes);
 404	uint8_t *new_array;
 405
 406	if (vm->bbm.bb_states && old_pages == new_pages)
 407		return 0;
 408
 409	new_array = vzalloc(new_pages * PAGE_SIZE);
 410	if (!new_array)
 411		return -ENOMEM;
 412
 413	mutex_lock(&vm->hotplug_mutex);
 414	if (vm->bbm.bb_states)
 415		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
 416	vfree(vm->bbm.bb_states);
 417	vm->bbm.bb_states = new_array;
 418	mutex_unlock(&vm->hotplug_mutex);
 419
 420	return 0;
 421}
 422
 423#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
 424	for (_bb_id = vm->bbm.first_bb_id; \
 425	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
 426	     _bb_id++) \
 427		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 428
 429#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
 430	for (_bb_id = vm->bbm.next_bb_id - 1; \
 431	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
 432	     _bb_id--) \
 433		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 434
 435/*
 436 * Set the state of a memory block, taking care of the state counter.
 437 */
 438static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
 439					unsigned long mb_id, uint8_t state)
 440{
 441	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 442	uint8_t old_state;
 443
 444	old_state = vm->sbm.mb_states[idx];
 445	vm->sbm.mb_states[idx] = state;
 446
 447	BUG_ON(vm->sbm.mb_count[old_state] == 0);
 448	vm->sbm.mb_count[old_state]--;
 449	vm->sbm.mb_count[state]++;
 450}
 451
 452/*
 453 * Get the state of a memory block.
 454 */
 455static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
 456					   unsigned long mb_id)
 457{
 458	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 459
 460	return vm->sbm.mb_states[idx];
 461}
 462
 463/*
 464 * Prepare the state array for the next memory block.
 465 */
 466static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
 467{
 468	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
 469	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
 470	uint8_t *new_array;
 471
 472	if (vm->sbm.mb_states && old_pages == new_pages)
 473		return 0;
 474
 475	new_array = vzalloc(new_pages * PAGE_SIZE);
 476	if (!new_array)
 477		return -ENOMEM;
 478
 479	mutex_lock(&vm->hotplug_mutex);
 480	if (vm->sbm.mb_states)
 481		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
 482	vfree(vm->sbm.mb_states);
 483	vm->sbm.mb_states = new_array;
 484	mutex_unlock(&vm->hotplug_mutex);
 485
 486	return 0;
 487}
 488
 489#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
 490	for (_mb_id = _vm->sbm.first_mb_id; \
 491	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
 492	     _mb_id++) \
 493		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 494
 495#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
 496	for (_mb_id = _vm->sbm.next_mb_id - 1; \
 497	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
 498	     _mb_id--) \
 499		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 500
 501/*
 502 * Calculate the bit number in the subblock bitmap for the given subblock
 503 * inside the given memory block.
 504 */
 505static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
 506					  unsigned long mb_id, int sb_id)
 507{
 508	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
 509}
 510
 511/*
 512 * Mark all selected subblocks plugged.
 513 *
 514 * Will not modify the state of the memory block.
 515 */
 516static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
 517					  unsigned long mb_id, int sb_id,
 518					  int count)
 519{
 520	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 521
 522	__bitmap_set(vm->sbm.sb_states, bit, count);
 523}
 524
 525/*
 526 * Mark all selected subblocks unplugged.
 527 *
 528 * Will not modify the state of the memory block.
 529 */
 530static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
 531					    unsigned long mb_id, int sb_id,
 532					    int count)
 533{
 534	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 535
 536	__bitmap_clear(vm->sbm.sb_states, bit, count);
 537}
 538
 539/*
 540 * Test if all selected subblocks are plugged.
 541 */
 542static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
 543					   unsigned long mb_id, int sb_id,
 544					   int count)
 545{
 546	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 547
 548	if (count == 1)
 549		return test_bit(bit, vm->sbm.sb_states);
 550
 551	/* TODO: Helper similar to bitmap_set() */
 552	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
 553	       bit + count;
 554}
 555
 556/*
 557 * Test if all selected subblocks are unplugged.
 558 */
 559static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
 560					     unsigned long mb_id, int sb_id,
 561					     int count)
 562{
 563	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 564
 565	/* TODO: Helper similar to bitmap_set() */
 566	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
 567	       bit + count;
 568}
 569
 570/*
 571 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
 572 * none.
 573 */
 574static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
 575					    unsigned long mb_id)
 576{
 577	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
 578
 579	return find_next_zero_bit(vm->sbm.sb_states,
 580				  bit + vm->sbm.sbs_per_mb, bit) - bit;
 581}
 582
 583/*
 584 * Prepare the subblock bitmap for the next memory block.
 585 */
 586static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
 587{
 588	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
 589	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
 590	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
 591	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
 592	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
 593	unsigned long *new_bitmap, *old_bitmap;
 594
 595	if (vm->sbm.sb_states && old_pages == new_pages)
 596		return 0;
 597
 598	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
 599	if (!new_bitmap)
 600		return -ENOMEM;
 601
 602	mutex_lock(&vm->hotplug_mutex);
 603	if (vm->sbm.sb_states)
 604		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
 605
 606	old_bitmap = vm->sbm.sb_states;
 607	vm->sbm.sb_states = new_bitmap;
 608	mutex_unlock(&vm->hotplug_mutex);
 609
 610	vfree(old_bitmap);
 611	return 0;
 612}
 613
 614/*
 615 * Test if we could add memory without creating too much offline memory -
 616 * to avoid running OOM if memory is getting onlined deferred.
 617 */
 618static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
 619{
 620	if (WARN_ON_ONCE(size > vm->offline_threshold))
 621		return false;
 622
 623	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
 624}
 625
 626/*
 627 * Try adding memory to Linux. Will usually only fail if out of memory.
 628 *
 629 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 630 * onlining code).
 631 *
 632 * Will not modify the state of memory blocks in virtio-mem.
 633 */
 634static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 635				 uint64_t size)
 636{
 637	int rc;
 638
 639	/*
 640	 * When force-unloading the driver and we still have memory added to
 641	 * Linux, the resource name has to stay.
 642	 */
 643	if (!vm->resource_name) {
 644		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
 645						  GFP_KERNEL);
 646		if (!vm->resource_name)
 647			return -ENOMEM;
 648	}
 649
 650	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
 651		addr + size - 1);
 652	/* Memory might get onlined immediately. */
 653	atomic64_add(size, &vm->offline_size);
 654	rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
 655				       MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
 656	if (rc) {
 657		atomic64_sub(size, &vm->offline_size);
 658		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
 659		/*
 660		 * TODO: Linux MM does not properly clean up yet in all cases
 661		 * where adding of memory failed - especially on -ENOMEM.
 662		 */
 663	}
 664	return rc;
 665}
 666
 667/*
 668 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
 669 */
 670static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
 671{
 672	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 673	const uint64_t size = memory_block_size_bytes();
 674
 675	return virtio_mem_add_memory(vm, addr, size);
 676}
 677
 678/*
 679 * See virtio_mem_add_memory(): Try adding a big block.
 680 */
 681static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
 682{
 683	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 684	const uint64_t size = vm->bbm.bb_size;
 685
 686	return virtio_mem_add_memory(vm, addr, size);
 687}
 688
 689/*
 690 * Try removing memory from Linux. Will only fail if memory blocks aren't
 691 * offline.
 692 *
 693 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 694 * onlining code).
 695 *
 696 * Will not modify the state of memory blocks in virtio-mem.
 697 */
 698static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
 699				    uint64_t size)
 700{
 701	int rc;
 702
 703	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
 704		addr + size - 1);
 705	rc = remove_memory(addr, size);
 706	if (!rc) {
 707		atomic64_sub(size, &vm->offline_size);
 708		/*
 709		 * We might have freed up memory we can now unplug, retry
 710		 * immediately instead of waiting.
 711		 */
 712		virtio_mem_retry(vm);
 713	} else {
 714		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
 715	}
 716	return rc;
 717}
 718
 719/*
 720 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
 721 */
 722static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
 723{
 724	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 725	const uint64_t size = memory_block_size_bytes();
 726
 727	return virtio_mem_remove_memory(vm, addr, size);
 728}
 729
 730/*
 731 * Try offlining and removing memory from Linux.
 732 *
 733 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 734 * onlining code).
 735 *
 736 * Will not modify the state of memory blocks in virtio-mem.
 737 */
 738static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
 739						uint64_t addr,
 740						uint64_t size)
 741{
 742	int rc;
 743
 744	dev_dbg(&vm->vdev->dev,
 745		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
 746		addr + size - 1);
 747
 748	rc = offline_and_remove_memory(addr, size);
 749	if (!rc) {
 750		atomic64_sub(size, &vm->offline_size);
 751		/*
 752		 * We might have freed up memory we can now unplug, retry
 753		 * immediately instead of waiting.
 754		 */
 755		virtio_mem_retry(vm);
 756		return 0;
 
 
 757	}
 758	dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc);
 759	/*
 760	 * We don't really expect this to fail, because we fake-offlined all
 761	 * memory already. But it could fail in corner cases.
 762	 */
 763	WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY);
 764	return rc == -ENOMEM ? -ENOMEM : -EBUSY;
 765}
 766
 767/*
 768 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
 769 * a single Linux memory block.
 770 */
 771static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
 772						unsigned long mb_id)
 773{
 774	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 775	const uint64_t size = memory_block_size_bytes();
 776
 777	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 778}
 779
 780/*
 781 * Try (offlining and) removing memory from Linux in case all subblocks are
 782 * unplugged. Can be called on online and offline memory blocks.
 783 *
 784 * May modify the state of memory blocks in virtio-mem.
 785 */
 786static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm,
 787						  unsigned long mb_id)
 788{
 789	int rc;
 790
 791	/*
 792	 * Once all subblocks of a memory block were unplugged, offline and
 793	 * remove it.
 794	 */
 795	if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
 796		return 0;
 797
 798	/* offline_and_remove_memory() works for online and offline memory. */
 799	mutex_unlock(&vm->hotplug_mutex);
 800	rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
 801	mutex_lock(&vm->hotplug_mutex);
 802	if (!rc)
 803		virtio_mem_sbm_set_mb_state(vm, mb_id,
 804					    VIRTIO_MEM_SBM_MB_UNUSED);
 805	return rc;
 806}
 807
 808/*
 809 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
 810 * all Linux memory blocks covered by the big block.
 811 */
 812static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
 813						unsigned long bb_id)
 814{
 815	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 816	const uint64_t size = vm->bbm.bb_size;
 817
 818	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 819}
 820
 821/*
 822 * Trigger the workqueue so the device can perform its magic.
 823 */
 824static void virtio_mem_retry(struct virtio_mem *vm)
 825{
 826	unsigned long flags;
 827
 828	spin_lock_irqsave(&vm->removal_lock, flags);
 829	if (!vm->removing)
 830		queue_work(system_freezable_wq, &vm->wq);
 831	spin_unlock_irqrestore(&vm->removal_lock, flags);
 832}
 833
 834static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
 835{
 836	int node = NUMA_NO_NODE;
 837
 838#if defined(CONFIG_ACPI_NUMA)
 839	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
 840		node = pxm_to_node(node_id);
 841#endif
 842	return node;
 843}
 844
 845/*
 846 * Test if a virtio-mem device overlaps with the given range. Can be called
 847 * from (notifier) callbacks lockless.
 848 */
 849static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
 850				      uint64_t size)
 851{
 852	return start < vm->addr + vm->region_size && vm->addr < start + size;
 853}
 854
 855/*
 856 * Test if a virtio-mem device contains a given range. Can be called from
 857 * (notifier) callbacks lockless.
 858 */
 859static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
 860				      uint64_t size)
 861{
 862	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
 863}
 864
 865static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
 866					      unsigned long mb_id)
 867{
 868	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 869	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 870	case VIRTIO_MEM_SBM_MB_OFFLINE:
 871		return NOTIFY_OK;
 872	default:
 873		break;
 874	}
 875	dev_warn_ratelimited(&vm->vdev->dev,
 876			     "memory block onlining denied\n");
 877	return NOTIFY_BAD;
 878}
 879
 880static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
 881					  unsigned long mb_id)
 882{
 883	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 884	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
 885	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
 886		virtio_mem_sbm_set_mb_state(vm, mb_id,
 887					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
 888		break;
 889	case VIRTIO_MEM_SBM_MB_KERNEL:
 890	case VIRTIO_MEM_SBM_MB_MOVABLE:
 891		virtio_mem_sbm_set_mb_state(vm, mb_id,
 892					    VIRTIO_MEM_SBM_MB_OFFLINE);
 893		break;
 894	default:
 895		BUG();
 896		break;
 897	}
 898}
 899
 900static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
 901					 unsigned long mb_id,
 902					 unsigned long start_pfn)
 903{
 904	const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn));
 
 905	int new_state;
 906
 907	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 908	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 909		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
 910		if (is_movable)
 911			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
 912		break;
 913	case VIRTIO_MEM_SBM_MB_OFFLINE:
 914		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
 915		if (is_movable)
 916			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
 917		break;
 918	default:
 919		BUG();
 920		break;
 921	}
 922	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
 923}
 924
 925static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
 926						unsigned long mb_id)
 927{
 928	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 929	unsigned long pfn;
 930	int sb_id;
 931
 932	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 933		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 934			continue;
 935		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 936			       sb_id * vm->sbm.sb_size);
 937		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 938	}
 939}
 940
 941static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
 942						 unsigned long mb_id)
 943{
 944	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 945	unsigned long pfn;
 946	int sb_id;
 947
 948	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 949		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 950			continue;
 951		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 952			       sb_id * vm->sbm.sb_size);
 953		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 954	}
 955}
 956
 957static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
 958						unsigned long bb_id,
 959						unsigned long pfn,
 960						unsigned long nr_pages)
 961{
 962	/*
 963	 * When marked as "fake-offline", all online memory of this device block
 964	 * is allocated by us. Otherwise, we don't have any memory allocated.
 965	 */
 966	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 967	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 968		return;
 969	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 970}
 971
 972static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
 973						 unsigned long bb_id,
 974						 unsigned long pfn,
 975						 unsigned long nr_pages)
 976{
 977	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 978	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 979		return;
 980	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 981}
 982
 983/*
 984 * This callback will either be called synchronously from add_memory() or
 985 * asynchronously (e.g., triggered via user space). We have to be careful
 986 * with locking when calling add_memory().
 987 */
 988static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
 989					 unsigned long action, void *arg)
 990{
 991	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
 992					     memory_notifier);
 993	struct memory_notify *mhp = arg;
 994	const unsigned long start = PFN_PHYS(mhp->start_pfn);
 995	const unsigned long size = PFN_PHYS(mhp->nr_pages);
 996	int rc = NOTIFY_OK;
 997	unsigned long id;
 998
 999	if (!virtio_mem_overlaps_range(vm, start, size))
1000		return NOTIFY_DONE;
1001
1002	if (vm->in_sbm) {
1003		id = virtio_mem_phys_to_mb_id(start);
1004		/*
1005		 * In SBM, we add memory in separate memory blocks - we expect
1006		 * it to be onlined/offlined in the same granularity. Bail out
1007		 * if this ever changes.
1008		 */
1009		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
1010				 !IS_ALIGNED(start, memory_block_size_bytes())))
1011			return NOTIFY_BAD;
1012	} else {
1013		id = virtio_mem_phys_to_bb_id(vm, start);
1014		/*
1015		 * In BBM, we only care about onlining/offlining happening
1016		 * within a single big block, we don't care about the
1017		 * actual granularity as we don't track individual Linux
1018		 * memory blocks.
1019		 */
1020		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
1021			return NOTIFY_BAD;
1022	}
1023
1024	/*
1025	 * Avoid circular locking lockdep warnings. We lock the mutex
1026	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
1027	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
1028	 * between both notifier calls and will bail out. False positive.
1029	 */
1030	lockdep_off();
1031
1032	switch (action) {
1033	case MEM_GOING_OFFLINE:
1034		mutex_lock(&vm->hotplug_mutex);
1035		if (vm->removing) {
1036			rc = notifier_from_errno(-EBUSY);
1037			mutex_unlock(&vm->hotplug_mutex);
1038			break;
1039		}
1040		vm->hotplug_active = true;
1041		if (vm->in_sbm)
1042			virtio_mem_sbm_notify_going_offline(vm, id);
1043		else
1044			virtio_mem_bbm_notify_going_offline(vm, id,
1045							    mhp->start_pfn,
1046							    mhp->nr_pages);
1047		break;
1048	case MEM_GOING_ONLINE:
1049		mutex_lock(&vm->hotplug_mutex);
1050		if (vm->removing) {
1051			rc = notifier_from_errno(-EBUSY);
1052			mutex_unlock(&vm->hotplug_mutex);
1053			break;
1054		}
1055		vm->hotplug_active = true;
1056		if (vm->in_sbm)
1057			rc = virtio_mem_sbm_notify_going_online(vm, id);
1058		break;
1059	case MEM_OFFLINE:
1060		if (vm->in_sbm)
1061			virtio_mem_sbm_notify_offline(vm, id);
1062
1063		atomic64_add(size, &vm->offline_size);
1064		/*
1065		 * Trigger the workqueue. Now that we have some offline memory,
1066		 * maybe we can handle pending unplug requests.
1067		 */
1068		if (!unplug_online)
1069			virtio_mem_retry(vm);
1070
1071		vm->hotplug_active = false;
1072		mutex_unlock(&vm->hotplug_mutex);
1073		break;
1074	case MEM_ONLINE:
1075		if (vm->in_sbm)
1076			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1077
1078		atomic64_sub(size, &vm->offline_size);
1079		/*
1080		 * Start adding more memory once we onlined half of our
1081		 * threshold. Don't trigger if it's possibly due to our actipn
1082		 * (e.g., us adding memory which gets onlined immediately from
1083		 * the core).
1084		 */
1085		if (!atomic_read(&vm->wq_active) &&
1086		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1087			virtio_mem_retry(vm);
1088
1089		vm->hotplug_active = false;
1090		mutex_unlock(&vm->hotplug_mutex);
1091		break;
1092	case MEM_CANCEL_OFFLINE:
1093		if (!vm->hotplug_active)
1094			break;
1095		if (vm->in_sbm)
1096			virtio_mem_sbm_notify_cancel_offline(vm, id);
1097		else
1098			virtio_mem_bbm_notify_cancel_offline(vm, id,
1099							     mhp->start_pfn,
1100							     mhp->nr_pages);
1101		vm->hotplug_active = false;
1102		mutex_unlock(&vm->hotplug_mutex);
1103		break;
1104	case MEM_CANCEL_ONLINE:
1105		if (!vm->hotplug_active)
1106			break;
1107		vm->hotplug_active = false;
1108		mutex_unlock(&vm->hotplug_mutex);
1109		break;
1110	default:
1111		break;
1112	}
1113
1114	lockdep_on();
1115
1116	return rc;
1117}
1118
1119static int virtio_mem_pm_notifier_cb(struct notifier_block *nb,
1120				     unsigned long action, void *arg)
1121{
1122	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
1123					     pm_notifier);
1124	switch (action) {
1125	case PM_HIBERNATION_PREPARE:
1126	case PM_RESTORE_PREPARE:
1127		/*
1128		 * When restarting the VM, all memory is unplugged. Don't
1129		 * allow to hibernate and restore from an image.
1130		 */
1131		dev_err(&vm->vdev->dev, "hibernation is not supported.\n");
1132		return NOTIFY_BAD;
1133	default:
1134		return NOTIFY_OK;
1135	}
1136}
1137
1138/*
1139 * Set a range of pages PG_offline. Remember pages that were never onlined
1140 * (via generic_online_page()) using PageDirty().
1141 */
1142static void virtio_mem_set_fake_offline(unsigned long pfn,
1143					unsigned long nr_pages, bool onlined)
1144{
1145	page_offline_begin();
1146	for (; nr_pages--; pfn++) {
1147		struct page *page = pfn_to_page(pfn);
1148
1149		if (!onlined)
1150			/*
1151			 * Pages that have not been onlined yet were initialized
1152			 * to PageOffline(). Remember that we have to route them
1153			 * through generic_online_page().
1154			 */
1155			SetPageDirty(page);
1156		else
1157			__SetPageOffline(page);
1158		VM_WARN_ON_ONCE(!PageOffline(page));
1159	}
1160	page_offline_end();
1161}
1162
1163/*
1164 * Clear PG_offline from a range of pages. If the pages were never onlined,
1165 * (via generic_online_page()), clear PageDirty().
1166 */
1167static void virtio_mem_clear_fake_offline(unsigned long pfn,
1168					  unsigned long nr_pages, bool onlined)
1169{
1170	for (; nr_pages--; pfn++) {
1171		struct page *page = pfn_to_page(pfn);
1172
 
1173		if (!onlined)
1174			/* generic_online_page() will clear PageOffline(). */
1175			ClearPageDirty(page);
1176		else
1177			__ClearPageOffline(page);
1178	}
1179}
1180
1181/*
1182 * Release a range of fake-offline pages to the buddy, effectively
1183 * fake-onlining them.
1184 */
1185static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1186{
1187	unsigned long order = MAX_PAGE_ORDER;
1188	unsigned long i;
1189
1190	/*
1191	 * We might get called for ranges that don't cover properly aligned
1192	 * MAX_PAGE_ORDER pages; however, we can only online properly aligned
1193	 * pages with an order of MAX_PAGE_ORDER at maximum.
1194	 */
1195	while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
1196		order--;
1197
1198	for (i = 0; i < nr_pages; i += 1 << order) {
1199		struct page *page = pfn_to_page(pfn + i);
1200
1201		/*
1202		 * If the page is PageDirty(), it was kept fake-offline when
1203		 * onlining the memory block. Otherwise, it was allocated
1204		 * using alloc_contig_range(). All pages in a subblock are
1205		 * alike.
1206		 */
1207		if (PageDirty(page)) {
1208			virtio_mem_clear_fake_offline(pfn + i, 1 << order, false);
1209			generic_online_page(page, order);
 
1210		} else {
1211			virtio_mem_clear_fake_offline(pfn + i, 1 << order, true);
1212			free_contig_range(pfn + i, 1 << order);
1213			adjust_managed_page_count(page, 1 << order);
 
1214		}
1215	}
1216}
1217
1218/*
1219 * Try to allocate a range, marking pages fake-offline, effectively
1220 * fake-offlining them.
1221 */
1222static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
1223				   unsigned long nr_pages)
1224{
1225	const bool is_movable = is_zone_movable_page(pfn_to_page(pfn));
 
1226	int rc, retry_count;
1227
1228	/*
1229	 * TODO: We want an alloc_contig_range() mode that tries to allocate
1230	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1231	 * with ZONE_MOVABLE. So for now, retry a couple of times with
1232	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1233	 * some guarantees.
1234	 */
1235	for (retry_count = 0; retry_count < 5; retry_count++) {
1236		/*
1237		 * If the config changed, stop immediately and go back to the
1238		 * main loop: avoid trying to keep unplugging if the device
1239		 * might have decided to not remove any more memory.
1240		 */
1241		if (atomic_read(&vm->config_changed))
1242			return -EAGAIN;
1243
1244		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1245					GFP_KERNEL);
1246		if (rc == -ENOMEM)
1247			/* whoops, out of memory */
1248			return rc;
1249		else if (rc && !is_movable)
1250			break;
1251		else if (rc)
1252			continue;
1253
1254		virtio_mem_set_fake_offline(pfn, nr_pages, true);
1255		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1256		return 0;
1257	}
1258
1259	return -EBUSY;
1260}
1261
1262/*
1263 * Handle fake-offline pages when memory is going offline - such that the
1264 * pages can be skipped by mm-core when offlining.
1265 */
1266static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1267						  unsigned long nr_pages)
1268{
1269	struct page *page;
1270	unsigned long i;
1271
 
 
 
 
 
 
1272	/* Drop our reference to the pages so the memory can get offlined. */
1273	for (i = 0; i < nr_pages; i++) {
1274		page = pfn_to_page(pfn + i);
1275		if (WARN_ON(!page_ref_dec_and_test(page)))
1276			dump_page(page, "fake-offline page referenced");
1277	}
1278}
1279
1280/*
1281 * Handle fake-offline pages when memory offlining is canceled - to undo
1282 * what we did in virtio_mem_fake_offline_going_offline().
1283 */
1284static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1285						   unsigned long nr_pages)
1286{
1287	unsigned long i;
1288
1289	/*
1290	 * Get the reference again that we dropped via page_ref_dec_and_test()
1291	 * when going offline.
1292	 */
 
1293	for (i = 0; i < nr_pages; i++)
1294		page_ref_inc(pfn_to_page(pfn + i));
1295}
1296
1297static void virtio_mem_online_page(struct virtio_mem *vm,
1298				   struct page *page, unsigned int order)
1299{
1300	const unsigned long start = page_to_phys(page);
1301	const unsigned long end = start + PFN_PHYS(1 << order);
1302	unsigned long addr, next, id, sb_id, count;
1303	bool do_online;
1304
1305	/*
1306	 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock
1307	 * size is smaller than that and we have a mixture of plugged and
1308	 * unplugged subblocks within such a page, we have to process in
1309	 * smaller granularity. In that case we'll adjust the order exactly once
1310	 * within the loop.
1311	 */
1312	for (addr = start; addr < end; ) {
1313		next = addr + PFN_PHYS(1 << order);
1314
1315		if (vm->in_sbm) {
 
 
 
 
 
1316			id = virtio_mem_phys_to_mb_id(addr);
1317			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1318			count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1;
1319
1320			if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) {
1321				/* Fully plugged. */
1322				do_online = true;
1323			} else if (count == 1 ||
1324				   virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) {
1325				/* Fully unplugged. */
1326				do_online = false;
1327			} else {
1328				/*
1329				 * Mixture, process sub-blocks instead. This
1330				 * will be at least the size of a pageblock.
1331				 * We'll run into this case exactly once.
1332				 */
1333				order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT;
1334				do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1);
1335				continue;
1336			}
1337		} else {
1338			/*
1339			 * If the whole block is marked fake offline, keep
1340			 * everything that way.
1341			 */
1342			id = virtio_mem_phys_to_bb_id(vm, addr);
1343			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1344				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1345		}
1346
1347		if (do_online)
1348			generic_online_page(pfn_to_page(PFN_DOWN(addr)), order);
1349		else
1350			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1351						    false);
1352		addr = next;
1353	}
1354}
1355
1356static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1357{
1358	const unsigned long addr = page_to_phys(page);
1359	struct virtio_mem *vm;
1360
1361	rcu_read_lock();
1362	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1363		/*
1364		 * Pages we're onlining will never cross memory blocks and,
1365		 * therefore, not virtio-mem devices.
1366		 */
1367		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1368			continue;
1369
1370		/*
1371		 * virtio_mem_set_fake_offline() might sleep. We can safely
1372		 * drop the RCU lock at this point because the device
1373		 * cannot go away. See virtio_mem_remove() how races
1374		 * between memory onlining and device removal are handled.
1375		 */
1376		rcu_read_unlock();
1377
1378		virtio_mem_online_page(vm, page, order);
 
 
 
 
1379		return;
1380	}
1381	rcu_read_unlock();
1382
1383	/* not virtio-mem memory, but e.g., a DIMM. online it */
1384	generic_online_page(page, order);
1385}
1386
1387static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1388					const struct virtio_mem_req *req)
1389{
1390	struct scatterlist *sgs[2], sg_req, sg_resp;
1391	unsigned int len;
1392	int rc;
1393
1394	/* don't use the request residing on the stack (vaddr) */
1395	vm->req = *req;
1396
1397	/* out: buffer for request */
1398	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1399	sgs[0] = &sg_req;
1400
1401	/* in: buffer for response */
1402	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1403	sgs[1] = &sg_resp;
1404
1405	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1406	if (rc < 0)
1407		return rc;
1408
1409	virtqueue_kick(vm->vq);
1410
1411	/* wait for a response */
1412	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1413
1414	return virtio16_to_cpu(vm->vdev, vm->resp.type);
1415}
1416
1417static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1418					uint64_t size)
1419{
1420	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1421	const struct virtio_mem_req req = {
1422		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1423		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1424		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1425	};
1426	int rc = -ENOMEM;
1427
1428	if (atomic_read(&vm->config_changed))
1429		return -EAGAIN;
1430
1431	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1432		addr + size - 1);
1433
1434	switch (virtio_mem_send_request(vm, &req)) {
1435	case VIRTIO_MEM_RESP_ACK:
1436		vm->plugged_size += size;
1437		return 0;
1438	case VIRTIO_MEM_RESP_NACK:
1439		rc = -EAGAIN;
1440		break;
1441	case VIRTIO_MEM_RESP_BUSY:
1442		rc = -ETXTBSY;
1443		break;
1444	case VIRTIO_MEM_RESP_ERROR:
1445		rc = -EINVAL;
1446		break;
1447	default:
1448		break;
1449	}
1450
1451	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1452	return rc;
1453}
1454
1455static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1456					  uint64_t size)
1457{
1458	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1459	const struct virtio_mem_req req = {
1460		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1461		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1462		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1463	};
1464	int rc = -ENOMEM;
1465
1466	if (atomic_read(&vm->config_changed))
1467		return -EAGAIN;
1468
1469	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1470		addr + size - 1);
1471
1472	switch (virtio_mem_send_request(vm, &req)) {
1473	case VIRTIO_MEM_RESP_ACK:
1474		vm->plugged_size -= size;
1475		return 0;
1476	case VIRTIO_MEM_RESP_BUSY:
1477		rc = -ETXTBSY;
1478		break;
1479	case VIRTIO_MEM_RESP_ERROR:
1480		rc = -EINVAL;
1481		break;
1482	default:
1483		break;
1484	}
1485
1486	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1487	return rc;
1488}
1489
1490static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1491{
1492	const struct virtio_mem_req req = {
1493		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1494	};
1495	int rc = -ENOMEM;
1496
1497	dev_dbg(&vm->vdev->dev, "unplugging all memory");
1498
1499	switch (virtio_mem_send_request(vm, &req)) {
1500	case VIRTIO_MEM_RESP_ACK:
1501		vm->unplug_all_required = false;
1502		vm->plugged_size = 0;
1503		/* usable region might have shrunk */
1504		atomic_set(&vm->config_changed, 1);
1505		return 0;
1506	case VIRTIO_MEM_RESP_BUSY:
1507		rc = -ETXTBSY;
1508		break;
1509	default:
1510		break;
1511	}
1512
1513	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1514	return rc;
1515}
1516
1517/*
1518 * Plug selected subblocks. Updates the plugged state, but not the state
1519 * of the memory block.
1520 */
1521static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1522				  int sb_id, int count)
1523{
1524	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1525			      sb_id * vm->sbm.sb_size;
1526	const uint64_t size = count * vm->sbm.sb_size;
1527	int rc;
1528
1529	rc = virtio_mem_send_plug_request(vm, addr, size);
1530	if (!rc)
1531		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1532	return rc;
1533}
1534
1535/*
1536 * Unplug selected subblocks. Updates the plugged state, but not the state
1537 * of the memory block.
1538 */
1539static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1540				    int sb_id, int count)
1541{
1542	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1543			      sb_id * vm->sbm.sb_size;
1544	const uint64_t size = count * vm->sbm.sb_size;
1545	int rc;
1546
1547	rc = virtio_mem_send_unplug_request(vm, addr, size);
1548	if (!rc)
1549		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1550	return rc;
1551}
1552
1553/*
1554 * Request to unplug a big block.
1555 *
1556 * Will not modify the state of the big block.
1557 */
1558static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1559{
1560	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1561	const uint64_t size = vm->bbm.bb_size;
1562
1563	return virtio_mem_send_unplug_request(vm, addr, size);
1564}
1565
1566/*
1567 * Request to plug a big block.
1568 *
1569 * Will not modify the state of the big block.
1570 */
1571static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1572{
1573	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1574	const uint64_t size = vm->bbm.bb_size;
1575
1576	return virtio_mem_send_plug_request(vm, addr, size);
1577}
1578
1579/*
1580 * Unplug the desired number of plugged subblocks of a offline or not-added
1581 * memory block. Will fail if any subblock cannot get unplugged (instead of
1582 * skipping it).
1583 *
1584 * Will not modify the state of the memory block.
1585 *
1586 * Note: can fail after some subblocks were unplugged.
1587 */
1588static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1589					    unsigned long mb_id, uint64_t *nb_sb)
1590{
1591	int sb_id, count;
1592	int rc;
1593
1594	sb_id = vm->sbm.sbs_per_mb - 1;
1595	while (*nb_sb) {
1596		/* Find the next candidate subblock */
1597		while (sb_id >= 0 &&
1598		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1599			sb_id--;
1600		if (sb_id < 0)
1601			break;
1602		/* Try to unplug multiple subblocks at a time */
1603		count = 1;
1604		while (count < *nb_sb && sb_id > 0 &&
1605		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1606			count++;
1607			sb_id--;
1608		}
1609
1610		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1611		if (rc)
1612			return rc;
1613		*nb_sb -= count;
1614		sb_id--;
1615	}
1616
1617	return 0;
1618}
1619
1620/*
1621 * Unplug all plugged subblocks of an offline or not-added memory block.
1622 *
1623 * Will not modify the state of the memory block.
1624 *
1625 * Note: can fail after some subblocks were unplugged.
1626 */
1627static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1628{
1629	uint64_t nb_sb = vm->sbm.sbs_per_mb;
1630
1631	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1632}
1633
1634/*
1635 * Prepare tracking data for the next memory block.
1636 */
1637static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1638					  unsigned long *mb_id)
1639{
1640	int rc;
1641
1642	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1643		return -ENOSPC;
1644
1645	/* Resize the state array if required. */
1646	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1647	if (rc)
1648		return rc;
1649
1650	/* Resize the subblock bitmap if required. */
1651	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1652	if (rc)
1653		return rc;
1654
1655	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1656	*mb_id = vm->sbm.next_mb_id++;
1657	return 0;
1658}
1659
1660/*
1661 * Try to plug the desired number of subblocks and add the memory block
1662 * to Linux.
1663 *
1664 * Will modify the state of the memory block.
1665 */
1666static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1667					  unsigned long mb_id, uint64_t *nb_sb)
1668{
1669	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1670	int rc;
1671
1672	if (WARN_ON_ONCE(!count))
1673		return -EINVAL;
1674
1675	/*
1676	 * Plug the requested number of subblocks before adding it to linux,
1677	 * so that onlining will directly online all plugged subblocks.
1678	 */
1679	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1680	if (rc)
1681		return rc;
1682
1683	/*
1684	 * Mark the block properly offline before adding it to Linux,
1685	 * so the memory notifiers will find the block in the right state.
1686	 */
1687	if (count == vm->sbm.sbs_per_mb)
1688		virtio_mem_sbm_set_mb_state(vm, mb_id,
1689					    VIRTIO_MEM_SBM_MB_OFFLINE);
1690	else
1691		virtio_mem_sbm_set_mb_state(vm, mb_id,
1692					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1693
1694	/* Add the memory block to linux - if that fails, try to unplug. */
1695	rc = virtio_mem_sbm_add_mb(vm, mb_id);
1696	if (rc) {
1697		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1698
1699		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1700			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1701		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1702		return rc;
1703	}
1704
1705	*nb_sb -= count;
1706	return 0;
1707}
1708
1709/*
1710 * Try to plug the desired number of subblocks of a memory block that
1711 * is already added to Linux.
1712 *
1713 * Will modify the state of the memory block.
1714 *
1715 * Note: Can fail after some subblocks were successfully plugged.
1716 */
1717static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1718				      unsigned long mb_id, uint64_t *nb_sb)
1719{
1720	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1721	unsigned long pfn, nr_pages;
1722	int sb_id, count;
1723	int rc;
1724
1725	if (WARN_ON_ONCE(!*nb_sb))
1726		return -EINVAL;
1727
1728	while (*nb_sb) {
1729		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1730		if (sb_id >= vm->sbm.sbs_per_mb)
1731			break;
1732		count = 1;
1733		while (count < *nb_sb &&
1734		       sb_id + count < vm->sbm.sbs_per_mb &&
1735		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1736			count++;
1737
1738		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1739		if (rc)
1740			return rc;
1741		*nb_sb -= count;
1742		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1743			continue;
1744
1745		/* fake-online the pages if the memory block is online */
1746		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1747			       sb_id * vm->sbm.sb_size);
1748		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1749		virtio_mem_fake_online(pfn, nr_pages);
1750	}
1751
1752	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1753		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1754
1755	return 0;
1756}
1757
1758static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1759{
1760	const int mb_states[] = {
1761		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1762		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1763		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1764	};
1765	uint64_t nb_sb = diff / vm->sbm.sb_size;
1766	unsigned long mb_id;
1767	int rc, i;
1768
1769	if (!nb_sb)
1770		return 0;
1771
1772	/* Don't race with onlining/offlining */
1773	mutex_lock(&vm->hotplug_mutex);
1774
1775	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1776		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1777			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1778			if (rc || !nb_sb)
1779				goto out_unlock;
1780			cond_resched();
1781		}
1782	}
1783
1784	/*
1785	 * We won't be working on online/offline memory blocks from this point,
1786	 * so we can't race with memory onlining/offlining. Drop the mutex.
1787	 */
1788	mutex_unlock(&vm->hotplug_mutex);
1789
1790	/* Try to plug and add unused blocks */
1791	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1792		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1793			return -ENOSPC;
1794
1795		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1796		if (rc || !nb_sb)
1797			return rc;
1798		cond_resched();
1799	}
1800
1801	/* Try to prepare, plug and add new blocks */
1802	while (nb_sb) {
1803		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1804			return -ENOSPC;
1805
1806		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1807		if (rc)
1808			return rc;
1809		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1810		if (rc)
1811			return rc;
1812		cond_resched();
1813	}
1814
1815	return 0;
1816out_unlock:
1817	mutex_unlock(&vm->hotplug_mutex);
1818	return rc;
1819}
1820
1821/*
1822 * Plug a big block and add it to Linux.
1823 *
1824 * Will modify the state of the big block.
1825 */
1826static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1827					  unsigned long bb_id)
1828{
1829	int rc;
1830
1831	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1832			 VIRTIO_MEM_BBM_BB_UNUSED))
1833		return -EINVAL;
1834
1835	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1836	if (rc)
1837		return rc;
1838	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1839
1840	rc = virtio_mem_bbm_add_bb(vm, bb_id);
1841	if (rc) {
1842		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1843			virtio_mem_bbm_set_bb_state(vm, bb_id,
1844						    VIRTIO_MEM_BBM_BB_UNUSED);
1845		else
1846			/* Retry from the main loop. */
1847			virtio_mem_bbm_set_bb_state(vm, bb_id,
1848						    VIRTIO_MEM_BBM_BB_PLUGGED);
1849		return rc;
1850	}
1851	return 0;
1852}
1853
1854/*
1855 * Prepare tracking data for the next big block.
1856 */
1857static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1858					  unsigned long *bb_id)
1859{
1860	int rc;
1861
1862	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1863		return -ENOSPC;
1864
1865	/* Resize the big block state array if required. */
1866	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1867	if (rc)
1868		return rc;
1869
1870	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1871	*bb_id = vm->bbm.next_bb_id;
1872	vm->bbm.next_bb_id++;
1873	return 0;
1874}
1875
1876static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1877{
1878	uint64_t nb_bb = diff / vm->bbm.bb_size;
1879	unsigned long bb_id;
1880	int rc;
1881
1882	if (!nb_bb)
1883		return 0;
1884
1885	/* Try to plug and add unused big blocks */
1886	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1887		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1888			return -ENOSPC;
1889
1890		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1891		if (!rc)
1892			nb_bb--;
1893		if (rc || !nb_bb)
1894			return rc;
1895		cond_resched();
1896	}
1897
1898	/* Try to prepare, plug and add new big blocks */
1899	while (nb_bb) {
1900		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1901			return -ENOSPC;
1902
1903		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1904		if (rc)
1905			return rc;
1906		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1907		if (!rc)
1908			nb_bb--;
1909		if (rc)
1910			return rc;
1911		cond_resched();
1912	}
1913
1914	return 0;
1915}
1916
1917/*
1918 * Try to plug the requested amount of memory.
1919 */
1920static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1921{
1922	if (vm->in_sbm)
1923		return virtio_mem_sbm_plug_request(vm, diff);
1924	return virtio_mem_bbm_plug_request(vm, diff);
1925}
1926
1927/*
1928 * Unplug the desired number of plugged subblocks of an offline memory block.
1929 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1930 *
1931 * Will modify the state of the memory block. Might temporarily drop the
1932 * hotplug_mutex.
1933 *
1934 * Note: Can fail after some subblocks were successfully unplugged.
1935 */
1936static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1937						unsigned long mb_id,
1938						uint64_t *nb_sb)
1939{
1940	int rc;
1941
1942	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1943
1944	/* some subblocks might have been unplugged even on failure */
1945	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1946		virtio_mem_sbm_set_mb_state(vm, mb_id,
1947					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1948	if (rc)
1949		return rc;
1950
1951	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1952		/*
1953		 * Remove the block from Linux - this should never fail.
1954		 * Hinder the block from getting onlined by marking it
1955		 * unplugged. Temporarily drop the mutex, so
1956		 * any pending GOING_ONLINE requests can be serviced/rejected.
1957		 */
1958		virtio_mem_sbm_set_mb_state(vm, mb_id,
1959					    VIRTIO_MEM_SBM_MB_UNUSED);
1960
1961		mutex_unlock(&vm->hotplug_mutex);
1962		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1963		BUG_ON(rc);
1964		mutex_lock(&vm->hotplug_mutex);
1965	}
1966	return 0;
1967}
1968
1969/*
1970 * Unplug the given plugged subblocks of an online memory block.
1971 *
1972 * Will modify the state of the memory block.
1973 */
1974static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1975					   unsigned long mb_id, int sb_id,
1976					   int count)
1977{
1978	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1979	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1980	unsigned long start_pfn;
1981	int rc;
1982
1983	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1984			     sb_id * vm->sbm.sb_size);
1985
1986	rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages);
1987	if (rc)
1988		return rc;
1989
1990	/* Try to unplug the allocated memory */
1991	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1992	if (rc) {
1993		/* Return the memory to the buddy. */
1994		virtio_mem_fake_online(start_pfn, nr_pages);
1995		return rc;
1996	}
1997
1998	switch (old_state) {
1999	case VIRTIO_MEM_SBM_MB_KERNEL:
2000		virtio_mem_sbm_set_mb_state(vm, mb_id,
2001					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
2002		break;
2003	case VIRTIO_MEM_SBM_MB_MOVABLE:
2004		virtio_mem_sbm_set_mb_state(vm, mb_id,
2005					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
2006		break;
2007	}
2008
2009	return 0;
2010}
2011
2012/*
2013 * Unplug the desired number of plugged subblocks of an online memory block.
2014 * Will skip subblock that are busy.
2015 *
2016 * Will modify the state of the memory block. Might temporarily drop the
2017 * hotplug_mutex.
2018 *
2019 * Note: Can fail after some subblocks were successfully unplugged. Can
2020 *       return 0 even if subblocks were busy and could not get unplugged.
2021 */
2022static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
2023					       unsigned long mb_id,
2024					       uint64_t *nb_sb)
2025{
2026	int rc, sb_id;
2027
2028	/* If possible, try to unplug the complete block in one shot. */
2029	if (*nb_sb >= vm->sbm.sbs_per_mb &&
2030	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
2031		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
2032						     vm->sbm.sbs_per_mb);
2033		if (!rc) {
2034			*nb_sb -= vm->sbm.sbs_per_mb;
2035			goto unplugged;
2036		} else if (rc != -EBUSY)
2037			return rc;
2038	}
2039
2040	/* Fallback to single subblocks. */
2041	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
2042		/* Find the next candidate subblock */
2043		while (sb_id >= 0 &&
2044		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
2045			sb_id--;
2046		if (sb_id < 0)
2047			break;
2048
2049		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
2050		if (rc == -EBUSY)
2051			continue;
2052		else if (rc)
2053			return rc;
2054		*nb_sb -= 1;
2055	}
2056
2057unplugged:
2058	rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id);
2059	if (rc)
2060		vm->sbm.have_unplugged_mb = 1;
2061	/* Ignore errors, this is not critical. We'll retry later. */
 
 
 
 
 
 
 
 
 
 
2062	return 0;
2063}
2064
2065/*
2066 * Unplug the desired number of plugged subblocks of a memory block that is
2067 * already added to Linux. Will skip subblock of online memory blocks that are
2068 * busy (by the OS). Will fail if any subblock that's not busy cannot get
2069 * unplugged.
2070 *
2071 * Will modify the state of the memory block. Might temporarily drop the
2072 * hotplug_mutex.
2073 *
2074 * Note: Can fail after some subblocks were successfully unplugged. Can
2075 *       return 0 even if subblocks were busy and could not get unplugged.
2076 */
2077static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
2078					unsigned long mb_id,
2079					uint64_t *nb_sb)
2080{
2081	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
2082
2083	switch (old_state) {
2084	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
2085	case VIRTIO_MEM_SBM_MB_KERNEL:
2086	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
2087	case VIRTIO_MEM_SBM_MB_MOVABLE:
2088		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
2089	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
2090	case VIRTIO_MEM_SBM_MB_OFFLINE:
2091		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
2092	}
2093	return -EINVAL;
2094}
2095
2096static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2097{
2098	const int mb_states[] = {
2099		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
2100		VIRTIO_MEM_SBM_MB_OFFLINE,
2101		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
2102		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
2103		VIRTIO_MEM_SBM_MB_MOVABLE,
2104		VIRTIO_MEM_SBM_MB_KERNEL,
2105	};
2106	uint64_t nb_sb = diff / vm->sbm.sb_size;
2107	unsigned long mb_id;
2108	int rc, i;
2109
2110	if (!nb_sb)
2111		return 0;
2112
2113	/*
2114	 * We'll drop the mutex a couple of times when it is safe to do so.
2115	 * This might result in some blocks switching the state (online/offline)
2116	 * and we could miss them in this run - we will retry again later.
2117	 */
2118	mutex_lock(&vm->hotplug_mutex);
2119
2120	/*
2121	 * We try unplug from partially plugged blocks first, to try removing
2122	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2123	 * as it's more reliable to unplug memory and remove whole memory
2124	 * blocks, and we don't want to trigger a zone imbalances by
2125	 * accidentially removing too much kernel memory.
2126	 */
2127	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2128		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2129			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2130			if (rc || !nb_sb)
2131				goto out_unlock;
2132			mutex_unlock(&vm->hotplug_mutex);
2133			cond_resched();
2134			mutex_lock(&vm->hotplug_mutex);
2135		}
2136		if (!unplug_online && i == 1) {
2137			mutex_unlock(&vm->hotplug_mutex);
2138			return 0;
2139		}
2140	}
2141
2142	mutex_unlock(&vm->hotplug_mutex);
2143	return nb_sb ? -EBUSY : 0;
2144out_unlock:
2145	mutex_unlock(&vm->hotplug_mutex);
2146	return rc;
2147}
2148
2149/*
2150 * Try to offline and remove a big block from Linux and unplug it. Will fail
2151 * with -EBUSY if some memory is busy and cannot get unplugged.
2152 *
2153 * Will modify the state of the memory block. Might temporarily drop the
2154 * hotplug_mutex.
2155 */
2156static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2157						       unsigned long bb_id)
2158{
2159	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2160	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2161	unsigned long end_pfn = start_pfn + nr_pages;
2162	unsigned long pfn;
2163	struct page *page;
2164	int rc;
2165
2166	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2167			 VIRTIO_MEM_BBM_BB_ADDED))
2168		return -EINVAL;
2169
2170	/*
2171	 * Start by fake-offlining all memory. Once we marked the device
2172	 * block as fake-offline, all newly onlined memory will
2173	 * automatically be kept fake-offline. Protect from concurrent
2174	 * onlining/offlining until we have a consistent state.
2175	 */
2176	mutex_lock(&vm->hotplug_mutex);
2177	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
 
 
2178
2179	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2180		page = pfn_to_online_page(pfn);
2181		if (!page)
2182			continue;
2183
2184		rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION);
2185		if (rc) {
2186			end_pfn = pfn;
2187			goto rollback;
 
2188		}
 
2189	}
2190	mutex_unlock(&vm->hotplug_mutex);
2191
2192	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2193	if (rc) {
2194		mutex_lock(&vm->hotplug_mutex);
2195		goto rollback;
 
 
 
2196	}
2197
2198	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2199	if (rc)
2200		virtio_mem_bbm_set_bb_state(vm, bb_id,
2201					    VIRTIO_MEM_BBM_BB_PLUGGED);
2202	else
2203		virtio_mem_bbm_set_bb_state(vm, bb_id,
2204					    VIRTIO_MEM_BBM_BB_UNUSED);
2205	return rc;
2206
2207rollback:
2208	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2209		page = pfn_to_online_page(pfn);
2210		if (!page)
2211			continue;
2212		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2213	}
2214	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2215	mutex_unlock(&vm->hotplug_mutex);
2216	return rc;
2217}
2218
2219/*
2220 * Test if a big block is completely offline.
2221 */
2222static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2223					 unsigned long bb_id)
2224{
2225	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2226	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2227	unsigned long pfn;
2228
2229	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2230	     pfn += PAGES_PER_SECTION) {
2231		if (pfn_to_online_page(pfn))
2232			return false;
2233	}
2234
2235	return true;
2236}
2237
2238/*
2239 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2240 */
2241static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2242					 unsigned long bb_id)
2243{
2244	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2245	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2246	struct page *page;
2247	unsigned long pfn;
2248
2249	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2250	     pfn += PAGES_PER_SECTION) {
2251		page = pfn_to_online_page(pfn);
2252		if (!page)
2253			continue;
2254		if (page_zonenum(page) != ZONE_MOVABLE)
2255			return false;
2256	}
2257
2258	return true;
2259}
2260
2261static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2262{
2263	uint64_t nb_bb = diff / vm->bbm.bb_size;
2264	uint64_t bb_id;
2265	int rc, i;
2266
2267	if (!nb_bb)
2268		return 0;
2269
2270	/*
2271	 * Try to unplug big blocks. Similar to SBM, start with offline
2272	 * big blocks.
2273	 */
2274	for (i = 0; i < 3; i++) {
2275		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2276			cond_resched();
2277
2278			/*
2279			 * As we're holding no locks, these checks are racy,
2280			 * but we don't care.
2281			 */
2282			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2283				continue;
2284			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2285				continue;
2286			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2287			if (rc == -EBUSY)
2288				continue;
2289			if (!rc)
2290				nb_bb--;
2291			if (rc || !nb_bb)
2292				return rc;
2293		}
2294		if (i == 0 && !unplug_online)
2295			return 0;
2296	}
2297
2298	return nb_bb ? -EBUSY : 0;
2299}
2300
2301/*
2302 * Try to unplug the requested amount of memory.
2303 */
2304static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2305{
2306	if (vm->in_sbm)
2307		return virtio_mem_sbm_unplug_request(vm, diff);
2308	return virtio_mem_bbm_unplug_request(vm, diff);
2309}
2310
2311/*
2312 * Try to unplug all blocks that couldn't be unplugged before, for example,
2313 * because the hypervisor was busy. Further, offline and remove any memory
2314 * blocks where we previously failed.
2315 */
2316static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm)
2317{
2318	unsigned long id;
2319	int rc = 0;
2320
2321	if (!vm->in_sbm) {
2322		virtio_mem_bbm_for_each_bb(vm, id,
2323					   VIRTIO_MEM_BBM_BB_PLUGGED) {
2324			rc = virtio_mem_bbm_unplug_bb(vm, id);
2325			if (rc)
2326				return rc;
2327			virtio_mem_bbm_set_bb_state(vm, id,
2328						    VIRTIO_MEM_BBM_BB_UNUSED);
2329		}
2330		return 0;
2331	}
2332
2333	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2334		rc = virtio_mem_sbm_unplug_mb(vm, id);
2335		if (rc)
2336			return rc;
2337		virtio_mem_sbm_set_mb_state(vm, id,
2338					    VIRTIO_MEM_SBM_MB_UNUSED);
2339	}
2340
2341	if (!vm->sbm.have_unplugged_mb)
2342		return 0;
2343
2344	/*
2345	 * Let's retry (offlining and) removing completely unplugged Linux
2346	 * memory blocks.
2347	 */
2348	vm->sbm.have_unplugged_mb = false;
2349
2350	mutex_lock(&vm->hotplug_mutex);
2351	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL)
2352		rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2353	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL)
2354		rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2355	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
2356		rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2357	mutex_unlock(&vm->hotplug_mutex);
2358
2359	if (rc)
2360		vm->sbm.have_unplugged_mb = true;
2361	/* Ignore errors, this is not critical. We'll retry later. */
2362	return 0;
2363}
2364
2365/*
2366 * Update all parts of the config that could have changed.
2367 */
2368static void virtio_mem_refresh_config(struct virtio_mem *vm)
2369{
2370	const struct range pluggable_range = mhp_get_pluggable_range(true);
2371	uint64_t new_plugged_size, usable_region_size, end_addr;
2372
2373	/* the plugged_size is just a reflection of what _we_ did previously */
2374	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2375			&new_plugged_size);
2376	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2377		vm->plugged_size = new_plugged_size;
2378
2379	/* calculate the last usable memory block id */
2380	virtio_cread_le(vm->vdev, struct virtio_mem_config,
2381			usable_region_size, &usable_region_size);
2382	end_addr = min(vm->addr + usable_region_size - 1,
2383		       pluggable_range.end);
2384
2385	if (vm->in_sbm) {
2386		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2387		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2388			vm->sbm.last_usable_mb_id--;
2389	} else {
2390		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2391								     end_addr);
2392		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2393			vm->bbm.last_usable_bb_id--;
2394	}
2395	/*
2396	 * If we cannot plug any of our device memory (e.g., nothing in the
2397	 * usable region is addressable), the last usable memory block id will
2398	 * be smaller than the first usable memory block id. We'll stop
2399	 * attempting to add memory with -ENOSPC from our main loop.
2400	 */
2401
2402	/* see if there is a request to change the size */
2403	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2404			&vm->requested_size);
2405
2406	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2407	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2408}
2409
2410/*
2411 * Workqueue function for handling plug/unplug requests and config updates.
2412 */
2413static void virtio_mem_run_wq(struct work_struct *work)
2414{
2415	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2416	uint64_t diff;
2417	int rc;
2418
2419	if (unlikely(vm->in_kdump)) {
2420		dev_warn_once(&vm->vdev->dev,
2421			     "unexpected workqueue run in kdump kernel\n");
2422		return;
2423	}
2424
2425	hrtimer_cancel(&vm->retry_timer);
2426
2427	if (vm->broken)
2428		return;
2429
2430	atomic_set(&vm->wq_active, 1);
2431retry:
2432	rc = 0;
2433
2434	/* Make sure we start with a clean state if there are leftovers. */
2435	if (unlikely(vm->unplug_all_required))
2436		rc = virtio_mem_send_unplug_all_request(vm);
2437
2438	if (atomic_read(&vm->config_changed)) {
2439		atomic_set(&vm->config_changed, 0);
2440		virtio_mem_refresh_config(vm);
2441	}
2442
2443	/* Cleanup any leftovers from previous runs */
2444	if (!rc)
2445		rc = virtio_mem_cleanup_pending_mb(vm);
2446
2447	if (!rc && vm->requested_size != vm->plugged_size) {
2448		if (vm->requested_size > vm->plugged_size) {
2449			diff = vm->requested_size - vm->plugged_size;
2450			rc = virtio_mem_plug_request(vm, diff);
2451		} else {
2452			diff = vm->plugged_size - vm->requested_size;
2453			rc = virtio_mem_unplug_request(vm, diff);
2454		}
2455	}
2456
2457	/*
2458	 * Keep retrying to offline and remove completely unplugged Linux
2459	 * memory blocks.
2460	 */
2461	if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb)
2462		rc = -EBUSY;
2463
2464	switch (rc) {
2465	case 0:
2466		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2467		break;
2468	case -ENOSPC:
2469		/*
2470		 * We cannot add any more memory (alignment, physical limit)
2471		 * or we have too many offline memory blocks.
2472		 */
2473		break;
2474	case -ETXTBSY:
2475		/*
2476		 * The hypervisor cannot process our request right now
2477		 * (e.g., out of memory, migrating);
2478		 */
2479	case -EBUSY:
2480		/*
2481		 * We cannot free up any memory to unplug it (all plugged memory
2482		 * is busy).
2483		 */
2484	case -ENOMEM:
2485		/* Out of memory, try again later. */
2486		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2487			      HRTIMER_MODE_REL);
2488		break;
2489	case -EAGAIN:
2490		/* Retry immediately (e.g., the config changed). */
2491		goto retry;
2492	default:
2493		/* Unknown error, mark as broken */
2494		dev_err(&vm->vdev->dev,
2495			"unknown error, marking device broken: %d\n", rc);
2496		vm->broken = true;
2497	}
2498
2499	atomic_set(&vm->wq_active, 0);
2500}
2501
2502static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2503{
2504	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2505					     retry_timer);
2506
2507	virtio_mem_retry(vm);
2508	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2509				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2510	return HRTIMER_NORESTART;
2511}
2512
2513static void virtio_mem_handle_response(struct virtqueue *vq)
2514{
2515	struct virtio_mem *vm = vq->vdev->priv;
2516
2517	wake_up(&vm->host_resp);
2518}
2519
2520static int virtio_mem_init_vq(struct virtio_mem *vm)
2521{
2522	struct virtqueue *vq;
2523
2524	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2525				   "guest-request");
2526	if (IS_ERR(vq))
2527		return PTR_ERR(vq);
2528	vm->vq = vq;
2529
2530	return 0;
2531}
2532
2533static int virtio_mem_init_hotplug(struct virtio_mem *vm)
2534{
2535	const struct range pluggable_range = mhp_get_pluggable_range(true);
2536	uint64_t unit_pages, sb_size, addr;
2537	int rc;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2538
2539	/* bad device setup - warn only */
2540	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2541		dev_warn(&vm->vdev->dev,
2542			 "The alignment of the physical start address can make some memory unusable.\n");
2543	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2544		dev_warn(&vm->vdev->dev,
2545			 "The alignment of the physical end address can make some memory unusable.\n");
2546	if (vm->addr < pluggable_range.start ||
2547	    vm->addr + vm->region_size - 1 > pluggable_range.end)
2548		dev_warn(&vm->vdev->dev,
2549			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2550
2551	/* Prepare the offline threshold - make sure we can add two blocks. */
2552	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2553				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2554
2555	/*
2556	 * alloc_contig_range() works reliably with pageblock
2557	 * granularity on ZONE_NORMAL, use pageblock_nr_pages.
 
 
 
 
2558	 */
2559	sb_size = PAGE_SIZE * pageblock_nr_pages;
 
2560	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2561
2562	if (sb_size < memory_block_size_bytes() && !force_bbm) {
2563		/* SBM: At least two subblocks per Linux memory block. */
2564		vm->in_sbm = true;
2565		vm->sbm.sb_size = sb_size;
2566		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2567				     vm->sbm.sb_size;
2568
2569		/* Round up to the next full memory block */
2570		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2571		       memory_block_size_bytes() - 1;
2572		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2573		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2574	} else {
2575		/* BBM: At least one Linux memory block. */
2576		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2577					memory_block_size_bytes());
2578
2579		if (bbm_block_size) {
2580			if (!is_power_of_2(bbm_block_size)) {
2581				dev_warn(&vm->vdev->dev,
2582					 "bbm_block_size is not a power of 2");
2583			} else if (bbm_block_size < vm->bbm.bb_size) {
2584				dev_warn(&vm->vdev->dev,
2585					 "bbm_block_size is too small");
2586			} else {
2587				vm->bbm.bb_size = bbm_block_size;
2588			}
2589		}
2590
2591		/* Round up to the next aligned big block */
2592		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2593		       vm->bbm.bb_size - 1;
2594		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2595		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2596
2597		/* Make sure we can add two big blocks. */
2598		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2599					      vm->offline_threshold);
2600	}
2601
 
 
 
 
2602	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2603		 memory_block_size_bytes());
2604	if (vm->in_sbm)
2605		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2606			 (unsigned long long)vm->sbm.sb_size);
2607	else
2608		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2609			 (unsigned long long)vm->bbm.bb_size);
2610
2611	/* create the parent resource for all memory */
2612	rc = virtio_mem_create_resource(vm);
2613	if (rc)
2614		return rc;
2615
2616	/* use a single dynamic memory group to cover the whole memory device */
2617	if (vm->in_sbm)
2618		unit_pages = PHYS_PFN(memory_block_size_bytes());
2619	else
2620		unit_pages = PHYS_PFN(vm->bbm.bb_size);
2621	rc = memory_group_register_dynamic(vm->nid, unit_pages);
2622	if (rc < 0)
2623		goto out_del_resource;
2624	vm->mgid = rc;
2625
2626	/*
2627	 * If we still have memory plugged, we have to unplug all memory first.
2628	 * Registering our parent resource makes sure that this memory isn't
2629	 * actually in use (e.g., trying to reload the driver).
2630	 */
2631	if (vm->plugged_size) {
2632		vm->unplug_all_required = true;
2633		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2634	}
2635
2636	/* register callbacks */
2637	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2638	rc = register_memory_notifier(&vm->memory_notifier);
2639	if (rc)
2640		goto out_unreg_group;
2641	/* Block hibernation as early as possible. */
2642	vm->pm_notifier.priority = INT_MAX;
2643	vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb;
2644	rc = register_pm_notifier(&vm->pm_notifier);
2645	if (rc)
2646		goto out_unreg_mem;
2647	rc = register_virtio_mem_device(vm);
2648	if (rc)
2649		goto out_unreg_pm;
2650
2651	return 0;
2652out_unreg_pm:
2653	unregister_pm_notifier(&vm->pm_notifier);
2654out_unreg_mem:
2655	unregister_memory_notifier(&vm->memory_notifier);
2656out_unreg_group:
2657	memory_group_unregister(vm->mgid);
2658out_del_resource:
2659	virtio_mem_delete_resource(vm);
2660	return rc;
2661}
2662
2663#ifdef CONFIG_PROC_VMCORE
2664static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
2665					 uint64_t size)
2666{
2667	const uint64_t nb_vm_blocks = size / vm->device_block_size;
2668	const struct virtio_mem_req req = {
2669		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
2670		.u.state.addr = cpu_to_virtio64(vm->vdev, addr),
2671		.u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
2672	};
2673	int rc = -ENOMEM;
2674
2675	dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
2676		addr + size - 1);
2677
2678	switch (virtio_mem_send_request(vm, &req)) {
2679	case VIRTIO_MEM_RESP_ACK:
2680		return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
2681	case VIRTIO_MEM_RESP_ERROR:
2682		rc = -EINVAL;
2683		break;
2684	default:
2685		break;
2686	}
2687
2688	dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
2689	return rc;
2690}
2691
2692static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
2693					 unsigned long pfn)
2694{
2695	struct virtio_mem *vm = container_of(cb, struct virtio_mem,
2696					     vmcore_cb);
2697	uint64_t addr = PFN_PHYS(pfn);
2698	bool is_ram;
2699	int rc;
2700
2701	if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
2702		return true;
2703	if (!vm->plugged_size)
2704		return false;
2705
2706	/*
2707	 * We have to serialize device requests and access to the information
2708	 * about the block queried last.
2709	 */
2710	mutex_lock(&vm->hotplug_mutex);
2711
2712	addr = ALIGN_DOWN(addr, vm->device_block_size);
2713	if (addr != vm->last_block_addr) {
2714		rc = virtio_mem_send_state_request(vm, addr,
2715						   vm->device_block_size);
2716		/* On any kind of error, we're going to signal !ram. */
2717		if (rc == VIRTIO_MEM_STATE_PLUGGED)
2718			vm->last_block_plugged = true;
2719		else
2720			vm->last_block_plugged = false;
2721		vm->last_block_addr = addr;
2722	}
2723
2724	is_ram = vm->last_block_plugged;
2725	mutex_unlock(&vm->hotplug_mutex);
2726	return is_ram;
2727}
2728#endif /* CONFIG_PROC_VMCORE */
2729
2730static int virtio_mem_init_kdump(struct virtio_mem *vm)
2731{
2732#ifdef CONFIG_PROC_VMCORE
2733	dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
2734	vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
2735	register_vmcore_cb(&vm->vmcore_cb);
2736	return 0;
2737#else /* CONFIG_PROC_VMCORE */
2738	dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
2739	return -EBUSY;
2740#endif /* CONFIG_PROC_VMCORE */
2741}
2742
2743static int virtio_mem_init(struct virtio_mem *vm)
2744{
2745	uint16_t node_id;
2746
2747	if (!vm->vdev->config->get) {
2748		dev_err(&vm->vdev->dev, "config access disabled\n");
2749		return -EINVAL;
2750	}
2751
2752	/* Fetch all properties that can't change. */
2753	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2754			&vm->plugged_size);
2755	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2756			&vm->device_block_size);
2757	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2758			&node_id);
2759	vm->nid = virtio_mem_translate_node_id(vm, node_id);
2760	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2761	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2762			&vm->region_size);
2763
2764	/* Determine the nid for the device based on the lowest address. */
2765	if (vm->nid == NUMA_NO_NODE)
2766		vm->nid = memory_add_physaddr_to_nid(vm->addr);
2767
2768	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2769	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2770	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2771		 (unsigned long long)vm->device_block_size);
2772	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2773		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2774
2775	/*
2776	 * We don't want to (un)plug or reuse any memory when in kdump. The
2777	 * memory is still accessible (but not exposed to Linux).
2778	 */
2779	if (vm->in_kdump)
2780		return virtio_mem_init_kdump(vm);
2781	return virtio_mem_init_hotplug(vm);
2782}
2783
2784static int virtio_mem_create_resource(struct virtio_mem *vm)
2785{
2786	/*
2787	 * When force-unloading the driver and removing the device, we
2788	 * could have a garbage pointer. Duplicate the string.
2789	 */
2790	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2791
2792	if (!name)
2793		return -ENOMEM;
2794
2795	/* Disallow mapping device memory via /dev/mem completely. */
2796	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2797						   name, IORESOURCE_SYSTEM_RAM |
2798						   IORESOURCE_EXCLUSIVE);
2799	if (!vm->parent_resource) {
2800		kfree(name);
2801		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2802		dev_info(&vm->vdev->dev,
2803			 "reloading the driver is not supported\n");
2804		return -EBUSY;
2805	}
2806
2807	/* The memory is not actually busy - make add_memory() work. */
2808	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2809	return 0;
2810}
2811
2812static void virtio_mem_delete_resource(struct virtio_mem *vm)
2813{
2814	const char *name;
2815
2816	if (!vm->parent_resource)
2817		return;
2818
2819	name = vm->parent_resource->name;
2820	release_resource(vm->parent_resource);
2821	kfree(vm->parent_resource);
2822	kfree(name);
2823	vm->parent_resource = NULL;
2824}
2825
2826static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2827{
2828	return 1;
2829}
2830
2831static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2832{
2833	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2834
2835	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2836				   vm->addr + vm->region_size, NULL,
2837				   virtio_mem_range_has_system_ram) == 1;
2838}
2839
2840static int virtio_mem_probe(struct virtio_device *vdev)
2841{
2842	struct virtio_mem *vm;
2843	int rc;
2844
2845	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2846	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2847
2848	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2849	if (!vm)
2850		return -ENOMEM;
2851
2852	init_waitqueue_head(&vm->host_resp);
2853	vm->vdev = vdev;
2854	INIT_WORK(&vm->wq, virtio_mem_run_wq);
2855	mutex_init(&vm->hotplug_mutex);
2856	INIT_LIST_HEAD(&vm->next);
2857	spin_lock_init(&vm->removal_lock);
2858	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2859	vm->retry_timer.function = virtio_mem_timer_expired;
2860	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2861	vm->in_kdump = is_kdump_kernel();
2862
2863	/* register the virtqueue */
2864	rc = virtio_mem_init_vq(vm);
2865	if (rc)
2866		goto out_free_vm;
2867
2868	/* initialize the device by querying the config */
2869	rc = virtio_mem_init(vm);
2870	if (rc)
2871		goto out_del_vq;
2872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2873	virtio_device_ready(vdev);
2874
2875	/* trigger a config update to start processing the requested_size */
2876	if (!vm->in_kdump) {
2877		atomic_set(&vm->config_changed, 1);
2878		queue_work(system_freezable_wq, &vm->wq);
2879	}
2880
2881	return 0;
 
 
 
 
2882out_del_vq:
2883	vdev->config->del_vqs(vdev);
2884out_free_vm:
2885	kfree(vm);
2886	vdev->priv = NULL;
2887
2888	return rc;
2889}
2890
2891static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
2892{
 
2893	unsigned long mb_id;
2894	int rc;
2895
2896	/*
2897	 * Make sure the workqueue won't be triggered anymore and no memory
2898	 * blocks can be onlined/offlined until we're finished here.
2899	 */
2900	mutex_lock(&vm->hotplug_mutex);
2901	spin_lock_irq(&vm->removal_lock);
2902	vm->removing = true;
2903	spin_unlock_irq(&vm->removal_lock);
2904	mutex_unlock(&vm->hotplug_mutex);
2905
2906	/* wait until the workqueue stopped */
2907	cancel_work_sync(&vm->wq);
2908	hrtimer_cancel(&vm->retry_timer);
2909
2910	if (vm->in_sbm) {
2911		/*
2912		 * After we unregistered our callbacks, user space can online
2913		 * partially plugged offline blocks. Make sure to remove them.
2914		 */
2915		virtio_mem_sbm_for_each_mb(vm, mb_id,
2916					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2917			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2918			BUG_ON(rc);
2919			virtio_mem_sbm_set_mb_state(vm, mb_id,
2920						    VIRTIO_MEM_SBM_MB_UNUSED);
2921		}
2922		/*
2923		 * After we unregistered our callbacks, user space can no longer
2924		 * offline partially plugged online memory blocks. No need to
2925		 * worry about them.
2926		 */
2927	}
2928
2929	/* unregister callbacks */
2930	unregister_virtio_mem_device(vm);
2931	unregister_pm_notifier(&vm->pm_notifier);
2932	unregister_memory_notifier(&vm->memory_notifier);
2933
2934	/*
2935	 * There is no way we could reliably remove all memory we have added to
2936	 * the system. And there is no way to stop the driver/device from going
2937	 * away. Warn at least.
2938	 */
2939	if (virtio_mem_has_memory_added(vm)) {
2940		dev_warn(&vm->vdev->dev,
2941			 "device still has system memory added\n");
2942	} else {
2943		virtio_mem_delete_resource(vm);
2944		kfree_const(vm->resource_name);
2945		memory_group_unregister(vm->mgid);
2946	}
2947
2948	/* remove all tracking data - no locking needed */
2949	if (vm->in_sbm) {
2950		vfree(vm->sbm.mb_states);
2951		vfree(vm->sbm.sb_states);
2952	} else {
2953		vfree(vm->bbm.bb_states);
2954	}
2955}
2956
2957static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
2958{
2959#ifdef CONFIG_PROC_VMCORE
2960	unregister_vmcore_cb(&vm->vmcore_cb);
2961#endif /* CONFIG_PROC_VMCORE */
2962}
2963
2964static void virtio_mem_remove(struct virtio_device *vdev)
2965{
2966	struct virtio_mem *vm = vdev->priv;
2967
2968	if (vm->in_kdump)
2969		virtio_mem_deinit_kdump(vm);
2970	else
2971		virtio_mem_deinit_hotplug(vm);
2972
2973	/* reset the device and cleanup the queues */
2974	virtio_reset_device(vdev);
2975	vdev->config->del_vqs(vdev);
2976
2977	kfree(vm);
2978	vdev->priv = NULL;
2979}
2980
2981static void virtio_mem_config_changed(struct virtio_device *vdev)
2982{
2983	struct virtio_mem *vm = vdev->priv;
2984
2985	if (unlikely(vm->in_kdump))
2986		return;
2987
2988	atomic_set(&vm->config_changed, 1);
2989	virtio_mem_retry(vm);
2990}
2991
2992#ifdef CONFIG_PM_SLEEP
2993static int virtio_mem_freeze(struct virtio_device *vdev)
2994{
2995	struct virtio_mem *vm = vdev->priv;
2996
2997	/*
2998	 * We block hibernation using the PM notifier completely. The workqueue
2999	 * is already frozen by the PM core at this point, so we simply
3000	 * reset the device and cleanup the queues.
3001	 */
3002	if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE &&
3003	    vm->plugged_size &&
3004	    !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) {
3005		dev_err(&vm->vdev->dev,
3006			"suspending with plugged memory is not supported\n");
3007		return -EPERM;
3008	}
3009
3010	virtio_reset_device(vdev);
3011	vdev->config->del_vqs(vdev);
3012	vm->vq = NULL;
3013	return 0;
3014}
3015
3016static int virtio_mem_restore(struct virtio_device *vdev)
3017{
3018	struct virtio_mem *vm = vdev->priv;
3019	int ret;
3020
3021	ret = virtio_mem_init_vq(vm);
3022	if (ret)
3023		return ret;
3024	virtio_device_ready(vdev);
3025
3026	/* Let's check if anything changed. */
3027	virtio_mem_config_changed(vdev);
3028	return 0;
3029}
3030#endif
3031
3032static unsigned int virtio_mem_features[] = {
3033#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
3034	VIRTIO_MEM_F_ACPI_PXM,
3035#endif
3036	VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
3037	VIRTIO_MEM_F_PERSISTENT_SUSPEND,
3038};
3039
3040static const struct virtio_device_id virtio_mem_id_table[] = {
3041	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
3042	{ 0 },
3043};
3044
3045static struct virtio_driver virtio_mem_driver = {
3046	.feature_table = virtio_mem_features,
3047	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
3048	.driver.name = KBUILD_MODNAME,
 
3049	.id_table = virtio_mem_id_table,
3050	.probe = virtio_mem_probe,
3051	.remove = virtio_mem_remove,
3052	.config_changed = virtio_mem_config_changed,
3053#ifdef CONFIG_PM_SLEEP
3054	.freeze	=	virtio_mem_freeze,
3055	.restore =	virtio_mem_restore,
3056#endif
3057};
3058
3059module_virtio_driver(virtio_mem_driver);
3060MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
3061MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
3062MODULE_DESCRIPTION("Virtio-mem driver");
3063MODULE_LICENSE("GPL");
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Virtio-mem device driver.
   4 *
   5 * Copyright Red Hat, Inc. 2020
   6 *
   7 * Author(s): David Hildenbrand <david@redhat.com>
   8 */
   9
  10#include <linux/virtio.h>
  11#include <linux/virtio_mem.h>
  12#include <linux/workqueue.h>
  13#include <linux/slab.h>
  14#include <linux/module.h>
  15#include <linux/mm.h>
  16#include <linux/memory_hotplug.h>
  17#include <linux/memory.h>
  18#include <linux/hrtimer.h>
  19#include <linux/crash_dump.h>
  20#include <linux/mutex.h>
  21#include <linux/bitmap.h>
  22#include <linux/lockdep.h>
 
 
 
  23
  24#include <acpi/acpi_numa.h>
  25
  26static bool unplug_online = true;
  27module_param(unplug_online, bool, 0644);
  28MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
  29
  30static bool force_bbm;
  31module_param(force_bbm, bool, 0444);
  32MODULE_PARM_DESC(force_bbm,
  33		"Force Big Block Mode. Default is 0 (auto-selection)");
  34
  35static unsigned long bbm_block_size;
  36module_param(bbm_block_size, ulong, 0444);
  37MODULE_PARM_DESC(bbm_block_size,
  38		 "Big Block size in bytes. Default is 0 (auto-detection).");
  39
  40static bool bbm_safe_unplug = true;
  41module_param(bbm_safe_unplug, bool, 0444);
  42MODULE_PARM_DESC(bbm_safe_unplug,
  43	     "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
  44
  45/*
  46 * virtio-mem currently supports the following modes of operation:
  47 *
  48 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
  49 *   size of a Sub Block (SB) is determined based on the device block size, the
  50 *   pageblock size, and the maximum allocation granularity of the buddy.
  51 *   Subblocks within a Linux memory block might either be plugged or unplugged.
  52 *   Memory is added/removed to Linux MM in Linux memory block granularity.
  53 *
  54 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
  55 *   Memory is added/removed to Linux MM in Big Block granularity.
  56 *
  57 * The mode is determined automatically based on the Linux memory block size
  58 * and the device block size.
  59 *
  60 * User space / core MM (auto onlining) is responsible for onlining added
  61 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
  62 * always onlined separately, and all memory within a Linux memory block is
  63 * onlined to the same zone - virtio-mem relies on this behavior.
  64 */
  65
  66/*
  67 * State of a Linux memory block in SBM.
  68 */
  69enum virtio_mem_sbm_mb_state {
  70	/* Unplugged, not added to Linux. Can be reused later. */
  71	VIRTIO_MEM_SBM_MB_UNUSED = 0,
  72	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
  73	VIRTIO_MEM_SBM_MB_PLUGGED,
  74	/* Fully plugged, fully added to Linux, offline. */
  75	VIRTIO_MEM_SBM_MB_OFFLINE,
  76	/* Partially plugged, fully added to Linux, offline. */
  77	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
  78	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
  79	VIRTIO_MEM_SBM_MB_KERNEL,
  80	/* Partially plugged, fully added to Linux, online to a kernel zone */
  81	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
  82	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  83	VIRTIO_MEM_SBM_MB_MOVABLE,
  84	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  85	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
  86	VIRTIO_MEM_SBM_MB_COUNT
  87};
  88
  89/*
  90 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
  91 */
  92enum virtio_mem_bbm_bb_state {
  93	/* Unplugged, not added to Linux. Can be reused later. */
  94	VIRTIO_MEM_BBM_BB_UNUSED = 0,
  95	/* Plugged, not added to Linux. Error on add_memory(). */
  96	VIRTIO_MEM_BBM_BB_PLUGGED,
  97	/* Plugged and added to Linux. */
  98	VIRTIO_MEM_BBM_BB_ADDED,
  99	/* All online parts are fake-offline, ready to remove. */
 100	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
 101	VIRTIO_MEM_BBM_BB_COUNT
 102};
 103
 104struct virtio_mem {
 105	struct virtio_device *vdev;
 106
 107	/* We might first have to unplug all memory when starting up. */
 108	bool unplug_all_required;
 109
 110	/* Workqueue that processes the plug/unplug requests. */
 111	struct work_struct wq;
 112	atomic_t wq_active;
 113	atomic_t config_changed;
 114
 115	/* Virtqueue for guest->host requests. */
 116	struct virtqueue *vq;
 117
 118	/* Wait for a host response to a guest request. */
 119	wait_queue_head_t host_resp;
 120
 121	/* Space for one guest request and the host response. */
 122	struct virtio_mem_req req;
 123	struct virtio_mem_resp resp;
 124
 125	/* The current size of the device. */
 126	uint64_t plugged_size;
 127	/* The requested size of the device. */
 128	uint64_t requested_size;
 129
 130	/* The device block size (for communicating with the device). */
 131	uint64_t device_block_size;
 132	/* The determined node id for all memory of the device. */
 133	int nid;
 134	/* Physical start address of the memory region. */
 135	uint64_t addr;
 136	/* Maximum region size in bytes. */
 137	uint64_t region_size;
 138
 139	/* The parent resource for all memory added via this device. */
 140	struct resource *parent_resource;
 141	/*
 142	 * Copy of "System RAM (virtio_mem)" to be used for
 143	 * add_memory_driver_managed().
 144	 */
 145	const char *resource_name;
 
 
 146
 147	/*
 148	 * We don't want to add too much memory if it's not getting onlined,
 149	 * to avoid running OOM. Besides this threshold, we allow to have at
 150	 * least two offline blocks at a time (whatever is bigger).
 151	 */
 152#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
 153	atomic64_t offline_size;
 154	uint64_t offline_threshold;
 155
 156	/* If set, the driver is in SBM, otherwise in BBM. */
 157	bool in_sbm;
 158
 159	union {
 160		struct {
 161			/* Id of the first memory block of this device. */
 162			unsigned long first_mb_id;
 163			/* Id of the last usable memory block of this device. */
 164			unsigned long last_usable_mb_id;
 165			/* Id of the next memory bock to prepare when needed. */
 166			unsigned long next_mb_id;
 167
 168			/* The subblock size. */
 169			uint64_t sb_size;
 170			/* The number of subblocks per Linux memory block. */
 171			uint32_t sbs_per_mb;
 172
 
 
 
 
 
 
 
 173			/* Summary of all memory block states. */
 174			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
 175
 176			/*
 177			 * One byte state per memory block. Allocated via
 178			 * vmalloc(). Resized (alloc+copy+free) on demand.
 179			 *
 180			 * With 128 MiB memory blocks, we have states for 512
 181			 * GiB of memory in one 4 KiB page.
 182			 */
 183			uint8_t *mb_states;
 184
 185			/*
 186			 * Bitmap: one bit per subblock. Allocated similar to
 187			 * sbm.mb_states.
 188			 *
 189			 * A set bit means the corresponding subblock is
 190			 * plugged, otherwise it's unblocked.
 191			 *
 192			 * With 4 MiB subblocks, we manage 128 GiB of memory
 193			 * in one 4 KiB page.
 194			 */
 195			unsigned long *sb_states;
 196		} sbm;
 197
 198		struct {
 199			/* Id of the first big block of this device. */
 200			unsigned long first_bb_id;
 201			/* Id of the last usable big block of this device. */
 202			unsigned long last_usable_bb_id;
 203			/* Id of the next device bock to prepare when needed. */
 204			unsigned long next_bb_id;
 205
 206			/* Summary of all big block states. */
 207			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
 208
 209			/* One byte state per big block. See sbm.mb_states. */
 210			uint8_t *bb_states;
 211
 212			/* The block size used for plugging/adding/removing. */
 213			uint64_t bb_size;
 214		} bbm;
 215	};
 216
 217	/*
 218	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
 219	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
 220	 *
 221	 * When this lock is held the pointers can't change, ONLINE and
 222	 * OFFLINE blocks can't change the state and no subblocks will get
 223	 * plugged/unplugged.
 
 
 
 224	 */
 225	struct mutex hotplug_mutex;
 226	bool hotplug_active;
 227
 228	/* An error occurred we cannot handle - stop processing requests. */
 229	bool broken;
 230
 
 
 
 231	/* The driver is being removed. */
 232	spinlock_t removal_lock;
 233	bool removing;
 234
 235	/* Timer for retrying to plug/unplug memory. */
 236	struct hrtimer retry_timer;
 237	unsigned int retry_timer_ms;
 238#define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
 239#define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
 240
 241	/* Memory notifier (online/offline events). */
 242	struct notifier_block memory_notifier;
 243
 
 
 
 
 
 
 
 
 
 
 244	/* Next device in the list of virtio-mem devices. */
 245	struct list_head next;
 246};
 247
 248/*
 249 * We have to share a single online_page callback among all virtio-mem
 250 * devices. We use RCU to iterate the list in the callback.
 251 */
 252static DEFINE_MUTEX(virtio_mem_mutex);
 253static LIST_HEAD(virtio_mem_devices);
 254
 255static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
 256static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
 257						  unsigned long nr_pages);
 258static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
 259						   unsigned long nr_pages);
 260static void virtio_mem_retry(struct virtio_mem *vm);
 
 
 261
 262/*
 263 * Register a virtio-mem device so it will be considered for the online_page
 264 * callback.
 265 */
 266static int register_virtio_mem_device(struct virtio_mem *vm)
 267{
 268	int rc = 0;
 269
 270	/* First device registers the callback. */
 271	mutex_lock(&virtio_mem_mutex);
 272	if (list_empty(&virtio_mem_devices))
 273		rc = set_online_page_callback(&virtio_mem_online_page_cb);
 274	if (!rc)
 275		list_add_rcu(&vm->next, &virtio_mem_devices);
 276	mutex_unlock(&virtio_mem_mutex);
 277
 278	return rc;
 279}
 280
 281/*
 282 * Unregister a virtio-mem device so it will no longer be considered for the
 283 * online_page callback.
 284 */
 285static void unregister_virtio_mem_device(struct virtio_mem *vm)
 286{
 287	/* Last device unregisters the callback. */
 288	mutex_lock(&virtio_mem_mutex);
 289	list_del_rcu(&vm->next);
 290	if (list_empty(&virtio_mem_devices))
 291		restore_online_page_callback(&virtio_mem_online_page_cb);
 292	mutex_unlock(&virtio_mem_mutex);
 293
 294	synchronize_rcu();
 295}
 296
 297/*
 298 * Calculate the memory block id of a given address.
 299 */
 300static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
 301{
 302	return addr / memory_block_size_bytes();
 303}
 304
 305/*
 306 * Calculate the physical start address of a given memory block id.
 307 */
 308static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
 309{
 310	return mb_id * memory_block_size_bytes();
 311}
 312
 313/*
 314 * Calculate the big block id of a given address.
 315 */
 316static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
 317					      uint64_t addr)
 318{
 319	return addr / vm->bbm.bb_size;
 320}
 321
 322/*
 323 * Calculate the physical start address of a given big block id.
 324 */
 325static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
 326					 unsigned long bb_id)
 327{
 328	return bb_id * vm->bbm.bb_size;
 329}
 330
 331/*
 332 * Calculate the subblock id of a given address.
 333 */
 334static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
 335					      unsigned long addr)
 336{
 337	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
 338	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
 339
 340	return (addr - mb_addr) / vm->sbm.sb_size;
 341}
 342
 343/*
 344 * Set the state of a big block, taking care of the state counter.
 345 */
 346static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
 347					unsigned long bb_id,
 348					enum virtio_mem_bbm_bb_state state)
 349{
 350	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
 351	enum virtio_mem_bbm_bb_state old_state;
 352
 353	old_state = vm->bbm.bb_states[idx];
 354	vm->bbm.bb_states[idx] = state;
 355
 356	BUG_ON(vm->bbm.bb_count[old_state] == 0);
 357	vm->bbm.bb_count[old_state]--;
 358	vm->bbm.bb_count[state]++;
 359}
 360
 361/*
 362 * Get the state of a big block.
 363 */
 364static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
 365								unsigned long bb_id)
 366{
 367	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
 368}
 369
 370/*
 371 * Prepare the big block state array for the next big block.
 372 */
 373static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
 374{
 375	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
 376	unsigned long new_bytes = old_bytes + 1;
 377	int old_pages = PFN_UP(old_bytes);
 378	int new_pages = PFN_UP(new_bytes);
 379	uint8_t *new_array;
 380
 381	if (vm->bbm.bb_states && old_pages == new_pages)
 382		return 0;
 383
 384	new_array = vzalloc(new_pages * PAGE_SIZE);
 385	if (!new_array)
 386		return -ENOMEM;
 387
 388	mutex_lock(&vm->hotplug_mutex);
 389	if (vm->bbm.bb_states)
 390		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
 391	vfree(vm->bbm.bb_states);
 392	vm->bbm.bb_states = new_array;
 393	mutex_unlock(&vm->hotplug_mutex);
 394
 395	return 0;
 396}
 397
 398#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
 399	for (_bb_id = vm->bbm.first_bb_id; \
 400	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
 401	     _bb_id++) \
 402		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 403
 404#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
 405	for (_bb_id = vm->bbm.next_bb_id - 1; \
 406	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
 407	     _bb_id--) \
 408		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 409
 410/*
 411 * Set the state of a memory block, taking care of the state counter.
 412 */
 413static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
 414					unsigned long mb_id, uint8_t state)
 415{
 416	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 417	uint8_t old_state;
 418
 419	old_state = vm->sbm.mb_states[idx];
 420	vm->sbm.mb_states[idx] = state;
 421
 422	BUG_ON(vm->sbm.mb_count[old_state] == 0);
 423	vm->sbm.mb_count[old_state]--;
 424	vm->sbm.mb_count[state]++;
 425}
 426
 427/*
 428 * Get the state of a memory block.
 429 */
 430static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
 431					   unsigned long mb_id)
 432{
 433	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 434
 435	return vm->sbm.mb_states[idx];
 436}
 437
 438/*
 439 * Prepare the state array for the next memory block.
 440 */
 441static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
 442{
 443	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
 444	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
 445	uint8_t *new_array;
 446
 447	if (vm->sbm.mb_states && old_pages == new_pages)
 448		return 0;
 449
 450	new_array = vzalloc(new_pages * PAGE_SIZE);
 451	if (!new_array)
 452		return -ENOMEM;
 453
 454	mutex_lock(&vm->hotplug_mutex);
 455	if (vm->sbm.mb_states)
 456		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
 457	vfree(vm->sbm.mb_states);
 458	vm->sbm.mb_states = new_array;
 459	mutex_unlock(&vm->hotplug_mutex);
 460
 461	return 0;
 462}
 463
 464#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
 465	for (_mb_id = _vm->sbm.first_mb_id; \
 466	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
 467	     _mb_id++) \
 468		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 469
 470#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
 471	for (_mb_id = _vm->sbm.next_mb_id - 1; \
 472	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
 473	     _mb_id--) \
 474		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 475
 476/*
 477 * Calculate the bit number in the subblock bitmap for the given subblock
 478 * inside the given memory block.
 479 */
 480static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
 481					  unsigned long mb_id, int sb_id)
 482{
 483	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
 484}
 485
 486/*
 487 * Mark all selected subblocks plugged.
 488 *
 489 * Will not modify the state of the memory block.
 490 */
 491static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
 492					  unsigned long mb_id, int sb_id,
 493					  int count)
 494{
 495	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 496
 497	__bitmap_set(vm->sbm.sb_states, bit, count);
 498}
 499
 500/*
 501 * Mark all selected subblocks unplugged.
 502 *
 503 * Will not modify the state of the memory block.
 504 */
 505static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
 506					    unsigned long mb_id, int sb_id,
 507					    int count)
 508{
 509	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 510
 511	__bitmap_clear(vm->sbm.sb_states, bit, count);
 512}
 513
 514/*
 515 * Test if all selected subblocks are plugged.
 516 */
 517static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
 518					   unsigned long mb_id, int sb_id,
 519					   int count)
 520{
 521	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 522
 523	if (count == 1)
 524		return test_bit(bit, vm->sbm.sb_states);
 525
 526	/* TODO: Helper similar to bitmap_set() */
 527	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
 528	       bit + count;
 529}
 530
 531/*
 532 * Test if all selected subblocks are unplugged.
 533 */
 534static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
 535					     unsigned long mb_id, int sb_id,
 536					     int count)
 537{
 538	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 539
 540	/* TODO: Helper similar to bitmap_set() */
 541	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
 542	       bit + count;
 543}
 544
 545/*
 546 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
 547 * none.
 548 */
 549static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
 550					    unsigned long mb_id)
 551{
 552	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
 553
 554	return find_next_zero_bit(vm->sbm.sb_states,
 555				  bit + vm->sbm.sbs_per_mb, bit) - bit;
 556}
 557
 558/*
 559 * Prepare the subblock bitmap for the next memory block.
 560 */
 561static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
 562{
 563	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
 564	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
 565	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
 566	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
 567	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
 568	unsigned long *new_bitmap, *old_bitmap;
 569
 570	if (vm->sbm.sb_states && old_pages == new_pages)
 571		return 0;
 572
 573	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
 574	if (!new_bitmap)
 575		return -ENOMEM;
 576
 577	mutex_lock(&vm->hotplug_mutex);
 578	if (new_bitmap)
 579		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
 580
 581	old_bitmap = vm->sbm.sb_states;
 582	vm->sbm.sb_states = new_bitmap;
 583	mutex_unlock(&vm->hotplug_mutex);
 584
 585	vfree(old_bitmap);
 586	return 0;
 587}
 588
 589/*
 590 * Test if we could add memory without creating too much offline memory -
 591 * to avoid running OOM if memory is getting onlined deferred.
 592 */
 593static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
 594{
 595	if (WARN_ON_ONCE(size > vm->offline_threshold))
 596		return false;
 597
 598	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
 599}
 600
 601/*
 602 * Try adding memory to Linux. Will usually only fail if out of memory.
 603 *
 604 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 605 * onlining code).
 606 *
 607 * Will not modify the state of memory blocks in virtio-mem.
 608 */
 609static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 610				 uint64_t size)
 611{
 612	int rc;
 613
 614	/*
 615	 * When force-unloading the driver and we still have memory added to
 616	 * Linux, the resource name has to stay.
 617	 */
 618	if (!vm->resource_name) {
 619		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
 620						  GFP_KERNEL);
 621		if (!vm->resource_name)
 622			return -ENOMEM;
 623	}
 624
 625	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
 626		addr + size - 1);
 627	/* Memory might get onlined immediately. */
 628	atomic64_add(size, &vm->offline_size);
 629	rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
 630				       MHP_MERGE_RESOURCE);
 631	if (rc) {
 632		atomic64_sub(size, &vm->offline_size);
 633		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
 634		/*
 635		 * TODO: Linux MM does not properly clean up yet in all cases
 636		 * where adding of memory failed - especially on -ENOMEM.
 637		 */
 638	}
 639	return rc;
 640}
 641
 642/*
 643 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
 644 */
 645static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
 646{
 647	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 648	const uint64_t size = memory_block_size_bytes();
 649
 650	return virtio_mem_add_memory(vm, addr, size);
 651}
 652
 653/*
 654 * See virtio_mem_add_memory(): Try adding a big block.
 655 */
 656static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
 657{
 658	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 659	const uint64_t size = vm->bbm.bb_size;
 660
 661	return virtio_mem_add_memory(vm, addr, size);
 662}
 663
 664/*
 665 * Try removing memory from Linux. Will only fail if memory blocks aren't
 666 * offline.
 667 *
 668 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 669 * onlining code).
 670 *
 671 * Will not modify the state of memory blocks in virtio-mem.
 672 */
 673static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
 674				    uint64_t size)
 675{
 676	int rc;
 677
 678	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
 679		addr + size - 1);
 680	rc = remove_memory(vm->nid, addr, size);
 681	if (!rc) {
 682		atomic64_sub(size, &vm->offline_size);
 683		/*
 684		 * We might have freed up memory we can now unplug, retry
 685		 * immediately instead of waiting.
 686		 */
 687		virtio_mem_retry(vm);
 688	} else {
 689		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
 690	}
 691	return rc;
 692}
 693
 694/*
 695 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
 696 */
 697static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
 698{
 699	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 700	const uint64_t size = memory_block_size_bytes();
 701
 702	return virtio_mem_remove_memory(vm, addr, size);
 703}
 704
 705/*
 706 * Try offlining and removing memory from Linux.
 707 *
 708 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 709 * onlining code).
 710 *
 711 * Will not modify the state of memory blocks in virtio-mem.
 712 */
 713static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
 714						uint64_t addr,
 715						uint64_t size)
 716{
 717	int rc;
 718
 719	dev_dbg(&vm->vdev->dev,
 720		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
 721		addr + size - 1);
 722
 723	rc = offline_and_remove_memory(vm->nid, addr, size);
 724	if (!rc) {
 725		atomic64_sub(size, &vm->offline_size);
 726		/*
 727		 * We might have freed up memory we can now unplug, retry
 728		 * immediately instead of waiting.
 729		 */
 730		virtio_mem_retry(vm);
 731	} else {
 732		dev_dbg(&vm->vdev->dev,
 733			"offlining and removing memory failed: %d\n", rc);
 734	}
 735	return rc;
 
 
 
 
 
 
 736}
 737
 738/*
 739 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
 740 * a single Linux memory block.
 741 */
 742static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
 743						unsigned long mb_id)
 744{
 745	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 746	const uint64_t size = memory_block_size_bytes();
 747
 748	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 749}
 750
 751/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 752 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
 753 * all Linux memory blocks covered by the big block.
 754 */
 755static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
 756						unsigned long bb_id)
 757{
 758	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 759	const uint64_t size = vm->bbm.bb_size;
 760
 761	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 762}
 763
 764/*
 765 * Trigger the workqueue so the device can perform its magic.
 766 */
 767static void virtio_mem_retry(struct virtio_mem *vm)
 768{
 769	unsigned long flags;
 770
 771	spin_lock_irqsave(&vm->removal_lock, flags);
 772	if (!vm->removing)
 773		queue_work(system_freezable_wq, &vm->wq);
 774	spin_unlock_irqrestore(&vm->removal_lock, flags);
 775}
 776
 777static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
 778{
 779	int node = NUMA_NO_NODE;
 780
 781#if defined(CONFIG_ACPI_NUMA)
 782	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
 783		node = pxm_to_node(node_id);
 784#endif
 785	return node;
 786}
 787
 788/*
 789 * Test if a virtio-mem device overlaps with the given range. Can be called
 790 * from (notifier) callbacks lockless.
 791 */
 792static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
 793				      uint64_t size)
 794{
 795	return start < vm->addr + vm->region_size && vm->addr < start + size;
 796}
 797
 798/*
 799 * Test if a virtio-mem device contains a given range. Can be called from
 800 * (notifier) callbacks lockless.
 801 */
 802static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
 803				      uint64_t size)
 804{
 805	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
 806}
 807
 808static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
 809					      unsigned long mb_id)
 810{
 811	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 812	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 813	case VIRTIO_MEM_SBM_MB_OFFLINE:
 814		return NOTIFY_OK;
 815	default:
 816		break;
 817	}
 818	dev_warn_ratelimited(&vm->vdev->dev,
 819			     "memory block onlining denied\n");
 820	return NOTIFY_BAD;
 821}
 822
 823static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
 824					  unsigned long mb_id)
 825{
 826	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 827	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
 828	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
 829		virtio_mem_sbm_set_mb_state(vm, mb_id,
 830					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
 831		break;
 832	case VIRTIO_MEM_SBM_MB_KERNEL:
 833	case VIRTIO_MEM_SBM_MB_MOVABLE:
 834		virtio_mem_sbm_set_mb_state(vm, mb_id,
 835					    VIRTIO_MEM_SBM_MB_OFFLINE);
 836		break;
 837	default:
 838		BUG();
 839		break;
 840	}
 841}
 842
 843static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
 844					 unsigned long mb_id,
 845					 unsigned long start_pfn)
 846{
 847	const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) ==
 848				ZONE_MOVABLE;
 849	int new_state;
 850
 851	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 852	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 853		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
 854		if (is_movable)
 855			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
 856		break;
 857	case VIRTIO_MEM_SBM_MB_OFFLINE:
 858		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
 859		if (is_movable)
 860			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
 861		break;
 862	default:
 863		BUG();
 864		break;
 865	}
 866	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
 867}
 868
 869static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
 870						unsigned long mb_id)
 871{
 872	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 873	unsigned long pfn;
 874	int sb_id;
 875
 876	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 877		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 878			continue;
 879		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 880			       sb_id * vm->sbm.sb_size);
 881		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 882	}
 883}
 884
 885static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
 886						 unsigned long mb_id)
 887{
 888	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 889	unsigned long pfn;
 890	int sb_id;
 891
 892	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 893		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 894			continue;
 895		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 896			       sb_id * vm->sbm.sb_size);
 897		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 898	}
 899}
 900
 901static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
 902						unsigned long bb_id,
 903						unsigned long pfn,
 904						unsigned long nr_pages)
 905{
 906	/*
 907	 * When marked as "fake-offline", all online memory of this device block
 908	 * is allocated by us. Otherwise, we don't have any memory allocated.
 909	 */
 910	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 911	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 912		return;
 913	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 914}
 915
 916static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
 917						 unsigned long bb_id,
 918						 unsigned long pfn,
 919						 unsigned long nr_pages)
 920{
 921	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 922	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 923		return;
 924	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 925}
 926
 927/*
 928 * This callback will either be called synchronously from add_memory() or
 929 * asynchronously (e.g., triggered via user space). We have to be careful
 930 * with locking when calling add_memory().
 931 */
 932static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
 933					 unsigned long action, void *arg)
 934{
 935	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
 936					     memory_notifier);
 937	struct memory_notify *mhp = arg;
 938	const unsigned long start = PFN_PHYS(mhp->start_pfn);
 939	const unsigned long size = PFN_PHYS(mhp->nr_pages);
 940	int rc = NOTIFY_OK;
 941	unsigned long id;
 942
 943	if (!virtio_mem_overlaps_range(vm, start, size))
 944		return NOTIFY_DONE;
 945
 946	if (vm->in_sbm) {
 947		id = virtio_mem_phys_to_mb_id(start);
 948		/*
 949		 * In SBM, we add memory in separate memory blocks - we expect
 950		 * it to be onlined/offlined in the same granularity. Bail out
 951		 * if this ever changes.
 952		 */
 953		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
 954				 !IS_ALIGNED(start, memory_block_size_bytes())))
 955			return NOTIFY_BAD;
 956	} else {
 957		id = virtio_mem_phys_to_bb_id(vm, start);
 958		/*
 959		 * In BBM, we only care about onlining/offlining happening
 960		 * within a single big block, we don't care about the
 961		 * actual granularity as we don't track individual Linux
 962		 * memory blocks.
 963		 */
 964		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
 965			return NOTIFY_BAD;
 966	}
 967
 968	/*
 969	 * Avoid circular locking lockdep warnings. We lock the mutex
 970	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
 971	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
 972	 * between both notifier calls and will bail out. False positive.
 973	 */
 974	lockdep_off();
 975
 976	switch (action) {
 977	case MEM_GOING_OFFLINE:
 978		mutex_lock(&vm->hotplug_mutex);
 979		if (vm->removing) {
 980			rc = notifier_from_errno(-EBUSY);
 981			mutex_unlock(&vm->hotplug_mutex);
 982			break;
 983		}
 984		vm->hotplug_active = true;
 985		if (vm->in_sbm)
 986			virtio_mem_sbm_notify_going_offline(vm, id);
 987		else
 988			virtio_mem_bbm_notify_going_offline(vm, id,
 989							    mhp->start_pfn,
 990							    mhp->nr_pages);
 991		break;
 992	case MEM_GOING_ONLINE:
 993		mutex_lock(&vm->hotplug_mutex);
 994		if (vm->removing) {
 995			rc = notifier_from_errno(-EBUSY);
 996			mutex_unlock(&vm->hotplug_mutex);
 997			break;
 998		}
 999		vm->hotplug_active = true;
1000		if (vm->in_sbm)
1001			rc = virtio_mem_sbm_notify_going_online(vm, id);
1002		break;
1003	case MEM_OFFLINE:
1004		if (vm->in_sbm)
1005			virtio_mem_sbm_notify_offline(vm, id);
1006
1007		atomic64_add(size, &vm->offline_size);
1008		/*
1009		 * Trigger the workqueue. Now that we have some offline memory,
1010		 * maybe we can handle pending unplug requests.
1011		 */
1012		if (!unplug_online)
1013			virtio_mem_retry(vm);
1014
1015		vm->hotplug_active = false;
1016		mutex_unlock(&vm->hotplug_mutex);
1017		break;
1018	case MEM_ONLINE:
1019		if (vm->in_sbm)
1020			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1021
1022		atomic64_sub(size, &vm->offline_size);
1023		/*
1024		 * Start adding more memory once we onlined half of our
1025		 * threshold. Don't trigger if it's possibly due to our actipn
1026		 * (e.g., us adding memory which gets onlined immediately from
1027		 * the core).
1028		 */
1029		if (!atomic_read(&vm->wq_active) &&
1030		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1031			virtio_mem_retry(vm);
1032
1033		vm->hotplug_active = false;
1034		mutex_unlock(&vm->hotplug_mutex);
1035		break;
1036	case MEM_CANCEL_OFFLINE:
1037		if (!vm->hotplug_active)
1038			break;
1039		if (vm->in_sbm)
1040			virtio_mem_sbm_notify_cancel_offline(vm, id);
1041		else
1042			virtio_mem_bbm_notify_cancel_offline(vm, id,
1043							     mhp->start_pfn,
1044							     mhp->nr_pages);
1045		vm->hotplug_active = false;
1046		mutex_unlock(&vm->hotplug_mutex);
1047		break;
1048	case MEM_CANCEL_ONLINE:
1049		if (!vm->hotplug_active)
1050			break;
1051		vm->hotplug_active = false;
1052		mutex_unlock(&vm->hotplug_mutex);
1053		break;
1054	default:
1055		break;
1056	}
1057
1058	lockdep_on();
1059
1060	return rc;
1061}
1062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1063/*
1064 * Set a range of pages PG_offline. Remember pages that were never onlined
1065 * (via generic_online_page()) using PageDirty().
1066 */
1067static void virtio_mem_set_fake_offline(unsigned long pfn,
1068					unsigned long nr_pages, bool onlined)
1069{
1070	page_offline_begin();
1071	for (; nr_pages--; pfn++) {
1072		struct page *page = pfn_to_page(pfn);
1073
1074		__SetPageOffline(page);
1075		if (!onlined) {
 
 
 
 
1076			SetPageDirty(page);
1077			/* FIXME: remove after cleanups */
1078			ClearPageReserved(page);
1079		}
1080	}
1081	page_offline_end();
1082}
1083
1084/*
1085 * Clear PG_offline from a range of pages. If the pages were never onlined,
1086 * (via generic_online_page()), clear PageDirty().
1087 */
1088static void virtio_mem_clear_fake_offline(unsigned long pfn,
1089					  unsigned long nr_pages, bool onlined)
1090{
1091	for (; nr_pages--; pfn++) {
1092		struct page *page = pfn_to_page(pfn);
1093
1094		__ClearPageOffline(page);
1095		if (!onlined)
 
1096			ClearPageDirty(page);
 
 
1097	}
1098}
1099
1100/*
1101 * Release a range of fake-offline pages to the buddy, effectively
1102 * fake-onlining them.
1103 */
1104static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1105{
1106	const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
1107	unsigned long i;
1108
1109	/*
1110	 * We are always called at least with MAX_ORDER_NR_PAGES
1111	 * granularity/alignment (e.g., the way subblocks work). All pages
1112	 * inside such a block are alike.
1113	 */
1114	for (i = 0; i < nr_pages; i += max_nr_pages) {
 
 
 
1115		struct page *page = pfn_to_page(pfn + i);
1116
1117		/*
1118		 * If the page is PageDirty(), it was kept fake-offline when
1119		 * onlining the memory block. Otherwise, it was allocated
1120		 * using alloc_contig_range(). All pages in a subblock are
1121		 * alike.
1122		 */
1123		if (PageDirty(page)) {
1124			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1125						      false);
1126			generic_online_page(page, MAX_ORDER - 1);
1127		} else {
1128			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1129						      true);
1130			free_contig_range(pfn + i, max_nr_pages);
1131			adjust_managed_page_count(page, max_nr_pages);
1132		}
1133	}
1134}
1135
1136/*
1137 * Try to allocate a range, marking pages fake-offline, effectively
1138 * fake-offlining them.
1139 */
1140static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
 
1141{
1142	const bool is_movable = page_zonenum(pfn_to_page(pfn)) ==
1143				ZONE_MOVABLE;
1144	int rc, retry_count;
1145
1146	/*
1147	 * TODO: We want an alloc_contig_range() mode that tries to allocate
1148	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1149	 * with ZONE_MOVABLE. So for now, retry a couple of times with
1150	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1151	 * some guarantees.
1152	 */
1153	for (retry_count = 0; retry_count < 5; retry_count++) {
 
 
 
 
 
 
 
 
1154		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1155					GFP_KERNEL);
1156		if (rc == -ENOMEM)
1157			/* whoops, out of memory */
1158			return rc;
1159		else if (rc && !is_movable)
1160			break;
1161		else if (rc)
1162			continue;
1163
1164		virtio_mem_set_fake_offline(pfn, nr_pages, true);
1165		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1166		return 0;
1167	}
1168
1169	return -EBUSY;
1170}
1171
1172/*
1173 * Handle fake-offline pages when memory is going offline - such that the
1174 * pages can be skipped by mm-core when offlining.
1175 */
1176static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1177						  unsigned long nr_pages)
1178{
1179	struct page *page;
1180	unsigned long i;
1181
1182	/*
1183	 * Drop our reference to the pages so the memory can get offlined
1184	 * and add the unplugged pages to the managed page counters (so
1185	 * offlining code can correctly subtract them again).
1186	 */
1187	adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
1188	/* Drop our reference to the pages so the memory can get offlined. */
1189	for (i = 0; i < nr_pages; i++) {
1190		page = pfn_to_page(pfn + i);
1191		if (WARN_ON(!page_ref_dec_and_test(page)))
1192			dump_page(page, "fake-offline page referenced");
1193	}
1194}
1195
1196/*
1197 * Handle fake-offline pages when memory offlining is canceled - to undo
1198 * what we did in virtio_mem_fake_offline_going_offline().
1199 */
1200static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1201						   unsigned long nr_pages)
1202{
1203	unsigned long i;
1204
1205	/*
1206	 * Get the reference we dropped when going offline and subtract the
1207	 * unplugged pages from the managed page counters.
1208	 */
1209	adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1210	for (i = 0; i < nr_pages; i++)
1211		page_ref_inc(pfn_to_page(pfn + i));
1212}
1213
1214static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
 
1215{
1216	const unsigned long addr = page_to_phys(page);
1217	unsigned long id, sb_id;
1218	struct virtio_mem *vm;
1219	bool do_online;
1220
1221	rcu_read_lock();
1222	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1223		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1224			continue;
 
 
 
 
 
1225
1226		if (vm->in_sbm) {
1227			/*
1228			 * We exploit here that subblocks have at least
1229			 * MAX_ORDER_NR_PAGES size/alignment - so we cannot
1230			 * cross subblocks within one call.
1231			 */
1232			id = virtio_mem_phys_to_mb_id(addr);
1233			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1234			do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
1235								   sb_id, 1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236		} else {
1237			/*
1238			 * If the whole block is marked fake offline, keep
1239			 * everything that way.
1240			 */
1241			id = virtio_mem_phys_to_bb_id(vm, addr);
1242			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1243				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1244		}
1245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246		/*
1247		 * virtio_mem_set_fake_offline() might sleep, we don't need
1248		 * the device anymore. See virtio_mem_remove() how races
 
1249		 * between memory onlining and device removal are handled.
1250		 */
1251		rcu_read_unlock();
1252
1253		if (do_online)
1254			generic_online_page(page, order);
1255		else
1256			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1257						    false);
1258		return;
1259	}
1260	rcu_read_unlock();
1261
1262	/* not virtio-mem memory, but e.g., a DIMM. online it */
1263	generic_online_page(page, order);
1264}
1265
1266static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1267					const struct virtio_mem_req *req)
1268{
1269	struct scatterlist *sgs[2], sg_req, sg_resp;
1270	unsigned int len;
1271	int rc;
1272
1273	/* don't use the request residing on the stack (vaddr) */
1274	vm->req = *req;
1275
1276	/* out: buffer for request */
1277	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1278	sgs[0] = &sg_req;
1279
1280	/* in: buffer for response */
1281	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1282	sgs[1] = &sg_resp;
1283
1284	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1285	if (rc < 0)
1286		return rc;
1287
1288	virtqueue_kick(vm->vq);
1289
1290	/* wait for a response */
1291	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1292
1293	return virtio16_to_cpu(vm->vdev, vm->resp.type);
1294}
1295
1296static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1297					uint64_t size)
1298{
1299	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1300	const struct virtio_mem_req req = {
1301		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1302		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1303		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1304	};
1305	int rc = -ENOMEM;
1306
1307	if (atomic_read(&vm->config_changed))
1308		return -EAGAIN;
1309
1310	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1311		addr + size - 1);
1312
1313	switch (virtio_mem_send_request(vm, &req)) {
1314	case VIRTIO_MEM_RESP_ACK:
1315		vm->plugged_size += size;
1316		return 0;
1317	case VIRTIO_MEM_RESP_NACK:
1318		rc = -EAGAIN;
1319		break;
1320	case VIRTIO_MEM_RESP_BUSY:
1321		rc = -ETXTBSY;
1322		break;
1323	case VIRTIO_MEM_RESP_ERROR:
1324		rc = -EINVAL;
1325		break;
1326	default:
1327		break;
1328	}
1329
1330	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1331	return rc;
1332}
1333
1334static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1335					  uint64_t size)
1336{
1337	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1338	const struct virtio_mem_req req = {
1339		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1340		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1341		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1342	};
1343	int rc = -ENOMEM;
1344
1345	if (atomic_read(&vm->config_changed))
1346		return -EAGAIN;
1347
1348	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1349		addr + size - 1);
1350
1351	switch (virtio_mem_send_request(vm, &req)) {
1352	case VIRTIO_MEM_RESP_ACK:
1353		vm->plugged_size -= size;
1354		return 0;
1355	case VIRTIO_MEM_RESP_BUSY:
1356		rc = -ETXTBSY;
1357		break;
1358	case VIRTIO_MEM_RESP_ERROR:
1359		rc = -EINVAL;
1360		break;
1361	default:
1362		break;
1363	}
1364
1365	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1366	return rc;
1367}
1368
1369static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1370{
1371	const struct virtio_mem_req req = {
1372		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1373	};
1374	int rc = -ENOMEM;
1375
1376	dev_dbg(&vm->vdev->dev, "unplugging all memory");
1377
1378	switch (virtio_mem_send_request(vm, &req)) {
1379	case VIRTIO_MEM_RESP_ACK:
1380		vm->unplug_all_required = false;
1381		vm->plugged_size = 0;
1382		/* usable region might have shrunk */
1383		atomic_set(&vm->config_changed, 1);
1384		return 0;
1385	case VIRTIO_MEM_RESP_BUSY:
1386		rc = -ETXTBSY;
1387		break;
1388	default:
1389		break;
1390	}
1391
1392	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1393	return rc;
1394}
1395
1396/*
1397 * Plug selected subblocks. Updates the plugged state, but not the state
1398 * of the memory block.
1399 */
1400static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1401				  int sb_id, int count)
1402{
1403	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1404			      sb_id * vm->sbm.sb_size;
1405	const uint64_t size = count * vm->sbm.sb_size;
1406	int rc;
1407
1408	rc = virtio_mem_send_plug_request(vm, addr, size);
1409	if (!rc)
1410		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1411	return rc;
1412}
1413
1414/*
1415 * Unplug selected subblocks. Updates the plugged state, but not the state
1416 * of the memory block.
1417 */
1418static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1419				    int sb_id, int count)
1420{
1421	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1422			      sb_id * vm->sbm.sb_size;
1423	const uint64_t size = count * vm->sbm.sb_size;
1424	int rc;
1425
1426	rc = virtio_mem_send_unplug_request(vm, addr, size);
1427	if (!rc)
1428		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1429	return rc;
1430}
1431
1432/*
1433 * Request to unplug a big block.
1434 *
1435 * Will not modify the state of the big block.
1436 */
1437static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1438{
1439	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1440	const uint64_t size = vm->bbm.bb_size;
1441
1442	return virtio_mem_send_unplug_request(vm, addr, size);
1443}
1444
1445/*
1446 * Request to plug a big block.
1447 *
1448 * Will not modify the state of the big block.
1449 */
1450static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1451{
1452	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1453	const uint64_t size = vm->bbm.bb_size;
1454
1455	return virtio_mem_send_plug_request(vm, addr, size);
1456}
1457
1458/*
1459 * Unplug the desired number of plugged subblocks of a offline or not-added
1460 * memory block. Will fail if any subblock cannot get unplugged (instead of
1461 * skipping it).
1462 *
1463 * Will not modify the state of the memory block.
1464 *
1465 * Note: can fail after some subblocks were unplugged.
1466 */
1467static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1468					    unsigned long mb_id, uint64_t *nb_sb)
1469{
1470	int sb_id, count;
1471	int rc;
1472
1473	sb_id = vm->sbm.sbs_per_mb - 1;
1474	while (*nb_sb) {
1475		/* Find the next candidate subblock */
1476		while (sb_id >= 0 &&
1477		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1478			sb_id--;
1479		if (sb_id < 0)
1480			break;
1481		/* Try to unplug multiple subblocks at a time */
1482		count = 1;
1483		while (count < *nb_sb && sb_id > 0 &&
1484		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1485			count++;
1486			sb_id--;
1487		}
1488
1489		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1490		if (rc)
1491			return rc;
1492		*nb_sb -= count;
1493		sb_id--;
1494	}
1495
1496	return 0;
1497}
1498
1499/*
1500 * Unplug all plugged subblocks of an offline or not-added memory block.
1501 *
1502 * Will not modify the state of the memory block.
1503 *
1504 * Note: can fail after some subblocks were unplugged.
1505 */
1506static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1507{
1508	uint64_t nb_sb = vm->sbm.sbs_per_mb;
1509
1510	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1511}
1512
1513/*
1514 * Prepare tracking data for the next memory block.
1515 */
1516static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1517					  unsigned long *mb_id)
1518{
1519	int rc;
1520
1521	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1522		return -ENOSPC;
1523
1524	/* Resize the state array if required. */
1525	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1526	if (rc)
1527		return rc;
1528
1529	/* Resize the subblock bitmap if required. */
1530	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1531	if (rc)
1532		return rc;
1533
1534	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1535	*mb_id = vm->sbm.next_mb_id++;
1536	return 0;
1537}
1538
1539/*
1540 * Try to plug the desired number of subblocks and add the memory block
1541 * to Linux.
1542 *
1543 * Will modify the state of the memory block.
1544 */
1545static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1546					  unsigned long mb_id, uint64_t *nb_sb)
1547{
1548	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1549	int rc;
1550
1551	if (WARN_ON_ONCE(!count))
1552		return -EINVAL;
1553
1554	/*
1555	 * Plug the requested number of subblocks before adding it to linux,
1556	 * so that onlining will directly online all plugged subblocks.
1557	 */
1558	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1559	if (rc)
1560		return rc;
1561
1562	/*
1563	 * Mark the block properly offline before adding it to Linux,
1564	 * so the memory notifiers will find the block in the right state.
1565	 */
1566	if (count == vm->sbm.sbs_per_mb)
1567		virtio_mem_sbm_set_mb_state(vm, mb_id,
1568					    VIRTIO_MEM_SBM_MB_OFFLINE);
1569	else
1570		virtio_mem_sbm_set_mb_state(vm, mb_id,
1571					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1572
1573	/* Add the memory block to linux - if that fails, try to unplug. */
1574	rc = virtio_mem_sbm_add_mb(vm, mb_id);
1575	if (rc) {
1576		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1577
1578		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1579			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1580		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1581		return rc;
1582	}
1583
1584	*nb_sb -= count;
1585	return 0;
1586}
1587
1588/*
1589 * Try to plug the desired number of subblocks of a memory block that
1590 * is already added to Linux.
1591 *
1592 * Will modify the state of the memory block.
1593 *
1594 * Note: Can fail after some subblocks were successfully plugged.
1595 */
1596static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1597				      unsigned long mb_id, uint64_t *nb_sb)
1598{
1599	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1600	unsigned long pfn, nr_pages;
1601	int sb_id, count;
1602	int rc;
1603
1604	if (WARN_ON_ONCE(!*nb_sb))
1605		return -EINVAL;
1606
1607	while (*nb_sb) {
1608		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1609		if (sb_id >= vm->sbm.sbs_per_mb)
1610			break;
1611		count = 1;
1612		while (count < *nb_sb &&
1613		       sb_id + count < vm->sbm.sbs_per_mb &&
1614		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1615			count++;
1616
1617		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1618		if (rc)
1619			return rc;
1620		*nb_sb -= count;
1621		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1622			continue;
1623
1624		/* fake-online the pages if the memory block is online */
1625		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1626			       sb_id * vm->sbm.sb_size);
1627		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1628		virtio_mem_fake_online(pfn, nr_pages);
1629	}
1630
1631	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1632		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1633
1634	return 0;
1635}
1636
1637static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1638{
1639	const int mb_states[] = {
1640		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1641		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1642		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1643	};
1644	uint64_t nb_sb = diff / vm->sbm.sb_size;
1645	unsigned long mb_id;
1646	int rc, i;
1647
1648	if (!nb_sb)
1649		return 0;
1650
1651	/* Don't race with onlining/offlining */
1652	mutex_lock(&vm->hotplug_mutex);
1653
1654	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1655		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1656			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1657			if (rc || !nb_sb)
1658				goto out_unlock;
1659			cond_resched();
1660		}
1661	}
1662
1663	/*
1664	 * We won't be working on online/offline memory blocks from this point,
1665	 * so we can't race with memory onlining/offlining. Drop the mutex.
1666	 */
1667	mutex_unlock(&vm->hotplug_mutex);
1668
1669	/* Try to plug and add unused blocks */
1670	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1671		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1672			return -ENOSPC;
1673
1674		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1675		if (rc || !nb_sb)
1676			return rc;
1677		cond_resched();
1678	}
1679
1680	/* Try to prepare, plug and add new blocks */
1681	while (nb_sb) {
1682		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1683			return -ENOSPC;
1684
1685		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1686		if (rc)
1687			return rc;
1688		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1689		if (rc)
1690			return rc;
1691		cond_resched();
1692	}
1693
1694	return 0;
1695out_unlock:
1696	mutex_unlock(&vm->hotplug_mutex);
1697	return rc;
1698}
1699
1700/*
1701 * Plug a big block and add it to Linux.
1702 *
1703 * Will modify the state of the big block.
1704 */
1705static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1706					  unsigned long bb_id)
1707{
1708	int rc;
1709
1710	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1711			 VIRTIO_MEM_BBM_BB_UNUSED))
1712		return -EINVAL;
1713
1714	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1715	if (rc)
1716		return rc;
1717	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1718
1719	rc = virtio_mem_bbm_add_bb(vm, bb_id);
1720	if (rc) {
1721		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1722			virtio_mem_bbm_set_bb_state(vm, bb_id,
1723						    VIRTIO_MEM_BBM_BB_UNUSED);
1724		else
1725			/* Retry from the main loop. */
1726			virtio_mem_bbm_set_bb_state(vm, bb_id,
1727						    VIRTIO_MEM_BBM_BB_PLUGGED);
1728		return rc;
1729	}
1730	return 0;
1731}
1732
1733/*
1734 * Prepare tracking data for the next big block.
1735 */
1736static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1737					  unsigned long *bb_id)
1738{
1739	int rc;
1740
1741	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1742		return -ENOSPC;
1743
1744	/* Resize the big block state array if required. */
1745	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1746	if (rc)
1747		return rc;
1748
1749	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1750	*bb_id = vm->bbm.next_bb_id;
1751	vm->bbm.next_bb_id++;
1752	return 0;
1753}
1754
1755static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1756{
1757	uint64_t nb_bb = diff / vm->bbm.bb_size;
1758	unsigned long bb_id;
1759	int rc;
1760
1761	if (!nb_bb)
1762		return 0;
1763
1764	/* Try to plug and add unused big blocks */
1765	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1766		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1767			return -ENOSPC;
1768
1769		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1770		if (!rc)
1771			nb_bb--;
1772		if (rc || !nb_bb)
1773			return rc;
1774		cond_resched();
1775	}
1776
1777	/* Try to prepare, plug and add new big blocks */
1778	while (nb_bb) {
1779		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1780			return -ENOSPC;
1781
1782		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1783		if (rc)
1784			return rc;
1785		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1786		if (!rc)
1787			nb_bb--;
1788		if (rc)
1789			return rc;
1790		cond_resched();
1791	}
1792
1793	return 0;
1794}
1795
1796/*
1797 * Try to plug the requested amount of memory.
1798 */
1799static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1800{
1801	if (vm->in_sbm)
1802		return virtio_mem_sbm_plug_request(vm, diff);
1803	return virtio_mem_bbm_plug_request(vm, diff);
1804}
1805
1806/*
1807 * Unplug the desired number of plugged subblocks of an offline memory block.
1808 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1809 *
1810 * Will modify the state of the memory block. Might temporarily drop the
1811 * hotplug_mutex.
1812 *
1813 * Note: Can fail after some subblocks were successfully unplugged.
1814 */
1815static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1816						unsigned long mb_id,
1817						uint64_t *nb_sb)
1818{
1819	int rc;
1820
1821	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1822
1823	/* some subblocks might have been unplugged even on failure */
1824	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1825		virtio_mem_sbm_set_mb_state(vm, mb_id,
1826					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1827	if (rc)
1828		return rc;
1829
1830	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1831		/*
1832		 * Remove the block from Linux - this should never fail.
1833		 * Hinder the block from getting onlined by marking it
1834		 * unplugged. Temporarily drop the mutex, so
1835		 * any pending GOING_ONLINE requests can be serviced/rejected.
1836		 */
1837		virtio_mem_sbm_set_mb_state(vm, mb_id,
1838					    VIRTIO_MEM_SBM_MB_UNUSED);
1839
1840		mutex_unlock(&vm->hotplug_mutex);
1841		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1842		BUG_ON(rc);
1843		mutex_lock(&vm->hotplug_mutex);
1844	}
1845	return 0;
1846}
1847
1848/*
1849 * Unplug the given plugged subblocks of an online memory block.
1850 *
1851 * Will modify the state of the memory block.
1852 */
1853static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1854					   unsigned long mb_id, int sb_id,
1855					   int count)
1856{
1857	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1858	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1859	unsigned long start_pfn;
1860	int rc;
1861
1862	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1863			     sb_id * vm->sbm.sb_size);
1864
1865	rc = virtio_mem_fake_offline(start_pfn, nr_pages);
1866	if (rc)
1867		return rc;
1868
1869	/* Try to unplug the allocated memory */
1870	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1871	if (rc) {
1872		/* Return the memory to the buddy. */
1873		virtio_mem_fake_online(start_pfn, nr_pages);
1874		return rc;
1875	}
1876
1877	switch (old_state) {
1878	case VIRTIO_MEM_SBM_MB_KERNEL:
1879		virtio_mem_sbm_set_mb_state(vm, mb_id,
1880					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
1881		break;
1882	case VIRTIO_MEM_SBM_MB_MOVABLE:
1883		virtio_mem_sbm_set_mb_state(vm, mb_id,
1884					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
1885		break;
1886	}
1887
1888	return 0;
1889}
1890
1891/*
1892 * Unplug the desired number of plugged subblocks of an online memory block.
1893 * Will skip subblock that are busy.
1894 *
1895 * Will modify the state of the memory block. Might temporarily drop the
1896 * hotplug_mutex.
1897 *
1898 * Note: Can fail after some subblocks were successfully unplugged. Can
1899 *       return 0 even if subblocks were busy and could not get unplugged.
1900 */
1901static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
1902					       unsigned long mb_id,
1903					       uint64_t *nb_sb)
1904{
1905	int rc, sb_id;
1906
1907	/* If possible, try to unplug the complete block in one shot. */
1908	if (*nb_sb >= vm->sbm.sbs_per_mb &&
1909	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1910		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
1911						     vm->sbm.sbs_per_mb);
1912		if (!rc) {
1913			*nb_sb -= vm->sbm.sbs_per_mb;
1914			goto unplugged;
1915		} else if (rc != -EBUSY)
1916			return rc;
1917	}
1918
1919	/* Fallback to single subblocks. */
1920	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
1921		/* Find the next candidate subblock */
1922		while (sb_id >= 0 &&
1923		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
1924			sb_id--;
1925		if (sb_id < 0)
1926			break;
1927
1928		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
1929		if (rc == -EBUSY)
1930			continue;
1931		else if (rc)
1932			return rc;
1933		*nb_sb -= 1;
1934	}
1935
1936unplugged:
1937	/*
1938	 * Once all subblocks of a memory block were unplugged, offline and
1939	 * remove it. This will usually not fail, as no memory is in use
1940	 * anymore - however some other notifiers might NACK the request.
1941	 */
1942	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1943		mutex_unlock(&vm->hotplug_mutex);
1944		rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
1945		mutex_lock(&vm->hotplug_mutex);
1946		if (!rc)
1947			virtio_mem_sbm_set_mb_state(vm, mb_id,
1948						    VIRTIO_MEM_SBM_MB_UNUSED);
1949	}
1950
1951	return 0;
1952}
1953
1954/*
1955 * Unplug the desired number of plugged subblocks of a memory block that is
1956 * already added to Linux. Will skip subblock of online memory blocks that are
1957 * busy (by the OS). Will fail if any subblock that's not busy cannot get
1958 * unplugged.
1959 *
1960 * Will modify the state of the memory block. Might temporarily drop the
1961 * hotplug_mutex.
1962 *
1963 * Note: Can fail after some subblocks were successfully unplugged. Can
1964 *       return 0 even if subblocks were busy and could not get unplugged.
1965 */
1966static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
1967					unsigned long mb_id,
1968					uint64_t *nb_sb)
1969{
1970	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1971
1972	switch (old_state) {
1973	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
1974	case VIRTIO_MEM_SBM_MB_KERNEL:
1975	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
1976	case VIRTIO_MEM_SBM_MB_MOVABLE:
1977		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
1978	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
1979	case VIRTIO_MEM_SBM_MB_OFFLINE:
1980		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
1981	}
1982	return -EINVAL;
1983}
1984
1985static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
1986{
1987	const int mb_states[] = {
1988		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1989		VIRTIO_MEM_SBM_MB_OFFLINE,
1990		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1991		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1992		VIRTIO_MEM_SBM_MB_MOVABLE,
1993		VIRTIO_MEM_SBM_MB_KERNEL,
1994	};
1995	uint64_t nb_sb = diff / vm->sbm.sb_size;
1996	unsigned long mb_id;
1997	int rc, i;
1998
1999	if (!nb_sb)
2000		return 0;
2001
2002	/*
2003	 * We'll drop the mutex a couple of times when it is safe to do so.
2004	 * This might result in some blocks switching the state (online/offline)
2005	 * and we could miss them in this run - we will retry again later.
2006	 */
2007	mutex_lock(&vm->hotplug_mutex);
2008
2009	/*
2010	 * We try unplug from partially plugged blocks first, to try removing
2011	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2012	 * as it's more reliable to unplug memory and remove whole memory
2013	 * blocks, and we don't want to trigger a zone imbalances by
2014	 * accidentially removing too much kernel memory.
2015	 */
2016	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2017		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2018			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2019			if (rc || !nb_sb)
2020				goto out_unlock;
2021			mutex_unlock(&vm->hotplug_mutex);
2022			cond_resched();
2023			mutex_lock(&vm->hotplug_mutex);
2024		}
2025		if (!unplug_online && i == 1) {
2026			mutex_unlock(&vm->hotplug_mutex);
2027			return 0;
2028		}
2029	}
2030
2031	mutex_unlock(&vm->hotplug_mutex);
2032	return nb_sb ? -EBUSY : 0;
2033out_unlock:
2034	mutex_unlock(&vm->hotplug_mutex);
2035	return rc;
2036}
2037
2038/*
2039 * Try to offline and remove a big block from Linux and unplug it. Will fail
2040 * with -EBUSY if some memory is busy and cannot get unplugged.
2041 *
2042 * Will modify the state of the memory block. Might temporarily drop the
2043 * hotplug_mutex.
2044 */
2045static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2046						       unsigned long bb_id)
2047{
2048	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2049	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2050	unsigned long end_pfn = start_pfn + nr_pages;
2051	unsigned long pfn;
2052	struct page *page;
2053	int rc;
2054
2055	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2056			 VIRTIO_MEM_BBM_BB_ADDED))
2057		return -EINVAL;
2058
2059	if (bbm_safe_unplug) {
2060		/*
2061		 * Start by fake-offlining all memory. Once we marked the device
2062		 * block as fake-offline, all newly onlined memory will
2063		 * automatically be kept fake-offline. Protect from concurrent
2064		 * onlining/offlining until we have a consistent state.
2065		 */
2066		mutex_lock(&vm->hotplug_mutex);
2067		virtio_mem_bbm_set_bb_state(vm, bb_id,
2068					    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2069
2070		for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2071			page = pfn_to_online_page(pfn);
2072			if (!page)
2073				continue;
2074
2075			rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
2076			if (rc) {
2077				end_pfn = pfn;
2078				goto rollback_safe_unplug;
2079			}
2080		}
2081		mutex_unlock(&vm->hotplug_mutex);
2082	}
 
2083
2084	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2085	if (rc) {
2086		if (bbm_safe_unplug) {
2087			mutex_lock(&vm->hotplug_mutex);
2088			goto rollback_safe_unplug;
2089		}
2090		return rc;
2091	}
2092
2093	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2094	if (rc)
2095		virtio_mem_bbm_set_bb_state(vm, bb_id,
2096					    VIRTIO_MEM_BBM_BB_PLUGGED);
2097	else
2098		virtio_mem_bbm_set_bb_state(vm, bb_id,
2099					    VIRTIO_MEM_BBM_BB_UNUSED);
2100	return rc;
2101
2102rollback_safe_unplug:
2103	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2104		page = pfn_to_online_page(pfn);
2105		if (!page)
2106			continue;
2107		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2108	}
2109	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2110	mutex_unlock(&vm->hotplug_mutex);
2111	return rc;
2112}
2113
2114/*
2115 * Test if a big block is completely offline.
2116 */
2117static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2118					 unsigned long bb_id)
2119{
2120	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2121	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2122	unsigned long pfn;
2123
2124	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2125	     pfn += PAGES_PER_SECTION) {
2126		if (pfn_to_online_page(pfn))
2127			return false;
2128	}
2129
2130	return true;
2131}
2132
2133/*
2134 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2135 */
2136static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2137					 unsigned long bb_id)
2138{
2139	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2140	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2141	struct page *page;
2142	unsigned long pfn;
2143
2144	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2145	     pfn += PAGES_PER_SECTION) {
2146		page = pfn_to_online_page(pfn);
2147		if (!page)
2148			continue;
2149		if (page_zonenum(page) != ZONE_MOVABLE)
2150			return false;
2151	}
2152
2153	return true;
2154}
2155
2156static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2157{
2158	uint64_t nb_bb = diff / vm->bbm.bb_size;
2159	uint64_t bb_id;
2160	int rc, i;
2161
2162	if (!nb_bb)
2163		return 0;
2164
2165	/*
2166	 * Try to unplug big blocks. Similar to SBM, start with offline
2167	 * big blocks.
2168	 */
2169	for (i = 0; i < 3; i++) {
2170		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2171			cond_resched();
2172
2173			/*
2174			 * As we're holding no locks, these checks are racy,
2175			 * but we don't care.
2176			 */
2177			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2178				continue;
2179			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2180				continue;
2181			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2182			if (rc == -EBUSY)
2183				continue;
2184			if (!rc)
2185				nb_bb--;
2186			if (rc || !nb_bb)
2187				return rc;
2188		}
2189		if (i == 0 && !unplug_online)
2190			return 0;
2191	}
2192
2193	return nb_bb ? -EBUSY : 0;
2194}
2195
2196/*
2197 * Try to unplug the requested amount of memory.
2198 */
2199static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2200{
2201	if (vm->in_sbm)
2202		return virtio_mem_sbm_unplug_request(vm, diff);
2203	return virtio_mem_bbm_unplug_request(vm, diff);
2204}
2205
2206/*
2207 * Try to unplug all blocks that couldn't be unplugged before, for example,
2208 * because the hypervisor was busy.
 
2209 */
2210static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
2211{
2212	unsigned long id;
2213	int rc;
2214
2215	if (!vm->in_sbm) {
2216		virtio_mem_bbm_for_each_bb(vm, id,
2217					   VIRTIO_MEM_BBM_BB_PLUGGED) {
2218			rc = virtio_mem_bbm_unplug_bb(vm, id);
2219			if (rc)
2220				return rc;
2221			virtio_mem_bbm_set_bb_state(vm, id,
2222						    VIRTIO_MEM_BBM_BB_UNUSED);
2223		}
2224		return 0;
2225	}
2226
2227	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2228		rc = virtio_mem_sbm_unplug_mb(vm, id);
2229		if (rc)
2230			return rc;
2231		virtio_mem_sbm_set_mb_state(vm, id,
2232					    VIRTIO_MEM_SBM_MB_UNUSED);
2233	}
2234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2235	return 0;
2236}
2237
2238/*
2239 * Update all parts of the config that could have changed.
2240 */
2241static void virtio_mem_refresh_config(struct virtio_mem *vm)
2242{
2243	const struct range pluggable_range = mhp_get_pluggable_range(true);
2244	uint64_t new_plugged_size, usable_region_size, end_addr;
2245
2246	/* the plugged_size is just a reflection of what _we_ did previously */
2247	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2248			&new_plugged_size);
2249	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2250		vm->plugged_size = new_plugged_size;
2251
2252	/* calculate the last usable memory block id */
2253	virtio_cread_le(vm->vdev, struct virtio_mem_config,
2254			usable_region_size, &usable_region_size);
2255	end_addr = min(vm->addr + usable_region_size - 1,
2256		       pluggable_range.end);
2257
2258	if (vm->in_sbm) {
2259		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2260		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2261			vm->sbm.last_usable_mb_id--;
2262	} else {
2263		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2264								     end_addr);
2265		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2266			vm->bbm.last_usable_bb_id--;
2267	}
2268	/*
2269	 * If we cannot plug any of our device memory (e.g., nothing in the
2270	 * usable region is addressable), the last usable memory block id will
2271	 * be smaller than the first usable memory block id. We'll stop
2272	 * attempting to add memory with -ENOSPC from our main loop.
2273	 */
2274
2275	/* see if there is a request to change the size */
2276	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2277			&vm->requested_size);
2278
2279	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2280	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2281}
2282
2283/*
2284 * Workqueue function for handling plug/unplug requests and config updates.
2285 */
2286static void virtio_mem_run_wq(struct work_struct *work)
2287{
2288	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2289	uint64_t diff;
2290	int rc;
2291
 
 
 
 
 
 
2292	hrtimer_cancel(&vm->retry_timer);
2293
2294	if (vm->broken)
2295		return;
2296
2297	atomic_set(&vm->wq_active, 1);
2298retry:
2299	rc = 0;
2300
2301	/* Make sure we start with a clean state if there are leftovers. */
2302	if (unlikely(vm->unplug_all_required))
2303		rc = virtio_mem_send_unplug_all_request(vm);
2304
2305	if (atomic_read(&vm->config_changed)) {
2306		atomic_set(&vm->config_changed, 0);
2307		virtio_mem_refresh_config(vm);
2308	}
2309
2310	/* Unplug any leftovers from previous runs */
2311	if (!rc)
2312		rc = virtio_mem_unplug_pending_mb(vm);
2313
2314	if (!rc && vm->requested_size != vm->plugged_size) {
2315		if (vm->requested_size > vm->plugged_size) {
2316			diff = vm->requested_size - vm->plugged_size;
2317			rc = virtio_mem_plug_request(vm, diff);
2318		} else {
2319			diff = vm->plugged_size - vm->requested_size;
2320			rc = virtio_mem_unplug_request(vm, diff);
2321		}
2322	}
2323
 
 
 
 
 
 
 
2324	switch (rc) {
2325	case 0:
2326		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2327		break;
2328	case -ENOSPC:
2329		/*
2330		 * We cannot add any more memory (alignment, physical limit)
2331		 * or we have too many offline memory blocks.
2332		 */
2333		break;
2334	case -ETXTBSY:
2335		/*
2336		 * The hypervisor cannot process our request right now
2337		 * (e.g., out of memory, migrating);
2338		 */
2339	case -EBUSY:
2340		/*
2341		 * We cannot free up any memory to unplug it (all plugged memory
2342		 * is busy).
2343		 */
2344	case -ENOMEM:
2345		/* Out of memory, try again later. */
2346		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2347			      HRTIMER_MODE_REL);
2348		break;
2349	case -EAGAIN:
2350		/* Retry immediately (e.g., the config changed). */
2351		goto retry;
2352	default:
2353		/* Unknown error, mark as broken */
2354		dev_err(&vm->vdev->dev,
2355			"unknown error, marking device broken: %d\n", rc);
2356		vm->broken = true;
2357	}
2358
2359	atomic_set(&vm->wq_active, 0);
2360}
2361
2362static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2363{
2364	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2365					     retry_timer);
2366
2367	virtio_mem_retry(vm);
2368	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2369				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2370	return HRTIMER_NORESTART;
2371}
2372
2373static void virtio_mem_handle_response(struct virtqueue *vq)
2374{
2375	struct virtio_mem *vm = vq->vdev->priv;
2376
2377	wake_up(&vm->host_resp);
2378}
2379
2380static int virtio_mem_init_vq(struct virtio_mem *vm)
2381{
2382	struct virtqueue *vq;
2383
2384	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2385				   "guest-request");
2386	if (IS_ERR(vq))
2387		return PTR_ERR(vq);
2388	vm->vq = vq;
2389
2390	return 0;
2391}
2392
2393static int virtio_mem_init(struct virtio_mem *vm)
2394{
2395	const struct range pluggable_range = mhp_get_pluggable_range(true);
2396	uint64_t sb_size, addr;
2397	uint16_t node_id;
2398
2399	if (!vm->vdev->config->get) {
2400		dev_err(&vm->vdev->dev, "config access disabled\n");
2401		return -EINVAL;
2402	}
2403
2404	/*
2405	 * We don't want to (un)plug or reuse any memory when in kdump. The
2406	 * memory is still accessible (but not mapped).
2407	 */
2408	if (is_kdump_kernel()) {
2409		dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
2410		return -EBUSY;
2411	}
2412
2413	/* Fetch all properties that can't change. */
2414	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2415			&vm->plugged_size);
2416	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2417			&vm->device_block_size);
2418	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2419			&node_id);
2420	vm->nid = virtio_mem_translate_node_id(vm, node_id);
2421	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2422	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2423			&vm->region_size);
2424
2425	/* Determine the nid for the device based on the lowest address. */
2426	if (vm->nid == NUMA_NO_NODE)
2427		vm->nid = memory_add_physaddr_to_nid(vm->addr);
2428
2429	/* bad device setup - warn only */
2430	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2431		dev_warn(&vm->vdev->dev,
2432			 "The alignment of the physical start address can make some memory unusable.\n");
2433	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2434		dev_warn(&vm->vdev->dev,
2435			 "The alignment of the physical end address can make some memory unusable.\n");
2436	if (vm->addr < pluggable_range.start ||
2437	    vm->addr + vm->region_size - 1 > pluggable_range.end)
2438		dev_warn(&vm->vdev->dev,
2439			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2440
2441	/* Prepare the offline threshold - make sure we can add two blocks. */
2442	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2443				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2444
2445	/*
2446	 * We want subblocks to span at least MAX_ORDER_NR_PAGES and
2447	 * pageblock_nr_pages pages. This:
2448	 * - Simplifies our page onlining code (virtio_mem_online_page_cb)
2449	 *   and fake page onlining code (virtio_mem_fake_online).
2450	 * - Is required for now for alloc_contig_range() to work reliably -
2451	 *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
2452	 */
2453	sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
2454			pageblock_nr_pages) * PAGE_SIZE;
2455	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2456
2457	if (sb_size < memory_block_size_bytes() && !force_bbm) {
2458		/* SBM: At least two subblocks per Linux memory block. */
2459		vm->in_sbm = true;
2460		vm->sbm.sb_size = sb_size;
2461		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2462				     vm->sbm.sb_size;
2463
2464		/* Round up to the next full memory block */
2465		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2466		       memory_block_size_bytes() - 1;
2467		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2468		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2469	} else {
2470		/* BBM: At least one Linux memory block. */
2471		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2472					memory_block_size_bytes());
2473
2474		if (bbm_block_size) {
2475			if (!is_power_of_2(bbm_block_size)) {
2476				dev_warn(&vm->vdev->dev,
2477					 "bbm_block_size is not a power of 2");
2478			} else if (bbm_block_size < vm->bbm.bb_size) {
2479				dev_warn(&vm->vdev->dev,
2480					 "bbm_block_size is too small");
2481			} else {
2482				vm->bbm.bb_size = bbm_block_size;
2483			}
2484		}
2485
2486		/* Round up to the next aligned big block */
2487		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2488		       vm->bbm.bb_size - 1;
2489		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2490		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2491
2492		/* Make sure we can add two big blocks. */
2493		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2494					      vm->offline_threshold);
2495	}
2496
2497	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2498	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2499	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2500		 (unsigned long long)vm->device_block_size);
2501	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2502		 memory_block_size_bytes());
2503	if (vm->in_sbm)
2504		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2505			 (unsigned long long)vm->sbm.sb_size);
2506	else
2507		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2508			 (unsigned long long)vm->bbm.bb_size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2509	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2510		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2511
2512	return 0;
 
 
 
 
 
 
2513}
2514
2515static int virtio_mem_create_resource(struct virtio_mem *vm)
2516{
2517	/*
2518	 * When force-unloading the driver and removing the device, we
2519	 * could have a garbage pointer. Duplicate the string.
2520	 */
2521	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2522
2523	if (!name)
2524		return -ENOMEM;
2525
 
2526	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2527						   name, IORESOURCE_SYSTEM_RAM);
 
2528	if (!vm->parent_resource) {
2529		kfree(name);
2530		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2531		dev_info(&vm->vdev->dev,
2532			 "reloading the driver is not supported\n");
2533		return -EBUSY;
2534	}
2535
2536	/* The memory is not actually busy - make add_memory() work. */
2537	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2538	return 0;
2539}
2540
2541static void virtio_mem_delete_resource(struct virtio_mem *vm)
2542{
2543	const char *name;
2544
2545	if (!vm->parent_resource)
2546		return;
2547
2548	name = vm->parent_resource->name;
2549	release_resource(vm->parent_resource);
2550	kfree(vm->parent_resource);
2551	kfree(name);
2552	vm->parent_resource = NULL;
2553}
2554
2555static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2556{
2557	return 1;
2558}
2559
2560static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2561{
2562	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2563
2564	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2565				   vm->addr + vm->region_size, NULL,
2566				   virtio_mem_range_has_system_ram) == 1;
2567}
2568
2569static int virtio_mem_probe(struct virtio_device *vdev)
2570{
2571	struct virtio_mem *vm;
2572	int rc;
2573
2574	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2575	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2576
2577	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2578	if (!vm)
2579		return -ENOMEM;
2580
2581	init_waitqueue_head(&vm->host_resp);
2582	vm->vdev = vdev;
2583	INIT_WORK(&vm->wq, virtio_mem_run_wq);
2584	mutex_init(&vm->hotplug_mutex);
2585	INIT_LIST_HEAD(&vm->next);
2586	spin_lock_init(&vm->removal_lock);
2587	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2588	vm->retry_timer.function = virtio_mem_timer_expired;
2589	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
 
2590
2591	/* register the virtqueue */
2592	rc = virtio_mem_init_vq(vm);
2593	if (rc)
2594		goto out_free_vm;
2595
2596	/* initialize the device by querying the config */
2597	rc = virtio_mem_init(vm);
2598	if (rc)
2599		goto out_del_vq;
2600
2601	/* create the parent resource for all memory */
2602	rc = virtio_mem_create_resource(vm);
2603	if (rc)
2604		goto out_del_vq;
2605
2606	/*
2607	 * If we still have memory plugged, we have to unplug all memory first.
2608	 * Registering our parent resource makes sure that this memory isn't
2609	 * actually in use (e.g., trying to reload the driver).
2610	 */
2611	if (vm->plugged_size) {
2612		vm->unplug_all_required = true;
2613		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2614	}
2615
2616	/* register callbacks */
2617	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2618	rc = register_memory_notifier(&vm->memory_notifier);
2619	if (rc)
2620		goto out_del_resource;
2621	rc = register_virtio_mem_device(vm);
2622	if (rc)
2623		goto out_unreg_mem;
2624
2625	virtio_device_ready(vdev);
2626
2627	/* trigger a config update to start processing the requested_size */
2628	atomic_set(&vm->config_changed, 1);
2629	queue_work(system_freezable_wq, &vm->wq);
 
 
2630
2631	return 0;
2632out_unreg_mem:
2633	unregister_memory_notifier(&vm->memory_notifier);
2634out_del_resource:
2635	virtio_mem_delete_resource(vm);
2636out_del_vq:
2637	vdev->config->del_vqs(vdev);
2638out_free_vm:
2639	kfree(vm);
2640	vdev->priv = NULL;
2641
2642	return rc;
2643}
2644
2645static void virtio_mem_remove(struct virtio_device *vdev)
2646{
2647	struct virtio_mem *vm = vdev->priv;
2648	unsigned long mb_id;
2649	int rc;
2650
2651	/*
2652	 * Make sure the workqueue won't be triggered anymore and no memory
2653	 * blocks can be onlined/offlined until we're finished here.
2654	 */
2655	mutex_lock(&vm->hotplug_mutex);
2656	spin_lock_irq(&vm->removal_lock);
2657	vm->removing = true;
2658	spin_unlock_irq(&vm->removal_lock);
2659	mutex_unlock(&vm->hotplug_mutex);
2660
2661	/* wait until the workqueue stopped */
2662	cancel_work_sync(&vm->wq);
2663	hrtimer_cancel(&vm->retry_timer);
2664
2665	if (vm->in_sbm) {
2666		/*
2667		 * After we unregistered our callbacks, user space can online
2668		 * partially plugged offline blocks. Make sure to remove them.
2669		 */
2670		virtio_mem_sbm_for_each_mb(vm, mb_id,
2671					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2672			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2673			BUG_ON(rc);
2674			virtio_mem_sbm_set_mb_state(vm, mb_id,
2675						    VIRTIO_MEM_SBM_MB_UNUSED);
2676		}
2677		/*
2678		 * After we unregistered our callbacks, user space can no longer
2679		 * offline partially plugged online memory blocks. No need to
2680		 * worry about them.
2681		 */
2682	}
2683
2684	/* unregister callbacks */
2685	unregister_virtio_mem_device(vm);
 
2686	unregister_memory_notifier(&vm->memory_notifier);
2687
2688	/*
2689	 * There is no way we could reliably remove all memory we have added to
2690	 * the system. And there is no way to stop the driver/device from going
2691	 * away. Warn at least.
2692	 */
2693	if (virtio_mem_has_memory_added(vm)) {
2694		dev_warn(&vdev->dev, "device still has system memory added\n");
 
2695	} else {
2696		virtio_mem_delete_resource(vm);
2697		kfree_const(vm->resource_name);
 
2698	}
2699
2700	/* remove all tracking data - no locking needed */
2701	if (vm->in_sbm) {
2702		vfree(vm->sbm.mb_states);
2703		vfree(vm->sbm.sb_states);
2704	} else {
2705		vfree(vm->bbm.bb_states);
2706	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2707
2708	/* reset the device and cleanup the queues */
2709	vdev->config->reset(vdev);
2710	vdev->config->del_vqs(vdev);
2711
2712	kfree(vm);
2713	vdev->priv = NULL;
2714}
2715
2716static void virtio_mem_config_changed(struct virtio_device *vdev)
2717{
2718	struct virtio_mem *vm = vdev->priv;
2719
 
 
 
2720	atomic_set(&vm->config_changed, 1);
2721	virtio_mem_retry(vm);
2722}
2723
2724#ifdef CONFIG_PM_SLEEP
2725static int virtio_mem_freeze(struct virtio_device *vdev)
2726{
 
 
2727	/*
2728	 * When restarting the VM, all memory is usually unplugged. Don't
2729	 * allow to suspend/hibernate.
2730	 */
2731	dev_err(&vdev->dev, "save/restore not supported.\n");
2732	return -EPERM;
 
 
 
 
 
 
 
 
 
 
 
2733}
2734
2735static int virtio_mem_restore(struct virtio_device *vdev)
2736{
2737	return -EPERM;
 
 
 
 
 
 
 
 
 
 
2738}
2739#endif
2740
2741static unsigned int virtio_mem_features[] = {
2742#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
2743	VIRTIO_MEM_F_ACPI_PXM,
2744#endif
 
 
2745};
2746
2747static const struct virtio_device_id virtio_mem_id_table[] = {
2748	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
2749	{ 0 },
2750};
2751
2752static struct virtio_driver virtio_mem_driver = {
2753	.feature_table = virtio_mem_features,
2754	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
2755	.driver.name = KBUILD_MODNAME,
2756	.driver.owner = THIS_MODULE,
2757	.id_table = virtio_mem_id_table,
2758	.probe = virtio_mem_probe,
2759	.remove = virtio_mem_remove,
2760	.config_changed = virtio_mem_config_changed,
2761#ifdef CONFIG_PM_SLEEP
2762	.freeze	=	virtio_mem_freeze,
2763	.restore =	virtio_mem_restore,
2764#endif
2765};
2766
2767module_virtio_driver(virtio_mem_driver);
2768MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
2769MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
2770MODULE_DESCRIPTION("Virtio-mem driver");
2771MODULE_LICENSE("GPL");