Linux Audio

Check our new training course

Loading...
v3.15
   1/*
   2 * Memory Migration functionality - linux/mm/migration.c
   3 *
   4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5 *
   6 * Page migration was first developed in the context of the memory hotplug
   7 * project. The main authors of the migration code are:
   8 *
   9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10 * Hirokazu Takahashi <taka@valinux.co.jp>
  11 * Dave Hansen <haveblue@us.ibm.com>
  12 * Christoph Lameter
  13 */
  14
  15#include <linux/migrate.h>
  16#include <linux/export.h>
  17#include <linux/swap.h>
  18#include <linux/swapops.h>
  19#include <linux/pagemap.h>
  20#include <linux/buffer_head.h>
  21#include <linux/mm_inline.h>
  22#include <linux/nsproxy.h>
  23#include <linux/pagevec.h>
  24#include <linux/ksm.h>
  25#include <linux/rmap.h>
  26#include <linux/topology.h>
  27#include <linux/cpu.h>
  28#include <linux/cpuset.h>
  29#include <linux/writeback.h>
  30#include <linux/mempolicy.h>
  31#include <linux/vmalloc.h>
  32#include <linux/security.h>
  33#include <linux/memcontrol.h>
  34#include <linux/syscalls.h>
  35#include <linux/hugetlb.h>
  36#include <linux/hugetlb_cgroup.h>
  37#include <linux/gfp.h>
  38#include <linux/balloon_compaction.h>
  39#include <linux/mmu_notifier.h>
  40
  41#include <asm/tlbflush.h>
  42
  43#define CREATE_TRACE_POINTS
  44#include <trace/events/migrate.h>
  45
  46#include "internal.h"
  47
 
 
  48/*
  49 * migrate_prep() needs to be called before we start compiling a list of pages
  50 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  51 * undesirable, use migrate_prep_local()
  52 */
  53int migrate_prep(void)
  54{
  55	/*
  56	 * Clear the LRU lists so pages can be isolated.
  57	 * Note that pages may be moved off the LRU after we have
  58	 * drained them. Those pages will fail to migrate like other
  59	 * pages that may be busy.
  60	 */
  61	lru_add_drain_all();
  62
  63	return 0;
  64}
  65
  66/* Do the necessary work of migrate_prep but not if it involves other CPUs */
  67int migrate_prep_local(void)
  68{
  69	lru_add_drain();
  70
  71	return 0;
  72}
  73
  74/*
  75 * Put previously isolated pages back onto the appropriate lists
  76 * from where they were once taken off for compaction/migration.
  77 *
  78 * This function shall be used whenever the isolated pageset has been
  79 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
  80 * and isolate_huge_page().
  81 */
  82void putback_movable_pages(struct list_head *l)
  83{
  84	struct page *page;
  85	struct page *page2;
  86
  87	list_for_each_entry_safe(page, page2, l, lru) {
  88		if (unlikely(PageHuge(page))) {
  89			putback_active_hugepage(page);
  90			continue;
  91		}
  92		list_del(&page->lru);
  93		dec_zone_page_state(page, NR_ISOLATED_ANON +
  94				page_is_file_cache(page));
  95		if (unlikely(isolated_balloon_page(page)))
  96			balloon_page_putback(page);
  97		else
  98			putback_lru_page(page);
  99	}
 100}
 101
 102/*
 103 * Restore a potential migration pte to a working pte entry
 104 */
 105static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 106				 unsigned long addr, void *old)
 107{
 108	struct mm_struct *mm = vma->vm_mm;
 109	swp_entry_t entry;
 
 
 110 	pmd_t *pmd;
 111	pte_t *ptep, pte;
 112 	spinlock_t *ptl;
 113
 114	if (unlikely(PageHuge(new))) {
 115		ptep = huge_pte_offset(mm, addr);
 116		if (!ptep)
 117			goto out;
 118		ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
 119	} else {
 120		pmd = mm_find_pmd(mm, addr);
 121		if (!pmd)
 122			goto out;
 
 
 
 
 
 
 123		if (pmd_trans_huge(*pmd))
 124			goto out;
 
 
 125
 126		ptep = pte_offset_map(pmd, addr);
 127
 128		/*
 129		 * Peek to check is_swap_pte() before taking ptlock?  No, we
 130		 * can race mremap's move_ptes(), which skips anon_vma lock.
 131		 */
 132
 133		ptl = pte_lockptr(mm, pmd);
 134	}
 135
 136 	spin_lock(ptl);
 137	pte = *ptep;
 138	if (!is_swap_pte(pte))
 139		goto unlock;
 140
 141	entry = pte_to_swp_entry(pte);
 142
 143	if (!is_migration_entry(entry) ||
 144	    migration_entry_to_page(entry) != old)
 145		goto unlock;
 146
 147	get_page(new);
 148	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 149	if (pte_swp_soft_dirty(*ptep))
 150		pte = pte_mksoft_dirty(pte);
 151	if (is_write_migration_entry(entry))
 152		pte = pte_mkwrite(pte);
 153#ifdef CONFIG_HUGETLB_PAGE
 154	if (PageHuge(new)) {
 155		pte = pte_mkhuge(pte);
 156		pte = arch_make_huge_pte(pte, vma, new, 0);
 157	}
 158#endif
 159	flush_dcache_page(new);
 160	set_pte_at(mm, addr, ptep, pte);
 161
 162	if (PageHuge(new)) {
 163		if (PageAnon(new))
 164			hugepage_add_anon_rmap(new, vma, addr);
 165		else
 166			page_dup_rmap(new);
 167	} else if (PageAnon(new))
 168		page_add_anon_rmap(new, vma, addr);
 169	else
 170		page_add_file_rmap(new);
 171
 172	/* No need to invalidate - it was non-present before */
 173	update_mmu_cache(vma, addr, ptep);
 174unlock:
 175	pte_unmap_unlock(ptep, ptl);
 176out:
 177	return SWAP_AGAIN;
 178}
 179
 180/*
 181 * Congratulations to trinity for discovering this bug.
 182 * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
 183 * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
 184 * replace the specified range by file ptes throughout (maybe populated after).
 185 * If page migration finds a page within that range, while it's still located
 186 * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
 187 * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
 188 * But if the migrating page is in a part of the vma outside the range to be
 189 * remapped, then it will not be cleared, and remove_migration_ptes() needs to
 190 * deal with it.  Fortunately, this part of the vma is of course still linear,
 191 * so we just need to use linear location on the nonlinear list.
 192 */
 193static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
 194		struct address_space *mapping, void *arg)
 195{
 196	struct vm_area_struct *vma;
 197	/* hugetlbfs does not support remap_pages, so no huge pgoff worries */
 198	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 199	unsigned long addr;
 200
 201	list_for_each_entry(vma,
 202		&mapping->i_mmap_nonlinear, shared.nonlinear) {
 203
 204		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 205		if (addr >= vma->vm_start && addr < vma->vm_end)
 206			remove_migration_pte(page, vma, addr, arg);
 207	}
 208	return SWAP_AGAIN;
 209}
 210
 211/*
 212 * Get rid of all migration entries and replace them by
 213 * references to the indicated page.
 214 */
 215static void remove_migration_ptes(struct page *old, struct page *new)
 216{
 217	struct rmap_walk_control rwc = {
 218		.rmap_one = remove_migration_pte,
 219		.arg = old,
 220		.file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
 221	};
 222
 223	rmap_walk(new, &rwc);
 224}
 225
 226/*
 227 * Something used the pte of a page under migration. We need to
 228 * get to the page and wait until migration is finished.
 229 * When we return from this function the fault will be retried.
 
 
 230 */
 231static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 232				spinlock_t *ptl)
 233{
 234	pte_t pte;
 
 235	swp_entry_t entry;
 236	struct page *page;
 237
 238	spin_lock(ptl);
 239	pte = *ptep;
 240	if (!is_swap_pte(pte))
 241		goto out;
 242
 243	entry = pte_to_swp_entry(pte);
 244	if (!is_migration_entry(entry))
 245		goto out;
 246
 247	page = migration_entry_to_page(entry);
 248
 249	/*
 250	 * Once radix-tree replacement of page migration started, page_count
 251	 * *must* be zero. And, we don't want to call wait_on_page_locked()
 252	 * against a page without get_page().
 253	 * So, we use get_page_unless_zero(), here. Even failed, page fault
 254	 * will occur again.
 255	 */
 256	if (!get_page_unless_zero(page))
 257		goto out;
 258	pte_unmap_unlock(ptep, ptl);
 259	wait_on_page_locked(page);
 260	put_page(page);
 261	return;
 262out:
 263	pte_unmap_unlock(ptep, ptl);
 264}
 265
 266void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 267				unsigned long address)
 268{
 269	spinlock_t *ptl = pte_lockptr(mm, pmd);
 270	pte_t *ptep = pte_offset_map(pmd, address);
 271	__migration_entry_wait(mm, ptep, ptl);
 272}
 273
 274void migration_entry_wait_huge(struct vm_area_struct *vma,
 275		struct mm_struct *mm, pte_t *pte)
 276{
 277	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
 278	__migration_entry_wait(mm, pte, ptl);
 279}
 280
 281#ifdef CONFIG_BLOCK
 282/* Returns true if all buffers are successfully locked */
 283static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 284							enum migrate_mode mode)
 285{
 286	struct buffer_head *bh = head;
 287
 288	/* Simple case, sync compaction */
 289	if (mode != MIGRATE_ASYNC) {
 290		do {
 291			get_bh(bh);
 292			lock_buffer(bh);
 293			bh = bh->b_this_page;
 294
 295		} while (bh != head);
 296
 297		return true;
 298	}
 299
 300	/* async case, we cannot block on lock_buffer so use trylock_buffer */
 301	do {
 302		get_bh(bh);
 303		if (!trylock_buffer(bh)) {
 304			/*
 305			 * We failed to lock the buffer and cannot stall in
 306			 * async migration. Release the taken locks
 307			 */
 308			struct buffer_head *failed_bh = bh;
 309			put_bh(failed_bh);
 310			bh = head;
 311			while (bh != failed_bh) {
 312				unlock_buffer(bh);
 313				put_bh(bh);
 314				bh = bh->b_this_page;
 315			}
 316			return false;
 317		}
 318
 319		bh = bh->b_this_page;
 320	} while (bh != head);
 321	return true;
 322}
 323#else
 324static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
 325							enum migrate_mode mode)
 326{
 327	return true;
 328}
 329#endif /* CONFIG_BLOCK */
 330
 331/*
 332 * Replace the page in the mapping.
 333 *
 334 * The number of remaining references must be:
 335 * 1 for anonymous pages without a mapping
 336 * 2 for pages with a mapping
 337 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 338 */
 339int migrate_page_move_mapping(struct address_space *mapping,
 340		struct page *newpage, struct page *page,
 341		struct buffer_head *head, enum migrate_mode mode,
 342		int extra_count)
 343{
 344	int expected_count = 1 + extra_count;
 345	void **pslot;
 346
 347	if (!mapping) {
 348		/* Anonymous page without mapping */
 349		if (page_count(page) != expected_count)
 350			return -EAGAIN;
 351		return MIGRATEPAGE_SUCCESS;
 352	}
 353
 354	spin_lock_irq(&mapping->tree_lock);
 355
 356	pslot = radix_tree_lookup_slot(&mapping->page_tree,
 357 					page_index(page));
 358
 359	expected_count += 1 + page_has_private(page);
 360	if (page_count(page) != expected_count ||
 361		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 362		spin_unlock_irq(&mapping->tree_lock);
 363		return -EAGAIN;
 364	}
 365
 366	if (!page_freeze_refs(page, expected_count)) {
 367		spin_unlock_irq(&mapping->tree_lock);
 368		return -EAGAIN;
 369	}
 370
 371	/*
 372	 * In the async migration case of moving a page with buffers, lock the
 373	 * buffers using trylock before the mapping is moved. If the mapping
 374	 * was moved, we later failed to lock the buffers and could not move
 375	 * the mapping back due to an elevated page count, we would have to
 376	 * block waiting on other references to be dropped.
 377	 */
 378	if (mode == MIGRATE_ASYNC && head &&
 379			!buffer_migrate_lock_buffers(head, mode)) {
 380		page_unfreeze_refs(page, expected_count);
 381		spin_unlock_irq(&mapping->tree_lock);
 382		return -EAGAIN;
 383	}
 384
 385	/*
 386	 * Now we know that no one else is looking at the page.
 387	 */
 388	get_page(newpage);	/* add cache reference */
 389	if (PageSwapCache(page)) {
 390		SetPageSwapCache(newpage);
 391		set_page_private(newpage, page_private(page));
 392	}
 393
 394	radix_tree_replace_slot(pslot, newpage);
 395
 
 396	/*
 397	 * Drop cache reference from old page by unfreezing
 398	 * to one less reference.
 399	 * We know this isn't the last reference.
 400	 */
 401	page_unfreeze_refs(page, expected_count - 1);
 402
 403	/*
 404	 * If moved to a different zone then also account
 405	 * the page for that zone. Other VM counters will be
 406	 * taken care of when we establish references to the
 407	 * new page and drop references to the old page.
 408	 *
 409	 * Note that anonymous pages are accounted for
 410	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
 411	 * are mapped to swap space.
 412	 */
 413	__dec_zone_page_state(page, NR_FILE_PAGES);
 414	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 415	if (!PageSwapCache(page) && PageSwapBacked(page)) {
 416		__dec_zone_page_state(page, NR_SHMEM);
 417		__inc_zone_page_state(newpage, NR_SHMEM);
 418	}
 419	spin_unlock_irq(&mapping->tree_lock);
 420
 421	return MIGRATEPAGE_SUCCESS;
 422}
 423
 424/*
 425 * The expected number of remaining references is the same as that
 426 * of migrate_page_move_mapping().
 427 */
 428int migrate_huge_page_move_mapping(struct address_space *mapping,
 429				   struct page *newpage, struct page *page)
 430{
 431	int expected_count;
 432	void **pslot;
 433
 434	if (!mapping) {
 435		if (page_count(page) != 1)
 436			return -EAGAIN;
 437		return MIGRATEPAGE_SUCCESS;
 438	}
 439
 440	spin_lock_irq(&mapping->tree_lock);
 441
 442	pslot = radix_tree_lookup_slot(&mapping->page_tree,
 443					page_index(page));
 444
 445	expected_count = 2 + page_has_private(page);
 446	if (page_count(page) != expected_count ||
 447		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 448		spin_unlock_irq(&mapping->tree_lock);
 449		return -EAGAIN;
 450	}
 451
 452	if (!page_freeze_refs(page, expected_count)) {
 453		spin_unlock_irq(&mapping->tree_lock);
 454		return -EAGAIN;
 455	}
 456
 457	get_page(newpage);
 458
 459	radix_tree_replace_slot(pslot, newpage);
 460
 461	page_unfreeze_refs(page, expected_count - 1);
 462
 463	spin_unlock_irq(&mapping->tree_lock);
 464	return MIGRATEPAGE_SUCCESS;
 465}
 466
 467/*
 468 * Gigantic pages are so large that we do not guarantee that page++ pointer
 469 * arithmetic will work across the entire page.  We need something more
 470 * specialized.
 471 */
 472static void __copy_gigantic_page(struct page *dst, struct page *src,
 473				int nr_pages)
 474{
 475	int i;
 476	struct page *dst_base = dst;
 477	struct page *src_base = src;
 478
 479	for (i = 0; i < nr_pages; ) {
 480		cond_resched();
 481		copy_highpage(dst, src);
 482
 483		i++;
 484		dst = mem_map_next(dst, dst_base, i);
 485		src = mem_map_next(src, src_base, i);
 486	}
 487}
 488
 489static void copy_huge_page(struct page *dst, struct page *src)
 490{
 491	int i;
 492	int nr_pages;
 493
 494	if (PageHuge(src)) {
 495		/* hugetlbfs page */
 496		struct hstate *h = page_hstate(src);
 497		nr_pages = pages_per_huge_page(h);
 498
 499		if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
 500			__copy_gigantic_page(dst, src, nr_pages);
 501			return;
 502		}
 503	} else {
 504		/* thp page */
 505		BUG_ON(!PageTransHuge(src));
 506		nr_pages = hpage_nr_pages(src);
 507	}
 508
 509	for (i = 0; i < nr_pages; i++) {
 510		cond_resched();
 511		copy_highpage(dst + i, src + i);
 512	}
 513}
 514
 515/*
 516 * Copy the page to its new location
 517 */
 518void migrate_page_copy(struct page *newpage, struct page *page)
 519{
 520	int cpupid;
 521
 522	if (PageHuge(page) || PageTransHuge(page))
 523		copy_huge_page(newpage, page);
 524	else
 525		copy_highpage(newpage, page);
 526
 527	if (PageError(page))
 528		SetPageError(newpage);
 529	if (PageReferenced(page))
 530		SetPageReferenced(newpage);
 531	if (PageUptodate(page))
 532		SetPageUptodate(newpage);
 533	if (TestClearPageActive(page)) {
 534		VM_BUG_ON_PAGE(PageUnevictable(page), page);
 535		SetPageActive(newpage);
 536	} else if (TestClearPageUnevictable(page))
 537		SetPageUnevictable(newpage);
 538	if (PageChecked(page))
 539		SetPageChecked(newpage);
 540	if (PageMappedToDisk(page))
 541		SetPageMappedToDisk(newpage);
 542
 543	if (PageDirty(page)) {
 544		clear_page_dirty_for_io(page);
 545		/*
 546		 * Want to mark the page and the radix tree as dirty, and
 547		 * redo the accounting that clear_page_dirty_for_io undid,
 548		 * but we can't use set_page_dirty because that function
 549		 * is actually a signal that all of the page has become dirty.
 550		 * Whereas only part of our page may be dirty.
 551		 */
 552		if (PageSwapBacked(page))
 553			SetPageDirty(newpage);
 554		else
 555			__set_page_dirty_nobuffers(newpage);
 556 	}
 557
 558	/*
 559	 * Copy NUMA information to the new page, to prevent over-eager
 560	 * future migrations of this same page.
 561	 */
 562	cpupid = page_cpupid_xchg_last(page, -1);
 563	page_cpupid_xchg_last(newpage, cpupid);
 564
 565	mlock_migrate_page(newpage, page);
 566	ksm_migrate_page(newpage, page);
 567	/*
 568	 * Please do not reorder this without considering how mm/ksm.c's
 569	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
 570	 */
 571	ClearPageSwapCache(page);
 572	ClearPagePrivate(page);
 573	set_page_private(page, 0);
 
 574
 575	/*
 576	 * If any waiters have accumulated on the new page then
 577	 * wake them up.
 578	 */
 579	if (PageWriteback(newpage))
 580		end_page_writeback(newpage);
 581}
 582
 583/************************************************************
 584 *                    Migration functions
 585 ***********************************************************/
 586
 
 
 
 
 
 
 
 
 587/*
 588 * Common logic to directly migrate a single page suitable for
 589 * pages that do not use PagePrivate/PagePrivate2.
 590 *
 591 * Pages are locked upon entry and exit.
 592 */
 593int migrate_page(struct address_space *mapping,
 594		struct page *newpage, struct page *page,
 595		enum migrate_mode mode)
 596{
 597	int rc;
 598
 599	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 600
 601	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
 602
 603	if (rc != MIGRATEPAGE_SUCCESS)
 604		return rc;
 605
 606	migrate_page_copy(newpage, page);
 607	return MIGRATEPAGE_SUCCESS;
 608}
 609EXPORT_SYMBOL(migrate_page);
 610
 611#ifdef CONFIG_BLOCK
 612/*
 613 * Migration function for pages with buffers. This function can only be used
 614 * if the underlying filesystem guarantees that no other references to "page"
 615 * exist.
 616 */
 617int buffer_migrate_page(struct address_space *mapping,
 618		struct page *newpage, struct page *page, enum migrate_mode mode)
 619{
 620	struct buffer_head *bh, *head;
 621	int rc;
 622
 623	if (!page_has_buffers(page))
 624		return migrate_page(mapping, newpage, page, mode);
 625
 626	head = page_buffers(page);
 627
 628	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
 629
 630	if (rc != MIGRATEPAGE_SUCCESS)
 631		return rc;
 632
 633	/*
 634	 * In the async case, migrate_page_move_mapping locked the buffers
 635	 * with an IRQ-safe spinlock held. In the sync case, the buffers
 636	 * need to be locked now
 637	 */
 638	if (mode != MIGRATE_ASYNC)
 639		BUG_ON(!buffer_migrate_lock_buffers(head, mode));
 640
 641	ClearPagePrivate(page);
 642	set_page_private(newpage, page_private(page));
 643	set_page_private(page, 0);
 644	put_page(page);
 645	get_page(newpage);
 646
 647	bh = head;
 648	do {
 649		set_bh_page(bh, newpage, bh_offset(bh));
 650		bh = bh->b_this_page;
 651
 652	} while (bh != head);
 653
 654	SetPagePrivate(newpage);
 655
 656	migrate_page_copy(newpage, page);
 657
 658	bh = head;
 659	do {
 660		unlock_buffer(bh);
 661 		put_bh(bh);
 662		bh = bh->b_this_page;
 663
 664	} while (bh != head);
 665
 666	return MIGRATEPAGE_SUCCESS;
 667}
 668EXPORT_SYMBOL(buffer_migrate_page);
 669#endif
 670
 671/*
 672 * Writeback a page to clean the dirty state
 673 */
 674static int writeout(struct address_space *mapping, struct page *page)
 675{
 676	struct writeback_control wbc = {
 677		.sync_mode = WB_SYNC_NONE,
 678		.nr_to_write = 1,
 679		.range_start = 0,
 680		.range_end = LLONG_MAX,
 681		.for_reclaim = 1
 682	};
 683	int rc;
 684
 685	if (!mapping->a_ops->writepage)
 686		/* No write method for the address space */
 687		return -EINVAL;
 688
 689	if (!clear_page_dirty_for_io(page))
 690		/* Someone else already triggered a write */
 691		return -EAGAIN;
 692
 693	/*
 694	 * A dirty page may imply that the underlying filesystem has
 695	 * the page on some queue. So the page must be clean for
 696	 * migration. Writeout may mean we loose the lock and the
 697	 * page state is no longer what we checked for earlier.
 698	 * At this point we know that the migration attempt cannot
 699	 * be successful.
 700	 */
 701	remove_migration_ptes(page, page);
 702
 703	rc = mapping->a_ops->writepage(page, &wbc);
 704
 705	if (rc != AOP_WRITEPAGE_ACTIVATE)
 706		/* unlocked. Relock */
 707		lock_page(page);
 708
 709	return (rc < 0) ? -EIO : -EAGAIN;
 710}
 711
 712/*
 713 * Default handling if a filesystem does not provide a migration function.
 714 */
 715static int fallback_migrate_page(struct address_space *mapping,
 716	struct page *newpage, struct page *page, enum migrate_mode mode)
 717{
 718	if (PageDirty(page)) {
 719		/* Only writeback pages in full synchronous migration */
 720		if (mode != MIGRATE_SYNC)
 721			return -EBUSY;
 722		return writeout(mapping, page);
 723	}
 724
 725	/*
 726	 * Buffers may be managed in a filesystem specific way.
 727	 * We must have no buffers or drop them.
 728	 */
 729	if (page_has_private(page) &&
 730	    !try_to_release_page(page, GFP_KERNEL))
 731		return -EAGAIN;
 732
 733	return migrate_page(mapping, newpage, page, mode);
 734}
 735
 736/*
 737 * Move a page to a newly allocated page
 738 * The page is locked and all ptes have been successfully removed.
 739 *
 740 * The new page will have replaced the old page if this function
 741 * is successful.
 742 *
 743 * Return value:
 744 *   < 0 - error code
 745 *  MIGRATEPAGE_SUCCESS - success
 746 */
 747static int move_to_new_page(struct page *newpage, struct page *page,
 748				int remap_swapcache, enum migrate_mode mode)
 749{
 750	struct address_space *mapping;
 751	int rc;
 752
 753	/*
 754	 * Block others from accessing the page when we get around to
 755	 * establishing additional references. We are the only one
 756	 * holding a reference to the new page at this point.
 757	 */
 758	if (!trylock_page(newpage))
 759		BUG();
 760
 761	/* Prepare mapping for the new page.*/
 762	newpage->index = page->index;
 763	newpage->mapping = page->mapping;
 764	if (PageSwapBacked(page))
 765		SetPageSwapBacked(newpage);
 766
 767	mapping = page_mapping(page);
 768	if (!mapping)
 769		rc = migrate_page(mapping, newpage, page, mode);
 770	else if (mapping->a_ops->migratepage)
 771		/*
 772		 * Most pages have a mapping and most filesystems provide a
 773		 * migratepage callback. Anonymous pages are part of swap
 774		 * space which also has its own migratepage callback. This
 775		 * is the most common path for page migration.
 776		 */
 777		rc = mapping->a_ops->migratepage(mapping,
 778						newpage, page, mode);
 779	else
 780		rc = fallback_migrate_page(mapping, newpage, page, mode);
 
 
 
 
 
 
 
 
 
 
 
 
 781
 782	if (rc != MIGRATEPAGE_SUCCESS) {
 783		newpage->mapping = NULL;
 784	} else {
 785		if (remap_swapcache)
 786			remove_migration_ptes(page, newpage);
 787		page->mapping = NULL;
 788	}
 789
 790	unlock_page(newpage);
 791
 792	return rc;
 793}
 794
 795static int __unmap_and_move(struct page *page, struct page *newpage,
 796				int force, enum migrate_mode mode)
 
 
 
 
 797{
 798	int rc = -EAGAIN;
 
 
 799	int remap_swapcache = 1;
 
 800	struct mem_cgroup *mem;
 801	struct anon_vma *anon_vma = NULL;
 802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 803	if (!trylock_page(page)) {
 804		if (!force || mode == MIGRATE_ASYNC)
 805			goto out;
 806
 807		/*
 808		 * It's not safe for direct compaction to call lock_page.
 809		 * For example, during page readahead pages are added locked
 810		 * to the LRU. Later, when the IO completes the pages are
 811		 * marked uptodate and unlocked. However, the queueing
 812		 * could be merging multiple pages for one bio (e.g.
 813		 * mpage_readpages). If an allocation happens for the
 814		 * second or third page, the process can end up locking
 815		 * the same page twice and deadlocking. Rather than
 816		 * trying to be clever about what pages can be locked,
 817		 * avoid the use of lock_page for direct compaction
 818		 * altogether.
 819		 */
 820		if (current->flags & PF_MEMALLOC)
 821			goto out;
 822
 823		lock_page(page);
 824	}
 825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 826	/* charge against new page */
 827	mem_cgroup_prepare_migration(page, newpage, &mem);
 
 
 
 
 
 828
 829	if (PageWriteback(page)) {
 830		/*
 831		 * Only in the case of a full synchronous migration is it
 832		 * necessary to wait for PageWriteback. In the async case,
 833		 * the retry loop is too short and in the sync-light case,
 834		 * the overhead of stalling is too much
 835		 */
 836		if (mode != MIGRATE_SYNC) {
 837			rc = -EBUSY;
 838			goto uncharge;
 839		}
 840		if (!force)
 841			goto uncharge;
 842		wait_on_page_writeback(page);
 843	}
 844	/*
 845	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 846	 * we cannot notice that anon_vma is freed while we migrates a page.
 847	 * This get_anon_vma() delays freeing anon_vma pointer until the end
 848	 * of migration. File cache pages are no problem because of page_lock()
 849	 * File Caches may use write_page() or lock_page() in migration, then,
 850	 * just care Anon page here.
 851	 */
 852	if (PageAnon(page) && !PageKsm(page)) {
 853		/*
 854		 * Only page_lock_anon_vma_read() understands the subtleties of
 855		 * getting a hold on an anon_vma from outside one of its mms.
 856		 */
 857		anon_vma = page_get_anon_vma(page);
 858		if (anon_vma) {
 859			/*
 860			 * Anon page
 861			 */
 862		} else if (PageSwapCache(page)) {
 863			/*
 864			 * We cannot be sure that the anon_vma of an unmapped
 865			 * swapcache page is safe to use because we don't
 866			 * know in advance if the VMA that this page belonged
 867			 * to still exists. If the VMA and others sharing the
 868			 * data have been freed, then the anon_vma could
 869			 * already be invalid.
 870			 *
 871			 * To avoid this possibility, swapcache pages get
 872			 * migrated but are not remapped when migration
 873			 * completes
 874			 */
 875			remap_swapcache = 0;
 876		} else {
 877			goto uncharge;
 878		}
 879	}
 880
 881	if (unlikely(balloon_page_movable(page))) {
 882		/*
 883		 * A ballooned page does not need any special attention from
 884		 * physical to virtual reverse mapping procedures.
 885		 * Skip any attempt to unmap PTEs or to remap swap cache,
 886		 * in order to avoid burning cycles at rmap level, and perform
 887		 * the page migration right away (proteced by page lock).
 888		 */
 889		rc = balloon_page_migrate(newpage, page, mode);
 890		goto uncharge;
 891	}
 892
 893	/*
 894	 * Corner case handling:
 895	 * 1. When a new swap-cache page is read into, it is added to the LRU
 896	 * and treated as swapcache but it has no rmap yet.
 897	 * Calling try_to_unmap() against a page->mapping==NULL page will
 898	 * trigger a BUG.  So handle it here.
 899	 * 2. An orphaned page (see truncate_complete_page) might have
 900	 * fs-private metadata. The page can be picked up due to memory
 901	 * offlining.  Everywhere else except page reclaim, the page is
 902	 * invisible to the vm, so the page can not be migrated.  So try to
 903	 * free the metadata, so the page can be freed.
 904	 */
 905	if (!page->mapping) {
 906		VM_BUG_ON_PAGE(PageAnon(page), page);
 907		if (page_has_private(page)) {
 908			try_to_free_buffers(page);
 909			goto uncharge;
 910		}
 911		goto skip_unmap;
 912	}
 913
 914	/* Establish migration ptes or remove ptes */
 915	try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 916
 917skip_unmap:
 918	if (!page_mapped(page))
 919		rc = move_to_new_page(newpage, page, remap_swapcache, mode);
 920
 921	if (rc && remap_swapcache)
 922		remove_migration_ptes(page, page);
 923
 924	/* Drop an anon_vma reference if we took one */
 925	if (anon_vma)
 926		put_anon_vma(anon_vma);
 927
 928uncharge:
 929	mem_cgroup_end_migration(mem, page, newpage,
 930				 (rc == MIGRATEPAGE_SUCCESS ||
 931				  rc == MIGRATEPAGE_BALLOON_SUCCESS));
 932	unlock_page(page);
 933out:
 934	return rc;
 935}
 936
 937/*
 938 * Obtain the lock on page, remove all ptes and migrate the page
 939 * to the newly allocated page in newpage.
 940 */
 941static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 942			struct page *page, int force, enum migrate_mode mode)
 943{
 944	int rc = 0;
 945	int *result = NULL;
 946	struct page *newpage = get_new_page(page, private, &result);
 947
 948	if (!newpage)
 949		return -ENOMEM;
 950
 951	if (page_count(page) == 1) {
 952		/* page was freed from under us. So we are done. */
 953		goto out;
 954	}
 955
 956	if (unlikely(PageTransHuge(page)))
 957		if (unlikely(split_huge_page(page)))
 958			goto out;
 959
 960	rc = __unmap_and_move(page, newpage, force, mode);
 961
 962	if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
 963		/*
 964		 * A ballooned page has been migrated already.
 965		 * Now, it's the time to wrap-up counters,
 966		 * handle the page back to Buddy and return.
 967		 */
 968		dec_zone_page_state(page, NR_ISOLATED_ANON +
 969				    page_is_file_cache(page));
 970		balloon_page_free(page);
 971		return MIGRATEPAGE_SUCCESS;
 972	}
 973out:
 974	if (rc != -EAGAIN) {
 975		/*
 976		 * A page that has been migrated has all references
 977		 * removed and will be freed. A page that has not been
 978		 * migrated will have kepts its references and be
 979		 * restored.
 980		 */
 981		list_del(&page->lru);
 982		dec_zone_page_state(page, NR_ISOLATED_ANON +
 983				page_is_file_cache(page));
 984		putback_lru_page(page);
 985	}
 
 986	/*
 987	 * Move the new page to the LRU. If migration was not successful
 988	 * then this will free the page.
 989	 */
 990	putback_lru_page(newpage);
 
 991	if (result) {
 992		if (rc)
 993			*result = rc;
 994		else
 995			*result = page_to_nid(newpage);
 996	}
 997	return rc;
 998}
 999
1000/*
1001 * Counterpart of unmap_and_move_page() for hugepage migration.
1002 *
1003 * This function doesn't wait the completion of hugepage I/O
1004 * because there is no race between I/O and migration for hugepage.
1005 * Note that currently hugepage I/O occurs only in direct I/O
1006 * where no lock is held and PG_writeback is irrelevant,
1007 * and writeback status of all subpages are counted in the reference
1008 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1009 * under direct I/O, the reference of the head page is 512 and a bit more.)
1010 * This means that when we try to migrate hugepage whose subpages are
1011 * doing direct I/O, some references remain after try_to_unmap() and
1012 * hugepage migration fails without data corruption.
1013 *
1014 * There is also no race when direct I/O is issued on the page under migration,
1015 * because then pte is replaced with migration swap entry and direct I/O code
1016 * will wait in the page fault for migration to complete.
1017 */
1018static int unmap_and_move_huge_page(new_page_t get_new_page,
1019				unsigned long private, struct page *hpage,
1020				int force, enum migrate_mode mode)
1021{
1022	int rc = 0;
1023	int *result = NULL;
1024	struct page *new_hpage;
1025	struct anon_vma *anon_vma = NULL;
1026
1027	/*
1028	 * Movability of hugepages depends on architectures and hugepage size.
1029	 * This check is necessary because some callers of hugepage migration
1030	 * like soft offline and memory hotremove don't walk through page
1031	 * tables or check whether the hugepage is pmd-based or not before
1032	 * kicking migration.
1033	 */
1034	if (!hugepage_migration_support(page_hstate(hpage))) {
1035		putback_active_hugepage(hpage);
1036		return -ENOSYS;
1037	}
1038
1039	new_hpage = get_new_page(hpage, private, &result);
1040	if (!new_hpage)
1041		return -ENOMEM;
1042
1043	rc = -EAGAIN;
1044
1045	if (!trylock_page(hpage)) {
1046		if (!force || mode != MIGRATE_SYNC)
1047			goto out;
1048		lock_page(hpage);
1049	}
1050
1051	if (PageAnon(hpage))
1052		anon_vma = page_get_anon_vma(hpage);
1053
1054	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1055
1056	if (!page_mapped(hpage))
1057		rc = move_to_new_page(new_hpage, hpage, 1, mode);
1058
1059	if (rc)
1060		remove_migration_ptes(hpage, hpage);
1061
1062	if (anon_vma)
1063		put_anon_vma(anon_vma);
 
 
1064
1065	if (!rc)
1066		hugetlb_cgroup_migrate(hpage, new_hpage);
 
 
1067
1068	unlock_page(hpage);
1069out:
1070	if (rc != -EAGAIN)
1071		putback_active_hugepage(hpage);
1072	put_page(new_hpage);
 
1073	if (result) {
1074		if (rc)
1075			*result = rc;
1076		else
1077			*result = page_to_nid(new_hpage);
1078	}
1079	return rc;
1080}
1081
1082/*
1083 * migrate_pages - migrate the pages specified in a list, to the free pages
1084 *		   supplied as the target for the page migration
1085 *
1086 * @from:		The list of pages to be migrated.
1087 * @get_new_page:	The function used to allocate free pages to be used
1088 *			as the target of the page migration.
1089 * @private:		Private data to be passed on to get_new_page()
1090 * @mode:		The migration mode that specifies the constraints for
1091 *			page migration, if any.
1092 * @reason:		The reason for page migration.
1093 *
1094 * The function returns after 10 attempts or if no pages are movable any more
1095 * because the list has become empty or no retryable pages exist any more.
1096 * The caller should call putback_lru_pages() to return pages to the LRU
 
1097 * or free list only if ret != 0.
1098 *
1099 * Returns the number of pages that were not migrated, or an error code.
1100 */
1101int migrate_pages(struct list_head *from, new_page_t get_new_page,
1102		unsigned long private, enum migrate_mode mode, int reason)
 
1103{
1104	int retry = 1;
1105	int nr_failed = 0;
1106	int nr_succeeded = 0;
1107	int pass = 0;
1108	struct page *page;
1109	struct page *page2;
1110	int swapwrite = current->flags & PF_SWAPWRITE;
1111	int rc;
1112
1113	if (!swapwrite)
1114		current->flags |= PF_SWAPWRITE;
1115
1116	for(pass = 0; pass < 10 && retry; pass++) {
1117		retry = 0;
1118
1119		list_for_each_entry_safe(page, page2, from, lru) {
1120			cond_resched();
1121
1122			if (PageHuge(page))
1123				rc = unmap_and_move_huge_page(get_new_page,
1124						private, page, pass > 2, mode);
1125			else
1126				rc = unmap_and_move(get_new_page, private,
1127						page, pass > 2, mode);
1128
1129			switch(rc) {
1130			case -ENOMEM:
1131				goto out;
1132			case -EAGAIN:
1133				retry++;
1134				break;
1135			case MIGRATEPAGE_SUCCESS:
1136				nr_succeeded++;
1137				break;
1138			default:
1139				/*
1140				 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1141				 * unlike -EAGAIN case, the failed page is
1142				 * removed from migration page list and not
1143				 * retried in the next outer loop.
1144				 */
1145				nr_failed++;
1146				break;
1147			}
1148		}
1149	}
1150	rc = nr_failed + retry;
1151out:
1152	if (nr_succeeded)
1153		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1154	if (nr_failed)
1155		count_vm_events(PGMIGRATE_FAIL, nr_failed);
1156	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1157
1158	if (!swapwrite)
1159		current->flags &= ~PF_SWAPWRITE;
1160
1161	return rc;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1162}
1163
1164#ifdef CONFIG_NUMA
1165/*
1166 * Move a list of individual pages
1167 */
1168struct page_to_node {
1169	unsigned long addr;
1170	struct page *page;
1171	int node;
1172	int status;
1173};
1174
1175static struct page *new_page_node(struct page *p, unsigned long private,
1176		int **result)
1177{
1178	struct page_to_node *pm = (struct page_to_node *)private;
1179
1180	while (pm->node != MAX_NUMNODES && pm->page != p)
1181		pm++;
1182
1183	if (pm->node == MAX_NUMNODES)
1184		return NULL;
1185
1186	*result = &pm->status;
1187
1188	if (PageHuge(p))
1189		return alloc_huge_page_node(page_hstate(compound_head(p)),
1190					pm->node);
1191	else
1192		return alloc_pages_exact_node(pm->node,
1193				GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1194}
1195
1196/*
1197 * Move a set of pages as indicated in the pm array. The addr
1198 * field must be set to the virtual address of the page to be moved
1199 * and the node number must contain a valid target node.
1200 * The pm array ends with node = MAX_NUMNODES.
1201 */
1202static int do_move_page_to_node_array(struct mm_struct *mm,
1203				      struct page_to_node *pm,
1204				      int migrate_all)
1205{
1206	int err;
1207	struct page_to_node *pp;
1208	LIST_HEAD(pagelist);
1209
1210	down_read(&mm->mmap_sem);
1211
1212	/*
1213	 * Build a list of pages to migrate
1214	 */
1215	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1216		struct vm_area_struct *vma;
1217		struct page *page;
1218
1219		err = -EFAULT;
1220		vma = find_vma(mm, pp->addr);
1221		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1222			goto set_status;
1223
1224		page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1225
1226		err = PTR_ERR(page);
1227		if (IS_ERR(page))
1228			goto set_status;
1229
1230		err = -ENOENT;
1231		if (!page)
1232			goto set_status;
1233
1234		/* Use PageReserved to check for zero page */
1235		if (PageReserved(page))
1236			goto put_and_set;
1237
1238		pp->page = page;
1239		err = page_to_nid(page);
1240
1241		if (err == pp->node)
1242			/*
1243			 * Node already in the right place
1244			 */
1245			goto put_and_set;
1246
1247		err = -EACCES;
1248		if (page_mapcount(page) > 1 &&
1249				!migrate_all)
1250			goto put_and_set;
1251
1252		if (PageHuge(page)) {
1253			isolate_huge_page(page, &pagelist);
1254			goto put_and_set;
1255		}
1256
1257		err = isolate_lru_page(page);
1258		if (!err) {
1259			list_add_tail(&page->lru, &pagelist);
1260			inc_zone_page_state(page, NR_ISOLATED_ANON +
1261					    page_is_file_cache(page));
1262		}
1263put_and_set:
1264		/*
1265		 * Either remove the duplicate refcount from
1266		 * isolate_lru_page() or drop the page ref if it was
1267		 * not isolated.
1268		 */
1269		put_page(page);
1270set_status:
1271		pp->status = err;
1272	}
1273
1274	err = 0;
1275	if (!list_empty(&pagelist)) {
1276		err = migrate_pages(&pagelist, new_page_node,
1277				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1278		if (err)
1279			putback_movable_pages(&pagelist);
1280	}
1281
1282	up_read(&mm->mmap_sem);
1283	return err;
1284}
1285
1286/*
1287 * Migrate an array of page address onto an array of nodes and fill
1288 * the corresponding array of status.
1289 */
1290static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1291			 unsigned long nr_pages,
1292			 const void __user * __user *pages,
1293			 const int __user *nodes,
1294			 int __user *status, int flags)
1295{
1296	struct page_to_node *pm;
 
1297	unsigned long chunk_nr_pages;
1298	unsigned long chunk_start;
1299	int err;
1300
 
 
1301	err = -ENOMEM;
1302	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1303	if (!pm)
1304		goto out;
1305
1306	migrate_prep();
1307
1308	/*
1309	 * Store a chunk of page_to_node array in a page,
1310	 * but keep the last one as a marker
1311	 */
1312	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1313
1314	for (chunk_start = 0;
1315	     chunk_start < nr_pages;
1316	     chunk_start += chunk_nr_pages) {
1317		int j;
1318
1319		if (chunk_start + chunk_nr_pages > nr_pages)
1320			chunk_nr_pages = nr_pages - chunk_start;
1321
1322		/* fill the chunk pm with addrs and nodes from user-space */
1323		for (j = 0; j < chunk_nr_pages; j++) {
1324			const void __user *p;
1325			int node;
1326
1327			err = -EFAULT;
1328			if (get_user(p, pages + j + chunk_start))
1329				goto out_pm;
1330			pm[j].addr = (unsigned long) p;
1331
1332			if (get_user(node, nodes + j + chunk_start))
1333				goto out_pm;
1334
1335			err = -ENODEV;
1336			if (node < 0 || node >= MAX_NUMNODES)
1337				goto out_pm;
1338
1339			if (!node_state(node, N_MEMORY))
1340				goto out_pm;
1341
1342			err = -EACCES;
1343			if (!node_isset(node, task_nodes))
1344				goto out_pm;
1345
1346			pm[j].node = node;
1347		}
1348
1349		/* End marker for this chunk */
1350		pm[chunk_nr_pages].node = MAX_NUMNODES;
1351
1352		/* Migrate this chunk */
1353		err = do_move_page_to_node_array(mm, pm,
1354						 flags & MPOL_MF_MOVE_ALL);
1355		if (err < 0)
1356			goto out_pm;
1357
1358		/* Return status information */
1359		for (j = 0; j < chunk_nr_pages; j++)
1360			if (put_user(pm[j].status, status + j + chunk_start)) {
1361				err = -EFAULT;
1362				goto out_pm;
1363			}
1364	}
1365	err = 0;
1366
1367out_pm:
1368	free_page((unsigned long)pm);
1369out:
1370	return err;
1371}
1372
1373/*
1374 * Determine the nodes of an array of pages and store it in an array of status.
1375 */
1376static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1377				const void __user **pages, int *status)
1378{
1379	unsigned long i;
1380
1381	down_read(&mm->mmap_sem);
1382
1383	for (i = 0; i < nr_pages; i++) {
1384		unsigned long addr = (unsigned long)(*pages);
1385		struct vm_area_struct *vma;
1386		struct page *page;
1387		int err = -EFAULT;
1388
1389		vma = find_vma(mm, addr);
1390		if (!vma || addr < vma->vm_start)
1391			goto set_status;
1392
1393		page = follow_page(vma, addr, 0);
1394
1395		err = PTR_ERR(page);
1396		if (IS_ERR(page))
1397			goto set_status;
1398
1399		err = -ENOENT;
1400		/* Use PageReserved to check for zero page */
1401		if (!page || PageReserved(page))
1402			goto set_status;
1403
1404		err = page_to_nid(page);
1405set_status:
1406		*status = err;
1407
1408		pages++;
1409		status++;
1410	}
1411
1412	up_read(&mm->mmap_sem);
1413}
1414
1415/*
1416 * Determine the nodes of a user array of pages and store it in
1417 * a user array of status.
1418 */
1419static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1420			 const void __user * __user *pages,
1421			 int __user *status)
1422{
1423#define DO_PAGES_STAT_CHUNK_NR 16
1424	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1425	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1426
1427	while (nr_pages) {
1428		unsigned long chunk_nr;
1429
1430		chunk_nr = nr_pages;
1431		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1432			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1433
1434		if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1435			break;
1436
1437		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1438
1439		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1440			break;
1441
1442		pages += chunk_nr;
1443		status += chunk_nr;
1444		nr_pages -= chunk_nr;
1445	}
1446	return nr_pages ? -EFAULT : 0;
1447}
1448
1449/*
1450 * Move a list of pages in the address space of the currently executing
1451 * process.
1452 */
1453SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1454		const void __user * __user *, pages,
1455		const int __user *, nodes,
1456		int __user *, status, int, flags)
1457{
1458	const struct cred *cred = current_cred(), *tcred;
1459	struct task_struct *task;
1460	struct mm_struct *mm;
1461	int err;
1462	nodemask_t task_nodes;
1463
1464	/* Check flags */
1465	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1466		return -EINVAL;
1467
1468	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1469		return -EPERM;
1470
1471	/* Find the mm_struct */
1472	rcu_read_lock();
1473	task = pid ? find_task_by_vpid(pid) : current;
1474	if (!task) {
1475		rcu_read_unlock();
1476		return -ESRCH;
1477	}
1478	get_task_struct(task);
 
 
 
 
1479
1480	/*
1481	 * Check if this process has the right to modify the specified
1482	 * process. The right exists if the process has administrative
1483	 * capabilities, superuser privileges or the same
1484	 * userid as the target process.
1485	 */
 
1486	tcred = __task_cred(task);
1487	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1488	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1489	    !capable(CAP_SYS_NICE)) {
1490		rcu_read_unlock();
1491		err = -EPERM;
1492		goto out;
1493	}
1494	rcu_read_unlock();
1495
1496 	err = security_task_movememory(task);
1497 	if (err)
1498		goto out;
1499
1500	task_nodes = cpuset_mems_allowed(task);
1501	mm = get_task_mm(task);
1502	put_task_struct(task);
1503
1504	if (!mm)
1505		return -EINVAL;
1506
1507	if (nodes)
1508		err = do_pages_move(mm, task_nodes, nr_pages, pages,
1509				    nodes, status, flags);
1510	else
1511		err = do_pages_stat(mm, nr_pages, pages, status);
1512
1513	mmput(mm);
1514	return err;
1515
1516out:
1517	put_task_struct(task);
1518	return err;
1519}
1520
1521/*
1522 * Call migration functions in the vma_ops that may prepare
1523 * memory in a vm for migration. migration functions may perform
1524 * the migration for vmas that do not have an underlying page struct.
1525 */
1526int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1527	const nodemask_t *from, unsigned long flags)
1528{
1529 	struct vm_area_struct *vma;
1530 	int err = 0;
1531
1532	for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1533 		if (vma->vm_ops && vma->vm_ops->migrate) {
1534 			err = vma->vm_ops->migrate(vma, to, from, flags);
1535 			if (err)
1536 				break;
1537 		}
1538 	}
1539 	return err;
1540}
1541
1542#ifdef CONFIG_NUMA_BALANCING
1543/*
1544 * Returns true if this is a safe migration target node for misplaced NUMA
1545 * pages. Currently it only checks the watermarks which crude
1546 */
1547static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1548				   unsigned long nr_migrate_pages)
1549{
1550	int z;
1551	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1552		struct zone *zone = pgdat->node_zones + z;
1553
1554		if (!populated_zone(zone))
1555			continue;
1556
1557		if (!zone_reclaimable(zone))
1558			continue;
1559
1560		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
1561		if (!zone_watermark_ok(zone, 0,
1562				       high_wmark_pages(zone) +
1563				       nr_migrate_pages,
1564				       0, 0))
1565			continue;
1566		return true;
1567	}
1568	return false;
1569}
1570
1571static struct page *alloc_misplaced_dst_page(struct page *page,
1572					   unsigned long data,
1573					   int **result)
1574{
1575	int nid = (int) data;
1576	struct page *newpage;
1577
1578	newpage = alloc_pages_exact_node(nid,
1579					 (GFP_HIGHUSER_MOVABLE |
1580					  __GFP_THISNODE | __GFP_NOMEMALLOC |
1581					  __GFP_NORETRY | __GFP_NOWARN) &
1582					 ~GFP_IOFS, 0);
1583
1584	return newpage;
1585}
1586
1587/*
1588 * page migration rate limiting control.
1589 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1590 * window of time. Default here says do not migrate more than 1280M per second.
1591 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1592 * as it is faults that reset the window, pte updates will happen unconditionally
1593 * if there has not been a fault since @pteupdate_interval_millisecs after the
1594 * throttle window closed.
1595 */
1596static unsigned int migrate_interval_millisecs __read_mostly = 100;
1597static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1598static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1599
1600/* Returns true if NUMA migration is currently rate limited */
1601bool migrate_ratelimited(int node)
1602{
1603	pg_data_t *pgdat = NODE_DATA(node);
1604
1605	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1606				msecs_to_jiffies(pteupdate_interval_millisecs)))
1607		return false;
1608
1609	if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1610		return false;
1611
1612	return true;
1613}
1614
1615/* Returns true if the node is migrate rate-limited after the update */
1616static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1617					unsigned long nr_pages)
1618{
1619	/*
1620	 * Rate-limit the amount of data that is being migrated to a node.
1621	 * Optimal placement is no good if the memory bus is saturated and
1622	 * all the time is being spent migrating!
1623	 */
1624	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1625		spin_lock(&pgdat->numabalancing_migrate_lock);
1626		pgdat->numabalancing_migrate_nr_pages = 0;
1627		pgdat->numabalancing_migrate_next_window = jiffies +
1628			msecs_to_jiffies(migrate_interval_millisecs);
1629		spin_unlock(&pgdat->numabalancing_migrate_lock);
1630	}
1631	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1632		trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1633								nr_pages);
1634		return true;
1635	}
1636
1637	/*
1638	 * This is an unlocked non-atomic update so errors are possible.
1639	 * The consequences are failing to migrate when we potentiall should
1640	 * have which is not severe enough to warrant locking. If it is ever
1641	 * a problem, it can be converted to a per-cpu counter.
1642	 */
1643	pgdat->numabalancing_migrate_nr_pages += nr_pages;
1644	return false;
1645}
1646
1647static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1648{
1649	int page_lru;
1650
1651	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1652
1653	/* Avoid migrating to a node that is nearly full */
1654	if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1655		return 0;
1656
1657	if (isolate_lru_page(page))
1658		return 0;
1659
1660	/*
1661	 * migrate_misplaced_transhuge_page() skips page migration's usual
1662	 * check on page_count(), so we must do it here, now that the page
1663	 * has been isolated: a GUP pin, or any other pin, prevents migration.
1664	 * The expected page count is 3: 1 for page's mapcount and 1 for the
1665	 * caller's pin and 1 for the reference taken by isolate_lru_page().
1666	 */
1667	if (PageTransHuge(page) && page_count(page) != 3) {
1668		putback_lru_page(page);
1669		return 0;
1670	}
1671
1672	page_lru = page_is_file_cache(page);
1673	mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1674				hpage_nr_pages(page));
1675
1676	/*
1677	 * Isolating the page has taken another reference, so the
1678	 * caller's reference can be safely dropped without the page
1679	 * disappearing underneath us during migration.
1680	 */
1681	put_page(page);
1682	return 1;
1683}
1684
1685bool pmd_trans_migrating(pmd_t pmd)
1686{
1687	struct page *page = pmd_page(pmd);
1688	return PageLocked(page);
1689}
1690
1691void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
1692{
1693	struct page *page = pmd_page(*pmd);
1694	wait_on_page_locked(page);
1695}
1696
1697/*
1698 * Attempt to migrate a misplaced page to the specified destination
1699 * node. Caller is expected to have an elevated reference count on
1700 * the page that will be dropped by this function before returning.
1701 */
1702int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1703			   int node)
1704{
1705	pg_data_t *pgdat = NODE_DATA(node);
1706	int isolated;
1707	int nr_remaining;
1708	LIST_HEAD(migratepages);
1709
1710	/*
1711	 * Don't migrate file pages that are mapped in multiple processes
1712	 * with execute permissions as they are probably shared libraries.
1713	 */
1714	if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1715	    (vma->vm_flags & VM_EXEC))
1716		goto out;
1717
1718	/*
1719	 * Rate-limit the amount of data that is being migrated to a node.
1720	 * Optimal placement is no good if the memory bus is saturated and
1721	 * all the time is being spent migrating!
1722	 */
1723	if (numamigrate_update_ratelimit(pgdat, 1))
1724		goto out;
1725
1726	isolated = numamigrate_isolate_page(pgdat, page);
1727	if (!isolated)
1728		goto out;
1729
1730	list_add(&page->lru, &migratepages);
1731	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1732				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1733	if (nr_remaining) {
1734		if (!list_empty(&migratepages)) {
1735			list_del(&page->lru);
1736			dec_zone_page_state(page, NR_ISOLATED_ANON +
1737					page_is_file_cache(page));
1738			putback_lru_page(page);
1739		}
1740		isolated = 0;
1741	} else
1742		count_vm_numa_event(NUMA_PAGE_MIGRATE);
1743	BUG_ON(!list_empty(&migratepages));
1744	return isolated;
1745
1746out:
1747	put_page(page);
1748	return 0;
1749}
1750#endif /* CONFIG_NUMA_BALANCING */
1751
1752#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1753/*
1754 * Migrates a THP to a given target node. page must be locked and is unlocked
1755 * before returning.
1756 */
1757int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1758				struct vm_area_struct *vma,
1759				pmd_t *pmd, pmd_t entry,
1760				unsigned long address,
1761				struct page *page, int node)
1762{
1763	spinlock_t *ptl;
1764	pg_data_t *pgdat = NODE_DATA(node);
1765	int isolated = 0;
1766	struct page *new_page = NULL;
1767	struct mem_cgroup *memcg = NULL;
1768	int page_lru = page_is_file_cache(page);
1769	unsigned long mmun_start = address & HPAGE_PMD_MASK;
1770	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
1771	pmd_t orig_entry;
1772
1773	/*
1774	 * Rate-limit the amount of data that is being migrated to a node.
1775	 * Optimal placement is no good if the memory bus is saturated and
1776	 * all the time is being spent migrating!
1777	 */
1778	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1779		goto out_dropref;
1780
1781	new_page = alloc_pages_node(node,
1782		(GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
1783		HPAGE_PMD_ORDER);
1784	if (!new_page)
1785		goto out_fail;
1786
1787	isolated = numamigrate_isolate_page(pgdat, page);
1788	if (!isolated) {
1789		put_page(new_page);
1790		goto out_fail;
1791	}
1792
1793	if (mm_tlb_flush_pending(mm))
1794		flush_tlb_range(vma, mmun_start, mmun_end);
1795
1796	/* Prepare a page as a migration target */
1797	__set_page_locked(new_page);
1798	SetPageSwapBacked(new_page);
1799
1800	/* anon mapping, we can simply copy page->mapping to the new page: */
1801	new_page->mapping = page->mapping;
1802	new_page->index = page->index;
1803	migrate_page_copy(new_page, page);
1804	WARN_ON(PageLRU(new_page));
1805
1806	/* Recheck the target PMD */
1807	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1808	ptl = pmd_lock(mm, pmd);
1809	if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
1810fail_putback:
1811		spin_unlock(ptl);
1812		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1813
1814		/* Reverse changes made by migrate_page_copy() */
1815		if (TestClearPageActive(new_page))
1816			SetPageActive(page);
1817		if (TestClearPageUnevictable(new_page))
1818			SetPageUnevictable(page);
1819		mlock_migrate_page(page, new_page);
1820
1821		unlock_page(new_page);
1822		put_page(new_page);		/* Free it */
1823
1824		/* Retake the callers reference and putback on LRU */
1825		get_page(page);
1826		putback_lru_page(page);
1827		mod_zone_page_state(page_zone(page),
1828			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1829
1830		goto out_unlock;
1831	}
1832
1833	/*
1834	 * Traditional migration needs to prepare the memcg charge
1835	 * transaction early to prevent the old page from being
1836	 * uncharged when installing migration entries.  Here we can
1837	 * save the potential rollback and start the charge transfer
1838	 * only when migration is already known to end successfully.
1839	 */
1840	mem_cgroup_prepare_migration(page, new_page, &memcg);
1841
1842	orig_entry = *pmd;
1843	entry = mk_pmd(new_page, vma->vm_page_prot);
1844	entry = pmd_mkhuge(entry);
1845	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1846
1847	/*
1848	 * Clear the old entry under pagetable lock and establish the new PTE.
1849	 * Any parallel GUP will either observe the old page blocking on the
1850	 * page lock, block on the page table lock or observe the new page.
1851	 * The SetPageUptodate on the new page and page_add_new_anon_rmap
1852	 * guarantee the copy is visible before the pagetable update.
1853	 */
1854	flush_cache_range(vma, mmun_start, mmun_end);
1855	page_add_new_anon_rmap(new_page, vma, mmun_start);
1856	pmdp_clear_flush(vma, mmun_start, pmd);
1857	set_pmd_at(mm, mmun_start, pmd, entry);
1858	flush_tlb_range(vma, mmun_start, mmun_end);
1859	update_mmu_cache_pmd(vma, address, &entry);
1860
1861	if (page_count(page) != 2) {
1862		set_pmd_at(mm, mmun_start, pmd, orig_entry);
1863		flush_tlb_range(vma, mmun_start, mmun_end);
1864		update_mmu_cache_pmd(vma, address, &entry);
1865		page_remove_rmap(new_page);
1866		goto fail_putback;
1867	}
1868
1869	page_remove_rmap(page);
1870
1871	/*
1872	 * Finish the charge transaction under the page table lock to
1873	 * prevent split_huge_page() from dividing up the charge
1874	 * before it's fully transferred to the new page.
1875	 */
1876	mem_cgroup_end_migration(memcg, page, new_page, true);
1877	spin_unlock(ptl);
1878	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1879
1880	unlock_page(new_page);
1881	unlock_page(page);
1882	put_page(page);			/* Drop the rmap reference */
1883	put_page(page);			/* Drop the LRU isolation reference */
1884
1885	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1886	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1887
1888	mod_zone_page_state(page_zone(page),
1889			NR_ISOLATED_ANON + page_lru,
1890			-HPAGE_PMD_NR);
1891	return isolated;
1892
1893out_fail:
1894	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1895out_dropref:
1896	ptl = pmd_lock(mm, pmd);
1897	if (pmd_same(*pmd, entry)) {
1898		entry = pmd_mknonnuma(entry);
1899		set_pmd_at(mm, mmun_start, pmd, entry);
1900		update_mmu_cache_pmd(vma, address, &entry);
1901	}
1902	spin_unlock(ptl);
1903
1904out_unlock:
1905	unlock_page(page);
1906	put_page(page);
1907	return 0;
1908}
1909#endif /* CONFIG_NUMA_BALANCING */
1910
1911#endif /* CONFIG_NUMA */
v3.1
   1/*
   2 * Memory Migration functionality - linux/mm/migration.c
   3 *
   4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5 *
   6 * Page migration was first developed in the context of the memory hotplug
   7 * project. The main authors of the migration code are:
   8 *
   9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10 * Hirokazu Takahashi <taka@valinux.co.jp>
  11 * Dave Hansen <haveblue@us.ibm.com>
  12 * Christoph Lameter
  13 */
  14
  15#include <linux/migrate.h>
  16#include <linux/module.h>
  17#include <linux/swap.h>
  18#include <linux/swapops.h>
  19#include <linux/pagemap.h>
  20#include <linux/buffer_head.h>
  21#include <linux/mm_inline.h>
  22#include <linux/nsproxy.h>
  23#include <linux/pagevec.h>
  24#include <linux/ksm.h>
  25#include <linux/rmap.h>
  26#include <linux/topology.h>
  27#include <linux/cpu.h>
  28#include <linux/cpuset.h>
  29#include <linux/writeback.h>
  30#include <linux/mempolicy.h>
  31#include <linux/vmalloc.h>
  32#include <linux/security.h>
  33#include <linux/memcontrol.h>
  34#include <linux/syscalls.h>
  35#include <linux/hugetlb.h>
 
  36#include <linux/gfp.h>
 
 
  37
  38#include <asm/tlbflush.h>
  39
 
 
 
  40#include "internal.h"
  41
  42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  43
  44/*
  45 * migrate_prep() needs to be called before we start compiling a list of pages
  46 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  47 * undesirable, use migrate_prep_local()
  48 */
  49int migrate_prep(void)
  50{
  51	/*
  52	 * Clear the LRU lists so pages can be isolated.
  53	 * Note that pages may be moved off the LRU after we have
  54	 * drained them. Those pages will fail to migrate like other
  55	 * pages that may be busy.
  56	 */
  57	lru_add_drain_all();
  58
  59	return 0;
  60}
  61
  62/* Do the necessary work of migrate_prep but not if it involves other CPUs */
  63int migrate_prep_local(void)
  64{
  65	lru_add_drain();
  66
  67	return 0;
  68}
  69
  70/*
  71 * Add isolated pages on the list back to the LRU under page lock
  72 * to avoid leaking evictable pages back onto unevictable list.
 
 
 
 
  73 */
  74void putback_lru_pages(struct list_head *l)
  75{
  76	struct page *page;
  77	struct page *page2;
  78
  79	list_for_each_entry_safe(page, page2, l, lru) {
 
 
 
 
  80		list_del(&page->lru);
  81		dec_zone_page_state(page, NR_ISOLATED_ANON +
  82				page_is_file_cache(page));
  83		putback_lru_page(page);
 
 
 
  84	}
  85}
  86
  87/*
  88 * Restore a potential migration pte to a working pte entry
  89 */
  90static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  91				 unsigned long addr, void *old)
  92{
  93	struct mm_struct *mm = vma->vm_mm;
  94	swp_entry_t entry;
  95 	pgd_t *pgd;
  96 	pud_t *pud;
  97 	pmd_t *pmd;
  98	pte_t *ptep, pte;
  99 	spinlock_t *ptl;
 100
 101	if (unlikely(PageHuge(new))) {
 102		ptep = huge_pte_offset(mm, addr);
 103		if (!ptep)
 104			goto out;
 105		ptl = &mm->page_table_lock;
 106	} else {
 107		pgd = pgd_offset(mm, addr);
 108		if (!pgd_present(*pgd))
 109			goto out;
 110
 111		pud = pud_offset(pgd, addr);
 112		if (!pud_present(*pud))
 113			goto out;
 114
 115		pmd = pmd_offset(pud, addr);
 116		if (pmd_trans_huge(*pmd))
 117			goto out;
 118		if (!pmd_present(*pmd))
 119			goto out;
 120
 121		ptep = pte_offset_map(pmd, addr);
 122
 123		/*
 124		 * Peek to check is_swap_pte() before taking ptlock?  No, we
 125		 * can race mremap's move_ptes(), which skips anon_vma lock.
 126		 */
 127
 128		ptl = pte_lockptr(mm, pmd);
 129	}
 130
 131 	spin_lock(ptl);
 132	pte = *ptep;
 133	if (!is_swap_pte(pte))
 134		goto unlock;
 135
 136	entry = pte_to_swp_entry(pte);
 137
 138	if (!is_migration_entry(entry) ||
 139	    migration_entry_to_page(entry) != old)
 140		goto unlock;
 141
 142	get_page(new);
 143	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 
 
 144	if (is_write_migration_entry(entry))
 145		pte = pte_mkwrite(pte);
 146#ifdef CONFIG_HUGETLB_PAGE
 147	if (PageHuge(new))
 148		pte = pte_mkhuge(pte);
 
 
 149#endif
 150	flush_cache_page(vma, addr, pte_pfn(pte));
 151	set_pte_at(mm, addr, ptep, pte);
 152
 153	if (PageHuge(new)) {
 154		if (PageAnon(new))
 155			hugepage_add_anon_rmap(new, vma, addr);
 156		else
 157			page_dup_rmap(new);
 158	} else if (PageAnon(new))
 159		page_add_anon_rmap(new, vma, addr);
 160	else
 161		page_add_file_rmap(new);
 162
 163	/* No need to invalidate - it was non-present before */
 164	update_mmu_cache(vma, addr, ptep);
 165unlock:
 166	pte_unmap_unlock(ptep, ptl);
 167out:
 168	return SWAP_AGAIN;
 169}
 170
 171/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 172 * Get rid of all migration entries and replace them by
 173 * references to the indicated page.
 174 */
 175static void remove_migration_ptes(struct page *old, struct page *new)
 176{
 177	rmap_walk(new, remove_migration_pte, old);
 
 
 
 
 
 
 178}
 179
 180/*
 181 * Something used the pte of a page under migration. We need to
 182 * get to the page and wait until migration is finished.
 183 * When we return from this function the fault will be retried.
 184 *
 185 * This function is called from do_swap_page().
 186 */
 187void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 188				unsigned long address)
 189{
 190	pte_t *ptep, pte;
 191	spinlock_t *ptl;
 192	swp_entry_t entry;
 193	struct page *page;
 194
 195	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 196	pte = *ptep;
 197	if (!is_swap_pte(pte))
 198		goto out;
 199
 200	entry = pte_to_swp_entry(pte);
 201	if (!is_migration_entry(entry))
 202		goto out;
 203
 204	page = migration_entry_to_page(entry);
 205
 206	/*
 207	 * Once radix-tree replacement of page migration started, page_count
 208	 * *must* be zero. And, we don't want to call wait_on_page_locked()
 209	 * against a page without get_page().
 210	 * So, we use get_page_unless_zero(), here. Even failed, page fault
 211	 * will occur again.
 212	 */
 213	if (!get_page_unless_zero(page))
 214		goto out;
 215	pte_unmap_unlock(ptep, ptl);
 216	wait_on_page_locked(page);
 217	put_page(page);
 218	return;
 219out:
 220	pte_unmap_unlock(ptep, ptl);
 221}
 222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 223/*
 224 * Replace the page in the mapping.
 225 *
 226 * The number of remaining references must be:
 227 * 1 for anonymous pages without a mapping
 228 * 2 for pages with a mapping
 229 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 230 */
 231static int migrate_page_move_mapping(struct address_space *mapping,
 232		struct page *newpage, struct page *page)
 
 
 233{
 234	int expected_count;
 235	void **pslot;
 236
 237	if (!mapping) {
 238		/* Anonymous page without mapping */
 239		if (page_count(page) != 1)
 240			return -EAGAIN;
 241		return 0;
 242	}
 243
 244	spin_lock_irq(&mapping->tree_lock);
 245
 246	pslot = radix_tree_lookup_slot(&mapping->page_tree,
 247 					page_index(page));
 248
 249	expected_count = 2 + page_has_private(page);
 250	if (page_count(page) != expected_count ||
 251		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 252		spin_unlock_irq(&mapping->tree_lock);
 253		return -EAGAIN;
 254	}
 255
 256	if (!page_freeze_refs(page, expected_count)) {
 257		spin_unlock_irq(&mapping->tree_lock);
 258		return -EAGAIN;
 259	}
 260
 261	/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 262	 * Now we know that no one else is looking at the page.
 263	 */
 264	get_page(newpage);	/* add cache reference */
 265	if (PageSwapCache(page)) {
 266		SetPageSwapCache(newpage);
 267		set_page_private(newpage, page_private(page));
 268	}
 269
 270	radix_tree_replace_slot(pslot, newpage);
 271
 272	page_unfreeze_refs(page, expected_count);
 273	/*
 274	 * Drop cache reference from old page.
 
 275	 * We know this isn't the last reference.
 276	 */
 277	__put_page(page);
 278
 279	/*
 280	 * If moved to a different zone then also account
 281	 * the page for that zone. Other VM counters will be
 282	 * taken care of when we establish references to the
 283	 * new page and drop references to the old page.
 284	 *
 285	 * Note that anonymous pages are accounted for
 286	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
 287	 * are mapped to swap space.
 288	 */
 289	__dec_zone_page_state(page, NR_FILE_PAGES);
 290	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 291	if (!PageSwapCache(page) && PageSwapBacked(page)) {
 292		__dec_zone_page_state(page, NR_SHMEM);
 293		__inc_zone_page_state(newpage, NR_SHMEM);
 294	}
 295	spin_unlock_irq(&mapping->tree_lock);
 296
 297	return 0;
 298}
 299
 300/*
 301 * The expected number of remaining references is the same as that
 302 * of migrate_page_move_mapping().
 303 */
 304int migrate_huge_page_move_mapping(struct address_space *mapping,
 305				   struct page *newpage, struct page *page)
 306{
 307	int expected_count;
 308	void **pslot;
 309
 310	if (!mapping) {
 311		if (page_count(page) != 1)
 312			return -EAGAIN;
 313		return 0;
 314	}
 315
 316	spin_lock_irq(&mapping->tree_lock);
 317
 318	pslot = radix_tree_lookup_slot(&mapping->page_tree,
 319					page_index(page));
 320
 321	expected_count = 2 + page_has_private(page);
 322	if (page_count(page) != expected_count ||
 323		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 324		spin_unlock_irq(&mapping->tree_lock);
 325		return -EAGAIN;
 326	}
 327
 328	if (!page_freeze_refs(page, expected_count)) {
 329		spin_unlock_irq(&mapping->tree_lock);
 330		return -EAGAIN;
 331	}
 332
 333	get_page(newpage);
 334
 335	radix_tree_replace_slot(pslot, newpage);
 336
 337	page_unfreeze_refs(page, expected_count);
 338
 339	__put_page(page);
 
 
 340
 341	spin_unlock_irq(&mapping->tree_lock);
 342	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 343}
 344
 345/*
 346 * Copy the page to its new location
 347 */
 348void migrate_page_copy(struct page *newpage, struct page *page)
 349{
 350	if (PageHuge(page))
 
 
 351		copy_huge_page(newpage, page);
 352	else
 353		copy_highpage(newpage, page);
 354
 355	if (PageError(page))
 356		SetPageError(newpage);
 357	if (PageReferenced(page))
 358		SetPageReferenced(newpage);
 359	if (PageUptodate(page))
 360		SetPageUptodate(newpage);
 361	if (TestClearPageActive(page)) {
 362		VM_BUG_ON(PageUnevictable(page));
 363		SetPageActive(newpage);
 364	} else if (TestClearPageUnevictable(page))
 365		SetPageUnevictable(newpage);
 366	if (PageChecked(page))
 367		SetPageChecked(newpage);
 368	if (PageMappedToDisk(page))
 369		SetPageMappedToDisk(newpage);
 370
 371	if (PageDirty(page)) {
 372		clear_page_dirty_for_io(page);
 373		/*
 374		 * Want to mark the page and the radix tree as dirty, and
 375		 * redo the accounting that clear_page_dirty_for_io undid,
 376		 * but we can't use set_page_dirty because that function
 377		 * is actually a signal that all of the page has become dirty.
 378		 * Whereas only part of our page may be dirty.
 379		 */
 380		__set_page_dirty_nobuffers(newpage);
 
 
 
 381 	}
 382
 
 
 
 
 
 
 
 383	mlock_migrate_page(newpage, page);
 384	ksm_migrate_page(newpage, page);
 385
 
 
 
 386	ClearPageSwapCache(page);
 387	ClearPagePrivate(page);
 388	set_page_private(page, 0);
 389	page->mapping = NULL;
 390
 391	/*
 392	 * If any waiters have accumulated on the new page then
 393	 * wake them up.
 394	 */
 395	if (PageWriteback(newpage))
 396		end_page_writeback(newpage);
 397}
 398
 399/************************************************************
 400 *                    Migration functions
 401 ***********************************************************/
 402
 403/* Always fail migration. Used for mappings that are not movable */
 404int fail_migrate_page(struct address_space *mapping,
 405			struct page *newpage, struct page *page)
 406{
 407	return -EIO;
 408}
 409EXPORT_SYMBOL(fail_migrate_page);
 410
 411/*
 412 * Common logic to directly migrate a single page suitable for
 413 * pages that do not use PagePrivate/PagePrivate2.
 414 *
 415 * Pages are locked upon entry and exit.
 416 */
 417int migrate_page(struct address_space *mapping,
 418		struct page *newpage, struct page *page)
 
 419{
 420	int rc;
 421
 422	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 423
 424	rc = migrate_page_move_mapping(mapping, newpage, page);
 425
 426	if (rc)
 427		return rc;
 428
 429	migrate_page_copy(newpage, page);
 430	return 0;
 431}
 432EXPORT_SYMBOL(migrate_page);
 433
 434#ifdef CONFIG_BLOCK
 435/*
 436 * Migration function for pages with buffers. This function can only be used
 437 * if the underlying filesystem guarantees that no other references to "page"
 438 * exist.
 439 */
 440int buffer_migrate_page(struct address_space *mapping,
 441		struct page *newpage, struct page *page)
 442{
 443	struct buffer_head *bh, *head;
 444	int rc;
 445
 446	if (!page_has_buffers(page))
 447		return migrate_page(mapping, newpage, page);
 448
 449	head = page_buffers(page);
 450
 451	rc = migrate_page_move_mapping(mapping, newpage, page);
 452
 453	if (rc)
 454		return rc;
 455
 456	bh = head;
 457	do {
 458		get_bh(bh);
 459		lock_buffer(bh);
 460		bh = bh->b_this_page;
 461
 462	} while (bh != head);
 463
 464	ClearPagePrivate(page);
 465	set_page_private(newpage, page_private(page));
 466	set_page_private(page, 0);
 467	put_page(page);
 468	get_page(newpage);
 469
 470	bh = head;
 471	do {
 472		set_bh_page(bh, newpage, bh_offset(bh));
 473		bh = bh->b_this_page;
 474
 475	} while (bh != head);
 476
 477	SetPagePrivate(newpage);
 478
 479	migrate_page_copy(newpage, page);
 480
 481	bh = head;
 482	do {
 483		unlock_buffer(bh);
 484 		put_bh(bh);
 485		bh = bh->b_this_page;
 486
 487	} while (bh != head);
 488
 489	return 0;
 490}
 491EXPORT_SYMBOL(buffer_migrate_page);
 492#endif
 493
 494/*
 495 * Writeback a page to clean the dirty state
 496 */
 497static int writeout(struct address_space *mapping, struct page *page)
 498{
 499	struct writeback_control wbc = {
 500		.sync_mode = WB_SYNC_NONE,
 501		.nr_to_write = 1,
 502		.range_start = 0,
 503		.range_end = LLONG_MAX,
 504		.for_reclaim = 1
 505	};
 506	int rc;
 507
 508	if (!mapping->a_ops->writepage)
 509		/* No write method for the address space */
 510		return -EINVAL;
 511
 512	if (!clear_page_dirty_for_io(page))
 513		/* Someone else already triggered a write */
 514		return -EAGAIN;
 515
 516	/*
 517	 * A dirty page may imply that the underlying filesystem has
 518	 * the page on some queue. So the page must be clean for
 519	 * migration. Writeout may mean we loose the lock and the
 520	 * page state is no longer what we checked for earlier.
 521	 * At this point we know that the migration attempt cannot
 522	 * be successful.
 523	 */
 524	remove_migration_ptes(page, page);
 525
 526	rc = mapping->a_ops->writepage(page, &wbc);
 527
 528	if (rc != AOP_WRITEPAGE_ACTIVATE)
 529		/* unlocked. Relock */
 530		lock_page(page);
 531
 532	return (rc < 0) ? -EIO : -EAGAIN;
 533}
 534
 535/*
 536 * Default handling if a filesystem does not provide a migration function.
 537 */
 538static int fallback_migrate_page(struct address_space *mapping,
 539	struct page *newpage, struct page *page)
 540{
 541	if (PageDirty(page))
 
 
 
 542		return writeout(mapping, page);
 
 543
 544	/*
 545	 * Buffers may be managed in a filesystem specific way.
 546	 * We must have no buffers or drop them.
 547	 */
 548	if (page_has_private(page) &&
 549	    !try_to_release_page(page, GFP_KERNEL))
 550		return -EAGAIN;
 551
 552	return migrate_page(mapping, newpage, page);
 553}
 554
 555/*
 556 * Move a page to a newly allocated page
 557 * The page is locked and all ptes have been successfully removed.
 558 *
 559 * The new page will have replaced the old page if this function
 560 * is successful.
 561 *
 562 * Return value:
 563 *   < 0 - error code
 564 *  == 0 - success
 565 */
 566static int move_to_new_page(struct page *newpage, struct page *page,
 567					int remap_swapcache, bool sync)
 568{
 569	struct address_space *mapping;
 570	int rc;
 571
 572	/*
 573	 * Block others from accessing the page when we get around to
 574	 * establishing additional references. We are the only one
 575	 * holding a reference to the new page at this point.
 576	 */
 577	if (!trylock_page(newpage))
 578		BUG();
 579
 580	/* Prepare mapping for the new page.*/
 581	newpage->index = page->index;
 582	newpage->mapping = page->mapping;
 583	if (PageSwapBacked(page))
 584		SetPageSwapBacked(newpage);
 585
 586	mapping = page_mapping(page);
 587	if (!mapping)
 588		rc = migrate_page(mapping, newpage, page);
 589	else {
 590		/*
 591		 * Do not writeback pages if !sync and migratepage is
 592		 * not pointing to migrate_page() which is nonblocking
 593		 * (swapcache/tmpfs uses migratepage = migrate_page).
 
 594		 */
 595		if (PageDirty(page) && !sync &&
 596		    mapping->a_ops->migratepage != migrate_page)
 597			rc = -EBUSY;
 598		else if (mapping->a_ops->migratepage)
 599			/*
 600			 * Most pages have a mapping and most filesystems
 601			 * should provide a migration function. Anonymous
 602			 * pages are part of swap space which also has its
 603			 * own migration function. This is the most common
 604			 * path for page migration.
 605			 */
 606			rc = mapping->a_ops->migratepage(mapping,
 607							newpage, page);
 608		else
 609			rc = fallback_migrate_page(mapping, newpage, page);
 610	}
 611
 612	if (rc) {
 613		newpage->mapping = NULL;
 614	} else {
 615		if (remap_swapcache)
 616			remove_migration_ptes(page, newpage);
 
 617	}
 618
 619	unlock_page(newpage);
 620
 621	return rc;
 622}
 623
 624/*
 625 * Obtain the lock on page, remove all ptes and migrate the page
 626 * to the newly allocated page in newpage.
 627 */
 628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 629			struct page *page, int force, bool offlining, bool sync)
 630{
 631	int rc = 0;
 632	int *result = NULL;
 633	struct page *newpage = get_new_page(page, private, &result);
 634	int remap_swapcache = 1;
 635	int charge = 0;
 636	struct mem_cgroup *mem;
 637	struct anon_vma *anon_vma = NULL;
 638
 639	if (!newpage)
 640		return -ENOMEM;
 641
 642	if (page_count(page) == 1) {
 643		/* page was freed from under us. So we are done. */
 644		goto move_newpage;
 645	}
 646	if (unlikely(PageTransHuge(page)))
 647		if (unlikely(split_huge_page(page)))
 648			goto move_newpage;
 649
 650	/* prepare cgroup just returns 0 or -ENOMEM */
 651	rc = -EAGAIN;
 652
 653	if (!trylock_page(page)) {
 654		if (!force || !sync)
 655			goto move_newpage;
 656
 657		/*
 658		 * It's not safe for direct compaction to call lock_page.
 659		 * For example, during page readahead pages are added locked
 660		 * to the LRU. Later, when the IO completes the pages are
 661		 * marked uptodate and unlocked. However, the queueing
 662		 * could be merging multiple pages for one bio (e.g.
 663		 * mpage_readpages). If an allocation happens for the
 664		 * second or third page, the process can end up locking
 665		 * the same page twice and deadlocking. Rather than
 666		 * trying to be clever about what pages can be locked,
 667		 * avoid the use of lock_page for direct compaction
 668		 * altogether.
 669		 */
 670		if (current->flags & PF_MEMALLOC)
 671			goto move_newpage;
 672
 673		lock_page(page);
 674	}
 675
 676	/*
 677	 * Only memory hotplug's offline_pages() caller has locked out KSM,
 678	 * and can safely migrate a KSM page.  The other cases have skipped
 679	 * PageKsm along with PageReserved - but it is only now when we have
 680	 * the page lock that we can be certain it will not go KSM beneath us
 681	 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
 682	 * its pagecount raised, but only here do we take the page lock which
 683	 * serializes that).
 684	 */
 685	if (PageKsm(page) && !offlining) {
 686		rc = -EBUSY;
 687		goto unlock;
 688	}
 689
 690	/* charge against new page */
 691	charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
 692	if (charge == -ENOMEM) {
 693		rc = -ENOMEM;
 694		goto unlock;
 695	}
 696	BUG_ON(charge);
 697
 698	if (PageWriteback(page)) {
 699		/*
 700		 * For !sync, there is no point retrying as the retry loop
 701		 * is expected to be too short for PageWriteback to be cleared
 
 
 702		 */
 703		if (!sync) {
 704			rc = -EBUSY;
 705			goto uncharge;
 706		}
 707		if (!force)
 708			goto uncharge;
 709		wait_on_page_writeback(page);
 710	}
 711	/*
 712	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 713	 * we cannot notice that anon_vma is freed while we migrates a page.
 714	 * This get_anon_vma() delays freeing anon_vma pointer until the end
 715	 * of migration. File cache pages are no problem because of page_lock()
 716	 * File Caches may use write_page() or lock_page() in migration, then,
 717	 * just care Anon page here.
 718	 */
 719	if (PageAnon(page)) {
 720		/*
 721		 * Only page_lock_anon_vma() understands the subtleties of
 722		 * getting a hold on an anon_vma from outside one of its mms.
 723		 */
 724		anon_vma = page_get_anon_vma(page);
 725		if (anon_vma) {
 726			/*
 727			 * Anon page
 728			 */
 729		} else if (PageSwapCache(page)) {
 730			/*
 731			 * We cannot be sure that the anon_vma of an unmapped
 732			 * swapcache page is safe to use because we don't
 733			 * know in advance if the VMA that this page belonged
 734			 * to still exists. If the VMA and others sharing the
 735			 * data have been freed, then the anon_vma could
 736			 * already be invalid.
 737			 *
 738			 * To avoid this possibility, swapcache pages get
 739			 * migrated but are not remapped when migration
 740			 * completes
 741			 */
 742			remap_swapcache = 0;
 743		} else {
 744			goto uncharge;
 745		}
 746	}
 747
 
 
 
 
 
 
 
 
 
 
 
 
 748	/*
 749	 * Corner case handling:
 750	 * 1. When a new swap-cache page is read into, it is added to the LRU
 751	 * and treated as swapcache but it has no rmap yet.
 752	 * Calling try_to_unmap() against a page->mapping==NULL page will
 753	 * trigger a BUG.  So handle it here.
 754	 * 2. An orphaned page (see truncate_complete_page) might have
 755	 * fs-private metadata. The page can be picked up due to memory
 756	 * offlining.  Everywhere else except page reclaim, the page is
 757	 * invisible to the vm, so the page can not be migrated.  So try to
 758	 * free the metadata, so the page can be freed.
 759	 */
 760	if (!page->mapping) {
 761		VM_BUG_ON(PageAnon(page));
 762		if (page_has_private(page)) {
 763			try_to_free_buffers(page);
 764			goto uncharge;
 765		}
 766		goto skip_unmap;
 767	}
 768
 769	/* Establish migration ptes or remove ptes */
 770	try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 771
 772skip_unmap:
 773	if (!page_mapped(page))
 774		rc = move_to_new_page(newpage, page, remap_swapcache, sync);
 775
 776	if (rc && remap_swapcache)
 777		remove_migration_ptes(page, page);
 778
 779	/* Drop an anon_vma reference if we took one */
 780	if (anon_vma)
 781		put_anon_vma(anon_vma);
 782
 783uncharge:
 784	if (!charge)
 785		mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 786unlock:
 787	unlock_page(page);
 
 
 
 788
 789move_newpage:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 790	if (rc != -EAGAIN) {
 791 		/*
 792 		 * A page that has been migrated has all references
 793 		 * removed and will be freed. A page that has not been
 794 		 * migrated will have kepts its references and be
 795 		 * restored.
 796 		 */
 797 		list_del(&page->lru);
 798		dec_zone_page_state(page, NR_ISOLATED_ANON +
 799				page_is_file_cache(page));
 800		putback_lru_page(page);
 801	}
 802
 803	/*
 804	 * Move the new page to the LRU. If migration was not successful
 805	 * then this will free the page.
 806	 */
 807	putback_lru_page(newpage);
 808
 809	if (result) {
 810		if (rc)
 811			*result = rc;
 812		else
 813			*result = page_to_nid(newpage);
 814	}
 815	return rc;
 816}
 817
 818/*
 819 * Counterpart of unmap_and_move_page() for hugepage migration.
 820 *
 821 * This function doesn't wait the completion of hugepage I/O
 822 * because there is no race between I/O and migration for hugepage.
 823 * Note that currently hugepage I/O occurs only in direct I/O
 824 * where no lock is held and PG_writeback is irrelevant,
 825 * and writeback status of all subpages are counted in the reference
 826 * count of the head page (i.e. if all subpages of a 2MB hugepage are
 827 * under direct I/O, the reference of the head page is 512 and a bit more.)
 828 * This means that when we try to migrate hugepage whose subpages are
 829 * doing direct I/O, some references remain after try_to_unmap() and
 830 * hugepage migration fails without data corruption.
 831 *
 832 * There is also no race when direct I/O is issued on the page under migration,
 833 * because then pte is replaced with migration swap entry and direct I/O code
 834 * will wait in the page fault for migration to complete.
 835 */
 836static int unmap_and_move_huge_page(new_page_t get_new_page,
 837				unsigned long private, struct page *hpage,
 838				int force, bool offlining, bool sync)
 839{
 840	int rc = 0;
 841	int *result = NULL;
 842	struct page *new_hpage = get_new_page(hpage, private, &result);
 843	struct anon_vma *anon_vma = NULL;
 844
 
 
 
 
 
 
 
 
 
 
 
 
 
 845	if (!new_hpage)
 846		return -ENOMEM;
 847
 848	rc = -EAGAIN;
 849
 850	if (!trylock_page(hpage)) {
 851		if (!force || !sync)
 852			goto out;
 853		lock_page(hpage);
 854	}
 855
 856	if (PageAnon(hpage))
 857		anon_vma = page_get_anon_vma(hpage);
 858
 859	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 860
 861	if (!page_mapped(hpage))
 862		rc = move_to_new_page(new_hpage, hpage, 1, sync);
 863
 864	if (rc)
 865		remove_migration_ptes(hpage, hpage);
 866
 867	if (anon_vma)
 868		put_anon_vma(anon_vma);
 869out:
 870	unlock_page(hpage);
 871
 872	if (rc != -EAGAIN) {
 873		list_del(&hpage->lru);
 874		put_page(hpage);
 875	}
 876
 
 
 
 
 877	put_page(new_hpage);
 878
 879	if (result) {
 880		if (rc)
 881			*result = rc;
 882		else
 883			*result = page_to_nid(new_hpage);
 884	}
 885	return rc;
 886}
 887
 888/*
 889 * migrate_pages
 
 890 *
 891 * The function takes one list of pages to migrate and a function
 892 * that determines from the page to be migrated and the private data
 893 * the target of the move and allocates the page.
 
 
 
 
 894 *
 895 * The function returns after 10 attempts or if no pages
 896 * are movable anymore because to has become empty
 897 * or no retryable pages exist anymore.
 898 * Caller should call putback_lru_pages to return pages to the LRU
 899 * or free list only if ret != 0.
 900 *
 901 * Return: Number of pages not migrated or error code.
 902 */
 903int migrate_pages(struct list_head *from,
 904		new_page_t get_new_page, unsigned long private, bool offlining,
 905		bool sync)
 906{
 907	int retry = 1;
 908	int nr_failed = 0;
 
 909	int pass = 0;
 910	struct page *page;
 911	struct page *page2;
 912	int swapwrite = current->flags & PF_SWAPWRITE;
 913	int rc;
 914
 915	if (!swapwrite)
 916		current->flags |= PF_SWAPWRITE;
 917
 918	for(pass = 0; pass < 10 && retry; pass++) {
 919		retry = 0;
 920
 921		list_for_each_entry_safe(page, page2, from, lru) {
 922			cond_resched();
 923
 924			rc = unmap_and_move(get_new_page, private,
 925						page, pass > 2, offlining,
 926						sync);
 
 
 
 927
 928			switch(rc) {
 929			case -ENOMEM:
 930				goto out;
 931			case -EAGAIN:
 932				retry++;
 933				break;
 934			case 0:
 
 935				break;
 936			default:
 937				/* Permanent failure */
 
 
 
 
 
 938				nr_failed++;
 939				break;
 940			}
 941		}
 942	}
 943	rc = 0;
 944out:
 
 
 
 
 
 
 945	if (!swapwrite)
 946		current->flags &= ~PF_SWAPWRITE;
 947
 948	if (rc)
 949		return rc;
 950
 951	return nr_failed + retry;
 952}
 953
 954int migrate_huge_pages(struct list_head *from,
 955		new_page_t get_new_page, unsigned long private, bool offlining,
 956		bool sync)
 957{
 958	int retry = 1;
 959	int nr_failed = 0;
 960	int pass = 0;
 961	struct page *page;
 962	struct page *page2;
 963	int rc;
 964
 965	for (pass = 0; pass < 10 && retry; pass++) {
 966		retry = 0;
 967
 968		list_for_each_entry_safe(page, page2, from, lru) {
 969			cond_resched();
 970
 971			rc = unmap_and_move_huge_page(get_new_page,
 972					private, page, pass > 2, offlining,
 973					sync);
 974
 975			switch(rc) {
 976			case -ENOMEM:
 977				goto out;
 978			case -EAGAIN:
 979				retry++;
 980				break;
 981			case 0:
 982				break;
 983			default:
 984				/* Permanent failure */
 985				nr_failed++;
 986				break;
 987			}
 988		}
 989	}
 990	rc = 0;
 991out:
 992	if (rc)
 993		return rc;
 994
 995	return nr_failed + retry;
 996}
 997
 998#ifdef CONFIG_NUMA
 999/*
1000 * Move a list of individual pages
1001 */
1002struct page_to_node {
1003	unsigned long addr;
1004	struct page *page;
1005	int node;
1006	int status;
1007};
1008
1009static struct page *new_page_node(struct page *p, unsigned long private,
1010		int **result)
1011{
1012	struct page_to_node *pm = (struct page_to_node *)private;
1013
1014	while (pm->node != MAX_NUMNODES && pm->page != p)
1015		pm++;
1016
1017	if (pm->node == MAX_NUMNODES)
1018		return NULL;
1019
1020	*result = &pm->status;
1021
1022	return alloc_pages_exact_node(pm->node,
1023				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 
 
 
 
1024}
1025
1026/*
1027 * Move a set of pages as indicated in the pm array. The addr
1028 * field must be set to the virtual address of the page to be moved
1029 * and the node number must contain a valid target node.
1030 * The pm array ends with node = MAX_NUMNODES.
1031 */
1032static int do_move_page_to_node_array(struct mm_struct *mm,
1033				      struct page_to_node *pm,
1034				      int migrate_all)
1035{
1036	int err;
1037	struct page_to_node *pp;
1038	LIST_HEAD(pagelist);
1039
1040	down_read(&mm->mmap_sem);
1041
1042	/*
1043	 * Build a list of pages to migrate
1044	 */
1045	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1046		struct vm_area_struct *vma;
1047		struct page *page;
1048
1049		err = -EFAULT;
1050		vma = find_vma(mm, pp->addr);
1051		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1052			goto set_status;
1053
1054		page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1055
1056		err = PTR_ERR(page);
1057		if (IS_ERR(page))
1058			goto set_status;
1059
1060		err = -ENOENT;
1061		if (!page)
1062			goto set_status;
1063
1064		/* Use PageReserved to check for zero page */
1065		if (PageReserved(page) || PageKsm(page))
1066			goto put_and_set;
1067
1068		pp->page = page;
1069		err = page_to_nid(page);
1070
1071		if (err == pp->node)
1072			/*
1073			 * Node already in the right place
1074			 */
1075			goto put_and_set;
1076
1077		err = -EACCES;
1078		if (page_mapcount(page) > 1 &&
1079				!migrate_all)
1080			goto put_and_set;
1081
 
 
 
 
 
1082		err = isolate_lru_page(page);
1083		if (!err) {
1084			list_add_tail(&page->lru, &pagelist);
1085			inc_zone_page_state(page, NR_ISOLATED_ANON +
1086					    page_is_file_cache(page));
1087		}
1088put_and_set:
1089		/*
1090		 * Either remove the duplicate refcount from
1091		 * isolate_lru_page() or drop the page ref if it was
1092		 * not isolated.
1093		 */
1094		put_page(page);
1095set_status:
1096		pp->status = err;
1097	}
1098
1099	err = 0;
1100	if (!list_empty(&pagelist)) {
1101		err = migrate_pages(&pagelist, new_page_node,
1102				(unsigned long)pm, 0, true);
1103		if (err)
1104			putback_lru_pages(&pagelist);
1105	}
1106
1107	up_read(&mm->mmap_sem);
1108	return err;
1109}
1110
1111/*
1112 * Migrate an array of page address onto an array of nodes and fill
1113 * the corresponding array of status.
1114 */
1115static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
1116			 unsigned long nr_pages,
1117			 const void __user * __user *pages,
1118			 const int __user *nodes,
1119			 int __user *status, int flags)
1120{
1121	struct page_to_node *pm;
1122	nodemask_t task_nodes;
1123	unsigned long chunk_nr_pages;
1124	unsigned long chunk_start;
1125	int err;
1126
1127	task_nodes = cpuset_mems_allowed(task);
1128
1129	err = -ENOMEM;
1130	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1131	if (!pm)
1132		goto out;
1133
1134	migrate_prep();
1135
1136	/*
1137	 * Store a chunk of page_to_node array in a page,
1138	 * but keep the last one as a marker
1139	 */
1140	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1141
1142	for (chunk_start = 0;
1143	     chunk_start < nr_pages;
1144	     chunk_start += chunk_nr_pages) {
1145		int j;
1146
1147		if (chunk_start + chunk_nr_pages > nr_pages)
1148			chunk_nr_pages = nr_pages - chunk_start;
1149
1150		/* fill the chunk pm with addrs and nodes from user-space */
1151		for (j = 0; j < chunk_nr_pages; j++) {
1152			const void __user *p;
1153			int node;
1154
1155			err = -EFAULT;
1156			if (get_user(p, pages + j + chunk_start))
1157				goto out_pm;
1158			pm[j].addr = (unsigned long) p;
1159
1160			if (get_user(node, nodes + j + chunk_start))
1161				goto out_pm;
1162
1163			err = -ENODEV;
1164			if (node < 0 || node >= MAX_NUMNODES)
1165				goto out_pm;
1166
1167			if (!node_state(node, N_HIGH_MEMORY))
1168				goto out_pm;
1169
1170			err = -EACCES;
1171			if (!node_isset(node, task_nodes))
1172				goto out_pm;
1173
1174			pm[j].node = node;
1175		}
1176
1177		/* End marker for this chunk */
1178		pm[chunk_nr_pages].node = MAX_NUMNODES;
1179
1180		/* Migrate this chunk */
1181		err = do_move_page_to_node_array(mm, pm,
1182						 flags & MPOL_MF_MOVE_ALL);
1183		if (err < 0)
1184			goto out_pm;
1185
1186		/* Return status information */
1187		for (j = 0; j < chunk_nr_pages; j++)
1188			if (put_user(pm[j].status, status + j + chunk_start)) {
1189				err = -EFAULT;
1190				goto out_pm;
1191			}
1192	}
1193	err = 0;
1194
1195out_pm:
1196	free_page((unsigned long)pm);
1197out:
1198	return err;
1199}
1200
1201/*
1202 * Determine the nodes of an array of pages and store it in an array of status.
1203 */
1204static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1205				const void __user **pages, int *status)
1206{
1207	unsigned long i;
1208
1209	down_read(&mm->mmap_sem);
1210
1211	for (i = 0; i < nr_pages; i++) {
1212		unsigned long addr = (unsigned long)(*pages);
1213		struct vm_area_struct *vma;
1214		struct page *page;
1215		int err = -EFAULT;
1216
1217		vma = find_vma(mm, addr);
1218		if (!vma || addr < vma->vm_start)
1219			goto set_status;
1220
1221		page = follow_page(vma, addr, 0);
1222
1223		err = PTR_ERR(page);
1224		if (IS_ERR(page))
1225			goto set_status;
1226
1227		err = -ENOENT;
1228		/* Use PageReserved to check for zero page */
1229		if (!page || PageReserved(page) || PageKsm(page))
1230			goto set_status;
1231
1232		err = page_to_nid(page);
1233set_status:
1234		*status = err;
1235
1236		pages++;
1237		status++;
1238	}
1239
1240	up_read(&mm->mmap_sem);
1241}
1242
1243/*
1244 * Determine the nodes of a user array of pages and store it in
1245 * a user array of status.
1246 */
1247static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1248			 const void __user * __user *pages,
1249			 int __user *status)
1250{
1251#define DO_PAGES_STAT_CHUNK_NR 16
1252	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1253	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1254
1255	while (nr_pages) {
1256		unsigned long chunk_nr;
1257
1258		chunk_nr = nr_pages;
1259		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1260			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1261
1262		if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1263			break;
1264
1265		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1266
1267		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1268			break;
1269
1270		pages += chunk_nr;
1271		status += chunk_nr;
1272		nr_pages -= chunk_nr;
1273	}
1274	return nr_pages ? -EFAULT : 0;
1275}
1276
1277/*
1278 * Move a list of pages in the address space of the currently executing
1279 * process.
1280 */
1281SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1282		const void __user * __user *, pages,
1283		const int __user *, nodes,
1284		int __user *, status, int, flags)
1285{
1286	const struct cred *cred = current_cred(), *tcred;
1287	struct task_struct *task;
1288	struct mm_struct *mm;
1289	int err;
 
1290
1291	/* Check flags */
1292	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1293		return -EINVAL;
1294
1295	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1296		return -EPERM;
1297
1298	/* Find the mm_struct */
1299	rcu_read_lock();
1300	task = pid ? find_task_by_vpid(pid) : current;
1301	if (!task) {
1302		rcu_read_unlock();
1303		return -ESRCH;
1304	}
1305	mm = get_task_mm(task);
1306	rcu_read_unlock();
1307
1308	if (!mm)
1309		return -EINVAL;
1310
1311	/*
1312	 * Check if this process has the right to modify the specified
1313	 * process. The right exists if the process has administrative
1314	 * capabilities, superuser privileges or the same
1315	 * userid as the target process.
1316	 */
1317	rcu_read_lock();
1318	tcred = __task_cred(task);
1319	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1320	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1321	    !capable(CAP_SYS_NICE)) {
1322		rcu_read_unlock();
1323		err = -EPERM;
1324		goto out;
1325	}
1326	rcu_read_unlock();
1327
1328 	err = security_task_movememory(task);
1329 	if (err)
1330		goto out;
1331
1332	if (nodes) {
1333		err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1334				    flags);
1335	} else {
 
 
 
 
 
 
 
1336		err = do_pages_stat(mm, nr_pages, pages, status);
1337	}
 
 
1338
1339out:
1340	mmput(mm);
1341	return err;
1342}
1343
1344/*
1345 * Call migration functions in the vma_ops that may prepare
1346 * memory in a vm for migration. migration functions may perform
1347 * the migration for vmas that do not have an underlying page struct.
1348 */
1349int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1350	const nodemask_t *from, unsigned long flags)
1351{
1352 	struct vm_area_struct *vma;
1353 	int err = 0;
1354
1355	for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1356 		if (vma->vm_ops && vma->vm_ops->migrate) {
1357 			err = vma->vm_ops->migrate(vma, to, from, flags);
1358 			if (err)
1359 				break;
1360 		}
1361 	}
1362 	return err;
1363}
1364#endif