compaction.c - mm/compaction.c - Linux diff v4.6 - Bootlin Elixir Cross Referencer

   1/*
   2 * linux/mm/compaction.c
   3 *
   4 * Memory compaction for the reduction of external fragmentation. Note that
   5 * this heavily depends upon page migration to do all the real heavy
   6 * lifting
   7 *
   8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
   9 */
  10#include <linux/cpu.h>
  11#include <linux/swap.h>
  12#include <linux/migrate.h>
  13#include <linux/compaction.h>
  14#include <linux/mm_inline.h>
  15#include <linux/backing-dev.h>
  16#include <linux/sysctl.h>
  17#include <linux/sysfs.h>
  18#include <linux/balloon_compaction.h>
  19#include <linux/page-isolation.h>
  20#include <linux/kasan.h>
  21#include <linux/kthread.h>
  22#include <linux/freezer.h>
 
  23#include "internal.h"
  24
  25#ifdef CONFIG_COMPACTION
  26static inline void count_compact_event(enum vm_event_item item)
  27{
  28	count_vm_event(item);
  29}
  30
  31static inline void count_compact_events(enum vm_event_item item, long delta)
  32{
  33	count_vm_events(item, delta);
  34}
  35#else
  36#define count_compact_event(item) do { } while (0)
  37#define count_compact_events(item, delta) do { } while (0)
  38#endif
  39
  40#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  41
  42#define CREATE_TRACE_POINTS
  43#include <trace/events/compaction.h>
  44
 
 
 
 
 
  45static unsigned long release_freepages(struct list_head *freelist)
  46{
  47	struct page *page, *next;
  48	unsigned long high_pfn = 0;
  49
  50	list_for_each_entry_safe(page, next, freelist, lru) {
  51		unsigned long pfn = page_to_pfn(page);
  52		list_del(&page->lru);
  53		__free_page(page);
  54		if (pfn > high_pfn)
  55			high_pfn = pfn;
  56	}
  57
  58	return high_pfn;
  59}
  60
  61static void map_pages(struct list_head *list)
  62{
  63	struct page *page;
 
 
 
 
 
 
 
 
  64
  65	list_for_each_entry(page, list, lru) {
  66		arch_alloc_page(page, 0);
  67		kernel_map_pages(page, 1, 1);
  68		kasan_alloc_pages(page, 0);
 
 
 
 
  69	}
 
 
  70}
  71
  72static inline bool migrate_async_suitable(int migratetype)
  73{
  74	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
  75}
  76
  77#ifdef CONFIG_COMPACTION
  78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  79/* Do not skip compaction more than 64 times */
  80#define COMPACT_MAX_DEFER_SHIFT 6
  81
  82/*
  83 * Compaction is deferred when compaction fails to result in a page
  84 * allocation success. 1 << compact_defer_limit compactions are skipped up
  85 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
  86 */
  87void defer_compaction(struct zone *zone, int order)
  88{
  89	zone->compact_considered = 0;
  90	zone->compact_defer_shift++;
  91
  92	if (order < zone->compact_order_failed)
  93		zone->compact_order_failed = order;
  94
  95	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
  96		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
  97
  98	trace_mm_compaction_defer_compaction(zone, order);
  99}
 100
 101/* Returns true if compaction should be skipped this time */
 102bool compaction_deferred(struct zone *zone, int order)
 103{
 104	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
 105
 106	if (order < zone->compact_order_failed)
 107		return false;
 108
 109	/* Avoid possible overflow */
 110	if (++zone->compact_considered > defer_limit)
 111		zone->compact_considered = defer_limit;
 112
 113	if (zone->compact_considered >= defer_limit)
 114		return false;
 115
 116	trace_mm_compaction_deferred(zone, order);
 117
 118	return true;
 119}
 120
 121/*
 122 * Update defer tracking counters after successful compaction of given order,
 123 * which means an allocation either succeeded (alloc_success == true) or is
 124 * expected to succeed.
 125 */
 126void compaction_defer_reset(struct zone *zone, int order,
 127		bool alloc_success)
 128{
 129	if (alloc_success) {
 130		zone->compact_considered = 0;
 131		zone->compact_defer_shift = 0;
 132	}
 133	if (order >= zone->compact_order_failed)
 134		zone->compact_order_failed = order + 1;
 135
 136	trace_mm_compaction_defer_reset(zone, order);
 137}
 138
 139/* Returns true if restarting compaction after many failures */
 140bool compaction_restarting(struct zone *zone, int order)
 141{
 142	if (order < zone->compact_order_failed)
 143		return false;
 144
 145	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
 146		zone->compact_considered >= 1UL << zone->compact_defer_shift;
 147}
 148
 149/* Returns true if the pageblock should be scanned for pages to isolate. */
 150static inline bool isolation_suitable(struct compact_control *cc,
 151					struct page *page)
 152{
 153	if (cc->ignore_skip_hint)
 154		return true;
 155
 156	return !get_pageblock_skip(page);
 157}
 158
 159static void reset_cached_positions(struct zone *zone)
 160{
 161	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
 162	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 163	zone->compact_cached_free_pfn =
 164			round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
 165}
 166
 167/*
 168 * This function is called to clear all cached information on pageblocks that
 169 * should be skipped for page isolation when the migrate and free page scanner
 170 * meet.
 171 */
 172static void __reset_isolation_suitable(struct zone *zone)
 173{
 174	unsigned long start_pfn = zone->zone_start_pfn;
 175	unsigned long end_pfn = zone_end_pfn(zone);
 176	unsigned long pfn;
 177
 178	zone->compact_blockskip_flush = false;
 179
 180	/* Walk the zone and mark every pageblock as suitable for isolation */
 181	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 182		struct page *page;
 183
 184		cond_resched();
 185
 186		if (!pfn_valid(pfn))
 187			continue;
 188
 189		page = pfn_to_page(pfn);
 190		if (zone != page_zone(page))
 191			continue;
 192
 193		clear_pageblock_skip(page);
 194	}
 195
 196	reset_cached_positions(zone);
 197}
 198
 199void reset_isolation_suitable(pg_data_t *pgdat)
 200{
 201	int zoneid;
 202
 203	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 204		struct zone *zone = &pgdat->node_zones[zoneid];
 205		if (!populated_zone(zone))
 206			continue;
 207
 208		/* Only flush if a full compaction finished recently */
 209		if (zone->compact_blockskip_flush)
 210			__reset_isolation_suitable(zone);
 211	}
 212}
 213
 214/*
 215 * If no pages were isolated then mark this pageblock to be skipped in the
 216 * future. The information is later cleared by __reset_isolation_suitable().
 217 */
 218static void update_pageblock_skip(struct compact_control *cc,
 219			struct page *page, unsigned long nr_isolated,
 220			bool migrate_scanner)
 221{
 222	struct zone *zone = cc->zone;
 223	unsigned long pfn;
 224
 225	if (cc->ignore_skip_hint)
 226		return;
 227
 228	if (!page)
 229		return;
 230
 231	if (nr_isolated)
 232		return;
 233
 234	set_pageblock_skip(page);
 235
 236	pfn = page_to_pfn(page);
 237
 238	/* Update where async and sync compaction should restart */
 239	if (migrate_scanner) {
 240		if (pfn > zone->compact_cached_migrate_pfn[0])
 241			zone->compact_cached_migrate_pfn[0] = pfn;
 242		if (cc->mode != MIGRATE_ASYNC &&
 243		    pfn > zone->compact_cached_migrate_pfn[1])
 244			zone->compact_cached_migrate_pfn[1] = pfn;
 245	} else {
 246		if (pfn < zone->compact_cached_free_pfn)
 247			zone->compact_cached_free_pfn = pfn;
 248	}
 249}
 250#else
 251static inline bool isolation_suitable(struct compact_control *cc,
 252					struct page *page)
 253{
 254	return true;
 255}
 256
 257static void update_pageblock_skip(struct compact_control *cc,
 258			struct page *page, unsigned long nr_isolated,
 259			bool migrate_scanner)
 260{
 261}
 262#endif /* CONFIG_COMPACTION */
 263
 264/*
 265 * Compaction requires the taking of some coarse locks that are potentially
 266 * very heavily contended. For async compaction, back out if the lock cannot
 267 * be taken immediately. For sync compaction, spin on the lock if needed.
 268 *
 269 * Returns true if the lock is held
 270 * Returns false if the lock is not held and compaction should abort
 271 */
 272static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
 273						struct compact_control *cc)
 274{
 275	if (cc->mode == MIGRATE_ASYNC) {
 276		if (!spin_trylock_irqsave(lock, *flags)) {
 277			cc->contended = COMPACT_CONTENDED_LOCK;
 278			return false;
 279		}
 280	} else {
 281		spin_lock_irqsave(lock, *flags);
 282	}
 283
 284	return true;
 285}
 286
 287/*
 288 * Compaction requires the taking of some coarse locks that are potentially
 289 * very heavily contended. The lock should be periodically unlocked to avoid
 290 * having disabled IRQs for a long time, even when there is nobody waiting on
 291 * the lock. It might also be that allowing the IRQs will result in
 292 * need_resched() becoming true. If scheduling is needed, async compaction
 293 * aborts. Sync compaction schedules.
 294 * Either compaction type will also abort if a fatal signal is pending.
 295 * In either case if the lock was locked, it is dropped and not regained.
 296 *
 297 * Returns true if compaction should abort due to fatal signal pending, or
 298 *		async compaction due to need_resched()
 299 * Returns false when compaction can continue (sync compaction might have
 300 *		scheduled)
 301 */
 302static bool compact_unlock_should_abort(spinlock_t *lock,
 303		unsigned long flags, bool *locked, struct compact_control *cc)
 304{
 305	if (*locked) {
 306		spin_unlock_irqrestore(lock, flags);
 307		*locked = false;
 308	}
 309
 310	if (fatal_signal_pending(current)) {
 311		cc->contended = COMPACT_CONTENDED_SCHED;
 312		return true;
 313	}
 314
 315	if (need_resched()) {
 316		if (cc->mode == MIGRATE_ASYNC) {
 317			cc->contended = COMPACT_CONTENDED_SCHED;
 318			return true;
 319		}
 320		cond_resched();
 321	}
 322
 323	return false;
 324}
 325
 326/*
 327 * Aside from avoiding lock contention, compaction also periodically checks
 328 * need_resched() and either schedules in sync compaction or aborts async
 329 * compaction. This is similar to what compact_unlock_should_abort() does, but
 330 * is used where no lock is concerned.
 331 *
 332 * Returns false when no scheduling was needed, or sync compaction scheduled.
 333 * Returns true when async compaction should abort.
 334 */
 335static inline bool compact_should_abort(struct compact_control *cc)
 336{
 337	/* async compaction aborts if contended */
 338	if (need_resched()) {
 339		if (cc->mode == MIGRATE_ASYNC) {
 340			cc->contended = COMPACT_CONTENDED_SCHED;
 341			return true;
 342		}
 343
 344		cond_resched();
 345	}
 346
 347	return false;
 348}
 349
 350/*
 351 * Isolate free pages onto a private freelist. If @strict is true, will abort
 352 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 353 * (even though it may still end up isolating some pages).
 354 */
 355static unsigned long isolate_freepages_block(struct compact_control *cc,
 356				unsigned long *start_pfn,
 357				unsigned long end_pfn,
 358				struct list_head *freelist,
 359				bool strict)
 360{
 361	int nr_scanned = 0, total_isolated = 0;
 362	struct page *cursor, *valid_page = NULL;
 363	unsigned long flags = 0;
 364	bool locked = false;
 365	unsigned long blockpfn = *start_pfn;
 
 366
 367	cursor = pfn_to_page(blockpfn);
 368
 369	/* Isolate free pages. */
 370	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 371		int isolated, i;
 372		struct page *page = cursor;
 373
 374		/*
 375		 * Periodically drop the lock (if held) regardless of its
 376		 * contention, to give chance to IRQs. Abort if fatal signal
 377		 * pending or async compaction detects need_resched()
 378		 */
 379		if (!(blockpfn % SWAP_CLUSTER_MAX)
 380		    && compact_unlock_should_abort(&cc->zone->lock, flags,
 381								&locked, cc))
 382			break;
 383
 384		nr_scanned++;
 385		if (!pfn_valid_within(blockpfn))
 386			goto isolate_fail;
 387
 388		if (!valid_page)
 389			valid_page = page;
 390
 391		/*
 392		 * For compound pages such as THP and hugetlbfs, we can save
 393		 * potentially a lot of iterations if we skip them at once.
 394		 * The check is racy, but we can consider only valid values
 395		 * and the only danger is skipping too much.
 396		 */
 397		if (PageCompound(page)) {
 398			unsigned int comp_order = compound_order(page);
 399
 400			if (likely(comp_order < MAX_ORDER)) {
 401				blockpfn += (1UL << comp_order) - 1;
 402				cursor += (1UL << comp_order) - 1;
 403			}
 404
 405			goto isolate_fail;
 406		}
 407
 408		if (!PageBuddy(page))
 409			goto isolate_fail;
 410
 411		/*
 412		 * If we already hold the lock, we can skip some rechecking.
 413		 * Note that if we hold the lock now, checked_pageblock was
 414		 * already set in some previous iteration (or strict is true),
 415		 * so it is correct to skip the suitable migration target
 416		 * recheck as well.
 417		 */
 418		if (!locked) {
 419			/*
 420			 * The zone lock must be held to isolate freepages.
 421			 * Unfortunately this is a very coarse lock and can be
 422			 * heavily contended if there are parallel allocations
 423			 * or parallel compactions. For async compaction do not
 424			 * spin on the lock and we acquire the lock as late as
 425			 * possible.
 426			 */
 427			locked = compact_trylock_irqsave(&cc->zone->lock,
 428								&flags, cc);
 429			if (!locked)
 430				break;
 431
 432			/* Recheck this is a buddy page under lock */
 433			if (!PageBuddy(page))
 434				goto isolate_fail;
 435		}
 436
 437		/* Found a free page, break it into order-0 pages */
 438		isolated = split_free_page(page);
 439		total_isolated += isolated;
 440		for (i = 0; i < isolated; i++) {
 441			list_add(&page->lru, freelist);
 442			page++;
 443		}
 444
 445		/* If a page was split, advance to the end of it */
 446		if (isolated) {
 447			cc->nr_freepages += isolated;
 448			if (!strict &&
 449				cc->nr_migratepages <= cc->nr_freepages) {
 450				blockpfn += isolated;
 451				break;
 452			}
 453
 454			blockpfn += isolated - 1;
 455			cursor += isolated - 1;
 456			continue;
 457		}
 
 
 
 
 458
 459isolate_fail:
 460		if (strict)
 461			break;
 462		else
 463			continue;
 464
 465	}
 466
 
 
 
 467	/*
 468	 * There is a tiny chance that we have read bogus compound_order(),
 469	 * so be careful to not go outside of the pageblock.
 470	 */
 471	if (unlikely(blockpfn > end_pfn))
 472		blockpfn = end_pfn;
 473
 474	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
 475					nr_scanned, total_isolated);
 476
 477	/* Record how far we have got within the block */
 478	*start_pfn = blockpfn;
 479
 480	/*
 481	 * If strict isolation is requested by CMA then check that all the
 482	 * pages requested were isolated. If there were any failures, 0 is
 483	 * returned and CMA will fail.
 484	 */
 485	if (strict && blockpfn < end_pfn)
 486		total_isolated = 0;
 487
 488	if (locked)
 489		spin_unlock_irqrestore(&cc->zone->lock, flags);
 490
 491	/* Update the pageblock-skip if the whole pageblock was scanned */
 492	if (blockpfn == end_pfn)
 493		update_pageblock_skip(cc, valid_page, total_isolated, false);
 494
 495	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
 496	if (total_isolated)
 497		count_compact_events(COMPACTISOLATED, total_isolated);
 498	return total_isolated;
 499}
 500
 501/**
 502 * isolate_freepages_range() - isolate free pages.
 503 * @start_pfn: The first PFN to start isolating.
 504 * @end_pfn:   The one-past-last PFN.
 505 *
 506 * Non-free pages, invalid PFNs, or zone boundaries within the
 507 * [start_pfn, end_pfn) range are considered errors, cause function to
 508 * undo its actions and return zero.
 509 *
 510 * Otherwise, function returns one-past-the-last PFN of isolated page
 511 * (which may be greater then end_pfn if end fell in a middle of
 512 * a free page).
 513 */
 514unsigned long
 515isolate_freepages_range(struct compact_control *cc,
 516			unsigned long start_pfn, unsigned long end_pfn)
 517{
 518	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
 519	LIST_HEAD(freelist);
 520
 521	pfn = start_pfn;
 522	block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
 523	if (block_start_pfn < cc->zone->zone_start_pfn)
 524		block_start_pfn = cc->zone->zone_start_pfn;
 525	block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 526
 527	for (; pfn < end_pfn; pfn += isolated,
 528				block_start_pfn = block_end_pfn,
 529				block_end_pfn += pageblock_nr_pages) {
 530		/* Protect pfn from changing by isolate_freepages_block */
 531		unsigned long isolate_start_pfn = pfn;
 532
 533		block_end_pfn = min(block_end_pfn, end_pfn);
 534
 535		/*
 536		 * pfn could pass the block_end_pfn if isolated freepage
 537		 * is more than pageblock order. In this case, we adjust
 538		 * scanning range to right one.
 539		 */
 540		if (pfn >= block_end_pfn) {
 541			block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
 542			block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 543			block_end_pfn = min(block_end_pfn, end_pfn);
 544		}
 545
 546		if (!pageblock_pfn_to_page(block_start_pfn,
 547					block_end_pfn, cc->zone))
 548			break;
 549
 550		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
 551						block_end_pfn, &freelist, true);
 552
 553		/*
 554		 * In strict mode, isolate_freepages_block() returns 0 if
 555		 * there are any holes in the block (ie. invalid PFNs or
 556		 * non-free pages).
 557		 */
 558		if (!isolated)
 559			break;
 560
 561		/*
 562		 * If we managed to isolate pages, it is always (1 << n) *
 563		 * pageblock_nr_pages for some non-negative n.  (Max order
 564		 * page may span two pageblocks).
 565		 */
 566	}
 567
 568	/* split_free_page does not map the pages */
 569	map_pages(&freelist);
 570
 571	if (pfn < end_pfn) {
 572		/* Loop terminated early, cleanup. */
 573		release_freepages(&freelist);
 574		return 0;
 575	}
 576
 577	/* We don't use freelists for anything. */
 578	return pfn;
 579}
 580
 581/* Update the number of anon and file isolated pages in the zone */
 582static void acct_isolated(struct zone *zone, struct compact_control *cc)
 583{
 584	struct page *page;
 585	unsigned int count[2] = { 0, };
 586
 587	if (list_empty(&cc->migratepages))
 588		return;
 589
 590	list_for_each_entry(page, &cc->migratepages, lru)
 591		count[!!page_is_file_cache(page)]++;
 592
 593	mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
 594	mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
 595}
 596
 597/* Similar to reclaim, but different enough that they don't share logic */
 598static bool too_many_isolated(struct zone *zone)
 599{
 600	unsigned long active, inactive, isolated;
 601
 602	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
 603					zone_page_state(zone, NR_INACTIVE_ANON);
 604	active = zone_page_state(zone, NR_ACTIVE_FILE) +
 605					zone_page_state(zone, NR_ACTIVE_ANON);
 606	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
 607					zone_page_state(zone, NR_ISOLATED_ANON);
 608
 609	return isolated > (inactive + active) / 2;
 610}
 611
 612/**
 613 * isolate_migratepages_block() - isolate all migrate-able pages within
 614 *				  a single pageblock
 615 * @cc:		Compaction control structure.
 616 * @low_pfn:	The first PFN to isolate
 617 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
 618 * @isolate_mode: Isolation mode to be used.
 619 *
 620 * Isolate all pages that can be migrated from the range specified by
 621 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 622 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 623 * first page that was not scanned (which may be both less, equal to or more
 624 * than end_pfn).
 625 *
 626 * The pages are isolated on cc->migratepages list (not required to be empty),
 627 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 628 * is neither read nor updated.
 629 */
 630static unsigned long
 631isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 632			unsigned long end_pfn, isolate_mode_t isolate_mode)
 633{
 634	struct zone *zone = cc->zone;
 635	unsigned long nr_scanned = 0, nr_isolated = 0;
 636	struct list_head *migratelist = &cc->migratepages;
 637	struct lruvec *lruvec;
 638	unsigned long flags = 0;
 639	bool locked = false;
 640	struct page *page = NULL, *valid_page = NULL;
 641	unsigned long start_pfn = low_pfn;
 
 
 642
 643	/*
 644	 * Ensure that there are not too many pages isolated from the LRU
 645	 * list by either parallel reclaimers or compaction. If there are,
 646	 * delay for some time until fewer pages are isolated
 647	 */
 648	while (unlikely(too_many_isolated(zone))) {
 649		/* async migration should just abort */
 650		if (cc->mode == MIGRATE_ASYNC)
 651			return 0;
 652
 653		congestion_wait(BLK_RW_ASYNC, HZ/10);
 654
 655		if (fatal_signal_pending(current))
 656			return 0;
 657	}
 658
 659	if (compact_should_abort(cc))
 660		return 0;
 661
 
 
 
 
 
 662	/* Time to isolate some pages for migration */
 663	for (; low_pfn < end_pfn; low_pfn++) {
 664		bool is_lru;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 665
 666		/*
 667		 * Periodically drop the lock (if held) regardless of its
 668		 * contention, to give chance to IRQs. Abort async compaction
 669		 * if contended.
 670		 */
 671		if (!(low_pfn % SWAP_CLUSTER_MAX)
 672		    && compact_unlock_should_abort(&zone->lru_lock, flags,
 673								&locked, cc))
 674			break;
 675
 676		if (!pfn_valid_within(low_pfn))
 677			continue;
 678		nr_scanned++;
 679
 680		page = pfn_to_page(low_pfn);
 681
 682		if (!valid_page)
 683			valid_page = page;
 684
 685		/*
 686		 * Skip if free. We read page order here without zone lock
 687		 * which is generally unsafe, but the race window is small and
 688		 * the worst thing that can happen is that we skip some
 689		 * potential isolation targets.
 690		 */
 691		if (PageBuddy(page)) {
 692			unsigned long freepage_order = page_order_unsafe(page);
 693
 694			/*
 695			 * Without lock, we cannot be sure that what we got is
 696			 * a valid page order. Consider only values in the
 697			 * valid order range to prevent low_pfn overflow.
 698			 */
 699			if (freepage_order > 0 && freepage_order < MAX_ORDER)
 700				low_pfn += (1UL << freepage_order) - 1;
 701			continue;
 702		}
 703
 704		/*
 705		 * Check may be lockless but that's ok as we recheck later.
 706		 * It's possible to migrate LRU pages and balloon pages
 707		 * Skip any other type of page
 708		 */
 709		is_lru = PageLRU(page);
 710		if (!is_lru) {
 711			if (unlikely(balloon_page_movable(page))) {
 712				if (balloon_page_isolate(page)) {
 713					/* Successfully isolated */
 714					goto isolate_success;
 715				}
 716			}
 717		}
 718
 719		/*
 720		 * Regardless of being on LRU, compound pages such as THP and
 721		 * hugetlbfs are not to be compacted. We can potentially save
 722		 * a lot of iterations if we skip them at once. The check is
 723		 * racy, but we can consider only valid values and the only
 724		 * danger is skipping too much.
 725		 */
 726		if (PageCompound(page)) {
 727			unsigned int comp_order = compound_order(page);
 728
 729			if (likely(comp_order < MAX_ORDER))
 730				low_pfn += (1UL << comp_order) - 1;
 731
 732			continue;
 733		}
 734
 735		if (!is_lru)
 736			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 737
 738		/*
 739		 * Migration will fail if an anonymous page is pinned in memory,
 740		 * so avoid taking lru_lock and isolating it unnecessarily in an
 741		 * admittedly racy check.
 742		 */
 743		if (!page_mapping(page) &&
 744		    page_count(page) > page_mapcount(page))
 745			continue;
 
 
 
 
 
 
 
 746
 747		/* If we already hold the lock, we can skip some rechecking */
 748		if (!locked) {
 749			locked = compact_trylock_irqsave(&zone->lru_lock,
 750								&flags, cc);
 751			if (!locked)
 752				break;
 753
 754			/* Recheck PageLRU and PageCompound under lock */
 755			if (!PageLRU(page))
 756				continue;
 757
 758			/*
 759			 * Page become compound since the non-locked check,
 760			 * and it's on LRU. It can only be a THP so the order
 761			 * is safe to read and it's 0 for tail pages.
 762			 */
 763			if (unlikely(PageCompound(page))) {
 764				low_pfn += (1UL << compound_order(page)) - 1;
 765				continue;
 766			}
 767		}
 768
 769		lruvec = mem_cgroup_page_lruvec(page, zone);
 770
 771		/* Try isolate the page */
 772		if (__isolate_lru_page(page, isolate_mode) != 0)
 773			continue;
 774
 775		VM_BUG_ON_PAGE(PageCompound(page), page);
 776
 777		/* Successfully isolated */
 778		del_page_from_lru_list(page, lruvec, page_lru(page));
 
 
 779
 780isolate_success:
 781		list_add(&page->lru, migratelist);
 782		cc->nr_migratepages++;
 783		nr_isolated++;
 784
 
 
 
 
 
 
 
 
 
 785		/* Avoid isolating too much */
 786		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
 787			++low_pfn;
 788			break;
 789		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 790	}
 791
 792	/*
 793	 * The PageBuddy() check could have potentially brought us outside
 794	 * the range to be scanned.
 795	 */
 796	if (unlikely(low_pfn > end_pfn))
 797		low_pfn = end_pfn;
 798
 799	if (locked)
 800		spin_unlock_irqrestore(&zone->lru_lock, flags);
 801
 802	/*
 803	 * Update the pageblock-skip information and cached scanner pfn,
 804	 * if the whole pageblock was scanned without isolating any page.
 805	 */
 806	if (low_pfn == end_pfn)
 807		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 808
 809	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
 810						nr_scanned, nr_isolated);
 811
 812	count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
 813	if (nr_isolated)
 814		count_compact_events(COMPACTISOLATED, nr_isolated);
 815
 816	return low_pfn;
 817}
 818
 819/**
 820 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
 821 * @cc:        Compaction control structure.
 822 * @start_pfn: The first PFN to start isolating.
 823 * @end_pfn:   The one-past-last PFN.
 824 *
 825 * Returns zero if isolation fails fatally due to e.g. pending signal.
 826 * Otherwise, function returns one-past-the-last PFN of isolated page
 827 * (which may be greater than end_pfn if end fell in a middle of a THP page).
 828 */
 829unsigned long
 830isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 831							unsigned long end_pfn)
 832{
 833	unsigned long pfn, block_start_pfn, block_end_pfn;
 834
 835	/* Scan block by block. First and last block may be incomplete */
 836	pfn = start_pfn;
 837	block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
 838	if (block_start_pfn < cc->zone->zone_start_pfn)
 839		block_start_pfn = cc->zone->zone_start_pfn;
 840	block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 841
 842	for (; pfn < end_pfn; pfn = block_end_pfn,
 843				block_start_pfn = block_end_pfn,
 844				block_end_pfn += pageblock_nr_pages) {
 845
 846		block_end_pfn = min(block_end_pfn, end_pfn);
 847
 848		if (!pageblock_pfn_to_page(block_start_pfn,
 849					block_end_pfn, cc->zone))
 850			continue;
 851
 852		pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
 853							ISOLATE_UNEVICTABLE);
 854
 855		if (!pfn)
 856			break;
 857
 858		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
 859			break;
 860	}
 861	acct_isolated(cc->zone, cc);
 862
 863	return pfn;
 864}
 865
 866#endif /* CONFIG_COMPACTION || CONFIG_CMA */
 867#ifdef CONFIG_COMPACTION
 868
 869/* Returns true if the page is within a block suitable for migration to */
 870static bool suitable_migration_target(struct page *page)
 
 871{
 
 
 
 872	/* If the page is a large free page, then disallow migration */
 873	if (PageBuddy(page)) {
 874		/*
 875		 * We are checking page_order without zone->lock taken. But
 876		 * the only small danger is that we skip a potentially suitable
 877		 * pageblock, so it's not worth to check order for valid range.
 878		 */
 879		if (page_order_unsafe(page) >= pageblock_order)
 880			return false;
 881	}
 882
 883	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
 884	if (migrate_async_suitable(get_pageblock_migratetype(page)))
 885		return true;
 886
 887	/* Otherwise skip the block */
 888	return false;
 889}
 890
 891/*
 892 * Test whether the free scanner has reached the same or lower pageblock than
 893 * the migration scanner, and compaction should thus terminate.
 894 */
 895static inline bool compact_scanners_met(struct compact_control *cc)
 896{
 897	return (cc->free_pfn >> pageblock_order)
 898		<= (cc->migrate_pfn >> pageblock_order);
 899}
 900
 901/*
 902 * Based on information in the current compact_control, find blocks
 903 * suitable for isolating free pages from and then isolate them.
 904 */
 905static void isolate_freepages(struct compact_control *cc)
 906{
 907	struct zone *zone = cc->zone;
 908	struct page *page;
 909	unsigned long block_start_pfn;	/* start of current pageblock */
 910	unsigned long isolate_start_pfn; /* exact pfn we start at */
 911	unsigned long block_end_pfn;	/* end of current pageblock */
 912	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
 913	struct list_head *freelist = &cc->freepages;
 914
 915	/*
 916	 * Initialise the free scanner. The starting point is where we last
 917	 * successfully isolated from, zone-cached value, or the end of the
 918	 * zone when isolating for the first time. For looping we also need
 919	 * this pfn aligned down to the pageblock boundary, because we do
 920	 * block_start_pfn -= pageblock_nr_pages in the for loop.
 921	 * For ending point, take care when isolating in last pageblock of a
 922	 * a zone which ends in the middle of a pageblock.
 923	 * The low boundary is the end of the pageblock the migration scanner
 924	 * is using.
 925	 */
 926	isolate_start_pfn = cc->free_pfn;
 927	block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
 928	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
 929						zone_end_pfn(zone));
 930	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
 931
 932	/*
 933	 * Isolate free pages until enough are available to migrate the
 934	 * pages on cc->migratepages. We stop searching if the migrate
 935	 * and free page scanners meet or enough free pages are isolated.
 936	 */
 937	for (; block_start_pfn >= low_pfn;
 938				block_end_pfn = block_start_pfn,
 939				block_start_pfn -= pageblock_nr_pages,
 940				isolate_start_pfn = block_start_pfn) {
 941
 942		/*
 943		 * This can iterate a massively long zone without finding any
 944		 * suitable migration targets, so periodically check if we need
 945		 * to schedule, or even abort async compaction.
 946		 */
 947		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
 948						&& compact_should_abort(cc))
 949			break;
 950
 951		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
 952									zone);
 953		if (!page)
 954			continue;
 955
 956		/* Check the block is suitable for migration */
 957		if (!suitable_migration_target(page))
 958			continue;
 959
 960		/* If isolation recently failed, do not retry */
 961		if (!isolation_suitable(cc, page))
 962			continue;
 963
 964		/* Found a block suitable for isolating free pages from. */
 965		isolate_freepages_block(cc, &isolate_start_pfn,
 966					block_end_pfn, freelist, false);
 967
 968		/*
 969		 * If we isolated enough freepages, or aborted due to async
 970		 * compaction being contended, terminate the loop.
 971		 * Remember where the free scanner should restart next time,
 972		 * which is where isolate_freepages_block() left off.
 973		 * But if it scanned the whole pageblock, isolate_start_pfn
 974		 * now points at block_end_pfn, which is the start of the next
 975		 * pageblock.
 976		 * In that case we will however want to restart at the start
 977		 * of the previous pageblock.
 978		 */
 979		if ((cc->nr_freepages >= cc->nr_migratepages)
 980							|| cc->contended) {
 981			if (isolate_start_pfn >= block_end_pfn)
 
 
 
 
 982				isolate_start_pfn =
 983					block_start_pfn - pageblock_nr_pages;
 
 984			break;
 985		} else {
 986			/*
 987			 * isolate_freepages_block() should not terminate
 988			 * prematurely unless contended, or isolated enough
 989			 */
 990			VM_BUG_ON(isolate_start_pfn < block_end_pfn);
 991		}
 992	}
 993
 994	/* split_free_page does not map the pages */
 995	map_pages(freelist);
 996
 997	/*
 998	 * Record where the free scanner will restart next time. Either we
 999	 * broke from the loop and set isolate_start_pfn based on the last
1000	 * call to isolate_freepages_block(), or we met the migration scanner
1001	 * and the loop terminated due to isolate_start_pfn < low_pfn
1002	 */
1003	cc->free_pfn = isolate_start_pfn;
1004}
1005
1006/*
1007 * This is a migrate-callback that "allocates" freepages by taking pages
1008 * from the isolated freelists in the block we are migrating to.
1009 */
1010static struct page *compaction_alloc(struct page *migratepage,
1011					unsigned long data,
1012					int **result)
1013{
1014	struct compact_control *cc = (struct compact_control *)data;
1015	struct page *freepage;
1016
1017	/*
1018	 * Isolate free pages if necessary, and if we are not aborting due to
1019	 * contention.
1020	 */
1021	if (list_empty(&cc->freepages)) {
1022		if (!cc->contended)
1023			isolate_freepages(cc);
1024
1025		if (list_empty(&cc->freepages))
1026			return NULL;
1027	}
1028
1029	freepage = list_entry(cc->freepages.next, struct page, lru);
1030	list_del(&freepage->lru);
1031	cc->nr_freepages--;
1032
1033	return freepage;
1034}
1035
1036/*
1037 * This is a migrate-callback that "frees" freepages back to the isolated
1038 * freelist.  All pages on the freelist are from the same zone, so there is no
1039 * special handling needed for NUMA.
1040 */
1041static void compaction_free(struct page *page, unsigned long data)
1042{
1043	struct compact_control *cc = (struct compact_control *)data;
1044
1045	list_add(&page->lru, &cc->freepages);
1046	cc->nr_freepages++;
1047}
1048
1049/* possible outcome of isolate_migratepages */
1050typedef enum {
1051	ISOLATE_ABORT,		/* Abort compaction now */
1052	ISOLATE_NONE,		/* No pages isolated, continue scanning */
1053	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
1054} isolate_migrate_t;
1055
1056/*
1057 * Allow userspace to control policy on scanning the unevictable LRU for
1058 * compactable pages.
1059 */
1060int sysctl_compact_unevictable_allowed __read_mostly = 1;
1061
1062/*
1063 * Isolate all pages that can be migrated from the first suitable block,
1064 * starting at the block pointed to by the migrate scanner pfn within
1065 * compact_control.
1066 */
1067static isolate_migrate_t isolate_migratepages(struct zone *zone,
1068					struct compact_control *cc)
1069{
1070	unsigned long block_start_pfn;
1071	unsigned long block_end_pfn;
1072	unsigned long low_pfn;
1073	unsigned long isolate_start_pfn;
1074	struct page *page;
1075	const isolate_mode_t isolate_mode =
1076		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1077		(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1078
1079	/*
1080	 * Start at where we last stopped, or beginning of the zone as
1081	 * initialized by compact_zone()
1082	 */
1083	low_pfn = cc->migrate_pfn;
1084	block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
1085	if (block_start_pfn < zone->zone_start_pfn)
1086		block_start_pfn = zone->zone_start_pfn;
1087
1088	/* Only scan within a pageblock boundary */
1089	block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
1090
1091	/*
1092	 * Iterate over whole pageblocks until we find the first suitable.
1093	 * Do not cross the free scanner.
1094	 */
1095	for (; block_end_pfn <= cc->free_pfn;
1096			low_pfn = block_end_pfn,
1097			block_start_pfn = block_end_pfn,
1098			block_end_pfn += pageblock_nr_pages) {
1099
1100		/*
1101		 * This can potentially iterate a massively long zone with
1102		 * many pageblocks unsuitable, so periodically check if we
1103		 * need to schedule, or even abort async compaction.
1104		 */
1105		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1106						&& compact_should_abort(cc))
1107			break;
1108
1109		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1110									zone);
1111		if (!page)
1112			continue;
1113
1114		/* If isolation recently failed, do not retry */
1115		if (!isolation_suitable(cc, page))
1116			continue;
1117
1118		/*
1119		 * For async compaction, also only scan in MOVABLE blocks.
1120		 * Async compaction is optimistic to see if the minimum amount
1121		 * of work satisfies the allocation.
1122		 */
1123		if (cc->mode == MIGRATE_ASYNC &&
1124		    !migrate_async_suitable(get_pageblock_migratetype(page)))
1125			continue;
1126
1127		/* Perform the isolation */
1128		isolate_start_pfn = low_pfn;
1129		low_pfn = isolate_migratepages_block(cc, low_pfn,
1130						block_end_pfn, isolate_mode);
1131
1132		if (!low_pfn || cc->contended) {
1133			acct_isolated(zone, cc);
1134			return ISOLATE_ABORT;
1135		}
1136
1137		/*
1138		 * Record where we could have freed pages by migration and not
1139		 * yet flushed them to buddy allocator.
1140		 * - this is the lowest page that could have been isolated and
1141		 * then freed by migration.
1142		 */
1143		if (cc->nr_migratepages && !cc->last_migrated_pfn)
1144			cc->last_migrated_pfn = isolate_start_pfn;
1145
1146		/*
1147		 * Either we isolated something and proceed with migration. Or
1148		 * we failed and compact_zone should decide if we should
1149		 * continue or not.
1150		 */
1151		break;
1152	}
1153
1154	acct_isolated(zone, cc);
1155	/* Record where migration scanner will be restarted. */
1156	cc->migrate_pfn = low_pfn;
1157
1158	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1159}
1160
1161/*
1162 * order == -1 is expected when compacting via
1163 * /proc/sys/vm/compact_memory
1164 */
1165static inline bool is_via_compact_memory(int order)
1166{
1167	return order == -1;
1168}
1169
1170static int __compact_finished(struct zone *zone, struct compact_control *cc,
1171			    const int migratetype)
1172{
1173	unsigned int order;
1174	unsigned long watermark;
1175
1176	if (cc->contended || fatal_signal_pending(current))
1177		return COMPACT_CONTENDED;
1178
1179	/* Compaction run completes if the migrate and free scanner meet */
1180	if (compact_scanners_met(cc)) {
1181		/* Let the next compaction start anew. */
1182		reset_cached_positions(zone);
1183
1184		/*
1185		 * Mark that the PG_migrate_skip information should be cleared
1186		 * by kswapd when it goes to sleep. kcompactd does not set the
1187		 * flag itself as the decision to be clear should be directly
1188		 * based on an allocation request.
1189		 */
1190		if (cc->direct_compaction)
1191			zone->compact_blockskip_flush = true;
1192
1193		return COMPACT_COMPLETE;
 
 
 
1194	}
1195
1196	if (is_via_compact_memory(cc->order))
1197		return COMPACT_CONTINUE;
1198
1199	/* Compaction run is not finished if the watermark is not met */
1200	watermark = low_wmark_pages(zone);
1201
1202	if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1203							cc->alloc_flags))
1204		return COMPACT_CONTINUE;
1205
1206	/* Direct compactor: Is a suitable page free? */
1207	for (order = cc->order; order < MAX_ORDER; order++) {
1208		struct free_area *area = &zone->free_area[order];
1209		bool can_steal;
1210
1211		/* Job done if page is free of the right migratetype */
1212		if (!list_empty(&area->free_list[migratetype]))
1213			return COMPACT_PARTIAL;
1214
1215#ifdef CONFIG_CMA
1216		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1217		if (migratetype == MIGRATE_MOVABLE &&
1218			!list_empty(&area->free_list[MIGRATE_CMA]))
1219			return COMPACT_PARTIAL;
1220#endif
1221		/*
1222		 * Job done if allocation would steal freepages from
1223		 * other migratetype buddy lists.
1224		 */
1225		if (find_suitable_fallback(area, order, migratetype,
1226						true, &can_steal) != -1)
1227			return COMPACT_PARTIAL;
1228	}
1229
1230	return COMPACT_NO_SUITABLE_PAGE;
1231}
1232
1233static int compact_finished(struct zone *zone, struct compact_control *cc,
1234			    const int migratetype)
 
1235{
1236	int ret;
1237
1238	ret = __compact_finished(zone, cc, migratetype);
1239	trace_mm_compaction_finished(zone, cc->order, ret);
1240	if (ret == COMPACT_NO_SUITABLE_PAGE)
1241		ret = COMPACT_CONTINUE;
1242
1243	return ret;
1244}
1245
1246/*
1247 * compaction_suitable: Is this suitable to run compaction on this zone now?
1248 * Returns
1249 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
1250 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
1251 *   COMPACT_CONTINUE - If compaction should run now
1252 */
1253static unsigned long __compaction_suitable(struct zone *zone, int order,
1254					int alloc_flags, int classzone_idx)
 
 
1255{
1256	int fragindex;
1257	unsigned long watermark;
1258
1259	if (is_via_compact_memory(order))
1260		return COMPACT_CONTINUE;
1261
1262	watermark = low_wmark_pages(zone);
1263	/*
1264	 * If watermarks for high-order allocation are already met, there
1265	 * should be no need for compaction at all.
1266	 */
1267	if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1268								alloc_flags))
1269		return COMPACT_PARTIAL;
1270
1271	/*
1272	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
1273	 * This is because during migration, copies of pages need to be
1274	 * allocated and for a short time, the footprint is higher
 
 
 
 
 
 
 
 
 
1275	 */
1276	watermark += (2UL << order);
1277	if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
 
 
 
1278		return COMPACT_SKIPPED;
1279
 
 
 
 
 
 
 
 
 
 
 
 
1280	/*
1281	 * fragmentation index determines if allocation failures are due to
1282	 * low memory or external fragmentation
1283	 *
1284	 * index of -1000 would imply allocations might succeed depending on
1285	 * watermarks, but we already failed the high-order watermark check
1286	 * index towards 0 implies failure is due to lack of memory
1287	 * index towards 1000 implies failure is due to fragmentation
1288	 *
1289	 * Only compact if a failure would be due to fragmentation.
 
 
 
 
 
1290	 */
1291	fragindex = fragmentation_index(zone, order);
1292	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1293		return COMPACT_NOT_SUITABLE_ZONE;
1294
1295	return COMPACT_CONTINUE;
1296}
1297
1298unsigned long compaction_suitable(struct zone *zone, int order,
1299					int alloc_flags, int classzone_idx)
1300{
1301	unsigned long ret;
1302
1303	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
1304	trace_mm_compaction_suitable(zone, order, ret);
1305	if (ret == COMPACT_NOT_SUITABLE_ZONE)
1306		ret = COMPACT_SKIPPED;
1307
1308	return ret;
1309}
1310
1311static int compact_zone(struct zone *zone, struct compact_control *cc)
 
1312{
1313	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1314	unsigned long start_pfn = zone->zone_start_pfn;
1315	unsigned long end_pfn = zone_end_pfn(zone);
1316	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1317	const bool sync = cc->mode != MIGRATE_ASYNC;
1318
1319	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1320							cc->classzone_idx);
1321	switch (ret) {
1322	case COMPACT_PARTIAL:
1323	case COMPACT_SKIPPED:
1324		/* Compaction is likely to fail */
1325		return ret;
1326	case COMPACT_CONTINUE:
1327		/* Fall through to compaction */
1328		;
1329	}
1330
1331	/*
1332	 * Clear pageblock skip if there were failures recently and compaction
1333	 * is about to be retried after being deferred.
1334	 */
1335	if (compaction_restarting(zone, cc->order))
1336		__reset_isolation_suitable(zone);
1337
1338	/*
1339	 * Setup to move all movable pages to the end of the zone. Used cached
1340	 * information on where the scanners should start but check that it
1341	 * is initialised by ensuring the values are within zone boundaries.
 
1342	 */
1343	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1344	cc->free_pfn = zone->compact_cached_free_pfn;
1345	if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1346		cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
1347		zone->compact_cached_free_pfn = cc->free_pfn;
1348	}
1349	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1350		cc->migrate_pfn = start_pfn;
1351		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1352		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1353	}
 
1354	cc->last_migrated_pfn = 0;
1355
1356	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1357				cc->free_pfn, end_pfn, sync);
1358
1359	migrate_prep_local();
1360
1361	while ((ret = compact_finished(zone, cc, migratetype)) ==
1362						COMPACT_CONTINUE) {
1363		int err;
1364
1365		switch (isolate_migratepages(zone, cc)) {
1366		case ISOLATE_ABORT:
1367			ret = COMPACT_CONTENDED;
1368			putback_movable_pages(&cc->migratepages);
1369			cc->nr_migratepages = 0;
1370			goto out;
1371		case ISOLATE_NONE:
1372			/*
1373			 * We haven't isolated and migrated anything, but
1374			 * there might still be unflushed migrations from
1375			 * previous cc->order aligned block.
1376			 */
1377			goto check_drain;
1378		case ISOLATE_SUCCESS:
1379			;
1380		}
1381
1382		err = migrate_pages(&cc->migratepages, compaction_alloc,
1383				compaction_free, (unsigned long)cc, cc->mode,
1384				MR_COMPACTION);
1385
1386		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1387							&cc->migratepages);
1388
1389		/* All pages were either migrated or will be released */
1390		cc->nr_migratepages = 0;
1391		if (err) {
1392			putback_movable_pages(&cc->migratepages);
1393			/*
1394			 * migrate_pages() may return -ENOMEM when scanners meet
1395			 * and we want compact_finished() to detect it
1396			 */
1397			if (err == -ENOMEM && !compact_scanners_met(cc)) {
1398				ret = COMPACT_CONTENDED;
1399				goto out;
1400			}
 
 
 
 
 
 
 
 
 
 
 
 
1401		}
1402
1403check_drain:
1404		/*
1405		 * Has the migration scanner moved away from the previous
1406		 * cc->order aligned block where we migrated from? If yes,
1407		 * flush the pages that were freed, so that they can merge and
1408		 * compact_finished() can detect immediately if allocation
1409		 * would succeed.
1410		 */
1411		if (cc->order > 0 && cc->last_migrated_pfn) {
1412			int cpu;
1413			unsigned long current_block_start =
1414				cc->migrate_pfn & ~((1UL << cc->order) - 1);
1415
1416			if (cc->last_migrated_pfn < current_block_start) {
1417				cpu = get_cpu();
1418				lru_add_drain_cpu(cpu);
1419				drain_local_pages(zone);
1420				put_cpu();
1421				/* No more flushing until we migrate again */
1422				cc->last_migrated_pfn = 0;
1423			}
1424		}
1425
1426	}
1427
1428out:
1429	/*
1430	 * Release free pages and update where the free scanner should restart,
1431	 * so we don't leave any returned pages behind in the next attempt.
1432	 */
1433	if (cc->nr_freepages > 0) {
1434		unsigned long free_pfn = release_freepages(&cc->freepages);
1435
1436		cc->nr_freepages = 0;
1437		VM_BUG_ON(free_pfn == 0);
1438		/* The cached pfn is always the first in a pageblock */
1439		free_pfn &= ~(pageblock_nr_pages-1);
1440		/*
1441		 * Only go back, not forward. The cached pfn might have been
1442		 * already reset to zone end in compact_finished()
1443		 */
1444		if (free_pfn > zone->compact_cached_free_pfn)
1445			zone->compact_cached_free_pfn = free_pfn;
1446	}
1447
1448	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1449				cc->free_pfn, end_pfn, sync, ret);
1450
1451	if (ret == COMPACT_CONTENDED)
1452		ret = COMPACT_PARTIAL;
1453
1454	return ret;
1455}
1456
1457static unsigned long compact_zone_order(struct zone *zone, int order,
1458		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
1459		int alloc_flags, int classzone_idx)
1460{
1461	unsigned long ret;
1462	struct compact_control cc = {
1463		.nr_freepages = 0,
1464		.nr_migratepages = 0,
1465		.order = order,
1466		.gfp_mask = gfp_mask,
1467		.zone = zone,
1468		.mode = mode,
 
1469		.alloc_flags = alloc_flags,
1470		.classzone_idx = classzone_idx,
1471		.direct_compaction = true,
 
 
 
1472	};
1473	INIT_LIST_HEAD(&cc.freepages);
1474	INIT_LIST_HEAD(&cc.migratepages);
1475
1476	ret = compact_zone(zone, &cc);
1477
1478	VM_BUG_ON(!list_empty(&cc.freepages));
1479	VM_BUG_ON(!list_empty(&cc.migratepages));
1480
1481	*contended = cc.contended;
1482	return ret;
1483}
1484
1485int sysctl_extfrag_threshold = 500;
1486
1487/**
1488 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1489 * @gfp_mask: The GFP mask of the current allocation
1490 * @order: The order of the current allocation
1491 * @alloc_flags: The allocation flags of the current allocation
1492 * @ac: The context of current allocation
1493 * @mode: The migration mode for async, sync light, or sync migration
1494 * @contended: Return value that determines if compaction was aborted due to
1495 *	       need_resched() or lock contention
1496 *
1497 * This is the main entry point for direct page compaction.
1498 */
1499unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1500			int alloc_flags, const struct alloc_context *ac,
1501			enum migrate_mode mode, int *contended)
1502{
1503	int may_enter_fs = gfp_mask & __GFP_FS;
1504	int may_perform_io = gfp_mask & __GFP_IO;
1505	struct zoneref *z;
1506	struct zone *zone;
1507	int rc = COMPACT_DEFERRED;
1508	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1509
1510	*contended = COMPACT_CONTENDED_NONE;
1511
1512	/* Check if the GFP flags allow compaction */
1513	if (!order || !may_enter_fs || !may_perform_io)
 
 
 
1514		return COMPACT_SKIPPED;
1515
1516	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
1517
1518	/* Compact each zone in the list */
1519	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1520								ac->nodemask) {
1521		int status;
1522		int zone_contended;
1523
1524		if (compaction_deferred(zone, order))
 
 
1525			continue;
 
1526
1527		status = compact_zone_order(zone, order, gfp_mask, mode,
1528				&zone_contended, alloc_flags,
1529				ac->classzone_idx);
1530		rc = max(status, rc);
1531		/*
1532		 * It takes at least one zone that wasn't lock contended
1533		 * to clear all_zones_contended.
1534		 */
1535		all_zones_contended &= zone_contended;
1536
1537		/* If a normal allocation would succeed, stop compacting */
1538		if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
1539					ac->classzone_idx, alloc_flags)) {
1540			/*
1541			 * We think the allocation will succeed in this zone,
1542			 * but it is not certain, hence the false. The caller
1543			 * will repeat this with true if allocation indeed
1544			 * succeeds in this zone.
1545			 */
1546			compaction_defer_reset(zone, order, false);
1547			/*
1548			 * It is possible that async compaction aborted due to
1549			 * need_resched() and the watermarks were ok thanks to
1550			 * somebody else freeing memory. The allocation can
1551			 * however still fail so we better signal the
1552			 * need_resched() contention anyway (this will not
1553			 * prevent the allocation attempt).
1554			 */
1555			if (zone_contended == COMPACT_CONTENDED_SCHED)
1556				*contended = COMPACT_CONTENDED_SCHED;
1557
1558			goto break_loop;
1559		}
1560
1561		if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
 
1562			/*
1563			 * We think that allocation won't succeed in this zone
1564			 * so we defer compaction there. If it ends up
1565			 * succeeding after all, it will be reset.
1566			 */
1567			defer_compaction(zone, order);
1568		}
1569
1570		/*
1571		 * We might have stopped compacting due to need_resched() in
1572		 * async compaction, or due to a fatal signal detected. In that
1573		 * case do not try further zones and signal need_resched()
1574		 * contention.
1575		 */
1576		if ((zone_contended == COMPACT_CONTENDED_SCHED)
1577					|| fatal_signal_pending(current)) {
1578			*contended = COMPACT_CONTENDED_SCHED;
1579			goto break_loop;
1580		}
1581
1582		continue;
1583break_loop:
1584		/*
1585		 * We might not have tried all the zones, so  be conservative
1586		 * and assume they are not all lock contended.
1587		 */
1588		all_zones_contended = 0;
1589		break;
1590	}
1591
1592	/*
1593	 * If at least one zone wasn't deferred or skipped, we report if all
1594	 * zones that were tried were lock contended.
1595	 */
1596	if (rc > COMPACT_SKIPPED && all_zones_contended)
1597		*contended = COMPACT_CONTENDED_LOCK;
1598
1599	return rc;
1600}
1601
1602
1603/* Compact all zones within a node */
1604static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1605{
 
1606	int zoneid;
1607	struct zone *zone;
 
 
 
 
 
 
 
 
1608
1609	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1610
1611		zone = &pgdat->node_zones[zoneid];
1612		if (!populated_zone(zone))
1613			continue;
1614
1615		cc->nr_freepages = 0;
1616		cc->nr_migratepages = 0;
1617		cc->zone = zone;
1618		INIT_LIST_HEAD(&cc->freepages);
1619		INIT_LIST_HEAD(&cc->migratepages);
1620
1621		/*
1622		 * When called via /proc/sys/vm/compact_memory
1623		 * this makes sure we compact the whole zone regardless of
1624		 * cached scanner positions.
1625		 */
1626		if (is_via_compact_memory(cc->order))
1627			__reset_isolation_suitable(zone);
1628
1629		if (is_via_compact_memory(cc->order) ||
1630				!compaction_deferred(zone, cc->order))
1631			compact_zone(zone, cc);
1632
1633		VM_BUG_ON(!list_empty(&cc->freepages));
1634		VM_BUG_ON(!list_empty(&cc->migratepages));
1635
1636		if (is_via_compact_memory(cc->order))
1637			continue;
1638
1639		if (zone_watermark_ok(zone, cc->order,
1640				low_wmark_pages(zone), 0, 0))
1641			compaction_defer_reset(zone, cc->order, false);
1642	}
1643}
1644
1645void compact_pgdat(pg_data_t *pgdat, int order)
1646{
1647	struct compact_control cc = {
1648		.order = order,
1649		.mode = MIGRATE_ASYNC,
1650	};
1651
1652	if (!order)
1653		return;
1654
1655	__compact_pgdat(pgdat, &cc);
1656}
1657
1658static void compact_node(int nid)
1659{
1660	struct compact_control cc = {
1661		.order = -1,
1662		.mode = MIGRATE_SYNC,
1663		.ignore_skip_hint = true,
1664	};
1665
1666	__compact_pgdat(NODE_DATA(nid), &cc);
1667}
1668
1669/* Compact all nodes in the system */
1670static void compact_nodes(void)
1671{
1672	int nid;
1673
1674	/* Flush pending updates to the LRU lists */
1675	lru_add_drain_all();
1676
1677	for_each_online_node(nid)
1678		compact_node(nid);
1679}
1680
1681/* The written value is actually unused, all memory is compacted */
1682int sysctl_compact_memory;
1683
1684/*
1685 * This is the entry point for compacting all nodes via
1686 * /proc/sys/vm/compact_memory
1687 */
1688int sysctl_compaction_handler(struct ctl_table *table, int write,
1689			void __user *buffer, size_t *length, loff_t *ppos)
1690{
1691	if (write)
1692		compact_nodes();
1693
1694	return 0;
1695}
1696
1697int sysctl_extfrag_handler(struct ctl_table *table, int write,
1698			void __user *buffer, size_t *length, loff_t *ppos)
1699{
1700	proc_dointvec_minmax(table, write, buffer, length, ppos);
1701
1702	return 0;
1703}
1704
1705#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1706static ssize_t sysfs_compact_node(struct device *dev,
1707			struct device_attribute *attr,
1708			const char *buf, size_t count)
1709{
1710	int nid = dev->id;
1711
1712	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1713		/* Flush pending updates to the LRU lists */
1714		lru_add_drain_all();
1715
1716		compact_node(nid);
1717	}
1718
1719	return count;
1720}
1721static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1722
1723int compaction_register_node(struct node *node)
1724{
1725	return device_create_file(&node->dev, &dev_attr_compact);
1726}
1727
1728void compaction_unregister_node(struct node *node)
1729{
1730	return device_remove_file(&node->dev, &dev_attr_compact);
1731}
1732#endif /* CONFIG_SYSFS && CONFIG_NUMA */
1733
1734static inline bool kcompactd_work_requested(pg_data_t *pgdat)
1735{
1736	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1737}
1738
1739static bool kcompactd_node_suitable(pg_data_t *pgdat)
1740{
1741	int zoneid;
1742	struct zone *zone;
1743	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
1744
1745	for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
1746		zone = &pgdat->node_zones[zoneid];
1747
1748		if (!populated_zone(zone))
1749			continue;
1750
1751		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
1752					classzone_idx) == COMPACT_CONTINUE)
1753			return true;
1754	}
1755
1756	return false;
1757}
1758
1759static void kcompactd_do_work(pg_data_t *pgdat)
1760{
1761	/*
1762	 * With no special task, compact all zones so that a page of requested
1763	 * order is allocatable.
1764	 */
1765	int zoneid;
1766	struct zone *zone;
1767	struct compact_control cc = {
1768		.order = pgdat->kcompactd_max_order,
1769		.classzone_idx = pgdat->kcompactd_classzone_idx,
1770		.mode = MIGRATE_SYNC_LIGHT,
1771		.ignore_skip_hint = true,
 
1772
1773	};
1774	bool success = false;
1775
1776	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
1777							cc.classzone_idx);
1778	count_vm_event(KCOMPACTD_WAKE);
1779
1780	for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
1781		int status;
1782
1783		zone = &pgdat->node_zones[zoneid];
1784		if (!populated_zone(zone))
1785			continue;
1786
1787		if (compaction_deferred(zone, cc.order))
1788			continue;
1789
1790		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
1791							COMPACT_CONTINUE)
1792			continue;
1793
1794		cc.nr_freepages = 0;
1795		cc.nr_migratepages = 0;
1796		cc.zone = zone;
1797		INIT_LIST_HEAD(&cc.freepages);
1798		INIT_LIST_HEAD(&cc.migratepages);
1799
1800		if (kthread_should_stop())
1801			return;
1802		status = compact_zone(zone, &cc);
1803
1804		if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
1805						cc.classzone_idx, 0)) {
1806			success = true;
1807			compaction_defer_reset(zone, cc.order, false);
1808		} else if (status == COMPACT_COMPLETE) {
1809			/*
1810			 * We use sync migration mode here, so we defer like
1811			 * sync direct compaction does.
1812			 */
1813			defer_compaction(zone, cc.order);
1814		}
1815
1816		VM_BUG_ON(!list_empty(&cc.freepages));
1817		VM_BUG_ON(!list_empty(&cc.migratepages));
1818	}
1819
1820	/*
1821	 * Regardless of success, we are done until woken up next. But remember
1822	 * the requested order/classzone_idx in case it was higher/tighter than
1823	 * our current ones
1824	 */
1825	if (pgdat->kcompactd_max_order <= cc.order)
1826		pgdat->kcompactd_max_order = 0;
1827	if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
1828		pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1829}
1830
1831void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
1832{
1833	if (!order)
1834		return;
1835
1836	if (pgdat->kcompactd_max_order < order)
1837		pgdat->kcompactd_max_order = order;
1838
1839	if (pgdat->kcompactd_classzone_idx > classzone_idx)
1840		pgdat->kcompactd_classzone_idx = classzone_idx;
1841
1842	if (!waitqueue_active(&pgdat->kcompactd_wait))
1843		return;
1844
1845	if (!kcompactd_node_suitable(pgdat))
1846		return;
1847
1848	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
1849							classzone_idx);
1850	wake_up_interruptible(&pgdat->kcompactd_wait);
1851}
1852
1853/*
1854 * The background compaction daemon, started as a kernel thread
1855 * from the init process.
1856 */
1857static int kcompactd(void *p)
1858{
1859	pg_data_t *pgdat = (pg_data_t*)p;
1860	struct task_struct *tsk = current;
1861
1862	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1863
1864	if (!cpumask_empty(cpumask))
1865		set_cpus_allowed_ptr(tsk, cpumask);
1866
1867	set_freezable();
1868
1869	pgdat->kcompactd_max_order = 0;
1870	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1871
1872	while (!kthread_should_stop()) {
1873		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
1874		wait_event_freezable(pgdat->kcompactd_wait,
1875				kcompactd_work_requested(pgdat));
1876
1877		kcompactd_do_work(pgdat);
1878	}
1879
1880	return 0;
1881}
1882
1883/*
1884 * This kcompactd start function will be called by init and node-hot-add.
1885 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
1886 */
1887int kcompactd_run(int nid)
1888{
1889	pg_data_t *pgdat = NODE_DATA(nid);
1890	int ret = 0;
1891
1892	if (pgdat->kcompactd)
1893		return 0;
1894
1895	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
1896	if (IS_ERR(pgdat->kcompactd)) {
1897		pr_err("Failed to start kcompactd on node %d\n", nid);
1898		ret = PTR_ERR(pgdat->kcompactd);
1899		pgdat->kcompactd = NULL;
1900	}
1901	return ret;
1902}
1903
1904/*
1905 * Called by memory hotplug when all memory in a node is offlined. Caller must
1906 * hold mem_hotplug_begin/end().
1907 */
1908void kcompactd_stop(int nid)
1909{
1910	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
1911
1912	if (kcompactd) {
1913		kthread_stop(kcompactd);
1914		NODE_DATA(nid)->kcompactd = NULL;
1915	}
1916}
1917
1918/*
1919 * It's optimal to keep kcompactd on the same CPUs as their memory, but
1920 * not required for correctness. So if the last cpu in a node goes
1921 * away, we get changed to run anywhere: as the first one comes back,
1922 * restore their cpu bindings.
1923 */
1924static int cpu_callback(struct notifier_block *nfb, unsigned long action,
1925			void *hcpu)
1926{
1927	int nid;
1928
1929	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1930		for_each_node_state(nid, N_MEMORY) {
1931			pg_data_t *pgdat = NODE_DATA(nid);
1932			const struct cpumask *mask;
1933
1934			mask = cpumask_of_node(pgdat->node_id);
1935
1936			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
1937				/* One of our CPUs online: restore mask */
1938				set_cpus_allowed_ptr(pgdat->kcompactd, mask);
1939		}
1940	}
1941	return NOTIFY_OK;
1942}
1943
1944static int __init kcompactd_init(void)
1945{
1946	int nid;
 
 
 
 
 
 
 
 
 
1947
1948	for_each_node_state(nid, N_MEMORY)
1949		kcompactd_run(nid);
1950	hotcpu_notifier(cpu_callback, 0);
1951	return 0;
1952}
1953subsys_initcall(kcompactd_init)
1954
1955#endif /* CONFIG_COMPACTION */

   1/*
   2 * linux/mm/compaction.c
   3 *
   4 * Memory compaction for the reduction of external fragmentation. Note that
   5 * this heavily depends upon page migration to do all the real heavy
   6 * lifting
   7 *
   8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
   9 */
  10#include <linux/cpu.h>
  11#include <linux/swap.h>
  12#include <linux/migrate.h>
  13#include <linux/compaction.h>
  14#include <linux/mm_inline.h>
  15#include <linux/backing-dev.h>
  16#include <linux/sysctl.h>
  17#include <linux/sysfs.h>
 
  18#include <linux/page-isolation.h>
  19#include <linux/kasan.h>
  20#include <linux/kthread.h>
  21#include <linux/freezer.h>
  22#include <linux/page_owner.h>
  23#include "internal.h"
  24
  25#ifdef CONFIG_COMPACTION
  26static inline void count_compact_event(enum vm_event_item item)
  27{
  28	count_vm_event(item);
  29}
  30
  31static inline void count_compact_events(enum vm_event_item item, long delta)
  32{
  33	count_vm_events(item, delta);
  34}
  35#else
  36#define count_compact_event(item) do { } while (0)
  37#define count_compact_events(item, delta) do { } while (0)
  38#endif
  39
  40#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  41
  42#define CREATE_TRACE_POINTS
  43#include <trace/events/compaction.h>
  44
  45#define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
  46#define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
  47#define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
  48#define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)
  49
  50static unsigned long release_freepages(struct list_head *freelist)
  51{
  52	struct page *page, *next;
  53	unsigned long high_pfn = 0;
  54
  55	list_for_each_entry_safe(page, next, freelist, lru) {
  56		unsigned long pfn = page_to_pfn(page);
  57		list_del(&page->lru);
  58		__free_page(page);
  59		if (pfn > high_pfn)
  60			high_pfn = pfn;
  61	}
  62
  63	return high_pfn;
  64}
  65
  66static void map_pages(struct list_head *list)
  67{
  68	unsigned int i, order, nr_pages;
  69	struct page *page, *next;
  70	LIST_HEAD(tmp_list);
  71
  72	list_for_each_entry_safe(page, next, list, lru) {
  73		list_del(&page->lru);
  74
  75		order = page_private(page);
  76		nr_pages = 1 << order;
  77
  78		post_alloc_hook(page, order, __GFP_MOVABLE);
  79		if (order)
  80			split_page(page, order);
  81
  82		for (i = 0; i < nr_pages; i++) {
  83			list_add(&page->lru, &tmp_list);
  84			page++;
  85		}
  86	}
  87
  88	list_splice(&tmp_list, list);
  89}
  90
  91static inline bool migrate_async_suitable(int migratetype)
  92{
  93	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
  94}
  95
  96#ifdef CONFIG_COMPACTION
  97
  98int PageMovable(struct page *page)
  99{
 100	struct address_space *mapping;
 101
 102	VM_BUG_ON_PAGE(!PageLocked(page), page);
 103	if (!__PageMovable(page))
 104		return 0;
 105
 106	mapping = page_mapping(page);
 107	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
 108		return 1;
 109
 110	return 0;
 111}
 112EXPORT_SYMBOL(PageMovable);
 113
 114void __SetPageMovable(struct page *page, struct address_space *mapping)
 115{
 116	VM_BUG_ON_PAGE(!PageLocked(page), page);
 117	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
 118	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
 119}
 120EXPORT_SYMBOL(__SetPageMovable);
 121
 122void __ClearPageMovable(struct page *page)
 123{
 124	VM_BUG_ON_PAGE(!PageLocked(page), page);
 125	VM_BUG_ON_PAGE(!PageMovable(page), page);
 126	/*
 127	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
 128	 * flag so that VM can catch up released page by driver after isolation.
 129	 * With it, VM migration doesn't try to put it back.
 130	 */
 131	page->mapping = (void *)((unsigned long)page->mapping &
 132				PAGE_MAPPING_MOVABLE);
 133}
 134EXPORT_SYMBOL(__ClearPageMovable);
 135
 136/* Do not skip compaction more than 64 times */
 137#define COMPACT_MAX_DEFER_SHIFT 6
 138
 139/*
 140 * Compaction is deferred when compaction fails to result in a page
 141 * allocation success. 1 << compact_defer_limit compactions are skipped up
 142 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
 143 */
 144void defer_compaction(struct zone *zone, int order)
 145{
 146	zone->compact_considered = 0;
 147	zone->compact_defer_shift++;
 148
 149	if (order < zone->compact_order_failed)
 150		zone->compact_order_failed = order;
 151
 152	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
 153		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
 154
 155	trace_mm_compaction_defer_compaction(zone, order);
 156}
 157
 158/* Returns true if compaction should be skipped this time */
 159bool compaction_deferred(struct zone *zone, int order)
 160{
 161	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
 162
 163	if (order < zone->compact_order_failed)
 164		return false;
 165
 166	/* Avoid possible overflow */
 167	if (++zone->compact_considered > defer_limit)
 168		zone->compact_considered = defer_limit;
 169
 170	if (zone->compact_considered >= defer_limit)
 171		return false;
 172
 173	trace_mm_compaction_deferred(zone, order);
 174
 175	return true;
 176}
 177
 178/*
 179 * Update defer tracking counters after successful compaction of given order,
 180 * which means an allocation either succeeded (alloc_success == true) or is
 181 * expected to succeed.
 182 */
 183void compaction_defer_reset(struct zone *zone, int order,
 184		bool alloc_success)
 185{
 186	if (alloc_success) {
 187		zone->compact_considered = 0;
 188		zone->compact_defer_shift = 0;
 189	}
 190	if (order >= zone->compact_order_failed)
 191		zone->compact_order_failed = order + 1;
 192
 193	trace_mm_compaction_defer_reset(zone, order);
 194}
 195
 196/* Returns true if restarting compaction after many failures */
 197bool compaction_restarting(struct zone *zone, int order)
 198{
 199	if (order < zone->compact_order_failed)
 200		return false;
 201
 202	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
 203		zone->compact_considered >= 1UL << zone->compact_defer_shift;
 204}
 205
 206/* Returns true if the pageblock should be scanned for pages to isolate. */
 207static inline bool isolation_suitable(struct compact_control *cc,
 208					struct page *page)
 209{
 210	if (cc->ignore_skip_hint)
 211		return true;
 212
 213	return !get_pageblock_skip(page);
 214}
 215
 216static void reset_cached_positions(struct zone *zone)
 217{
 218	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
 219	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 220	zone->compact_cached_free_pfn =
 221				pageblock_start_pfn(zone_end_pfn(zone) - 1);
 222}
 223
 224/*
 225 * This function is called to clear all cached information on pageblocks that
 226 * should be skipped for page isolation when the migrate and free page scanner
 227 * meet.
 228 */
 229static void __reset_isolation_suitable(struct zone *zone)
 230{
 231	unsigned long start_pfn = zone->zone_start_pfn;
 232	unsigned long end_pfn = zone_end_pfn(zone);
 233	unsigned long pfn;
 234
 235	zone->compact_blockskip_flush = false;
 236
 237	/* Walk the zone and mark every pageblock as suitable for isolation */
 238	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 239		struct page *page;
 240
 241		cond_resched();
 242
 243		if (!pfn_valid(pfn))
 244			continue;
 245
 246		page = pfn_to_page(pfn);
 247		if (zone != page_zone(page))
 248			continue;
 249
 250		clear_pageblock_skip(page);
 251	}
 252
 253	reset_cached_positions(zone);
 254}
 255
 256void reset_isolation_suitable(pg_data_t *pgdat)
 257{
 258	int zoneid;
 259
 260	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 261		struct zone *zone = &pgdat->node_zones[zoneid];
 262		if (!populated_zone(zone))
 263			continue;
 264
 265		/* Only flush if a full compaction finished recently */
 266		if (zone->compact_blockskip_flush)
 267			__reset_isolation_suitable(zone);
 268	}
 269}
 270
 271/*
 272 * If no pages were isolated then mark this pageblock to be skipped in the
 273 * future. The information is later cleared by __reset_isolation_suitable().
 274 */
 275static void update_pageblock_skip(struct compact_control *cc,
 276			struct page *page, unsigned long nr_isolated,
 277			bool migrate_scanner)
 278{
 279	struct zone *zone = cc->zone;
 280	unsigned long pfn;
 281
 282	if (cc->ignore_skip_hint)
 283		return;
 284
 285	if (!page)
 286		return;
 287
 288	if (nr_isolated)
 289		return;
 290
 291	set_pageblock_skip(page);
 292
 293	pfn = page_to_pfn(page);
 294
 295	/* Update where async and sync compaction should restart */
 296	if (migrate_scanner) {
 297		if (pfn > zone->compact_cached_migrate_pfn[0])
 298			zone->compact_cached_migrate_pfn[0] = pfn;
 299		if (cc->mode != MIGRATE_ASYNC &&
 300		    pfn > zone->compact_cached_migrate_pfn[1])
 301			zone->compact_cached_migrate_pfn[1] = pfn;
 302	} else {
 303		if (pfn < zone->compact_cached_free_pfn)
 304			zone->compact_cached_free_pfn = pfn;
 305	}
 306}
 307#else
 308static inline bool isolation_suitable(struct compact_control *cc,
 309					struct page *page)
 310{
 311	return true;
 312}
 313
 314static void update_pageblock_skip(struct compact_control *cc,
 315			struct page *page, unsigned long nr_isolated,
 316			bool migrate_scanner)
 317{
 318}
 319#endif /* CONFIG_COMPACTION */
 320
 321/*
 322 * Compaction requires the taking of some coarse locks that are potentially
 323 * very heavily contended. For async compaction, back out if the lock cannot
 324 * be taken immediately. For sync compaction, spin on the lock if needed.
 325 *
 326 * Returns true if the lock is held
 327 * Returns false if the lock is not held and compaction should abort
 328 */
 329static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
 330						struct compact_control *cc)
 331{
 332	if (cc->mode == MIGRATE_ASYNC) {
 333		if (!spin_trylock_irqsave(lock, *flags)) {
 334			cc->contended = true;
 335			return false;
 336		}
 337	} else {
 338		spin_lock_irqsave(lock, *flags);
 339	}
 340
 341	return true;
 342}
 343
 344/*
 345 * Compaction requires the taking of some coarse locks that are potentially
 346 * very heavily contended. The lock should be periodically unlocked to avoid
 347 * having disabled IRQs for a long time, even when there is nobody waiting on
 348 * the lock. It might also be that allowing the IRQs will result in
 349 * need_resched() becoming true. If scheduling is needed, async compaction
 350 * aborts. Sync compaction schedules.
 351 * Either compaction type will also abort if a fatal signal is pending.
 352 * In either case if the lock was locked, it is dropped and not regained.
 353 *
 354 * Returns true if compaction should abort due to fatal signal pending, or
 355 *		async compaction due to need_resched()
 356 * Returns false when compaction can continue (sync compaction might have
 357 *		scheduled)
 358 */
 359static bool compact_unlock_should_abort(spinlock_t *lock,
 360		unsigned long flags, bool *locked, struct compact_control *cc)
 361{
 362	if (*locked) {
 363		spin_unlock_irqrestore(lock, flags);
 364		*locked = false;
 365	}
 366
 367	if (fatal_signal_pending(current)) {
 368		cc->contended = true;
 369		return true;
 370	}
 371
 372	if (need_resched()) {
 373		if (cc->mode == MIGRATE_ASYNC) {
 374			cc->contended = true;
 375			return true;
 376		}
 377		cond_resched();
 378	}
 379
 380	return false;
 381}
 382
 383/*
 384 * Aside from avoiding lock contention, compaction also periodically checks
 385 * need_resched() and either schedules in sync compaction or aborts async
 386 * compaction. This is similar to what compact_unlock_should_abort() does, but
 387 * is used where no lock is concerned.
 388 *
 389 * Returns false when no scheduling was needed, or sync compaction scheduled.
 390 * Returns true when async compaction should abort.
 391 */
 392static inline bool compact_should_abort(struct compact_control *cc)
 393{
 394	/* async compaction aborts if contended */
 395	if (need_resched()) {
 396		if (cc->mode == MIGRATE_ASYNC) {
 397			cc->contended = true;
 398			return true;
 399		}
 400
 401		cond_resched();
 402	}
 403
 404	return false;
 405}
 406
 407/*
 408 * Isolate free pages onto a private freelist. If @strict is true, will abort
 409 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 410 * (even though it may still end up isolating some pages).
 411 */
 412static unsigned long isolate_freepages_block(struct compact_control *cc,
 413				unsigned long *start_pfn,
 414				unsigned long end_pfn,
 415				struct list_head *freelist,
 416				bool strict)
 417{
 418	int nr_scanned = 0, total_isolated = 0;
 419	struct page *cursor, *valid_page = NULL;
 420	unsigned long flags = 0;
 421	bool locked = false;
 422	unsigned long blockpfn = *start_pfn;
 423	unsigned int order;
 424
 425	cursor = pfn_to_page(blockpfn);
 426
 427	/* Isolate free pages. */
 428	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 429		int isolated;
 430		struct page *page = cursor;
 431
 432		/*
 433		 * Periodically drop the lock (if held) regardless of its
 434		 * contention, to give chance to IRQs. Abort if fatal signal
 435		 * pending or async compaction detects need_resched()
 436		 */
 437		if (!(blockpfn % SWAP_CLUSTER_MAX)
 438		    && compact_unlock_should_abort(&cc->zone->lock, flags,
 439								&locked, cc))
 440			break;
 441
 442		nr_scanned++;
 443		if (!pfn_valid_within(blockpfn))
 444			goto isolate_fail;
 445
 446		if (!valid_page)
 447			valid_page = page;
 448
 449		/*
 450		 * For compound pages such as THP and hugetlbfs, we can save
 451		 * potentially a lot of iterations if we skip them at once.
 452		 * The check is racy, but we can consider only valid values
 453		 * and the only danger is skipping too much.
 454		 */
 455		if (PageCompound(page)) {
 456			unsigned int comp_order = compound_order(page);
 457
 458			if (likely(comp_order < MAX_ORDER)) {
 459				blockpfn += (1UL << comp_order) - 1;
 460				cursor += (1UL << comp_order) - 1;
 461			}
 462
 463			goto isolate_fail;
 464		}
 465
 466		if (!PageBuddy(page))
 467			goto isolate_fail;
 468
 469		/*
 470		 * If we already hold the lock, we can skip some rechecking.
 471		 * Note that if we hold the lock now, checked_pageblock was
 472		 * already set in some previous iteration (or strict is true),
 473		 * so it is correct to skip the suitable migration target
 474		 * recheck as well.
 475		 */
 476		if (!locked) {
 477			/*
 478			 * The zone lock must be held to isolate freepages.
 479			 * Unfortunately this is a very coarse lock and can be
 480			 * heavily contended if there are parallel allocations
 481			 * or parallel compactions. For async compaction do not
 482			 * spin on the lock and we acquire the lock as late as
 483			 * possible.
 484			 */
 485			locked = compact_trylock_irqsave(&cc->zone->lock,
 486								&flags, cc);
 487			if (!locked)
 488				break;
 489
 490			/* Recheck this is a buddy page under lock */
 491			if (!PageBuddy(page))
 492				goto isolate_fail;
 493		}
 494
 495		/* Found a free page, will break it into order-0 pages */
 496		order = page_order(page);
 497		isolated = __isolate_free_page(page, order);
 498		if (!isolated)
 499			break;
 500		set_page_private(page, order);
 
 501
 502		total_isolated += isolated;
 503		cc->nr_freepages += isolated;
 504		list_add_tail(&page->lru, freelist);
 
 
 
 
 
 505
 506		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
 507			blockpfn += isolated;
 508			break;
 509		}
 510		/* Advance to the end of split page */
 511		blockpfn += isolated - 1;
 512		cursor += isolated - 1;
 513		continue;
 514
 515isolate_fail:
 516		if (strict)
 517			break;
 518		else
 519			continue;
 520
 521	}
 522
 523	if (locked)
 524		spin_unlock_irqrestore(&cc->zone->lock, flags);
 525
 526	/*
 527	 * There is a tiny chance that we have read bogus compound_order(),
 528	 * so be careful to not go outside of the pageblock.
 529	 */
 530	if (unlikely(blockpfn > end_pfn))
 531		blockpfn = end_pfn;
 532
 533	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
 534					nr_scanned, total_isolated);
 535
 536	/* Record how far we have got within the block */
 537	*start_pfn = blockpfn;
 538
 539	/*
 540	 * If strict isolation is requested by CMA then check that all the
 541	 * pages requested were isolated. If there were any failures, 0 is
 542	 * returned and CMA will fail.
 543	 */
 544	if (strict && blockpfn < end_pfn)
 545		total_isolated = 0;
 546
 
 
 
 547	/* Update the pageblock-skip if the whole pageblock was scanned */
 548	if (blockpfn == end_pfn)
 549		update_pageblock_skip(cc, valid_page, total_isolated, false);
 550
 551	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
 552	if (total_isolated)
 553		count_compact_events(COMPACTISOLATED, total_isolated);
 554	return total_isolated;
 555}
 556
 557/**
 558 * isolate_freepages_range() - isolate free pages.
 559 * @start_pfn: The first PFN to start isolating.
 560 * @end_pfn:   The one-past-last PFN.
 561 *
 562 * Non-free pages, invalid PFNs, or zone boundaries within the
 563 * [start_pfn, end_pfn) range are considered errors, cause function to
 564 * undo its actions and return zero.
 565 *
 566 * Otherwise, function returns one-past-the-last PFN of isolated page
 567 * (which may be greater then end_pfn if end fell in a middle of
 568 * a free page).
 569 */
 570unsigned long
 571isolate_freepages_range(struct compact_control *cc,
 572			unsigned long start_pfn, unsigned long end_pfn)
 573{
 574	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
 575	LIST_HEAD(freelist);
 576
 577	pfn = start_pfn;
 578	block_start_pfn = pageblock_start_pfn(pfn);
 579	if (block_start_pfn < cc->zone->zone_start_pfn)
 580		block_start_pfn = cc->zone->zone_start_pfn;
 581	block_end_pfn = pageblock_end_pfn(pfn);
 582
 583	for (; pfn < end_pfn; pfn += isolated,
 584				block_start_pfn = block_end_pfn,
 585				block_end_pfn += pageblock_nr_pages) {
 586		/* Protect pfn from changing by isolate_freepages_block */
 587		unsigned long isolate_start_pfn = pfn;
 588
 589		block_end_pfn = min(block_end_pfn, end_pfn);
 590
 591		/*
 592		 * pfn could pass the block_end_pfn if isolated freepage
 593		 * is more than pageblock order. In this case, we adjust
 594		 * scanning range to right one.
 595		 */
 596		if (pfn >= block_end_pfn) {
 597			block_start_pfn = pageblock_start_pfn(pfn);
 598			block_end_pfn = pageblock_end_pfn(pfn);
 599			block_end_pfn = min(block_end_pfn, end_pfn);
 600		}
 601
 602		if (!pageblock_pfn_to_page(block_start_pfn,
 603					block_end_pfn, cc->zone))
 604			break;
 605
 606		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
 607						block_end_pfn, &freelist, true);
 608
 609		/*
 610		 * In strict mode, isolate_freepages_block() returns 0 if
 611		 * there are any holes in the block (ie. invalid PFNs or
 612		 * non-free pages).
 613		 */
 614		if (!isolated)
 615			break;
 616
 617		/*
 618		 * If we managed to isolate pages, it is always (1 << n) *
 619		 * pageblock_nr_pages for some non-negative n.  (Max order
 620		 * page may span two pageblocks).
 621		 */
 622	}
 623
 624	/* __isolate_free_page() does not map the pages */
 625	map_pages(&freelist);
 626
 627	if (pfn < end_pfn) {
 628		/* Loop terminated early, cleanup. */
 629		release_freepages(&freelist);
 630		return 0;
 631	}
 632
 633	/* We don't use freelists for anything. */
 634	return pfn;
 635}
 636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 637/* Similar to reclaim, but different enough that they don't share logic */
 638static bool too_many_isolated(struct zone *zone)
 639{
 640	unsigned long active, inactive, isolated;
 641
 642	inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
 643			node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
 644	active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
 645			node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
 646	isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
 647			node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
 648
 649	return isolated > (inactive + active) / 2;
 650}
 651
 652/**
 653 * isolate_migratepages_block() - isolate all migrate-able pages within
 654 *				  a single pageblock
 655 * @cc:		Compaction control structure.
 656 * @low_pfn:	The first PFN to isolate
 657 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
 658 * @isolate_mode: Isolation mode to be used.
 659 *
 660 * Isolate all pages that can be migrated from the range specified by
 661 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 662 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 663 * first page that was not scanned (which may be both less, equal to or more
 664 * than end_pfn).
 665 *
 666 * The pages are isolated on cc->migratepages list (not required to be empty),
 667 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 668 * is neither read nor updated.
 669 */
 670static unsigned long
 671isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 672			unsigned long end_pfn, isolate_mode_t isolate_mode)
 673{
 674	struct zone *zone = cc->zone;
 675	unsigned long nr_scanned = 0, nr_isolated = 0;
 
 676	struct lruvec *lruvec;
 677	unsigned long flags = 0;
 678	bool locked = false;
 679	struct page *page = NULL, *valid_page = NULL;
 680	unsigned long start_pfn = low_pfn;
 681	bool skip_on_failure = false;
 682	unsigned long next_skip_pfn = 0;
 683
 684	/*
 685	 * Ensure that there are not too many pages isolated from the LRU
 686	 * list by either parallel reclaimers or compaction. If there are,
 687	 * delay for some time until fewer pages are isolated
 688	 */
 689	while (unlikely(too_many_isolated(zone))) {
 690		/* async migration should just abort */
 691		if (cc->mode == MIGRATE_ASYNC)
 692			return 0;
 693
 694		congestion_wait(BLK_RW_ASYNC, HZ/10);
 695
 696		if (fatal_signal_pending(current))
 697			return 0;
 698	}
 699
 700	if (compact_should_abort(cc))
 701		return 0;
 702
 703	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
 704		skip_on_failure = true;
 705		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
 706	}
 707
 708	/* Time to isolate some pages for migration */
 709	for (; low_pfn < end_pfn; low_pfn++) {
 710
 711		if (skip_on_failure && low_pfn >= next_skip_pfn) {
 712			/*
 713			 * We have isolated all migration candidates in the
 714			 * previous order-aligned block, and did not skip it due
 715			 * to failure. We should migrate the pages now and
 716			 * hopefully succeed compaction.
 717			 */
 718			if (nr_isolated)
 719				break;
 720
 721			/*
 722			 * We failed to isolate in the previous order-aligned
 723			 * block. Set the new boundary to the end of the
 724			 * current block. Note we can't simply increase
 725			 * next_skip_pfn by 1 << order, as low_pfn might have
 726			 * been incremented by a higher number due to skipping
 727			 * a compound or a high-order buddy page in the
 728			 * previous loop iteration.
 729			 */
 730			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
 731		}
 732
 733		/*
 734		 * Periodically drop the lock (if held) regardless of its
 735		 * contention, to give chance to IRQs. Abort async compaction
 736		 * if contended.
 737		 */
 738		if (!(low_pfn % SWAP_CLUSTER_MAX)
 739		    && compact_unlock_should_abort(zone_lru_lock(zone), flags,
 740								&locked, cc))
 741			break;
 742
 743		if (!pfn_valid_within(low_pfn))
 744			goto isolate_fail;
 745		nr_scanned++;
 746
 747		page = pfn_to_page(low_pfn);
 748
 749		if (!valid_page)
 750			valid_page = page;
 751
 752		/*
 753		 * Skip if free. We read page order here without zone lock
 754		 * which is generally unsafe, but the race window is small and
 755		 * the worst thing that can happen is that we skip some
 756		 * potential isolation targets.
 757		 */
 758		if (PageBuddy(page)) {
 759			unsigned long freepage_order = page_order_unsafe(page);
 760
 761			/*
 762			 * Without lock, we cannot be sure that what we got is
 763			 * a valid page order. Consider only values in the
 764			 * valid order range to prevent low_pfn overflow.
 765			 */
 766			if (freepage_order > 0 && freepage_order < MAX_ORDER)
 767				low_pfn += (1UL << freepage_order) - 1;
 768			continue;
 769		}
 770
 771		/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 772		 * Regardless of being on LRU, compound pages such as THP and
 773		 * hugetlbfs are not to be compacted. We can potentially save
 774		 * a lot of iterations if we skip them at once. The check is
 775		 * racy, but we can consider only valid values and the only
 776		 * danger is skipping too much.
 777		 */
 778		if (PageCompound(page)) {
 779			unsigned int comp_order = compound_order(page);
 780
 781			if (likely(comp_order < MAX_ORDER))
 782				low_pfn += (1UL << comp_order) - 1;
 783
 784			goto isolate_fail;
 785		}
 786
 787		/*
 788		 * Check may be lockless but that's ok as we recheck later.
 789		 * It's possible to migrate LRU and non-lru movable pages.
 790		 * Skip any other type of page
 791		 */
 792		if (!PageLRU(page)) {
 793			/*
 794			 * __PageMovable can return false positive so we need
 795			 * to verify it under page_lock.
 796			 */
 797			if (unlikely(__PageMovable(page)) &&
 798					!PageIsolated(page)) {
 799				if (locked) {
 800					spin_unlock_irqrestore(zone_lru_lock(zone),
 801									flags);
 802					locked = false;
 803				}
 804
 805				if (isolate_movable_page(page, isolate_mode))
 806					goto isolate_success;
 807			}
 808
 809			goto isolate_fail;
 810		}
 811
 812		/*
 813		 * Migration will fail if an anonymous page is pinned in memory,
 814		 * so avoid taking lru_lock and isolating it unnecessarily in an
 815		 * admittedly racy check.
 816		 */
 817		if (!page_mapping(page) &&
 818		    page_count(page) > page_mapcount(page))
 819			goto isolate_fail;
 820
 821		/*
 822		 * Only allow to migrate anonymous pages in GFP_NOFS context
 823		 * because those do not depend on fs locks.
 824		 */
 825		if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
 826			goto isolate_fail;
 827
 828		/* If we already hold the lock, we can skip some rechecking */
 829		if (!locked) {
 830			locked = compact_trylock_irqsave(zone_lru_lock(zone),
 831								&flags, cc);
 832			if (!locked)
 833				break;
 834
 835			/* Recheck PageLRU and PageCompound under lock */
 836			if (!PageLRU(page))
 837				goto isolate_fail;
 838
 839			/*
 840			 * Page become compound since the non-locked check,
 841			 * and it's on LRU. It can only be a THP so the order
 842			 * is safe to read and it's 0 for tail pages.
 843			 */
 844			if (unlikely(PageCompound(page))) {
 845				low_pfn += (1UL << compound_order(page)) - 1;
 846				goto isolate_fail;
 847			}
 848		}
 849
 850		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 851
 852		/* Try isolate the page */
 853		if (__isolate_lru_page(page, isolate_mode) != 0)
 854			goto isolate_fail;
 855
 856		VM_BUG_ON_PAGE(PageCompound(page), page);
 857
 858		/* Successfully isolated */
 859		del_page_from_lru_list(page, lruvec, page_lru(page));
 860		inc_node_page_state(page,
 861				NR_ISOLATED_ANON + page_is_file_cache(page));
 862
 863isolate_success:
 864		list_add(&page->lru, &cc->migratepages);
 865		cc->nr_migratepages++;
 866		nr_isolated++;
 867
 868		/*
 869		 * Record where we could have freed pages by migration and not
 870		 * yet flushed them to buddy allocator.
 871		 * - this is the lowest page that was isolated and likely be
 872		 * then freed by migration.
 873		 */
 874		if (!cc->last_migrated_pfn)
 875			cc->last_migrated_pfn = low_pfn;
 876
 877		/* Avoid isolating too much */
 878		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
 879			++low_pfn;
 880			break;
 881		}
 882
 883		continue;
 884isolate_fail:
 885		if (!skip_on_failure)
 886			continue;
 887
 888		/*
 889		 * We have isolated some pages, but then failed. Release them
 890		 * instead of migrating, as we cannot form the cc->order buddy
 891		 * page anyway.
 892		 */
 893		if (nr_isolated) {
 894			if (locked) {
 895				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 896				locked = false;
 897			}
 898			putback_movable_pages(&cc->migratepages);
 899			cc->nr_migratepages = 0;
 900			cc->last_migrated_pfn = 0;
 901			nr_isolated = 0;
 902		}
 903
 904		if (low_pfn < next_skip_pfn) {
 905			low_pfn = next_skip_pfn - 1;
 906			/*
 907			 * The check near the loop beginning would have updated
 908			 * next_skip_pfn too, but this is a bit simpler.
 909			 */
 910			next_skip_pfn += 1UL << cc->order;
 911		}
 912	}
 913
 914	/*
 915	 * The PageBuddy() check could have potentially brought us outside
 916	 * the range to be scanned.
 917	 */
 918	if (unlikely(low_pfn > end_pfn))
 919		low_pfn = end_pfn;
 920
 921	if (locked)
 922		spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 923
 924	/*
 925	 * Update the pageblock-skip information and cached scanner pfn,
 926	 * if the whole pageblock was scanned without isolating any page.
 927	 */
 928	if (low_pfn == end_pfn)
 929		update_pageblock_skip(cc, valid_page, nr_isolated, true);
 930
 931	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
 932						nr_scanned, nr_isolated);
 933
 934	count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
 935	if (nr_isolated)
 936		count_compact_events(COMPACTISOLATED, nr_isolated);
 937
 938	return low_pfn;
 939}
 940
 941/**
 942 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
 943 * @cc:        Compaction control structure.
 944 * @start_pfn: The first PFN to start isolating.
 945 * @end_pfn:   The one-past-last PFN.
 946 *
 947 * Returns zero if isolation fails fatally due to e.g. pending signal.
 948 * Otherwise, function returns one-past-the-last PFN of isolated page
 949 * (which may be greater than end_pfn if end fell in a middle of a THP page).
 950 */
 951unsigned long
 952isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 953							unsigned long end_pfn)
 954{
 955	unsigned long pfn, block_start_pfn, block_end_pfn;
 956
 957	/* Scan block by block. First and last block may be incomplete */
 958	pfn = start_pfn;
 959	block_start_pfn = pageblock_start_pfn(pfn);
 960	if (block_start_pfn < cc->zone->zone_start_pfn)
 961		block_start_pfn = cc->zone->zone_start_pfn;
 962	block_end_pfn = pageblock_end_pfn(pfn);
 963
 964	for (; pfn < end_pfn; pfn = block_end_pfn,
 965				block_start_pfn = block_end_pfn,
 966				block_end_pfn += pageblock_nr_pages) {
 967
 968		block_end_pfn = min(block_end_pfn, end_pfn);
 969
 970		if (!pageblock_pfn_to_page(block_start_pfn,
 971					block_end_pfn, cc->zone))
 972			continue;
 973
 974		pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
 975							ISOLATE_UNEVICTABLE);
 976
 977		if (!pfn)
 978			break;
 979
 980		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
 981			break;
 982	}
 
 983
 984	return pfn;
 985}
 986
 987#endif /* CONFIG_COMPACTION || CONFIG_CMA */
 988#ifdef CONFIG_COMPACTION
 989
 990/* Returns true if the page is within a block suitable for migration to */
 991static bool suitable_migration_target(struct compact_control *cc,
 992							struct page *page)
 993{
 994	if (cc->ignore_block_suitable)
 995		return true;
 996
 997	/* If the page is a large free page, then disallow migration */
 998	if (PageBuddy(page)) {
 999		/*
1000		 * We are checking page_order without zone->lock taken. But
1001		 * the only small danger is that we skip a potentially suitable
1002		 * pageblock, so it's not worth to check order for valid range.
1003		 */
1004		if (page_order_unsafe(page) >= pageblock_order)
1005			return false;
1006	}
1007
1008	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1009	if (migrate_async_suitable(get_pageblock_migratetype(page)))
1010		return true;
1011
1012	/* Otherwise skip the block */
1013	return false;
1014}
1015
1016/*
1017 * Test whether the free scanner has reached the same or lower pageblock than
1018 * the migration scanner, and compaction should thus terminate.
1019 */
1020static inline bool compact_scanners_met(struct compact_control *cc)
1021{
1022	return (cc->free_pfn >> pageblock_order)
1023		<= (cc->migrate_pfn >> pageblock_order);
1024}
1025
1026/*
1027 * Based on information in the current compact_control, find blocks
1028 * suitable for isolating free pages from and then isolate them.
1029 */
1030static void isolate_freepages(struct compact_control *cc)
1031{
1032	struct zone *zone = cc->zone;
1033	struct page *page;
1034	unsigned long block_start_pfn;	/* start of current pageblock */
1035	unsigned long isolate_start_pfn; /* exact pfn we start at */
1036	unsigned long block_end_pfn;	/* end of current pageblock */
1037	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
1038	struct list_head *freelist = &cc->freepages;
1039
1040	/*
1041	 * Initialise the free scanner. The starting point is where we last
1042	 * successfully isolated from, zone-cached value, or the end of the
1043	 * zone when isolating for the first time. For looping we also need
1044	 * this pfn aligned down to the pageblock boundary, because we do
1045	 * block_start_pfn -= pageblock_nr_pages in the for loop.
1046	 * For ending point, take care when isolating in last pageblock of a
1047	 * a zone which ends in the middle of a pageblock.
1048	 * The low boundary is the end of the pageblock the migration scanner
1049	 * is using.
1050	 */
1051	isolate_start_pfn = cc->free_pfn;
1052	block_start_pfn = pageblock_start_pfn(cc->free_pfn);
1053	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1054						zone_end_pfn(zone));
1055	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1056
1057	/*
1058	 * Isolate free pages until enough are available to migrate the
1059	 * pages on cc->migratepages. We stop searching if the migrate
1060	 * and free page scanners meet or enough free pages are isolated.
1061	 */
1062	for (; block_start_pfn >= low_pfn;
1063				block_end_pfn = block_start_pfn,
1064				block_start_pfn -= pageblock_nr_pages,
1065				isolate_start_pfn = block_start_pfn) {
 
1066		/*
1067		 * This can iterate a massively long zone without finding any
1068		 * suitable migration targets, so periodically check if we need
1069		 * to schedule, or even abort async compaction.
1070		 */
1071		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1072						&& compact_should_abort(cc))
1073			break;
1074
1075		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1076									zone);
1077		if (!page)
1078			continue;
1079
1080		/* Check the block is suitable for migration */
1081		if (!suitable_migration_target(cc, page))
1082			continue;
1083
1084		/* If isolation recently failed, do not retry */
1085		if (!isolation_suitable(cc, page))
1086			continue;
1087
1088		/* Found a block suitable for isolating free pages from. */
1089		isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
1090					freelist, false);
1091
1092		/*
1093		 * If we isolated enough freepages, or aborted due to lock
1094		 * contention, terminate.
 
 
 
 
 
 
 
1095		 */
1096		if ((cc->nr_freepages >= cc->nr_migratepages)
1097							|| cc->contended) {
1098			if (isolate_start_pfn >= block_end_pfn) {
1099				/*
1100				 * Restart at previous pageblock if more
1101				 * freepages can be isolated next time.
1102				 */
1103				isolate_start_pfn =
1104					block_start_pfn - pageblock_nr_pages;
1105			}
1106			break;
1107		} else if (isolate_start_pfn < block_end_pfn) {
1108			/*
1109			 * If isolation failed early, do not continue
1110			 * needlessly.
1111			 */
1112			break;
1113		}
1114	}
1115
1116	/* __isolate_free_page() does not map the pages */
1117	map_pages(freelist);
1118
1119	/*
1120	 * Record where the free scanner will restart next time. Either we
1121	 * broke from the loop and set isolate_start_pfn based on the last
1122	 * call to isolate_freepages_block(), or we met the migration scanner
1123	 * and the loop terminated due to isolate_start_pfn < low_pfn
1124	 */
1125	cc->free_pfn = isolate_start_pfn;
1126}
1127
1128/*
1129 * This is a migrate-callback that "allocates" freepages by taking pages
1130 * from the isolated freelists in the block we are migrating to.
1131 */
1132static struct page *compaction_alloc(struct page *migratepage,
1133					unsigned long data,
1134					int **result)
1135{
1136	struct compact_control *cc = (struct compact_control *)data;
1137	struct page *freepage;
1138
1139	/*
1140	 * Isolate free pages if necessary, and if we are not aborting due to
1141	 * contention.
1142	 */
1143	if (list_empty(&cc->freepages)) {
1144		if (!cc->contended)
1145			isolate_freepages(cc);
1146
1147		if (list_empty(&cc->freepages))
1148			return NULL;
1149	}
1150
1151	freepage = list_entry(cc->freepages.next, struct page, lru);
1152	list_del(&freepage->lru);
1153	cc->nr_freepages--;
1154
1155	return freepage;
1156}
1157
1158/*
1159 * This is a migrate-callback that "frees" freepages back to the isolated
1160 * freelist.  All pages on the freelist are from the same zone, so there is no
1161 * special handling needed for NUMA.
1162 */
1163static void compaction_free(struct page *page, unsigned long data)
1164{
1165	struct compact_control *cc = (struct compact_control *)data;
1166
1167	list_add(&page->lru, &cc->freepages);
1168	cc->nr_freepages++;
1169}
1170
1171/* possible outcome of isolate_migratepages */
1172typedef enum {
1173	ISOLATE_ABORT,		/* Abort compaction now */
1174	ISOLATE_NONE,		/* No pages isolated, continue scanning */
1175	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
1176} isolate_migrate_t;
1177
1178/*
1179 * Allow userspace to control policy on scanning the unevictable LRU for
1180 * compactable pages.
1181 */
1182int sysctl_compact_unevictable_allowed __read_mostly = 1;
1183
1184/*
1185 * Isolate all pages that can be migrated from the first suitable block,
1186 * starting at the block pointed to by the migrate scanner pfn within
1187 * compact_control.
1188 */
1189static isolate_migrate_t isolate_migratepages(struct zone *zone,
1190					struct compact_control *cc)
1191{
1192	unsigned long block_start_pfn;
1193	unsigned long block_end_pfn;
1194	unsigned long low_pfn;
 
1195	struct page *page;
1196	const isolate_mode_t isolate_mode =
1197		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1198		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1199
1200	/*
1201	 * Start at where we last stopped, or beginning of the zone as
1202	 * initialized by compact_zone()
1203	 */
1204	low_pfn = cc->migrate_pfn;
1205	block_start_pfn = pageblock_start_pfn(low_pfn);
1206	if (block_start_pfn < zone->zone_start_pfn)
1207		block_start_pfn = zone->zone_start_pfn;
1208
1209	/* Only scan within a pageblock boundary */
1210	block_end_pfn = pageblock_end_pfn(low_pfn);
1211
1212	/*
1213	 * Iterate over whole pageblocks until we find the first suitable.
1214	 * Do not cross the free scanner.
1215	 */
1216	for (; block_end_pfn <= cc->free_pfn;
1217			low_pfn = block_end_pfn,
1218			block_start_pfn = block_end_pfn,
1219			block_end_pfn += pageblock_nr_pages) {
1220
1221		/*
1222		 * This can potentially iterate a massively long zone with
1223		 * many pageblocks unsuitable, so periodically check if we
1224		 * need to schedule, or even abort async compaction.
1225		 */
1226		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1227						&& compact_should_abort(cc))
1228			break;
1229
1230		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1231									zone);
1232		if (!page)
1233			continue;
1234
1235		/* If isolation recently failed, do not retry */
1236		if (!isolation_suitable(cc, page))
1237			continue;
1238
1239		/*
1240		 * For async compaction, also only scan in MOVABLE blocks.
1241		 * Async compaction is optimistic to see if the minimum amount
1242		 * of work satisfies the allocation.
1243		 */
1244		if (cc->mode == MIGRATE_ASYNC &&
1245		    !migrate_async_suitable(get_pageblock_migratetype(page)))
1246			continue;
1247
1248		/* Perform the isolation */
 
1249		low_pfn = isolate_migratepages_block(cc, low_pfn,
1250						block_end_pfn, isolate_mode);
1251
1252		if (!low_pfn || cc->contended)
 
1253			return ISOLATE_ABORT;
 
 
 
 
 
 
 
 
 
 
1254
1255		/*
1256		 * Either we isolated something and proceed with migration. Or
1257		 * we failed and compact_zone should decide if we should
1258		 * continue or not.
1259		 */
1260		break;
1261	}
1262
 
1263	/* Record where migration scanner will be restarted. */
1264	cc->migrate_pfn = low_pfn;
1265
1266	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1267}
1268
1269/*
1270 * order == -1 is expected when compacting via
1271 * /proc/sys/vm/compact_memory
1272 */
1273static inline bool is_via_compact_memory(int order)
1274{
1275	return order == -1;
1276}
1277
1278static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
1279			    const int migratetype)
1280{
1281	unsigned int order;
1282	unsigned long watermark;
1283
1284	if (cc->contended || fatal_signal_pending(current))
1285		return COMPACT_CONTENDED;
1286
1287	/* Compaction run completes if the migrate and free scanner meet */
1288	if (compact_scanners_met(cc)) {
1289		/* Let the next compaction start anew. */
1290		reset_cached_positions(zone);
1291
1292		/*
1293		 * Mark that the PG_migrate_skip information should be cleared
1294		 * by kswapd when it goes to sleep. kcompactd does not set the
1295		 * flag itself as the decision to be clear should be directly
1296		 * based on an allocation request.
1297		 */
1298		if (cc->direct_compaction)
1299			zone->compact_blockskip_flush = true;
1300
1301		if (cc->whole_zone)
1302			return COMPACT_COMPLETE;
1303		else
1304			return COMPACT_PARTIAL_SKIPPED;
1305	}
1306
1307	if (is_via_compact_memory(cc->order))
1308		return COMPACT_CONTINUE;
1309
1310	/* Compaction run is not finished if the watermark is not met */
1311	watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
1312
1313	if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1314							cc->alloc_flags))
1315		return COMPACT_CONTINUE;
1316
1317	/* Direct compactor: Is a suitable page free? */
1318	for (order = cc->order; order < MAX_ORDER; order++) {
1319		struct free_area *area = &zone->free_area[order];
1320		bool can_steal;
1321
1322		/* Job done if page is free of the right migratetype */
1323		if (!list_empty(&area->free_list[migratetype]))
1324			return COMPACT_SUCCESS;
1325
1326#ifdef CONFIG_CMA
1327		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1328		if (migratetype == MIGRATE_MOVABLE &&
1329			!list_empty(&area->free_list[MIGRATE_CMA]))
1330			return COMPACT_SUCCESS;
1331#endif
1332		/*
1333		 * Job done if allocation would steal freepages from
1334		 * other migratetype buddy lists.
1335		 */
1336		if (find_suitable_fallback(area, order, migratetype,
1337						true, &can_steal) != -1)
1338			return COMPACT_SUCCESS;
1339	}
1340
1341	return COMPACT_NO_SUITABLE_PAGE;
1342}
1343
1344static enum compact_result compact_finished(struct zone *zone,
1345			struct compact_control *cc,
1346			const int migratetype)
1347{
1348	int ret;
1349
1350	ret = __compact_finished(zone, cc, migratetype);
1351	trace_mm_compaction_finished(zone, cc->order, ret);
1352	if (ret == COMPACT_NO_SUITABLE_PAGE)
1353		ret = COMPACT_CONTINUE;
1354
1355	return ret;
1356}
1357
1358/*
1359 * compaction_suitable: Is this suitable to run compaction on this zone now?
1360 * Returns
1361 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
1362 *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
1363 *   COMPACT_CONTINUE - If compaction should run now
1364 */
1365static enum compact_result __compaction_suitable(struct zone *zone, int order,
1366					unsigned int alloc_flags,
1367					int classzone_idx,
1368					unsigned long wmark_target)
1369{
 
1370	unsigned long watermark;
1371
1372	if (is_via_compact_memory(order))
1373		return COMPACT_CONTINUE;
1374
1375	watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1376	/*
1377	 * If watermarks for high-order allocation are already met, there
1378	 * should be no need for compaction at all.
1379	 */
1380	if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1381								alloc_flags))
1382		return COMPACT_SUCCESS;
1383
1384	/*
1385	 * Watermarks for order-0 must be met for compaction to be able to
1386	 * isolate free pages for migration targets. This means that the
1387	 * watermark and alloc_flags have to match, or be more pessimistic than
1388	 * the check in __isolate_free_page(). We don't use the direct
1389	 * compactor's alloc_flags, as they are not relevant for freepage
1390	 * isolation. We however do use the direct compactor's classzone_idx to
1391	 * skip over zones where lowmem reserves would prevent allocation even
1392	 * if compaction succeeds.
1393	 * For costly orders, we require low watermark instead of min for
1394	 * compaction to proceed to increase its chances.
1395	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
1396	 * suitable migration targets
1397	 */
1398	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
1399				low_wmark_pages(zone) : min_wmark_pages(zone);
1400	watermark += compact_gap(order);
1401	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
1402						ALLOC_CMA, wmark_target))
1403		return COMPACT_SKIPPED;
1404
1405	return COMPACT_CONTINUE;
1406}
1407
1408enum compact_result compaction_suitable(struct zone *zone, int order,
1409					unsigned int alloc_flags,
1410					int classzone_idx)
1411{
1412	enum compact_result ret;
1413	int fragindex;
1414
1415	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
1416				    zone_page_state(zone, NR_FREE_PAGES));
1417	/*
1418	 * fragmentation index determines if allocation failures are due to
1419	 * low memory or external fragmentation
1420	 *
1421	 * index of -1000 would imply allocations might succeed depending on
1422	 * watermarks, but we already failed the high-order watermark check
1423	 * index towards 0 implies failure is due to lack of memory
1424	 * index towards 1000 implies failure is due to fragmentation
1425	 *
1426	 * Only compact if a failure would be due to fragmentation. Also
1427	 * ignore fragindex for non-costly orders where the alternative to
1428	 * a successful reclaim/compaction is OOM. Fragindex and the
1429	 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
1430	 * excessive compaction for costly orders, but it should not be at the
1431	 * expense of system stability.
1432	 */
1433	if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
1434		fragindex = fragmentation_index(zone, order);
1435		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1436			ret = COMPACT_NOT_SUITABLE_ZONE;
1437	}
 
 
 
 
 
 
1438
 
1439	trace_mm_compaction_suitable(zone, order, ret);
1440	if (ret == COMPACT_NOT_SUITABLE_ZONE)
1441		ret = COMPACT_SKIPPED;
1442
1443	return ret;
1444}
1445
1446bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
1447		int alloc_flags)
1448{
1449	struct zone *zone;
1450	struct zoneref *z;
1451
1452	/*
1453	 * Make sure at least one zone would pass __compaction_suitable if we continue
1454	 * retrying the reclaim.
1455	 */
1456	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1457					ac->nodemask) {
1458		unsigned long available;
1459		enum compact_result compact_result;
1460
1461		/*
1462		 * Do not consider all the reclaimable memory because we do not
1463		 * want to trash just for a single high order allocation which
1464		 * is even not guaranteed to appear even if __compaction_suitable
1465		 * is happy about the watermark check.
1466		 */
1467		available = zone_reclaimable_pages(zone) / order;
1468		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
1469		compact_result = __compaction_suitable(zone, order, alloc_flags,
1470				ac_classzone_idx(ac), available);
1471		if (compact_result != COMPACT_SKIPPED)
1472			return true;
1473	}
1474
1475	return false;
1476}
1477
1478static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
1479{
1480	enum compact_result ret;
1481	unsigned long start_pfn = zone->zone_start_pfn;
1482	unsigned long end_pfn = zone_end_pfn(zone);
1483	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1484	const bool sync = cc->mode != MIGRATE_ASYNC;
1485
1486	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1487							cc->classzone_idx);
1488	/* Compaction is likely to fail */
1489	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
 
 
1490		return ret;
1491
1492	/* huh, compaction_suitable is returning something unexpected */
1493	VM_BUG_ON(ret != COMPACT_CONTINUE);
 
1494
1495	/*
1496	 * Clear pageblock skip if there were failures recently and compaction
1497	 * is about to be retried after being deferred.
1498	 */
1499	if (compaction_restarting(zone, cc->order))
1500		__reset_isolation_suitable(zone);
1501
1502	/*
1503	 * Setup to move all movable pages to the end of the zone. Used cached
1504	 * information on where the scanners should start (unless we explicitly
1505	 * want to compact the whole zone), but check that it is initialised
1506	 * by ensuring the values are within zone boundaries.
1507	 */
1508	if (cc->whole_zone) {
 
 
 
 
 
 
1509		cc->migrate_pfn = start_pfn;
1510		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1511	} else {
1512		cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1513		cc->free_pfn = zone->compact_cached_free_pfn;
1514		if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1515			cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1516			zone->compact_cached_free_pfn = cc->free_pfn;
1517		}
1518		if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1519			cc->migrate_pfn = start_pfn;
1520			zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1521			zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1522		}
1523
1524		if (cc->migrate_pfn == start_pfn)
1525			cc->whole_zone = true;
1526	}
1527
1528	cc->last_migrated_pfn = 0;
1529
1530	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1531				cc->free_pfn, end_pfn, sync);
1532
1533	migrate_prep_local();
1534
1535	while ((ret = compact_finished(zone, cc, migratetype)) ==
1536						COMPACT_CONTINUE) {
1537		int err;
1538
1539		switch (isolate_migratepages(zone, cc)) {
1540		case ISOLATE_ABORT:
1541			ret = COMPACT_CONTENDED;
1542			putback_movable_pages(&cc->migratepages);
1543			cc->nr_migratepages = 0;
1544			goto out;
1545		case ISOLATE_NONE:
1546			/*
1547			 * We haven't isolated and migrated anything, but
1548			 * there might still be unflushed migrations from
1549			 * previous cc->order aligned block.
1550			 */
1551			goto check_drain;
1552		case ISOLATE_SUCCESS:
1553			;
1554		}
1555
1556		err = migrate_pages(&cc->migratepages, compaction_alloc,
1557				compaction_free, (unsigned long)cc, cc->mode,
1558				MR_COMPACTION);
1559
1560		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1561							&cc->migratepages);
1562
1563		/* All pages were either migrated or will be released */
1564		cc->nr_migratepages = 0;
1565		if (err) {
1566			putback_movable_pages(&cc->migratepages);
1567			/*
1568			 * migrate_pages() may return -ENOMEM when scanners meet
1569			 * and we want compact_finished() to detect it
1570			 */
1571			if (err == -ENOMEM && !compact_scanners_met(cc)) {
1572				ret = COMPACT_CONTENDED;
1573				goto out;
1574			}
1575			/*
1576			 * We failed to migrate at least one page in the current
1577			 * order-aligned block, so skip the rest of it.
1578			 */
1579			if (cc->direct_compaction &&
1580						(cc->mode == MIGRATE_ASYNC)) {
1581				cc->migrate_pfn = block_end_pfn(
1582						cc->migrate_pfn - 1, cc->order);
1583				/* Draining pcplists is useless in this case */
1584				cc->last_migrated_pfn = 0;
1585
1586			}
1587		}
1588
1589check_drain:
1590		/*
1591		 * Has the migration scanner moved away from the previous
1592		 * cc->order aligned block where we migrated from? If yes,
1593		 * flush the pages that were freed, so that they can merge and
1594		 * compact_finished() can detect immediately if allocation
1595		 * would succeed.
1596		 */
1597		if (cc->order > 0 && cc->last_migrated_pfn) {
1598			int cpu;
1599			unsigned long current_block_start =
1600				block_start_pfn(cc->migrate_pfn, cc->order);
1601
1602			if (cc->last_migrated_pfn < current_block_start) {
1603				cpu = get_cpu();
1604				lru_add_drain_cpu(cpu);
1605				drain_local_pages(zone);
1606				put_cpu();
1607				/* No more flushing until we migrate again */
1608				cc->last_migrated_pfn = 0;
1609			}
1610		}
1611
1612	}
1613
1614out:
1615	/*
1616	 * Release free pages and update where the free scanner should restart,
1617	 * so we don't leave any returned pages behind in the next attempt.
1618	 */
1619	if (cc->nr_freepages > 0) {
1620		unsigned long free_pfn = release_freepages(&cc->freepages);
1621
1622		cc->nr_freepages = 0;
1623		VM_BUG_ON(free_pfn == 0);
1624		/* The cached pfn is always the first in a pageblock */
1625		free_pfn = pageblock_start_pfn(free_pfn);
1626		/*
1627		 * Only go back, not forward. The cached pfn might have been
1628		 * already reset to zone end in compact_finished()
1629		 */
1630		if (free_pfn > zone->compact_cached_free_pfn)
1631			zone->compact_cached_free_pfn = free_pfn;
1632	}
1633
1634	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1635				cc->free_pfn, end_pfn, sync, ret);
1636
 
 
 
1637	return ret;
1638}
1639
1640static enum compact_result compact_zone_order(struct zone *zone, int order,
1641		gfp_t gfp_mask, enum compact_priority prio,
1642		unsigned int alloc_flags, int classzone_idx)
1643{
1644	enum compact_result ret;
1645	struct compact_control cc = {
1646		.nr_freepages = 0,
1647		.nr_migratepages = 0,
1648		.order = order,
1649		.gfp_mask = gfp_mask,
1650		.zone = zone,
1651		.mode = (prio == COMPACT_PRIO_ASYNC) ?
1652					MIGRATE_ASYNC :	MIGRATE_SYNC_LIGHT,
1653		.alloc_flags = alloc_flags,
1654		.classzone_idx = classzone_idx,
1655		.direct_compaction = true,
1656		.whole_zone = (prio == MIN_COMPACT_PRIORITY),
1657		.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
1658		.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
1659	};
1660	INIT_LIST_HEAD(&cc.freepages);
1661	INIT_LIST_HEAD(&cc.migratepages);
1662
1663	ret = compact_zone(zone, &cc);
1664
1665	VM_BUG_ON(!list_empty(&cc.freepages));
1666	VM_BUG_ON(!list_empty(&cc.migratepages));
1667
 
1668	return ret;
1669}
1670
1671int sysctl_extfrag_threshold = 500;
1672
1673/**
1674 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1675 * @gfp_mask: The GFP mask of the current allocation
1676 * @order: The order of the current allocation
1677 * @alloc_flags: The allocation flags of the current allocation
1678 * @ac: The context of current allocation
1679 * @mode: The migration mode for async, sync light, or sync migration
 
 
1680 *
1681 * This is the main entry point for direct page compaction.
1682 */
1683enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1684		unsigned int alloc_flags, const struct alloc_context *ac,
1685		enum compact_priority prio)
1686{
 
1687	int may_perform_io = gfp_mask & __GFP_IO;
1688	struct zoneref *z;
1689	struct zone *zone;
1690	enum compact_result rc = COMPACT_SKIPPED;
 
 
 
1691
1692	/*
1693	 * Check if the GFP flags allow compaction - GFP_NOIO is really
1694	 * tricky context because the migration might require IO
1695	 */
1696	if (!may_perform_io)
1697		return COMPACT_SKIPPED;
1698
1699	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
1700
1701	/* Compact each zone in the list */
1702	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1703								ac->nodemask) {
1704		enum compact_result status;
 
1705
1706		if (prio > MIN_COMPACT_PRIORITY
1707					&& compaction_deferred(zone, order)) {
1708			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1709			continue;
1710		}
1711
1712		status = compact_zone_order(zone, order, gfp_mask, prio,
1713					alloc_flags, ac_classzone_idx(ac));
 
1714		rc = max(status, rc);
 
 
 
 
 
1715
1716		/* The allocation should succeed, stop compacting */
1717		if (status == COMPACT_SUCCESS) {
 
1718			/*
1719			 * We think the allocation will succeed in this zone,
1720			 * but it is not certain, hence the false. The caller
1721			 * will repeat this with true if allocation indeed
1722			 * succeeds in this zone.
1723			 */
1724			compaction_defer_reset(zone, order, false);
 
 
 
 
 
 
 
 
 
 
1725
1726			break;
1727		}
1728
1729		if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
1730					status == COMPACT_PARTIAL_SKIPPED))
1731			/*
1732			 * We think that allocation won't succeed in this zone
1733			 * so we defer compaction there. If it ends up
1734			 * succeeding after all, it will be reset.
1735			 */
1736			defer_compaction(zone, order);
 
1737
1738		/*
1739		 * We might have stopped compacting due to need_resched() in
1740		 * async compaction, or due to a fatal signal detected. In that
1741		 * case do not try further zones
 
1742		 */
1743		if ((prio == COMPACT_PRIO_ASYNC && need_resched())
1744					|| fatal_signal_pending(current))
1745			break;
 
 
 
 
 
 
 
 
 
 
 
1746	}
1747
 
 
 
 
 
 
 
1748	return rc;
1749}
1750
1751
1752/* Compact all zones within a node */
1753static void compact_node(int nid)
1754{
1755	pg_data_t *pgdat = NODE_DATA(nid);
1756	int zoneid;
1757	struct zone *zone;
1758	struct compact_control cc = {
1759		.order = -1,
1760		.mode = MIGRATE_SYNC,
1761		.ignore_skip_hint = true,
1762		.whole_zone = true,
1763		.gfp_mask = GFP_KERNEL,
1764	};
1765
1766
1767	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1768
1769		zone = &pgdat->node_zones[zoneid];
1770		if (!populated_zone(zone))
1771			continue;
1772
1773		cc.nr_freepages = 0;
1774		cc.nr_migratepages = 0;
1775		cc.zone = zone;
1776		INIT_LIST_HEAD(&cc.freepages);
1777		INIT_LIST_HEAD(&cc.migratepages);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1778
1779		compact_zone(zone, &cc);
 
1780
1781		VM_BUG_ON(!list_empty(&cc.freepages));
1782		VM_BUG_ON(!list_empty(&cc.migratepages));
 
1783	}
1784}
1785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1786/* Compact all nodes in the system */
1787static void compact_nodes(void)
1788{
1789	int nid;
1790
1791	/* Flush pending updates to the LRU lists */
1792	lru_add_drain_all();
1793
1794	for_each_online_node(nid)
1795		compact_node(nid);
1796}
1797
1798/* The written value is actually unused, all memory is compacted */
1799int sysctl_compact_memory;
1800
1801/*
1802 * This is the entry point for compacting all nodes via
1803 * /proc/sys/vm/compact_memory
1804 */
1805int sysctl_compaction_handler(struct ctl_table *table, int write,
1806			void __user *buffer, size_t *length, loff_t *ppos)
1807{
1808	if (write)
1809		compact_nodes();
1810
1811	return 0;
1812}
1813
1814int sysctl_extfrag_handler(struct ctl_table *table, int write,
1815			void __user *buffer, size_t *length, loff_t *ppos)
1816{
1817	proc_dointvec_minmax(table, write, buffer, length, ppos);
1818
1819	return 0;
1820}
1821
1822#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1823static ssize_t sysfs_compact_node(struct device *dev,
1824			struct device_attribute *attr,
1825			const char *buf, size_t count)
1826{
1827	int nid = dev->id;
1828
1829	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1830		/* Flush pending updates to the LRU lists */
1831		lru_add_drain_all();
1832
1833		compact_node(nid);
1834	}
1835
1836	return count;
1837}
1838static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1839
1840int compaction_register_node(struct node *node)
1841{
1842	return device_create_file(&node->dev, &dev_attr_compact);
1843}
1844
1845void compaction_unregister_node(struct node *node)
1846{
1847	return device_remove_file(&node->dev, &dev_attr_compact);
1848}
1849#endif /* CONFIG_SYSFS && CONFIG_NUMA */
1850
1851static inline bool kcompactd_work_requested(pg_data_t *pgdat)
1852{
1853	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1854}
1855
1856static bool kcompactd_node_suitable(pg_data_t *pgdat)
1857{
1858	int zoneid;
1859	struct zone *zone;
1860	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
1861
1862	for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
1863		zone = &pgdat->node_zones[zoneid];
1864
1865		if (!populated_zone(zone))
1866			continue;
1867
1868		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
1869					classzone_idx) == COMPACT_CONTINUE)
1870			return true;
1871	}
1872
1873	return false;
1874}
1875
1876static void kcompactd_do_work(pg_data_t *pgdat)
1877{
1878	/*
1879	 * With no special task, compact all zones so that a page of requested
1880	 * order is allocatable.
1881	 */
1882	int zoneid;
1883	struct zone *zone;
1884	struct compact_control cc = {
1885		.order = pgdat->kcompactd_max_order,
1886		.classzone_idx = pgdat->kcompactd_classzone_idx,
1887		.mode = MIGRATE_SYNC_LIGHT,
1888		.ignore_skip_hint = true,
1889		.gfp_mask = GFP_KERNEL,
1890
1891	};
 
 
1892	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
1893							cc.classzone_idx);
1894	count_vm_event(KCOMPACTD_WAKE);
1895
1896	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
1897		int status;
1898
1899		zone = &pgdat->node_zones[zoneid];
1900		if (!populated_zone(zone))
1901			continue;
1902
1903		if (compaction_deferred(zone, cc.order))
1904			continue;
1905
1906		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
1907							COMPACT_CONTINUE)
1908			continue;
1909
1910		cc.nr_freepages = 0;
1911		cc.nr_migratepages = 0;
1912		cc.zone = zone;
1913		INIT_LIST_HEAD(&cc.freepages);
1914		INIT_LIST_HEAD(&cc.migratepages);
1915
1916		if (kthread_should_stop())
1917			return;
1918		status = compact_zone(zone, &cc);
1919
1920		if (status == COMPACT_SUCCESS) {
 
 
1921			compaction_defer_reset(zone, cc.order, false);
1922		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1923			/*
1924			 * We use sync migration mode here, so we defer like
1925			 * sync direct compaction does.
1926			 */
1927			defer_compaction(zone, cc.order);
1928		}
1929
1930		VM_BUG_ON(!list_empty(&cc.freepages));
1931		VM_BUG_ON(!list_empty(&cc.migratepages));
1932	}
1933
1934	/*
1935	 * Regardless of success, we are done until woken up next. But remember
1936	 * the requested order/classzone_idx in case it was higher/tighter than
1937	 * our current ones
1938	 */
1939	if (pgdat->kcompactd_max_order <= cc.order)
1940		pgdat->kcompactd_max_order = 0;
1941	if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
1942		pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1943}
1944
1945void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
1946{
1947	if (!order)
1948		return;
1949
1950	if (pgdat->kcompactd_max_order < order)
1951		pgdat->kcompactd_max_order = order;
1952
1953	if (pgdat->kcompactd_classzone_idx > classzone_idx)
1954		pgdat->kcompactd_classzone_idx = classzone_idx;
1955
1956	if (!waitqueue_active(&pgdat->kcompactd_wait))
1957		return;
1958
1959	if (!kcompactd_node_suitable(pgdat))
1960		return;
1961
1962	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
1963							classzone_idx);
1964	wake_up_interruptible(&pgdat->kcompactd_wait);
1965}
1966
1967/*
1968 * The background compaction daemon, started as a kernel thread
1969 * from the init process.
1970 */
1971static int kcompactd(void *p)
1972{
1973	pg_data_t *pgdat = (pg_data_t*)p;
1974	struct task_struct *tsk = current;
1975
1976	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1977
1978	if (!cpumask_empty(cpumask))
1979		set_cpus_allowed_ptr(tsk, cpumask);
1980
1981	set_freezable();
1982
1983	pgdat->kcompactd_max_order = 0;
1984	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
1985
1986	while (!kthread_should_stop()) {
1987		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
1988		wait_event_freezable(pgdat->kcompactd_wait,
1989				kcompactd_work_requested(pgdat));
1990
1991		kcompactd_do_work(pgdat);
1992	}
1993
1994	return 0;
1995}
1996
1997/*
1998 * This kcompactd start function will be called by init and node-hot-add.
1999 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
2000 */
2001int kcompactd_run(int nid)
2002{
2003	pg_data_t *pgdat = NODE_DATA(nid);
2004	int ret = 0;
2005
2006	if (pgdat->kcompactd)
2007		return 0;
2008
2009	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
2010	if (IS_ERR(pgdat->kcompactd)) {
2011		pr_err("Failed to start kcompactd on node %d\n", nid);
2012		ret = PTR_ERR(pgdat->kcompactd);
2013		pgdat->kcompactd = NULL;
2014	}
2015	return ret;
2016}
2017
2018/*
2019 * Called by memory hotplug when all memory in a node is offlined. Caller must
2020 * hold mem_hotplug_begin/end().
2021 */
2022void kcompactd_stop(int nid)
2023{
2024	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
2025
2026	if (kcompactd) {
2027		kthread_stop(kcompactd);
2028		NODE_DATA(nid)->kcompactd = NULL;
2029	}
2030}
2031
2032/*
2033 * It's optimal to keep kcompactd on the same CPUs as their memory, but
2034 * not required for correctness. So if the last cpu in a node goes
2035 * away, we get changed to run anywhere: as the first one comes back,
2036 * restore their cpu bindings.
2037 */
2038static int kcompactd_cpu_online(unsigned int cpu)
 
2039{
2040	int nid;
2041
2042	for_each_node_state(nid, N_MEMORY) {
2043		pg_data_t *pgdat = NODE_DATA(nid);
2044		const struct cpumask *mask;
2045
2046		mask = cpumask_of_node(pgdat->node_id);
2047
2048		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2049			/* One of our CPUs online: restore mask */
2050			set_cpus_allowed_ptr(pgdat->kcompactd, mask);
 
 
2051	}
2052	return 0;
2053}
2054
2055static int __init kcompactd_init(void)
2056{
2057	int nid;
2058	int ret;
2059
2060	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
2061					"mm/compaction:online",
2062					kcompactd_cpu_online, NULL);
2063	if (ret < 0) {
2064		pr_err("kcompactd: failed to register hotplug callbacks.\n");
2065		return ret;
2066	}
2067
2068	for_each_node_state(nid, N_MEMORY)
2069		kcompactd_run(nid);
 
2070	return 0;
2071}
2072subsys_initcall(kcompactd_init)
2073
2074#endif /* CONFIG_COMPACTION */