Linux Audio

Check our new training course

Loading...
v3.1
  1/*
  2 * linux/mm/compaction.c
  3 *
  4 * Memory compaction for the reduction of external fragmentation. Note that
  5 * this heavily depends upon page migration to do all the real heavy
  6 * lifting
  7 *
  8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
  9 */
 10#include <linux/swap.h>
 11#include <linux/migrate.h>
 12#include <linux/compaction.h>
 13#include <linux/mm_inline.h>
 14#include <linux/backing-dev.h>
 15#include <linux/sysctl.h>
 16#include <linux/sysfs.h>
 17#include "internal.h"
 18
 
 
 19#define CREATE_TRACE_POINTS
 20#include <trace/events/compaction.h>
 21
 22/*
 23 * compact_control is used to track pages being migrated and the free pages
 24 * they are being migrated to during memory compaction. The free_pfn starts
 25 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 26 * are moved to the end of a zone during a compaction run and the run
 27 * completes when free_pfn <= migrate_pfn
 28 */
 29struct compact_control {
 30	struct list_head freepages;	/* List of free pages to migrate to */
 31	struct list_head migratepages;	/* List of pages being migrated */
 32	unsigned long nr_freepages;	/* Number of isolated free pages */
 33	unsigned long nr_migratepages;	/* Number of pages to migrate */
 34	unsigned long free_pfn;		/* isolate_freepages search base */
 35	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 36	bool sync;			/* Synchronous migration */
 37
 38	/* Account for isolated anon and file pages */
 39	unsigned long nr_anon;
 40	unsigned long nr_file;
 41
 42	unsigned int order;		/* order a direct compactor needs */
 43	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 44	struct zone *zone;
 45};
 46
 47static unsigned long release_freepages(struct list_head *freelist)
 48{
 49	struct page *page, *next;
 50	unsigned long count = 0;
 51
 52	list_for_each_entry_safe(page, next, freelist, lru) {
 53		list_del(&page->lru);
 54		__free_page(page);
 55		count++;
 56	}
 57
 58	return count;
 59}
 60
 61/* Isolate free pages onto a private freelist. Must hold zone->lock */
 62static unsigned long isolate_freepages_block(struct zone *zone,
 63				unsigned long blockpfn,
 64				struct list_head *freelist)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 65{
 66	unsigned long zone_end_pfn, end_pfn;
 67	int nr_scanned = 0, total_isolated = 0;
 68	struct page *cursor;
 69
 70	/* Get the last PFN we should scan for free pages at */
 71	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 72	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
 73
 74	/* Find the first usable PFN in the block to initialse page cursor */
 75	for (; blockpfn < end_pfn; blockpfn++) {
 76		if (pfn_valid_within(blockpfn))
 77			break;
 78	}
 79	cursor = pfn_to_page(blockpfn);
 80
 81	/* Isolate free pages. This assumes the block is valid */
 82	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 83		int isolated, i;
 84		struct page *page = cursor;
 85
 86		if (!pfn_valid_within(blockpfn))
 
 
 87			continue;
 
 88		nr_scanned++;
 89
 90		if (!PageBuddy(page))
 
 
 91			continue;
 
 92
 93		/* Found a free page, break it into order-0 pages */
 94		isolated = split_free_page(page);
 
 
 95		total_isolated += isolated;
 96		for (i = 0; i < isolated; i++) {
 97			list_add(&page->lru, freelist);
 98			page++;
 99		}
100
101		/* If a page was split, advance to the end of it */
102		if (isolated) {
103			blockpfn += isolated - 1;
104			cursor += isolated - 1;
105		}
106	}
107
108	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
109	return total_isolated;
110}
111
112/* Returns true if the page is within a block suitable for migration to */
113static bool suitable_migration_target(struct page *page)
114{
115
116	int migratetype = get_pageblock_migratetype(page);
117
118	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
119	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
120		return false;
121
122	/* If the page is a large free page, then allow migration */
123	if (PageBuddy(page) && page_order(page) >= pageblock_order)
124		return true;
125
126	/* If the block is MIGRATE_MOVABLE, allow migration */
127	if (migratetype == MIGRATE_MOVABLE)
128		return true;
129
130	/* Otherwise skip the block */
131	return false;
132}
133
134/*
135 * Based on information in the current compact_control, find blocks
136 * suitable for isolating free pages from and then isolate them.
137 */
138static void isolate_freepages(struct zone *zone,
139				struct compact_control *cc)
140{
141	struct page *page;
142	unsigned long high_pfn, low_pfn, pfn;
143	unsigned long flags;
144	int nr_freepages = cc->nr_freepages;
145	struct list_head *freelist = &cc->freepages;
146
147	/*
148	 * Initialise the free scanner. The starting point is where we last
149	 * scanned from (or the end of the zone if starting). The low point
150	 * is the end of the pageblock the migration scanner is using.
151	 */
152	pfn = cc->free_pfn;
153	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
154
155	/*
156	 * Take care that if the migration scanner is at the end of the zone
157	 * that the free scanner does not accidentally move to the next zone
158	 * in the next isolation cycle.
159	 */
160	high_pfn = min(low_pfn, pfn);
161
162	/*
163	 * Isolate free pages until enough are available to migrate the
164	 * pages on cc->migratepages. We stop searching if the migrate
165	 * and free page scanners meet or enough free pages are isolated.
166	 */
167	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
168					pfn -= pageblock_nr_pages) {
169		unsigned long isolated;
170
171		if (!pfn_valid(pfn))
172			continue;
 
173
174		/*
175		 * Check for overlapping nodes/zones. It's possible on some
176		 * configurations to have a setup like
177		 * node0 node1 node0
178		 * i.e. it's possible that all pages within a zones range of
179		 * pages do not belong to a single zone.
180		 */
181		page = pfn_to_page(pfn);
182		if (page_zone(page) != zone)
183			continue;
184
185		/* Check the block is suitable for migration */
186		if (!suitable_migration_target(page))
187			continue;
 
188
189		/*
190		 * Found a block suitable for isolating free pages from. Now
191		 * we disabled interrupts, double check things are ok and
192		 * isolate the pages. This is to minimise the time IRQs
193		 * are disabled
194		 */
195		isolated = 0;
196		spin_lock_irqsave(&zone->lock, flags);
197		if (suitable_migration_target(page)) {
198			isolated = isolate_freepages_block(zone, pfn, freelist);
199			nr_freepages += isolated;
200		}
201		spin_unlock_irqrestore(&zone->lock, flags);
202
203		/*
204		 * Record the highest PFN we isolated pages from. When next
205		 * looking for free pages, the search will restart here as
206		 * page migration may have returned some pages to the allocator
207		 */
208		if (isolated)
209			high_pfn = max(high_pfn, pfn);
210	}
211
212	/* split_free_page does not map the pages */
213	list_for_each_entry(page, freelist, lru) {
214		arch_alloc_page(page, 0);
215		kernel_map_pages(page, 1, 1);
 
 
 
216	}
217
218	cc->free_pfn = high_pfn;
219	cc->nr_freepages = nr_freepages;
220}
221
222/* Update the number of anon and file isolated pages in the zone */
223static void acct_isolated(struct zone *zone, struct compact_control *cc)
224{
225	struct page *page;
226	unsigned int count[NR_LRU_LISTS] = { 0, };
227
228	list_for_each_entry(page, &cc->migratepages, lru) {
229		int lru = page_lru_base_type(page);
230		count[lru]++;
231	}
232
233	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
234	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
235	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
236	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
237}
238
239/* Similar to reclaim, but different enough that they don't share logic */
240static bool too_many_isolated(struct zone *zone)
241{
242	unsigned long active, inactive, isolated;
243
244	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
245					zone_page_state(zone, NR_INACTIVE_ANON);
246	active = zone_page_state(zone, NR_ACTIVE_FILE) +
247					zone_page_state(zone, NR_ACTIVE_ANON);
248	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
249					zone_page_state(zone, NR_ISOLATED_ANON);
250
251	return isolated > (inactive + active) / 2;
252}
253
254/* possible outcome of isolate_migratepages */
255typedef enum {
256	ISOLATE_ABORT,		/* Abort compaction now */
257	ISOLATE_NONE,		/* No pages isolated, continue scanning */
258	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
259} isolate_migrate_t;
260
261/*
262 * Isolate all pages that can be migrated from the block pointed to by
263 * the migrate scanner within compact_control.
 
 
 
 
 
 
 
 
264 */
265static isolate_migrate_t isolate_migratepages(struct zone *zone,
266					struct compact_control *cc)
 
267{
268	unsigned long low_pfn, end_pfn;
269	unsigned long last_pageblock_nr = 0, pageblock_nr;
270	unsigned long nr_scanned = 0, nr_isolated = 0;
271	struct list_head *migratelist = &cc->migratepages;
272
273	/* Do not scan outside zone boundaries */
274	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
275
276	/* Only scan within a pageblock boundary */
277	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
278
279	/* Do not cross the free scanner or scan within a memory hole */
280	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
281		cc->migrate_pfn = end_pfn;
282		return ISOLATE_NONE;
283	}
284
285	/*
286	 * Ensure that there are not too many pages isolated from the LRU
287	 * list by either parallel reclaimers or compaction. If there are,
288	 * delay for some time until fewer pages are isolated
289	 */
290	while (unlikely(too_many_isolated(zone))) {
291		/* async migration should just abort */
292		if (!cc->sync)
293			return ISOLATE_ABORT;
294
295		congestion_wait(BLK_RW_ASYNC, HZ/10);
296
297		if (fatal_signal_pending(current))
298			return ISOLATE_ABORT;
299	}
300
301	/* Time to isolate some pages for migration */
302	cond_resched();
303	spin_lock_irq(&zone->lru_lock);
304	for (; low_pfn < end_pfn; low_pfn++) {
305		struct page *page;
306		bool locked = true;
307
308		/* give a chance to irqs before checking need_resched() */
309		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
310			spin_unlock_irq(&zone->lru_lock);
311			locked = false;
312		}
313		if (need_resched() || spin_is_contended(&zone->lru_lock)) {
314			if (locked)
315				spin_unlock_irq(&zone->lru_lock);
316			cond_resched();
317			spin_lock_irq(&zone->lru_lock);
318			if (fatal_signal_pending(current))
319				break;
320		} else if (!locked)
321			spin_lock_irq(&zone->lru_lock);
322
 
 
 
 
 
 
 
 
 
 
 
 
 
323		if (!pfn_valid_within(low_pfn))
324			continue;
325		nr_scanned++;
326
327		/* Get the page and skip if free */
 
 
 
 
 
328		page = pfn_to_page(low_pfn);
 
 
 
 
329		if (PageBuddy(page))
330			continue;
331
332		/*
333		 * For async migration, also only scan in MOVABLE blocks. Async
334		 * migration is optimistic to see if the minimum amount of work
335		 * satisfies the allocation
336		 */
337		pageblock_nr = low_pfn >> pageblock_order;
338		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
339				get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
340			low_pfn += pageblock_nr_pages;
341			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
342			last_pageblock_nr = pageblock_nr;
343			continue;
344		}
345
346		if (!PageLRU(page))
347			continue;
348
349		/*
350		 * PageLRU is set, and lru_lock excludes isolation,
351		 * splitting and collapsing (collapsing has already
352		 * happened if PageLRU is set).
353		 */
354		if (PageTransHuge(page)) {
355			low_pfn += (1 << compound_order(page)) - 1;
356			continue;
357		}
358
 
 
 
 
 
359		/* Try isolate the page */
360		if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
361			continue;
362
363		VM_BUG_ON(PageTransCompound(page));
364
365		/* Successfully isolated */
366		del_page_from_lru_list(zone, page, page_lru(page));
367		list_add(&page->lru, migratelist);
368		cc->nr_migratepages++;
369		nr_isolated++;
370
371		/* Avoid isolating too much */
372		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
 
373			break;
 
374	}
375
376	acct_isolated(zone, cc);
377
378	spin_unlock_irq(&zone->lru_lock);
379	cc->migrate_pfn = low_pfn;
380
381	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
382
383	return ISOLATE_SUCCESS;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384}
385
386/*
387 * This is a migrate-callback that "allocates" freepages by taking pages
388 * from the isolated freelists in the block we are migrating to.
389 */
390static struct page *compaction_alloc(struct page *migratepage,
391					unsigned long data,
392					int **result)
393{
394	struct compact_control *cc = (struct compact_control *)data;
395	struct page *freepage;
396
397	/* Isolate free pages if necessary */
398	if (list_empty(&cc->freepages)) {
399		isolate_freepages(cc->zone, cc);
400
401		if (list_empty(&cc->freepages))
402			return NULL;
403	}
404
405	freepage = list_entry(cc->freepages.next, struct page, lru);
406	list_del(&freepage->lru);
407	cc->nr_freepages--;
408
409	return freepage;
410}
411
412/*
413 * We cannot control nr_migratepages and nr_freepages fully when migration is
414 * running as migrate_pages() has no knowledge of compact_control. When
415 * migration is complete, we count the number of pages on the lists by hand.
416 */
417static void update_nr_listpages(struct compact_control *cc)
418{
419	int nr_migratepages = 0;
420	int nr_freepages = 0;
421	struct page *page;
422
423	list_for_each_entry(page, &cc->migratepages, lru)
424		nr_migratepages++;
425	list_for_each_entry(page, &cc->freepages, lru)
426		nr_freepages++;
427
428	cc->nr_migratepages = nr_migratepages;
429	cc->nr_freepages = nr_freepages;
430}
431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432static int compact_finished(struct zone *zone,
433			    struct compact_control *cc)
434{
435	unsigned int order;
436	unsigned long watermark;
437
438	if (fatal_signal_pending(current))
439		return COMPACT_PARTIAL;
440
441	/* Compaction run completes if the migrate and free scanner meet */
442	if (cc->free_pfn <= cc->migrate_pfn)
443		return COMPACT_COMPLETE;
444
445	/*
446	 * order == -1 is expected when compacting via
447	 * /proc/sys/vm/compact_memory
448	 */
449	if (cc->order == -1)
450		return COMPACT_CONTINUE;
451
452	/* Compaction run is not finished if the watermark is not met */
453	watermark = low_wmark_pages(zone);
454	watermark += (1 << cc->order);
455
456	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
457		return COMPACT_CONTINUE;
458
459	/* Direct compactor: Is a suitable page free? */
460	for (order = cc->order; order < MAX_ORDER; order++) {
461		/* Job done if page is free of the right migratetype */
462		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
463			return COMPACT_PARTIAL;
464
465		/* Job done if allocation would set block type */
466		if (order >= pageblock_order && zone->free_area[order].nr_free)
467			return COMPACT_PARTIAL;
468	}
469
470	return COMPACT_CONTINUE;
471}
472
473/*
474 * compaction_suitable: Is this suitable to run compaction on this zone now?
475 * Returns
476 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
477 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
478 *   COMPACT_CONTINUE - If compaction should run now
479 */
480unsigned long compaction_suitable(struct zone *zone, int order)
481{
482	int fragindex;
483	unsigned long watermark;
484
485	/*
486	 * order == -1 is expected when compacting via
487	 * /proc/sys/vm/compact_memory
488	 */
489	if (order == -1)
490		return COMPACT_CONTINUE;
491
492	/*
493	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
494	 * This is because during migration, copies of pages need to be
495	 * allocated and for a short time, the footprint is higher
496	 */
497	watermark = low_wmark_pages(zone) + (2UL << order);
498	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
499		return COMPACT_SKIPPED;
500
501	/*
502	 * fragmentation index determines if allocation failures are due to
503	 * low memory or external fragmentation
504	 *
505	 * index of -1000 implies allocations might succeed depending on
506	 * watermarks
507	 * index towards 0 implies failure is due to lack of memory
508	 * index towards 1000 implies failure is due to fragmentation
509	 *
510	 * Only compact if a failure would be due to fragmentation.
511	 */
512	fragindex = fragmentation_index(zone, order);
513	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
514		return COMPACT_SKIPPED;
515
516	if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
517	    0, 0))
518		return COMPACT_PARTIAL;
519
520	return COMPACT_CONTINUE;
521}
522
523static int compact_zone(struct zone *zone, struct compact_control *cc)
524{
525	int ret;
526
527	ret = compaction_suitable(zone, cc->order);
528	switch (ret) {
529	case COMPACT_PARTIAL:
530	case COMPACT_SKIPPED:
531		/* Compaction is likely to fail */
532		return ret;
533	case COMPACT_CONTINUE:
534		/* Fall through to compaction */
535		;
536	}
537
538	/* Setup to move all movable pages to the end of the zone */
539	cc->migrate_pfn = zone->zone_start_pfn;
540	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
541	cc->free_pfn &= ~(pageblock_nr_pages-1);
542
543	migrate_prep_local();
544
545	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
546		unsigned long nr_migrate, nr_remaining;
547		int err;
548
549		switch (isolate_migratepages(zone, cc)) {
550		case ISOLATE_ABORT:
551			ret = COMPACT_PARTIAL;
552			goto out;
553		case ISOLATE_NONE:
554			continue;
555		case ISOLATE_SUCCESS:
556			;
557		}
558
559		nr_migrate = cc->nr_migratepages;
560		err = migrate_pages(&cc->migratepages, compaction_alloc,
561				(unsigned long)cc, false,
562				cc->sync);
563		update_nr_listpages(cc);
564		nr_remaining = cc->nr_migratepages;
565
566		count_vm_event(COMPACTBLOCKS);
567		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
568		if (nr_remaining)
569			count_vm_events(COMPACTPAGEFAILED, nr_remaining);
570		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
571						nr_remaining);
572
573		/* Release LRU pages not migrated */
574		if (err) {
575			putback_lru_pages(&cc->migratepages);
576			cc->nr_migratepages = 0;
 
 
 
 
577		}
578
579	}
580
581out:
582	/* Release free pages and check accounting */
583	cc->nr_freepages -= release_freepages(&cc->freepages);
584	VM_BUG_ON(cc->nr_freepages != 0);
585
586	return ret;
587}
588
589unsigned long compact_zone_order(struct zone *zone,
590				 int order, gfp_t gfp_mask,
591				 bool sync)
592{
593	struct compact_control cc = {
594		.nr_freepages = 0,
595		.nr_migratepages = 0,
596		.order = order,
597		.migratetype = allocflags_to_migratetype(gfp_mask),
598		.zone = zone,
599		.sync = sync,
600	};
601	INIT_LIST_HEAD(&cc.freepages);
602	INIT_LIST_HEAD(&cc.migratepages);
603
604	return compact_zone(zone, &cc);
605}
606
607int sysctl_extfrag_threshold = 500;
608
609/**
610 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
611 * @zonelist: The zonelist used for the current allocation
612 * @order: The order of the current allocation
613 * @gfp_mask: The GFP mask of the current allocation
614 * @nodemask: The allowed nodes to allocate from
615 * @sync: Whether migration is synchronous or not
616 *
617 * This is the main entry point for direct page compaction.
618 */
619unsigned long try_to_compact_pages(struct zonelist *zonelist,
620			int order, gfp_t gfp_mask, nodemask_t *nodemask,
621			bool sync)
622{
623	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
624	int may_enter_fs = gfp_mask & __GFP_FS;
625	int may_perform_io = gfp_mask & __GFP_IO;
626	struct zoneref *z;
627	struct zone *zone;
628	int rc = COMPACT_SKIPPED;
629
630	/*
631	 * Check whether it is worth even starting compaction. The order check is
632	 * made because an assumption is made that the page allocator can satisfy
633	 * the "cheaper" orders without taking special steps
634	 */
635	if (!order || !may_enter_fs || !may_perform_io)
636		return rc;
637
638	count_vm_event(COMPACTSTALL);
639
640	/* Compact each zone in the list */
641	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
642								nodemask) {
643		int status;
644
645		status = compact_zone_order(zone, order, gfp_mask, sync);
646		rc = max(status, rc);
647
648		/* If a normal allocation would succeed, stop compacting */
649		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
650			break;
651	}
652
653	return rc;
654}
655
656
657/* Compact all zones within a node */
658static int compact_node(int nid)
659{
660	int zoneid;
661	pg_data_t *pgdat;
662	struct zone *zone;
663
664	if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
665		return -EINVAL;
666	pgdat = NODE_DATA(nid);
667
668	/* Flush pending updates to the LRU lists */
669	lru_add_drain_all();
670
671	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
672		struct compact_control cc = {
673			.nr_freepages = 0,
674			.nr_migratepages = 0,
675			.order = -1,
676		};
677
678		zone = &pgdat->node_zones[zoneid];
679		if (!populated_zone(zone))
680			continue;
681
682		cc.zone = zone;
683		INIT_LIST_HEAD(&cc.freepages);
684		INIT_LIST_HEAD(&cc.migratepages);
685
686		compact_zone(zone, &cc);
 
 
 
 
 
 
 
 
 
 
 
 
 
687
688		VM_BUG_ON(!list_empty(&cc.freepages));
689		VM_BUG_ON(!list_empty(&cc.migratepages));
690	}
691
692	return 0;
693}
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695/* Compact all nodes in the system */
696static int compact_nodes(void)
697{
698	int nid;
699
 
 
 
700	for_each_online_node(nid)
701		compact_node(nid);
702
703	return COMPACT_COMPLETE;
704}
705
706/* The written value is actually unused, all memory is compacted */
707int sysctl_compact_memory;
708
709/* This is the entry point for compacting all nodes via /proc/sys/vm */
710int sysctl_compaction_handler(struct ctl_table *table, int write,
711			void __user *buffer, size_t *length, loff_t *ppos)
712{
713	if (write)
714		return compact_nodes();
715
716	return 0;
717}
718
719int sysctl_extfrag_handler(struct ctl_table *table, int write,
720			void __user *buffer, size_t *length, loff_t *ppos)
721{
722	proc_dointvec_minmax(table, write, buffer, length, ppos);
723
724	return 0;
725}
726
727#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
728ssize_t sysfs_compact_node(struct sys_device *dev,
729			struct sysdev_attribute *attr,
730			const char *buf, size_t count)
731{
732	compact_node(dev->id);
 
 
 
 
 
 
 
733
734	return count;
735}
736static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
737
738int compaction_register_node(struct node *node)
739{
740	return sysdev_create_file(&node->sysdev, &attr_compact);
741}
742
743void compaction_unregister_node(struct node *node)
744{
745	return sysdev_remove_file(&node->sysdev, &attr_compact);
746}
747#endif /* CONFIG_SYSFS && CONFIG_NUMA */
v3.5.6
  1/*
  2 * linux/mm/compaction.c
  3 *
  4 * Memory compaction for the reduction of external fragmentation. Note that
  5 * this heavily depends upon page migration to do all the real heavy
  6 * lifting
  7 *
  8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
  9 */
 10#include <linux/swap.h>
 11#include <linux/migrate.h>
 12#include <linux/compaction.h>
 13#include <linux/mm_inline.h>
 14#include <linux/backing-dev.h>
 15#include <linux/sysctl.h>
 16#include <linux/sysfs.h>
 17#include "internal.h"
 18
 19#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 20
 21#define CREATE_TRACE_POINTS
 22#include <trace/events/compaction.h>
 23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 24static unsigned long release_freepages(struct list_head *freelist)
 25{
 26	struct page *page, *next;
 27	unsigned long count = 0;
 28
 29	list_for_each_entry_safe(page, next, freelist, lru) {
 30		list_del(&page->lru);
 31		__free_page(page);
 32		count++;
 33	}
 34
 35	return count;
 36}
 37
 38static void map_pages(struct list_head *list)
 39{
 40	struct page *page;
 41
 42	list_for_each_entry(page, list, lru) {
 43		arch_alloc_page(page, 0);
 44		kernel_map_pages(page, 1, 1);
 45	}
 46}
 47
 48static inline bool migrate_async_suitable(int migratetype)
 49{
 50	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 51}
 52
 53/*
 54 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 55 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 56 * pages inside of the pageblock (even though it may still end up isolating
 57 * some pages).
 58 */
 59static unsigned long isolate_freepages_block(unsigned long blockpfn,
 60				unsigned long end_pfn,
 61				struct list_head *freelist,
 62				bool strict)
 63{
 
 64	int nr_scanned = 0, total_isolated = 0;
 65	struct page *cursor;
 66
 
 
 
 
 
 
 
 
 
 67	cursor = pfn_to_page(blockpfn);
 68
 69	/* Isolate free pages. This assumes the block is valid */
 70	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 71		int isolated, i;
 72		struct page *page = cursor;
 73
 74		if (!pfn_valid_within(blockpfn)) {
 75			if (strict)
 76				return 0;
 77			continue;
 78		}
 79		nr_scanned++;
 80
 81		if (!PageBuddy(page)) {
 82			if (strict)
 83				return 0;
 84			continue;
 85		}
 86
 87		/* Found a free page, break it into order-0 pages */
 88		isolated = split_free_page(page);
 89		if (!isolated && strict)
 90			return 0;
 91		total_isolated += isolated;
 92		for (i = 0; i < isolated; i++) {
 93			list_add(&page->lru, freelist);
 94			page++;
 95		}
 96
 97		/* If a page was split, advance to the end of it */
 98		if (isolated) {
 99			blockpfn += isolated - 1;
100			cursor += isolated - 1;
101		}
102	}
103
104	trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
105	return total_isolated;
106}
107
108/**
109 * isolate_freepages_range() - isolate free pages.
110 * @start_pfn: The first PFN to start isolating.
111 * @end_pfn:   The one-past-last PFN.
112 *
113 * Non-free pages, invalid PFNs, or zone boundaries within the
114 * [start_pfn, end_pfn) range are considered errors, cause function to
115 * undo its actions and return zero.
116 *
117 * Otherwise, function returns one-past-the-last PFN of isolated page
118 * (which may be greater then end_pfn if end fell in a middle of
119 * a free page).
 
 
 
 
 
 
 
 
 
 
 
 
 
120 */
121unsigned long
122isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
123{
124	unsigned long isolated, pfn, block_end_pfn, flags;
125	struct zone *zone = NULL;
126	LIST_HEAD(freelist);
 
 
127
128	if (pfn_valid(start_pfn))
129		zone = page_zone(pfn_to_page(start_pfn));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
131	for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
132		if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
133			break;
134
135		/*
136		 * On subsequent iterations ALIGN() is actually not needed,
137		 * but we keep it that we not to complicate the code.
 
 
 
138		 */
139		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
140		block_end_pfn = min(block_end_pfn, end_pfn);
 
141
142		spin_lock_irqsave(&zone->lock, flags);
143		isolated = isolate_freepages_block(pfn, block_end_pfn,
144						   &freelist, true);
145		spin_unlock_irqrestore(&zone->lock, flags);
146
147		/*
148		 * In strict mode, isolate_freepages_block() returns 0 if
149		 * there are any holes in the block (ie. invalid PFNs or
150		 * non-free pages).
 
151		 */
152		if (!isolated)
153			break;
 
 
 
 
 
154
155		/*
156		 * If we managed to isolate pages, it is always (1 << n) *
157		 * pageblock_nr_pages for some non-negative n.  (Max order
158		 * page may span two pageblocks).
159		 */
 
 
160	}
161
162	/* split_free_page does not map the pages */
163	map_pages(&freelist);
164
165	if (pfn < end_pfn) {
166		/* Loop terminated early, cleanup. */
167		release_freepages(&freelist);
168		return 0;
169	}
170
171	/* We don't use freelists for anything. */
172	return pfn;
173}
174
175/* Update the number of anon and file isolated pages in the zone */
176static void acct_isolated(struct zone *zone, struct compact_control *cc)
177{
178	struct page *page;
179	unsigned int count[2] = { 0, };
180
181	list_for_each_entry(page, &cc->migratepages, lru)
182		count[!!page_is_file_cache(page)]++;
 
 
183
184	__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
185	__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
 
 
186}
187
188/* Similar to reclaim, but different enough that they don't share logic */
189static bool too_many_isolated(struct zone *zone)
190{
191	unsigned long active, inactive, isolated;
192
193	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
194					zone_page_state(zone, NR_INACTIVE_ANON);
195	active = zone_page_state(zone, NR_ACTIVE_FILE) +
196					zone_page_state(zone, NR_ACTIVE_ANON);
197	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
198					zone_page_state(zone, NR_ISOLATED_ANON);
199
200	return isolated > (inactive + active) / 2;
201}
202
203/**
204 * isolate_migratepages_range() - isolate all migrate-able pages in range.
205 * @zone:	Zone pages are in.
206 * @cc:		Compaction control structure.
207 * @low_pfn:	The first PFN of the range.
208 * @end_pfn:	The one-past-the-last PFN of the range.
209 *
210 * Isolate all pages that can be migrated from the range specified by
211 * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
212 * pending), otherwise PFN of the first page that was not scanned
213 * (which may be both less, equal to or more then end_pfn).
214 *
215 * Assumes that cc->migratepages is empty and cc->nr_migratepages is
216 * zero.
217 *
218 * Apart from cc->migratepages and cc->nr_migratetypes this function
219 * does not modify any cc's fields, in particular it does not modify
220 * (or read for that matter) cc->migrate_pfn.
221 */
222unsigned long
223isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
224			   unsigned long low_pfn, unsigned long end_pfn)
225{
 
226	unsigned long last_pageblock_nr = 0, pageblock_nr;
227	unsigned long nr_scanned = 0, nr_isolated = 0;
228	struct list_head *migratelist = &cc->migratepages;
229	isolate_mode_t mode = 0;
230	struct lruvec *lruvec;
 
 
 
 
 
 
 
 
 
 
231
232	/*
233	 * Ensure that there are not too many pages isolated from the LRU
234	 * list by either parallel reclaimers or compaction. If there are,
235	 * delay for some time until fewer pages are isolated
236	 */
237	while (unlikely(too_many_isolated(zone))) {
238		/* async migration should just abort */
239		if (!cc->sync)
240			return 0;
241
242		congestion_wait(BLK_RW_ASYNC, HZ/10);
243
244		if (fatal_signal_pending(current))
245			return 0;
246	}
247
248	/* Time to isolate some pages for migration */
249	cond_resched();
250	spin_lock_irq(&zone->lru_lock);
251	for (; low_pfn < end_pfn; low_pfn++) {
252		struct page *page;
253		bool locked = true;
254
255		/* give a chance to irqs before checking need_resched() */
256		if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
257			spin_unlock_irq(&zone->lru_lock);
258			locked = false;
259		}
260		if (need_resched() || spin_is_contended(&zone->lru_lock)) {
261			if (locked)
262				spin_unlock_irq(&zone->lru_lock);
263			cond_resched();
264			spin_lock_irq(&zone->lru_lock);
265			if (fatal_signal_pending(current))
266				break;
267		} else if (!locked)
268			spin_lock_irq(&zone->lru_lock);
269
270		/*
271		 * migrate_pfn does not necessarily start aligned to a
272		 * pageblock. Ensure that pfn_valid is called when moving
273		 * into a new MAX_ORDER_NR_PAGES range in case of large
274		 * memory holes within the zone
275		 */
276		if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
277			if (!pfn_valid(low_pfn)) {
278				low_pfn += MAX_ORDER_NR_PAGES - 1;
279				continue;
280			}
281		}
282
283		if (!pfn_valid_within(low_pfn))
284			continue;
285		nr_scanned++;
286
287		/*
288		 * Get the page and ensure the page is within the same zone.
289		 * See the comment in isolate_freepages about overlapping
290		 * nodes. It is deliberate that the new zone lock is not taken
291		 * as memory compaction should not move pages between nodes.
292		 */
293		page = pfn_to_page(low_pfn);
294		if (page_zone(page) != zone)
295			continue;
296
297		/* Skip if free */
298		if (PageBuddy(page))
299			continue;
300
301		/*
302		 * For async migration, also only scan in MOVABLE blocks. Async
303		 * migration is optimistic to see if the minimum amount of work
304		 * satisfies the allocation
305		 */
306		pageblock_nr = low_pfn >> pageblock_order;
307		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
308		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
309			low_pfn += pageblock_nr_pages;
310			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
311			last_pageblock_nr = pageblock_nr;
312			continue;
313		}
314
315		if (!PageLRU(page))
316			continue;
317
318		/*
319		 * PageLRU is set, and lru_lock excludes isolation,
320		 * splitting and collapsing (collapsing has already
321		 * happened if PageLRU is set).
322		 */
323		if (PageTransHuge(page)) {
324			low_pfn += (1 << compound_order(page)) - 1;
325			continue;
326		}
327
328		if (!cc->sync)
329			mode |= ISOLATE_ASYNC_MIGRATE;
330
331		lruvec = mem_cgroup_page_lruvec(page, zone);
332
333		/* Try isolate the page */
334		if (__isolate_lru_page(page, mode) != 0)
335			continue;
336
337		VM_BUG_ON(PageTransCompound(page));
338
339		/* Successfully isolated */
340		del_page_from_lru_list(page, lruvec, page_lru(page));
341		list_add(&page->lru, migratelist);
342		cc->nr_migratepages++;
343		nr_isolated++;
344
345		/* Avoid isolating too much */
346		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
347			++low_pfn;
348			break;
349		}
350	}
351
352	acct_isolated(zone, cc);
353
354	spin_unlock_irq(&zone->lru_lock);
 
355
356	trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
357
358	return low_pfn;
359}
360
361#endif /* CONFIG_COMPACTION || CONFIG_CMA */
362#ifdef CONFIG_COMPACTION
363
364/* Returns true if the page is within a block suitable for migration to */
365static bool suitable_migration_target(struct page *page)
366{
367
368	int migratetype = get_pageblock_migratetype(page);
369
370	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
371	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
372		return false;
373
374	/* If the page is a large free page, then allow migration */
375	if (PageBuddy(page) && page_order(page) >= pageblock_order)
376		return true;
377
378	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
379	if (migrate_async_suitable(migratetype))
380		return true;
381
382	/* Otherwise skip the block */
383	return false;
384}
385
386/*
387 * Based on information in the current compact_control, find blocks
388 * suitable for isolating free pages from and then isolate them.
389 */
390static void isolate_freepages(struct zone *zone,
391				struct compact_control *cc)
392{
393	struct page *page;
394	unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
395	unsigned long flags;
396	int nr_freepages = cc->nr_freepages;
397	struct list_head *freelist = &cc->freepages;
398
399	/*
400	 * Initialise the free scanner. The starting point is where we last
401	 * scanned from (or the end of the zone if starting). The low point
402	 * is the end of the pageblock the migration scanner is using.
403	 */
404	pfn = cc->free_pfn;
405	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
406
407	/*
408	 * Take care that if the migration scanner is at the end of the zone
409	 * that the free scanner does not accidentally move to the next zone
410	 * in the next isolation cycle.
411	 */
412	high_pfn = min(low_pfn, pfn);
413
414	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
415
416	/*
417	 * Isolate free pages until enough are available to migrate the
418	 * pages on cc->migratepages. We stop searching if the migrate
419	 * and free page scanners meet or enough free pages are isolated.
420	 */
421	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
422					pfn -= pageblock_nr_pages) {
423		unsigned long isolated;
424
425		if (!pfn_valid(pfn))
426			continue;
427
428		/*
429		 * Check for overlapping nodes/zones. It's possible on some
430		 * configurations to have a setup like
431		 * node0 node1 node0
432		 * i.e. it's possible that all pages within a zones range of
433		 * pages do not belong to a single zone.
434		 */
435		page = pfn_to_page(pfn);
436		if (page_zone(page) != zone)
437			continue;
438
439		/* Check the block is suitable for migration */
440		if (!suitable_migration_target(page))
441			continue;
442
443		/*
444		 * Found a block suitable for isolating free pages from. Now
445		 * we disabled interrupts, double check things are ok and
446		 * isolate the pages. This is to minimise the time IRQs
447		 * are disabled
448		 */
449		isolated = 0;
450		spin_lock_irqsave(&zone->lock, flags);
451		if (suitable_migration_target(page)) {
452			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
453			isolated = isolate_freepages_block(pfn, end_pfn,
454							   freelist, false);
455			nr_freepages += isolated;
456		}
457		spin_unlock_irqrestore(&zone->lock, flags);
458
459		/*
460		 * Record the highest PFN we isolated pages from. When next
461		 * looking for free pages, the search will restart here as
462		 * page migration may have returned some pages to the allocator
463		 */
464		if (isolated)
465			high_pfn = max(high_pfn, pfn);
466	}
467
468	/* split_free_page does not map the pages */
469	map_pages(freelist);
470
471	cc->free_pfn = high_pfn;
472	cc->nr_freepages = nr_freepages;
473}
474
475/*
476 * This is a migrate-callback that "allocates" freepages by taking pages
477 * from the isolated freelists in the block we are migrating to.
478 */
479static struct page *compaction_alloc(struct page *migratepage,
480					unsigned long data,
481					int **result)
482{
483	struct compact_control *cc = (struct compact_control *)data;
484	struct page *freepage;
485
486	/* Isolate free pages if necessary */
487	if (list_empty(&cc->freepages)) {
488		isolate_freepages(cc->zone, cc);
489
490		if (list_empty(&cc->freepages))
491			return NULL;
492	}
493
494	freepage = list_entry(cc->freepages.next, struct page, lru);
495	list_del(&freepage->lru);
496	cc->nr_freepages--;
497
498	return freepage;
499}
500
501/*
502 * We cannot control nr_migratepages and nr_freepages fully when migration is
503 * running as migrate_pages() has no knowledge of compact_control. When
504 * migration is complete, we count the number of pages on the lists by hand.
505 */
506static void update_nr_listpages(struct compact_control *cc)
507{
508	int nr_migratepages = 0;
509	int nr_freepages = 0;
510	struct page *page;
511
512	list_for_each_entry(page, &cc->migratepages, lru)
513		nr_migratepages++;
514	list_for_each_entry(page, &cc->freepages, lru)
515		nr_freepages++;
516
517	cc->nr_migratepages = nr_migratepages;
518	cc->nr_freepages = nr_freepages;
519}
520
521/* possible outcome of isolate_migratepages */
522typedef enum {
523	ISOLATE_ABORT,		/* Abort compaction now */
524	ISOLATE_NONE,		/* No pages isolated, continue scanning */
525	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
526} isolate_migrate_t;
527
528/*
529 * Isolate all pages that can be migrated from the block pointed to by
530 * the migrate scanner within compact_control.
531 */
532static isolate_migrate_t isolate_migratepages(struct zone *zone,
533					struct compact_control *cc)
534{
535	unsigned long low_pfn, end_pfn;
536
537	/* Do not scan outside zone boundaries */
538	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
539
540	/* Only scan within a pageblock boundary */
541	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
542
543	/* Do not cross the free scanner or scan within a memory hole */
544	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
545		cc->migrate_pfn = end_pfn;
546		return ISOLATE_NONE;
547	}
548
549	/* Perform the isolation */
550	low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
551	if (!low_pfn)
552		return ISOLATE_ABORT;
553
554	cc->migrate_pfn = low_pfn;
555
556	return ISOLATE_SUCCESS;
557}
558
559static int compact_finished(struct zone *zone,
560			    struct compact_control *cc)
561{
562	unsigned int order;
563	unsigned long watermark;
564
565	if (fatal_signal_pending(current))
566		return COMPACT_PARTIAL;
567
568	/* Compaction run completes if the migrate and free scanner meet */
569	if (cc->free_pfn <= cc->migrate_pfn)
570		return COMPACT_COMPLETE;
571
572	/*
573	 * order == -1 is expected when compacting via
574	 * /proc/sys/vm/compact_memory
575	 */
576	if (cc->order == -1)
577		return COMPACT_CONTINUE;
578
579	/* Compaction run is not finished if the watermark is not met */
580	watermark = low_wmark_pages(zone);
581	watermark += (1 << cc->order);
582
583	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
584		return COMPACT_CONTINUE;
585
586	/* Direct compactor: Is a suitable page free? */
587	for (order = cc->order; order < MAX_ORDER; order++) {
588		/* Job done if page is free of the right migratetype */
589		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
590			return COMPACT_PARTIAL;
591
592		/* Job done if allocation would set block type */
593		if (order >= pageblock_order && zone->free_area[order].nr_free)
594			return COMPACT_PARTIAL;
595	}
596
597	return COMPACT_CONTINUE;
598}
599
600/*
601 * compaction_suitable: Is this suitable to run compaction on this zone now?
602 * Returns
603 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
604 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
605 *   COMPACT_CONTINUE - If compaction should run now
606 */
607unsigned long compaction_suitable(struct zone *zone, int order)
608{
609	int fragindex;
610	unsigned long watermark;
611
612	/*
613	 * order == -1 is expected when compacting via
614	 * /proc/sys/vm/compact_memory
615	 */
616	if (order == -1)
617		return COMPACT_CONTINUE;
618
619	/*
620	 * Watermarks for order-0 must be met for compaction. Note the 2UL.
621	 * This is because during migration, copies of pages need to be
622	 * allocated and for a short time, the footprint is higher
623	 */
624	watermark = low_wmark_pages(zone) + (2UL << order);
625	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
626		return COMPACT_SKIPPED;
627
628	/*
629	 * fragmentation index determines if allocation failures are due to
630	 * low memory or external fragmentation
631	 *
632	 * index of -1000 implies allocations might succeed depending on
633	 * watermarks
634	 * index towards 0 implies failure is due to lack of memory
635	 * index towards 1000 implies failure is due to fragmentation
636	 *
637	 * Only compact if a failure would be due to fragmentation.
638	 */
639	fragindex = fragmentation_index(zone, order);
640	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
641		return COMPACT_SKIPPED;
642
643	if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
644	    0, 0))
645		return COMPACT_PARTIAL;
646
647	return COMPACT_CONTINUE;
648}
649
650static int compact_zone(struct zone *zone, struct compact_control *cc)
651{
652	int ret;
653
654	ret = compaction_suitable(zone, cc->order);
655	switch (ret) {
656	case COMPACT_PARTIAL:
657	case COMPACT_SKIPPED:
658		/* Compaction is likely to fail */
659		return ret;
660	case COMPACT_CONTINUE:
661		/* Fall through to compaction */
662		;
663	}
664
665	/* Setup to move all movable pages to the end of the zone */
666	cc->migrate_pfn = zone->zone_start_pfn;
667	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
668	cc->free_pfn &= ~(pageblock_nr_pages-1);
669
670	migrate_prep_local();
671
672	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
673		unsigned long nr_migrate, nr_remaining;
674		int err;
675
676		switch (isolate_migratepages(zone, cc)) {
677		case ISOLATE_ABORT:
678			ret = COMPACT_PARTIAL;
679			goto out;
680		case ISOLATE_NONE:
681			continue;
682		case ISOLATE_SUCCESS:
683			;
684		}
685
686		nr_migrate = cc->nr_migratepages;
687		err = migrate_pages(&cc->migratepages, compaction_alloc,
688				(unsigned long)cc, false,
689				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
690		update_nr_listpages(cc);
691		nr_remaining = cc->nr_migratepages;
692
693		count_vm_event(COMPACTBLOCKS);
694		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
695		if (nr_remaining)
696			count_vm_events(COMPACTPAGEFAILED, nr_remaining);
697		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
698						nr_remaining);
699
700		/* Release LRU pages not migrated */
701		if (err) {
702			putback_lru_pages(&cc->migratepages);
703			cc->nr_migratepages = 0;
704			if (err == -ENOMEM) {
705				ret = COMPACT_PARTIAL;
706				goto out;
707			}
708		}
 
709	}
710
711out:
712	/* Release free pages and check accounting */
713	cc->nr_freepages -= release_freepages(&cc->freepages);
714	VM_BUG_ON(cc->nr_freepages != 0);
715
716	return ret;
717}
718
719static unsigned long compact_zone_order(struct zone *zone,
720				 int order, gfp_t gfp_mask,
721				 bool sync)
722{
723	struct compact_control cc = {
724		.nr_freepages = 0,
725		.nr_migratepages = 0,
726		.order = order,
727		.migratetype = allocflags_to_migratetype(gfp_mask),
728		.zone = zone,
729		.sync = sync,
730	};
731	INIT_LIST_HEAD(&cc.freepages);
732	INIT_LIST_HEAD(&cc.migratepages);
733
734	return compact_zone(zone, &cc);
735}
736
737int sysctl_extfrag_threshold = 500;
738
739/**
740 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
741 * @zonelist: The zonelist used for the current allocation
742 * @order: The order of the current allocation
743 * @gfp_mask: The GFP mask of the current allocation
744 * @nodemask: The allowed nodes to allocate from
745 * @sync: Whether migration is synchronous or not
746 *
747 * This is the main entry point for direct page compaction.
748 */
749unsigned long try_to_compact_pages(struct zonelist *zonelist,
750			int order, gfp_t gfp_mask, nodemask_t *nodemask,
751			bool sync)
752{
753	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
754	int may_enter_fs = gfp_mask & __GFP_FS;
755	int may_perform_io = gfp_mask & __GFP_IO;
756	struct zoneref *z;
757	struct zone *zone;
758	int rc = COMPACT_SKIPPED;
759
760	/*
761	 * Check whether it is worth even starting compaction. The order check is
762	 * made because an assumption is made that the page allocator can satisfy
763	 * the "cheaper" orders without taking special steps
764	 */
765	if (!order || !may_enter_fs || !may_perform_io)
766		return rc;
767
768	count_vm_event(COMPACTSTALL);
769
770	/* Compact each zone in the list */
771	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
772								nodemask) {
773		int status;
774
775		status = compact_zone_order(zone, order, gfp_mask, sync);
776		rc = max(status, rc);
777
778		/* If a normal allocation would succeed, stop compacting */
779		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
780			break;
781	}
782
783	return rc;
784}
785
786
787/* Compact all zones within a node */
788static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
789{
790	int zoneid;
 
791	struct zone *zone;
792
 
 
 
 
 
 
 
793	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 
 
 
 
 
794
795		zone = &pgdat->node_zones[zoneid];
796		if (!populated_zone(zone))
797			continue;
798
799		cc->nr_freepages = 0;
800		cc->nr_migratepages = 0;
801		cc->zone = zone;
802		INIT_LIST_HEAD(&cc->freepages);
803		INIT_LIST_HEAD(&cc->migratepages);
804
805		if (cc->order == -1 || !compaction_deferred(zone, cc->order))
806			compact_zone(zone, cc);
807
808		if (cc->order > 0) {
809			int ok = zone_watermark_ok(zone, cc->order,
810						low_wmark_pages(zone), 0, 0);
811			if (ok && cc->order > zone->compact_order_failed)
812				zone->compact_order_failed = cc->order + 1;
813			/* Currently async compaction is never deferred. */
814			else if (!ok && cc->sync)
815				defer_compaction(zone, cc->order);
816		}
817
818		VM_BUG_ON(!list_empty(&cc->freepages));
819		VM_BUG_ON(!list_empty(&cc->migratepages));
820	}
821
822	return 0;
823}
824
825int compact_pgdat(pg_data_t *pgdat, int order)
826{
827	struct compact_control cc = {
828		.order = order,
829		.sync = false,
830	};
831
832	return __compact_pgdat(pgdat, &cc);
833}
834
835static int compact_node(int nid)
836{
837	struct compact_control cc = {
838		.order = -1,
839		.sync = true,
840	};
841
842	return __compact_pgdat(NODE_DATA(nid), &cc);
843}
844
845/* Compact all nodes in the system */
846static int compact_nodes(void)
847{
848	int nid;
849
850	/* Flush pending updates to the LRU lists */
851	lru_add_drain_all();
852
853	for_each_online_node(nid)
854		compact_node(nid);
855
856	return COMPACT_COMPLETE;
857}
858
859/* The written value is actually unused, all memory is compacted */
860int sysctl_compact_memory;
861
862/* This is the entry point for compacting all nodes via /proc/sys/vm */
863int sysctl_compaction_handler(struct ctl_table *table, int write,
864			void __user *buffer, size_t *length, loff_t *ppos)
865{
866	if (write)
867		return compact_nodes();
868
869	return 0;
870}
871
872int sysctl_extfrag_handler(struct ctl_table *table, int write,
873			void __user *buffer, size_t *length, loff_t *ppos)
874{
875	proc_dointvec_minmax(table, write, buffer, length, ppos);
876
877	return 0;
878}
879
880#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
881ssize_t sysfs_compact_node(struct device *dev,
882			struct device_attribute *attr,
883			const char *buf, size_t count)
884{
885	int nid = dev->id;
886
887	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
888		/* Flush pending updates to the LRU lists */
889		lru_add_drain_all();
890
891		compact_node(nid);
892	}
893
894	return count;
895}
896static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
897
898int compaction_register_node(struct node *node)
899{
900	return device_create_file(&node->dev, &dev_attr_compact);
901}
902
903void compaction_unregister_node(struct node *node)
904{
905	return device_remove_file(&node->dev, &dev_attr_compact);
906}
907#endif /* CONFIG_SYSFS && CONFIG_NUMA */
908
909#endif /* CONFIG_COMPACTION */