memcontrol.c - mm/memcontrol.c - Linux diff v3.1 - Bootlin Elixir Cross Referencer

 
   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * Memory thresholds
  10 * Copyright (C) 2009 Nokia Corporation
  11 * Author: Kirill A. Shutemov
  12 *
  13 * This program is free software; you can redistribute it and/or modify
  14 * it under the terms of the GNU General Public License as published by
  15 * the Free Software Foundation; either version 2 of the License, or
  16 * (at your option) any later version.
  17 *
  18 * This program is distributed in the hope that it will be useful,
  19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 * GNU General Public License for more details.
 
 
 
  22 */
  23
  24#include <linux/res_counter.h>
 
  25#include <linux/memcontrol.h>
  26#include <linux/cgroup.h>
  27#include <linux/mm.h>
 
  28#include <linux/hugetlb.h>
  29#include <linux/pagemap.h>
 
 
  30#include <linux/smp.h>
  31#include <linux/page-flags.h>
  32#include <linux/backing-dev.h>
  33#include <linux/bit_spinlock.h>
  34#include <linux/rcupdate.h>
  35#include <linux/limits.h>
 
 
  36#include <linux/mutex.h>
  37#include <linux/rbtree.h>
  38#include <linux/slab.h>
  39#include <linux/swap.h>
  40#include <linux/swapops.h>
  41#include <linux/spinlock.h>
  42#include <linux/eventfd.h>
  43#include <linux/sort.h>
  44#include <linux/fs.h>
  45#include <linux/seq_file.h>
  46#include <linux/vmalloc.h>
 
 
  47#include <linux/mm_inline.h>
  48#include <linux/page_cgroup.h>
  49#include <linux/cpu.h>
  50#include <linux/oom.h>
 
 
 
 
 
 
  51#include "internal.h"
  52
  53#include <asm/uaccess.h>
 
 
 
 
 
 
 
 
  54
  55#include <trace/events/vmscan.h>
  56
  57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  58#define MEM_CGROUP_RECLAIM_RETRIES	5
 
  59struct mem_cgroup *root_mem_cgroup __read_mostly;
  60
  61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  63int do_swap_account __read_mostly;
  64
  65/* for remember boot option*/
  66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
  67static int really_do_swap_account __initdata = 1;
  68#else
  69static int really_do_swap_account __initdata = 0;
  70#endif
  71
  72#else
  73#define do_swap_account		(0)
 
 
 
 
 
 
 
 
 
  74#endif
  75
 
 
 
 
 
  76
  77/*
  78 * Statistics for memory cgroup.
  79 */
  80enum mem_cgroup_stat_index {
  81	/*
  82	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  83	 */
  84	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
  85	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
  86	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
  87	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
  88	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
  89	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */
  90	MEM_CGROUP_STAT_NSTATS,
  91};
  92
  93enum mem_cgroup_events_index {
  94	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
  95	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
  96	MEM_CGROUP_EVENTS_COUNT,	/* # of pages paged in/out */
  97	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
  98	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
  99	MEM_CGROUP_EVENTS_NSTATS,
 100};
 101/*
 102 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 103 * it will be incremated by the number of pages. This counter is used for
 104 * for trigger some periodic events. This is straightforward and better
 105 * than using jiffies etc. to handle periodic memcg event.
 106 */
 107enum mem_cgroup_events_target {
 108	MEM_CGROUP_TARGET_THRESH,
 109	MEM_CGROUP_TARGET_SOFTLIMIT,
 110	MEM_CGROUP_TARGET_NUMAINFO,
 111	MEM_CGROUP_NTARGETS,
 112};
 113#define THRESHOLDS_EVENTS_TARGET (128)
 114#define SOFTLIMIT_EVENTS_TARGET (1024)
 115#define NUMAINFO_EVENTS_TARGET	(1024)
 116
 117struct mem_cgroup_stat_cpu {
 118	long count[MEM_CGROUP_STAT_NSTATS];
 119	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 120	unsigned long targets[MEM_CGROUP_NTARGETS];
 121};
 122
 123/*
 124 * per-zone information in memory controller.
 125 */
 126struct mem_cgroup_per_zone {
 127	/*
 128	 * spin_lock to protect the per cgroup LRU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 129	 */
 130	struct list_head	lists[NR_LRU_LISTS];
 131	unsigned long		count[NR_LRU_LISTS];
 
 132
 133	struct zone_reclaim_stat reclaim_stat;
 134	struct rb_node		tree_node;	/* RB tree node */
 135	unsigned long long	usage_in_excess;/* Set to the value by which */
 136						/* the soft limit is exceeded*/
 137	bool			on_tree;
 138	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
 139						/* use container_of	   */
 140};
 141/* Macro for accessing counter */
 142#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
 143
 144struct mem_cgroup_per_node {
 145	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 146};
 147
 148struct mem_cgroup_lru_info {
 149	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 150};
 151
 152/*
 153 * Cgroups above their limits are maintained in a RB-Tree, independent of
 154 * their hierarchy representation
 155 */
 156
 157struct mem_cgroup_tree_per_zone {
 158	struct rb_root rb_root;
 159	spinlock_t lock;
 160};
 161
 162struct mem_cgroup_tree_per_node {
 163	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 164};
 165
 166struct mem_cgroup_tree {
 167	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 168};
 
 
 
 
 
 
 169
 170static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 
 
 
 171
 172struct mem_cgroup_threshold {
 173	struct eventfd_ctx *eventfd;
 174	u64 threshold;
 175};
 176
 177/* For threshold */
 178struct mem_cgroup_threshold_ary {
 179	/* An array index points to threshold just below usage. */
 180	int current_threshold;
 181	/* Size of entries[] */
 182	unsigned int size;
 183	/* Array of thresholds */
 184	struct mem_cgroup_threshold entries[0];
 185};
 186
 187struct mem_cgroup_thresholds {
 188	/* Primary thresholds array */
 189	struct mem_cgroup_threshold_ary *primary;
 190	/*
 191	 * Spare threshold array.
 192	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 193	 * It must be able to store at least primary->size - 1 entries.
 194	 */
 195	struct mem_cgroup_threshold_ary *spare;
 196};
 197
 198/* for OOM */
 199struct mem_cgroup_eventfd_list {
 200	struct list_head list;
 201	struct eventfd_ctx *eventfd;
 202};
 203
 204static void mem_cgroup_threshold(struct mem_cgroup *mem);
 205static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 206
 207/*
 208 * The memory controller data structure. The memory controller controls both
 209 * page cache and RSS per cgroup. We would eventually like to provide
 210 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 211 * to help the administrator determine what knobs to tune.
 212 *
 213 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 214 * we hit the water mark. May be even add a low water mark, such that
 215 * no reclaim occurs from a cgroup at it's low water mark, this is
 216 * a feature that will be implemented much later in the future.
 217 */
 218struct mem_cgroup {
 219	struct cgroup_subsys_state css;
 220	/*
 221	 * the counter to account for memory usage
 222	 */
 223	struct res_counter res;
 224	/*
 225	 * the counter to account for mem+swap usage.
 226	 */
 227	struct res_counter memsw;
 228	/*
 229	 * Per cgroup active and inactive list, similar to the
 230	 * per zone LRU lists.
 231	 */
 232	struct mem_cgroup_lru_info info;
 233	/*
 234	 * While reclaiming in a hierarchy, we cache the last child we
 235	 * reclaimed from.
 236	 */
 237	int last_scanned_child;
 238	int last_scanned_node;
 239#if MAX_NUMNODES > 1
 240	nodemask_t	scan_nodes;
 241	atomic_t	numainfo_events;
 242	atomic_t	numainfo_updating;
 243#endif
 244	/*
 245	 * Should the accounting and control be hierarchical, per subtree?
 246	 */
 247	bool use_hierarchy;
 248
 249	bool		oom_lock;
 250	atomic_t	under_oom;
 251
 252	atomic_t	refcnt;
 253
 254	int	swappiness;
 255	/* OOM-Killer disable */
 256	int		oom_kill_disable;
 
 
 
 
 
 
 
 
 
 257
 258	/* set when res.limit == memsw.limit */
 259	bool		memsw_is_minimum;
 260
 261	/* protect arrays of thresholds */
 262	struct mutex thresholds_lock;
 263
 264	/* thresholds for memory usage. RCU-protected */
 265	struct mem_cgroup_thresholds thresholds;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 266
 267	/* thresholds for mem+swap usage. RCU-protected */
 268	struct mem_cgroup_thresholds memsw_thresholds;
 
 269
 270	/* For oom notifier event fd */
 271	struct list_head oom_notify;
 
 
 
 
 
 272
 273	/*
 274	 * Should we move charges of a task when a task is moved into this
 275	 * mem_cgroup ? And what type of charges should we move ?
 276	 */
 277	unsigned long 	move_charge_at_immigrate;
 278	/*
 279	 * percpu counter.
 280	 */
 281	struct mem_cgroup_stat_cpu *stat;
 282	/*
 283	 * used when a cpu is offlined or other synchronizations
 284	 * See mem_cgroup_read_stat().
 285	 */
 286	struct mem_cgroup_stat_cpu nocpu_base;
 287	spinlock_t pcp_counter_lock;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 288};
 289
 290/* Stuffs for move charges at task migration. */
 291/*
 292 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
 293 * left-shifted bitmap of these types.
 294 */
 295enum move_type {
 296	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 297	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 298	NR_MOVE_TYPE,
 299};
 300
 301/* "mc" and its members are protected by cgroup_mutex */
 302static struct move_charge_struct {
 303	spinlock_t	  lock; /* for from, to */
 304	struct mem_cgroup *from;
 305	struct mem_cgroup *to;
 306	unsigned long precharge;
 307	unsigned long moved_charge;
 308	unsigned long moved_swap;
 309	struct task_struct *moving_task;	/* a task moving charges */
 310	wait_queue_head_t waitq;		/* a waitq for other context */
 311} mc = {
 312	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 313	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 314};
 315
 316static bool move_anon(void)
 317{
 318	return test_bit(MOVE_CHARGE_TYPE_ANON,
 319					&mc.to->move_charge_at_immigrate);
 
 
 
 
 
 
 
 
 
 320}
 321
 322static bool move_file(void)
 323{
 324	return test_bit(MOVE_CHARGE_TYPE_FILE,
 325					&mc.to->move_charge_at_immigrate);
 326}
 327
 328/*
 329 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 330 * limit reclaim to prevent infinite loops, if they ever occur.
 331 */
 332#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
 333#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
 334
 335enum charge_type {
 336	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 337	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 338	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 339	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 340	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 341	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 342	NR_CHARGE_TYPE,
 343};
 344
 345/* for encoding cft->private value on file */
 346#define _MEM			(0)
 347#define _MEMSWAP		(1)
 348#define _OOM_TYPE		(2)
 349#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 350#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
 351#define MEMFILE_ATTR(val)	((val) & 0xffff)
 352/* Used for OOM nofiier */
 353#define OOM_CONTROL		(0)
 354
 355/*
 356 * Reclaim flags for mem_cgroup_hierarchical_reclaim
 357 */
 358#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
 359#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 360#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 361#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 362#define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 363#define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 364
 365static void mem_cgroup_get(struct mem_cgroup *mem);
 366static void mem_cgroup_put(struct mem_cgroup *mem);
 367static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 368static void drain_all_stock_async(struct mem_cgroup *mem);
 369
 370static struct mem_cgroup_per_zone *
 371mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 372{
 373	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 374}
 
 375
 376struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
 377{
 378	return &mem->css;
 379}
 380
 381static struct mem_cgroup_per_zone *
 382page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
 383{
 384	int nid = page_to_nid(page);
 385	int zid = page_zonenum(page);
 386
 387	return mem_cgroup_zoneinfo(mem, nid, zid);
 
 
 
 
 
 
 388}
 389
 390static struct mem_cgroup_tree_per_zone *
 391soft_limit_tree_node_zone(int nid, int zid)
 392{
 393	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 394}
 
 395
 396static struct mem_cgroup_tree_per_zone *
 397soft_limit_tree_from_page(struct page *page)
 398{
 399	int nid = page_to_nid(page);
 400	int zid = page_zonenum(page);
 
 401
 402	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 
 
 
 
 
 
 403}
 404
 405static void
 406__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 407				struct mem_cgroup_per_zone *mz,
 408				struct mem_cgroup_tree_per_zone *mctz,
 409				unsigned long long new_usage_in_excess)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 410{
 411	struct rb_node **p = &mctz->rb_root.rb_node;
 412	struct rb_node *parent = NULL;
 413	struct mem_cgroup_per_zone *mz_node;
 414
 415	if (mz->on_tree)
 416		return;
 417
 418	mz->usage_in_excess = new_usage_in_excess;
 419	if (!mz->usage_in_excess)
 420		return;
 421	while (*p) {
 422		parent = *p;
 423		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 424					tree_node);
 425		if (mz->usage_in_excess < mz_node->usage_in_excess)
 426			p = &(*p)->rb_left;
 427		/*
 428		 * We can't avoid mem cgroups that are over their soft
 429		 * limit by the same amount
 430		 */
 431		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 432			p = &(*p)->rb_right;
 433	}
 434	rb_link_node(&mz->tree_node, parent, p);
 435	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 436	mz->on_tree = true;
 437}
 438
 439static void
 440__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 441				struct mem_cgroup_per_zone *mz,
 442				struct mem_cgroup_tree_per_zone *mctz)
 443{
 444	if (!mz->on_tree)
 445		return;
 446	rb_erase(&mz->tree_node, &mctz->rb_root);
 447	mz->on_tree = false;
 448}
 449
 450static void
 451mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 452				struct mem_cgroup_per_zone *mz,
 453				struct mem_cgroup_tree_per_zone *mctz)
 454{
 455	spin_lock(&mctz->lock);
 456	__mem_cgroup_remove_exceeded(mem, mz, mctz);
 457	spin_unlock(&mctz->lock);
 458}
 459
 
 
 
 460
 461static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 462{
 463	unsigned long long excess;
 464	struct mem_cgroup_per_zone *mz;
 465	struct mem_cgroup_tree_per_zone *mctz;
 466	int nid = page_to_nid(page);
 467	int zid = page_zonenum(page);
 468	mctz = soft_limit_tree_from_page(page);
 469
 470	/*
 471	 * Necessary to update all ancestors when hierarchy is used.
 472	 * because their event counter is not touched.
 473	 */
 474	for (; mem; mem = parent_mem_cgroup(mem)) {
 475		mz = mem_cgroup_zoneinfo(mem, nid, zid);
 476		excess = res_counter_soft_limit_excess(&mem->res);
 477		/*
 478		 * We have to update the tree if mz is on RB-tree or
 479		 * mem is over its softlimit.
 480		 */
 481		if (excess || mz->on_tree) {
 482			spin_lock(&mctz->lock);
 483			/* if on-tree, remove it */
 484			if (mz->on_tree)
 485				__mem_cgroup_remove_exceeded(mem, mz, mctz);
 486			/*
 487			 * Insert again. mz->usage_in_excess will be updated.
 488			 * If excess is 0, no tree ops.
 489			 */
 490			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
 491			spin_unlock(&mctz->lock);
 492		}
 493	}
 494}
 495
 496static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 497{
 498	int node, zone;
 499	struct mem_cgroup_per_zone *mz;
 500	struct mem_cgroup_tree_per_zone *mctz;
 501
 502	for_each_node_state(node, N_POSSIBLE) {
 503		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 504			mz = mem_cgroup_zoneinfo(mem, node, zone);
 505			mctz = soft_limit_tree_node_zone(node, zone);
 506			mem_cgroup_remove_exceeded(mem, mz, mctz);
 507		}
 508	}
 509}
 510
 511static struct mem_cgroup_per_zone *
 512__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 513{
 514	struct rb_node *rightmost = NULL;
 515	struct mem_cgroup_per_zone *mz;
 516
 517retry:
 518	mz = NULL;
 519	rightmost = rb_last(&mctz->rb_root);
 520	if (!rightmost)
 521		goto done;		/* Nothing to reclaim from */
 522
 523	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 524	/*
 525	 * Remove the node now but someone else can add it back,
 526	 * we will to add it back at the end of reclaim to its correct
 527	 * position in the tree.
 528	 */
 529	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 530	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
 531		!css_tryget(&mz->mem->css))
 532		goto retry;
 533done:
 534	return mz;
 535}
 536
 537static struct mem_cgroup_per_zone *
 538mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 539{
 540	struct mem_cgroup_per_zone *mz;
 541
 542	spin_lock(&mctz->lock);
 543	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 544	spin_unlock(&mctz->lock);
 545	return mz;
 546}
 
 
 547
 548/*
 549 * Implementation Note: reading percpu statistics for memcg.
 
 
 
 
 550 *
 551 * Both of vmstat[] and percpu_counter has threshold and do periodic
 552 * synchronization to implement "quick" read. There are trade-off between
 553 * reading cost and precision of value. Then, we may have a chance to implement
 554 * a periodic synchronizion of counter in memcg's counter.
 555 *
 556 * But this _read() function is used for user interface now. The user accounts
 557 * memory usage by memory cgroup and he _always_ requires exact value because
 558 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 559 * have to visit all online cpus and make sum. So, for now, unnecessary
 560 * synchronization is not implemented. (just implemented for cpu hotplug)
 561 *
 562 * If there are kernel internal actions which can make use of some not-exact
 563 * value, and reading all cpu value can be performance bottleneck in some
 564 * common workload, threashold and synchonization as vmstat[] should be
 565 * implemented.
 566 */
 567static long mem_cgroup_read_stat(struct mem_cgroup *mem,
 568				 enum mem_cgroup_stat_index idx)
 569{
 570	long val = 0;
 571	int cpu;
 572
 573	get_online_cpus();
 574	for_each_online_cpu(cpu)
 575		val += per_cpu(mem->stat->count[idx], cpu);
 576#ifdef CONFIG_HOTPLUG_CPU
 577	spin_lock(&mem->pcp_counter_lock);
 578	val += mem->nocpu_base.count[idx];
 579	spin_unlock(&mem->pcp_counter_lock);
 580#endif
 581	put_online_cpus();
 582	return val;
 583}
 584
 585static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 586					 bool charge)
 
 
 
 
 
 587{
 588	int val = (charge) ? 1 : -1;
 589	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 590}
 591
 592void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
 593{
 594	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
 595}
 596
 597void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
 598{
 599	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
 600}
 601
 602static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
 603					    enum mem_cgroup_events_index idx)
 604{
 605	unsigned long val = 0;
 606	int cpu;
 607
 608	for_each_online_cpu(cpu)
 609		val += per_cpu(mem->stat->events[idx], cpu);
 610#ifdef CONFIG_HOTPLUG_CPU
 611	spin_lock(&mem->pcp_counter_lock);
 612	val += mem->nocpu_base.events[idx];
 613	spin_unlock(&mem->pcp_counter_lock);
 614#endif
 615	return val;
 616}
 617
 618static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 619					 bool file, int nr_pages)
 620{
 621	preempt_disable();
 622
 623	if (file)
 624		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
 625	else
 626		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
 627
 628	/* pagein of a big page is an event. So, ignore page size */
 629	if (nr_pages > 0)
 630		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 631	else {
 632		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 633		nr_pages = -nr_pages; /* for event */
 634	}
 635
 636	__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
 
 
 
 
 
 
 637
 638	preempt_enable();
 
 
 
 
 
 
 
 
 639}
 640
 641unsigned long
 642mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
 643			unsigned int lru_mask)
 644{
 645	struct mem_cgroup_per_zone *mz;
 646	enum lru_list l;
 647	unsigned long ret = 0;
 648
 649	mz = mem_cgroup_zoneinfo(mem, nid, zid);
 
 650
 651	for_each_lru(l) {
 652		if (BIT(l) & lru_mask)
 653			ret += MEM_CGROUP_ZSTAT(mz, l);
 654	}
 655	return ret;
 
 
 656}
 657
 658static unsigned long
 659mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
 660			int nid, unsigned int lru_mask)
 
 
 
 
 
 
 
 661{
 662	u64 total = 0;
 663	int zid;
 664
 665	for (zid = 0; zid < MAX_NR_ZONES; zid++)
 666		total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
 667
 668	return total;
 669}
 670
 671static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
 672			unsigned int lru_mask)
 673{
 674	int nid;
 675	u64 total = 0;
 676
 677	for_each_node_state(nid, N_HIGH_MEMORY)
 678		total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
 679	return total;
 680}
 681
 682static bool __memcg_event_check(struct mem_cgroup *mem, int target)
 683{
 684	unsigned long val, next;
 685
 686	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 687	next = this_cpu_read(mem->stat->targets[target]);
 688	/* from time_after() in jiffies.h */
 689	return ((long)next - (long)val < 0);
 690}
 691
 692static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
 693{
 694	unsigned long val, next;
 
 695
 696	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 697
 698	switch (target) {
 699	case MEM_CGROUP_TARGET_THRESH:
 700		next = val + THRESHOLDS_EVENTS_TARGET;
 701		break;
 702	case MEM_CGROUP_TARGET_SOFTLIMIT:
 703		next = val + SOFTLIMIT_EVENTS_TARGET;
 704		break;
 705	case MEM_CGROUP_TARGET_NUMAINFO:
 706		next = val + NUMAINFO_EVENTS_TARGET;
 707		break;
 708	default:
 709		return;
 710	}
 711
 712	this_cpu_write(mem->stat->targets[target], next);
 
 
 
 
 
 713}
 714
 
 
 715/*
 716 * Check events in order.
 717 *
 718 */
 719static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
 720{
 721	/* threshold event is triggered in finer grain than soft limit */
 722	if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
 723		mem_cgroup_threshold(mem);
 724		__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
 725		if (unlikely(__memcg_event_check(mem,
 726			     MEM_CGROUP_TARGET_SOFTLIMIT))) {
 727			mem_cgroup_update_tree(mem, page);
 728			__mem_cgroup_target_update(mem,
 729						   MEM_CGROUP_TARGET_SOFTLIMIT);
 730		}
 731#if MAX_NUMNODES > 1
 732		if (unlikely(__memcg_event_check(mem,
 733			MEM_CGROUP_TARGET_NUMAINFO))) {
 734			atomic_inc(&mem->numainfo_events);
 735			__mem_cgroup_target_update(mem,
 736				MEM_CGROUP_TARGET_NUMAINFO);
 737		}
 738#endif
 739	}
 740}
 741
 742static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 743{
 744	return container_of(cgroup_subsys_state(cont,
 745				mem_cgroup_subsys_id), struct mem_cgroup,
 746				css);
 747}
 748
 749struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 
 
 
 
 
 
 
 750{
 751	/*
 752	 * mm_update_next_owner() may clear mm->owner to NULL
 753	 * if it races with swapoff, page migration, etc.
 754	 * So this can be called with p == NULL.
 755	 */
 756	if (unlikely(!p))
 757		return NULL;
 758
 759	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 760				struct mem_cgroup, css);
 761}
 762
 763struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 764{
 765	struct mem_cgroup *mem = NULL;
 766
 767	if (!mm)
 768		return NULL;
 769	/*
 770	 * Because we have no locks, mm->owner's may be being moved to other
 771	 * cgroup. We use css_tryget() here even if this looks
 772	 * pessimistic (rather than adding locks here).
 773	 */
 774	rcu_read_lock();
 775	do {
 776		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 777		if (unlikely(!mem))
 778			break;
 779	} while (!css_tryget(&mem->css));
 780	rcu_read_unlock();
 781	return mem;
 782}
 783
 784/* The caller has to guarantee "mem" exists before calling this */
 785static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
 786{
 787	struct cgroup_subsys_state *css;
 788	int found;
 789
 790	if (!mem) /* ROOT cgroup has the smallest ID */
 791		return root_mem_cgroup; /*css_put/get against root is ignored*/
 792	if (!mem->use_hierarchy) {
 793		if (css_tryget(&mem->css))
 794			return mem;
 795		return NULL;
 796	}
 797	rcu_read_lock();
 798	/*
 799	 * searching a memory cgroup which has the smallest ID under given
 800	 * ROOT cgroup. (ID >= 1)
 801	 */
 802	css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
 803	if (css && css_tryget(css))
 804		mem = container_of(css, struct mem_cgroup, css);
 805	else
 806		mem = NULL;
 807	rcu_read_unlock();
 808	return mem;
 809}
 810
 811static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
 812					struct mem_cgroup *root,
 813					bool cond)
 814{
 815	int nextid = css_id(&iter->css) + 1;
 816	int found;
 817	int hierarchy_used;
 818	struct cgroup_subsys_state *css;
 819
 820	hierarchy_used = iter->use_hierarchy;
 
 821
 822	css_put(&iter->css);
 823	/* If no ROOT, walk all, ignore hierarchy */
 824	if (!cond || (root && !hierarchy_used))
 825		return NULL;
 826
 827	if (!root)
 828		root = root_mem_cgroup;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 829
 830	do {
 831		iter = NULL;
 832		rcu_read_lock();
 833
 834		css = css_get_next(&mem_cgroup_subsys, nextid,
 835				&root->css, &found);
 836		if (css && css_tryget(css))
 837			iter = container_of(css, struct mem_cgroup, css);
 838		rcu_read_unlock();
 839		/* If css is NULL, no more cgroups will be found */
 840		nextid = found + 1;
 841	} while (css && !iter);
 842
 843	return iter;
 
 
 
 844}
 845/*
 846 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
 847 * be careful that "break" loop is not allowed. We have reference count.
 848 * Instead of that modify "cond" to be false and "continue" to exit the loop.
 849 */
 850#define for_each_mem_cgroup_tree_cond(iter, root, cond)	\
 851	for (iter = mem_cgroup_start_loop(root);\
 852	     iter != NULL;\
 853	     iter = mem_cgroup_get_next(iter, root, cond))
 854
 855#define for_each_mem_cgroup_tree(iter, root) \
 856	for_each_mem_cgroup_tree_cond(iter, root, true)
 857
 858#define for_each_mem_cgroup_all(iter) \
 859	for_each_mem_cgroup_tree_cond(iter, NULL, true)
 860
 861
 862static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 
 
 
 
 863{
 864	return (mem == root_mem_cgroup);
 
 
 
 
 
 865}
 866
 867void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 
 868{
 869	struct mem_cgroup *mem;
 870
 871	if (!mm)
 872		return;
 873
 874	rcu_read_lock();
 875	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 876	if (unlikely(!mem))
 877		goto out;
 878
 879	switch (idx) {
 880	case PGMAJFAULT:
 881		mem_cgroup_pgmajfault(mem, 1);
 882		break;
 883	case PGFAULT:
 884		mem_cgroup_pgfault(mem, 1);
 885		break;
 886	default:
 887		BUG();
 888	}
 889out:
 
 
 890	rcu_read_unlock();
 891}
 892EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 893
 894/*
 895 * Following LRU functions are allowed to be used without PCG_LOCK.
 896 * Operations are called by routine of global LRU independently from memcg.
 897 * What we have to take care of here is validness of pc->mem_cgroup.
 898 *
 899 * Changes to pc->mem_cgroup happens when
 900 * 1. charge
 901 * 2. moving account
 902 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 903 * It is added to LRU before charge.
 904 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 905 * When moving account, the page is not on LRU. It's isolated.
 906 */
 907
 908void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 909{
 910	struct page_cgroup *pc;
 911	struct mem_cgroup_per_zone *mz;
 
 
 
 
 912
 913	if (mem_cgroup_disabled())
 914		return;
 915	pc = lookup_page_cgroup(page);
 916	/* can happen while we handle swapcache. */
 917	if (!TestClearPageCgroupAcctLRU(pc))
 918		return;
 919	VM_BUG_ON(!pc->mem_cgroup);
 920	/*
 921	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
 922	 * removed from global LRU.
 
 
 923	 */
 924	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 925	/* huge page split is done under lru_lock. so, we have no races. */
 926	MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
 927	if (mem_cgroup_is_root(pc->mem_cgroup))
 928		return;
 929	VM_BUG_ON(list_empty(&pc->lru));
 930	list_del_init(&pc->lru);
 931}
 932
 933void mem_cgroup_del_lru(struct page *page)
 
 
 
 
 
 
 
 934{
 935	mem_cgroup_del_lru_list(page, page_lru(page));
 936}
 937
 938/*
 939 * Writeback is about to end against a page which has been marked for immediate
 940 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 941 * inactive list.
 942 */
 943void mem_cgroup_rotate_reclaimable_page(struct page *page)
 944{
 945	struct mem_cgroup_per_zone *mz;
 946	struct page_cgroup *pc;
 947	enum lru_list lru = page_lru(page);
 948
 949	if (mem_cgroup_disabled())
 950		return;
 951
 952	pc = lookup_page_cgroup(page);
 953	/* unused or root page is not rotated. */
 954	if (!PageCgroupUsed(pc))
 955		return;
 956	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 957	smp_rmb();
 958	if (mem_cgroup_is_root(pc->mem_cgroup))
 959		return;
 960	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 961	list_move_tail(&pc->lru, &mz->lists[lru]);
 
 
 
 
 962}
 963
 964void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 965{
 966	struct mem_cgroup_per_zone *mz;
 967	struct page_cgroup *pc;
 968
 969	if (mem_cgroup_disabled())
 970		return;
 971
 972	pc = lookup_page_cgroup(page);
 973	/* unused or root page is not rotated. */
 974	if (!PageCgroupUsed(pc))
 975		return;
 976	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 977	smp_rmb();
 978	if (mem_cgroup_is_root(pc->mem_cgroup))
 979		return;
 980	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 981	list_move(&pc->lru, &mz->lists[lru]);
 982}
 983
 984void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 985{
 986	struct page_cgroup *pc;
 987	struct mem_cgroup_per_zone *mz;
 988
 989	if (mem_cgroup_disabled())
 990		return;
 991	pc = lookup_page_cgroup(page);
 992	VM_BUG_ON(PageCgroupAcctLRU(pc));
 993	if (!PageCgroupUsed(pc))
 994		return;
 995	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 996	smp_rmb();
 997	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 998	/* huge page split is done under lru_lock. so, we have no races. */
 999	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1000	SetPageCgroupAcctLRU(pc);
1001	if (mem_cgroup_is_root(pc->mem_cgroup))
1002		return;
1003	list_add(&pc->lru, &mz->lists[lru]);
1004}
1005
1006/*
1007 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1008 * while it's linked to lru because the page may be reused after it's fully
1009 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1010 * It's done under lock_page and expected that zone->lru_lock isnever held.
1011 */
1012static void mem_cgroup_lru_del_before_commit(struct page *page)
1013{
1014	unsigned long flags;
1015	struct zone *zone = page_zone(page);
1016	struct page_cgroup *pc = lookup_page_cgroup(page);
1017
1018	/*
1019	 * Doing this check without taking ->lru_lock seems wrong but this
1020	 * is safe. Because if page_cgroup's USED bit is unset, the page
1021	 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1022	 * set, the commit after this will fail, anyway.
1023	 * This all charge/uncharge is done under some mutual execustion.
1024	 * So, we don't need to taking care of changes in USED bit.
1025	 */
1026	if (likely(!PageLRU(page)))
1027		return;
1028
1029	spin_lock_irqsave(&zone->lru_lock, flags);
1030	/*
1031	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
1032	 * is guarded by lock_page() because the page is SwapCache.
1033	 */
1034	if (!PageCgroupUsed(pc))
1035		mem_cgroup_del_lru_list(page, page_lru(page));
1036	spin_unlock_irqrestore(&zone->lru_lock, flags);
1037}
 
1038
1039static void mem_cgroup_lru_add_after_commit(struct page *page)
1040{
1041	unsigned long flags;
1042	struct zone *zone = page_zone(page);
1043	struct page_cgroup *pc = lookup_page_cgroup(page);
1044
1045	/* taking care of that the page is added to LRU while we commit it */
1046	if (likely(!PageLRU(page)))
1047		return;
1048	spin_lock_irqsave(&zone->lru_lock, flags);
1049	/* link when the page is linked to LRU but page_cgroup isn't */
1050	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1051		mem_cgroup_add_lru_list(page, page_lru(page));
1052	spin_unlock_irqrestore(&zone->lru_lock, flags);
1053}
1054
1055
1056void mem_cgroup_move_lists(struct page *page,
1057			   enum lru_list from, enum lru_list to)
 
 
 
 
 
 
 
 
 
1058{
 
 
1059	if (mem_cgroup_disabled())
1060		return;
1061	mem_cgroup_del_lru_list(page, from);
1062	mem_cgroup_add_lru_list(page, to);
1063}
1064
1065/*
1066 * Checks whether given mem is same or in the root_mem's
1067 * hierarchy subtree
1068 */
1069static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
1070		struct mem_cgroup *mem)
1071{
1072	if (root_mem != mem) {
1073		return (root_mem->use_hierarchy &&
1074			css_is_ancestor(&mem->css, &root_mem->css));
 
 
 
 
 
 
 
 
 
1075	}
1076
1077	return true;
 
 
 
 
 
 
 
1078}
 
1079
1080int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 
 
 
1081{
1082	int ret;
1083	struct mem_cgroup *curr = NULL;
1084	struct task_struct *p;
1085
1086	p = find_lock_task_mm(task);
1087	if (!p)
1088		return 0;
1089	curr = try_get_mem_cgroup_from_mm(p->mm);
1090	task_unlock(p);
1091	if (!curr)
1092		return 0;
1093	/*
1094	 * We should check use_hierarchy of "mem" not "curr". Because checking
1095	 * use_hierarchy of "curr" here make this function true if hierarchy is
1096	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
1097	 * hierarchy(even if use_hierarchy is disabled in "mem").
1098	 */
1099	ret = mem_cgroup_same_or_subtree(mem, curr);
1100	css_put(&curr->css);
1101	return ret;
1102}
1103
1104static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
1105{
1106	unsigned long active;
1107	unsigned long inactive;
1108	unsigned long gb;
1109	unsigned long inactive_ratio;
1110
1111	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
1112	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1113
1114	gb = (inactive + active) >> (30 - PAGE_SHIFT);
1115	if (gb)
1116		inactive_ratio = int_sqrt(10 * gb);
1117	else
1118		inactive_ratio = 1;
1119
1120	if (present_pages) {
1121		present_pages[0] = inactive;
1122		present_pages[1] = active;
 
 
 
1123	}
1124
1125	return inactive_ratio;
1126}
1127
1128int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 
 
 
 
1129{
1130	unsigned long active;
1131	unsigned long inactive;
1132	unsigned long present_pages[2];
1133	unsigned long inactive_ratio;
1134
1135	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1136
1137	inactive = present_pages[0];
1138	active = present_pages[1];
1139
1140	if (inactive * inactive_ratio < active)
1141		return 1;
1142
1143	return 0;
 
 
 
 
1144}
1145
1146int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1147{
1148	unsigned long active;
1149	unsigned long inactive;
 
 
1150
1151	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
1152	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
1153
1154	return (active > inactive);
1155}
1156
1157struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1158						      struct zone *zone)
1159{
1160	int nid = zone_to_nid(zone);
1161	int zid = zone_idx(zone);
1162	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1163
1164	return &mz->reclaim_stat;
1165}
 
1166
1167struct zone_reclaim_stat *
1168mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1169{
1170	struct page_cgroup *pc;
1171	struct mem_cgroup_per_zone *mz;
1172
1173	if (mem_cgroup_disabled())
1174		return NULL;
 
 
 
 
 
 
1175
1176	pc = lookup_page_cgroup(page);
1177	if (!PageCgroupUsed(pc))
1178		return NULL;
1179	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1180	smp_rmb();
1181	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1182	return &mz->reclaim_stat;
1183}
1184
1185unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1186					struct list_head *dst,
1187					unsigned long *scanned, int order,
1188					int mode, struct zone *z,
1189					struct mem_cgroup *mem_cont,
1190					int active, int file)
1191{
1192	unsigned long nr_taken = 0;
1193	struct page *page;
1194	unsigned long scan;
1195	LIST_HEAD(pc_list);
1196	struct list_head *src;
1197	struct page_cgroup *pc, *tmp;
1198	int nid = zone_to_nid(z);
1199	int zid = zone_idx(z);
1200	struct mem_cgroup_per_zone *mz;
1201	int lru = LRU_FILE * file + active;
1202	int ret;
1203
1204	BUG_ON(!mem_cont);
1205	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1206	src = &mz->lists[lru];
1207
1208	scan = 0;
1209	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1210		if (scan >= nr_to_scan)
1211			break;
 
1212
1213		if (unlikely(!PageCgroupUsed(pc)))
1214			continue;
1215
1216		page = lookup_cgroup_page(pc);
 
 
 
 
 
 
 
 
 
 
1217
1218		if (unlikely(!PageLRU(page)))
1219			continue;
1220
1221		scan++;
1222		ret = __isolate_lru_page(page, mode, file);
1223		switch (ret) {
1224		case 0:
1225			list_move(&page->lru, dst);
1226			mem_cgroup_del_lru(page);
1227			nr_taken += hpage_nr_pages(page);
1228			break;
1229		case -EBUSY:
1230			/* we don't affect global LRU but rotate in our LRU */
1231			mem_cgroup_rotate_lru_list(page, page_lru(page));
1232			break;
1233		default:
1234			break;
1235		}
1236	}
1237
1238	*scanned = scan;
1239
1240	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1241				      0, 0, 0, mode);
1242
1243	return nr_taken;
1244}
1245
1246#define mem_cgroup_from_res_counter(counter, member)	\
1247	container_of(counter, struct mem_cgroup, member)
1248
1249/**
1250 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1251 * @mem: the memory cgroup
1252 *
1253 * Returns the maximum amount of memory @mem can be charged with, in
1254 * pages.
1255 */
1256static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
 
1257{
1258	unsigned long long margin;
1259
1260	margin = res_counter_margin(&mem->res);
1261	if (do_swap_account)
1262		margin = min(margin, res_counter_margin(&mem->memsw));
1263	return margin >> PAGE_SHIFT;
1264}
1265
1266int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 
1267{
1268	struct cgroup *cgrp = memcg->css.cgroup;
1269
1270	/* root ? */
1271	if (cgrp->parent == NULL)
1272		return vm_swappiness;
1273
1274	return memcg->swappiness;
 
 
 
 
1275}
1276
1277static void mem_cgroup_start_move(struct mem_cgroup *mem)
1278{
1279	int cpu;
 
1280
1281	get_online_cpus();
1282	spin_lock(&mem->pcp_counter_lock);
1283	for_each_online_cpu(cpu)
1284		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1285	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1286	spin_unlock(&mem->pcp_counter_lock);
1287	put_online_cpus();
1288
1289	synchronize_rcu();
 
 
 
 
 
 
 
 
1290}
1291
1292static void mem_cgroup_end_move(struct mem_cgroup *mem)
1293{
1294	int cpu;
1295
1296	if (!mem)
1297		return;
1298	get_online_cpus();
1299	spin_lock(&mem->pcp_counter_lock);
1300	for_each_online_cpu(cpu)
1301		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1302	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1303	spin_unlock(&mem->pcp_counter_lock);
1304	put_online_cpus();
1305}
1306/*
1307 * 2 routines for checking "mem" is under move_account() or not.
1308 *
1309 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1310 *			  for avoiding race in accounting. If true,
1311 *			  pc->mem_cgroup may be overwritten.
 
1312 *
1313 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1314 *			  under hierarchy of moving cgroups. This is for
1315 *			  waiting at hith-memory prressure caused by "move".
1316 */
1317
1318static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1319{
1320	VM_BUG_ON(!rcu_read_lock_held());
1321	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1322}
1323
1324static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1325{
1326	struct mem_cgroup *from;
1327	struct mem_cgroup *to;
1328	bool ret = false;
1329	/*
1330	 * Unlike task_move routines, we access mc.to, mc.from not under
1331	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1332	 */
1333	spin_lock(&mc.lock);
1334	from = mc.from;
1335	to = mc.to;
1336	if (!from)
1337		goto unlock;
1338
1339	ret = mem_cgroup_same_or_subtree(mem, from)
1340		|| mem_cgroup_same_or_subtree(mem, to);
1341unlock:
1342	spin_unlock(&mc.lock);
1343	return ret;
1344}
1345
1346static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1347{
1348	if (mc.moving_task && current != mc.moving_task) {
1349		if (mem_cgroup_under_move(mem)) {
1350			DEFINE_WAIT(wait);
1351			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1352			/* moving charge context might have finished. */
1353			if (mc.moving_task)
1354				schedule();
1355			finish_wait(&mc.waitq, &wait);
1356			return true;
 
 
 
 
1357		}
1358	}
1359	return false;
1360}
1361
1362/**
1363 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1364 * @memcg: The memory cgroup that went over limit
1365 * @p: Task that is going to be killed
1366 *
1367 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1368 * enabled
1369 */
1370void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1371{
1372	struct cgroup *task_cgrp;
1373	struct cgroup *mem_cgrp;
1374	/*
1375	 * Need a buffer in BSS, can't rely on allocations. The code relies
1376	 * on the assumption that OOM is serialized for memory controller.
1377	 * If this assumption is broken, revisit this code.
1378	 */
1379	static char memcg_name[PATH_MAX];
1380	int ret;
1381
1382	if (!memcg || !p)
1383		return;
1384
 
1385
1386	rcu_read_lock();
1387
1388	mem_cgrp = memcg->css.cgroup;
1389	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1390
1391	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1392	if (ret < 0) {
1393		/*
1394		 * Unfortunately, we are unable to convert to a useful name
1395		 * But we'll still print out the usage information
1396		 */
1397		rcu_read_unlock();
1398		goto done;
1399	}
1400	rcu_read_unlock();
1401
1402	printk(KERN_INFO "Task in %s killed", memcg_name);
1403
1404	rcu_read_lock();
1405	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1406	if (ret < 0) {
1407		rcu_read_unlock();
1408		goto done;
1409	}
1410	rcu_read_unlock();
 
 
 
 
 
 
 
1411
1412	/*
1413	 * Continues from above, so we don't need an KERN_ level
1414	 */
1415	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1416done:
1417
1418	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1419		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1420		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1421		res_counter_read_u64(&memcg->res, RES_FAILCNT));
1422	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1423		"failcnt %llu\n",
1424		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1425		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1426		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1427}
1428
1429/*
1430 * This function returns the number of memcg under hierarchy tree. Returns
1431 * 1(self count) if no children.
 
 
 
 
 
 
 
 
1432 */
1433static int mem_cgroup_count_children(struct mem_cgroup *mem)
1434{
1435	int num = 0;
1436	struct mem_cgroup *iter;
1437
1438	for_each_mem_cgroup_tree(iter, mem)
1439		num++;
1440	return num;
 
1441}
1442
1443/*
1444 * Return the memory (and swap, if configured) limit for a memcg.
 
 
 
 
 
 
 
 
 
 
1445 */
1446u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 
1447{
1448	u64 limit;
1449	u64 memsw;
1450
1451	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1452	limit += total_swap_pages << PAGE_SHIFT;
1453
1454	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1455	/*
1456	 * If memsw is finite and limits the amount of swap space available
1457	 * to this memcg, return that limit.
1458	 */
1459	return min(limit, memsw);
1460}
1461
1462/*
1463 * Visit the first child (need not be the first child as per the ordering
1464 * of the cgroup list, since we track last_scanned_child) of @mem and use
1465 * that to reclaim free pages from.
 
 
 
 
 
1466 */
1467static struct mem_cgroup *
1468mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1469{
1470	struct mem_cgroup *ret = NULL;
1471	struct cgroup_subsys_state *css;
1472	int nextid, found;
1473
1474	if (!root_mem->use_hierarchy) {
1475		css_get(&root_mem->css);
1476		ret = root_mem;
1477	}
1478
1479	while (!ret) {
1480		rcu_read_lock();
1481		nextid = root_mem->last_scanned_child + 1;
1482		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1483				   &found);
1484		if (css && css_tryget(css))
1485			ret = container_of(css, struct mem_cgroup, css);
1486
1487		rcu_read_unlock();
1488		/* Updates scanning parameter */
1489		if (!css) {
1490			/* this means start scan from ID:1 */
1491			root_mem->last_scanned_child = 0;
1492		} else
1493			root_mem->last_scanned_child = found;
 
 
1494	}
1495
1496	return ret;
 
1497}
1498
1499/**
1500 * test_mem_cgroup_node_reclaimable
1501 * @mem: the target memcg
1502 * @nid: the node ID to be checked.
1503 * @noswap : specify true here if the user wants flle only information.
1504 *
1505 * This function returns whether the specified memcg contains any
1506 * reclaimable pages on a node. Returns true if there are any reclaimable
1507 * pages in the node.
1508 */
1509static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1510		int nid, bool noswap)
1511{
1512	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
1513		return true;
1514	if (noswap || !total_swap_pages)
1515		return false;
1516	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
1517		return true;
1518	return false;
 
 
 
 
 
 
 
 
 
 
1519
 
1520}
1521#if MAX_NUMNODES > 1
1522
1523/*
1524 * Always updating the nodemask is not very good - even if we have an empty
1525 * list or the wrong list here, we can start from some node and traverse all
1526 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1527 *
1528 */
1529static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1530{
1531	int nid;
1532	/*
1533	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1534	 * pagein/pageout changes since the last update.
1535	 */
1536	if (!atomic_read(&mem->numainfo_events))
1537		return;
1538	if (atomic_inc_return(&mem->numainfo_updating) > 1)
1539		return;
1540
1541	/* make a nodemask where this memcg uses memory from */
1542	mem->scan_nodes = node_states[N_HIGH_MEMORY];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1543
1544	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1545
1546		if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1547			node_clear(nid, mem->scan_nodes);
 
 
 
 
 
 
 
 
 
 
 
1548	}
1549
1550	atomic_set(&mem->numainfo_events, 0);
1551	atomic_set(&mem->numainfo_updating, 0);
1552}
1553
1554/*
1555 * Selecting a node where we start reclaim from. Because what we need is just
1556 * reducing usage counter, start from anywhere is O,K. Considering
1557 * memory reclaim from current node, there are pros. and cons.
1558 *
1559 * Freeing memory from current node means freeing memory from a node which
1560 * we'll use or we've used. So, it may make LRU bad. And if several threads
1561 * hit limits, it will see a contention on a node. But freeing from remote
1562 * node means more costs for memory reclaim because of memory latency.
1563 *
1564 * Now, we use round-robin. Better algorithm is welcomed.
1565 */
1566int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1567{
1568	int node;
1569
1570	mem_cgroup_may_update_nodemask(mem);
1571	node = mem->last_scanned_node;
1572
1573	node = next_node(node, mem->scan_nodes);
1574	if (node == MAX_NUMNODES)
1575		node = first_node(mem->scan_nodes);
1576	/*
1577	 * We call this when we hit limit, not when pages are added to LRU.
1578	 * No LRU may hold pages because all pages are UNEVICTABLE or
1579	 * memcg is too small and all pages are not on LRU. In that case,
1580	 * we use curret node.
1581	 */
1582	if (unlikely(node == MAX_NUMNODES))
1583		node = numa_node_id();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1584
1585	mem->last_scanned_node = node;
1586	return node;
 
 
1587}
1588
1589/*
1590 * Check all nodes whether it contains reclaimable pages or not.
1591 * For quick scan, we make use of scan_nodes. This will allow us to skip
1592 * unused nodes. But scan_nodes is lazily updated and may not cotain
1593 * enough new information. We need to do double check.
1594 */
1595bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1596{
1597	int nid;
 
 
 
 
 
 
1598
1599	/*
1600	 * quick check...making use of scan_node.
1601	 * We can skip unused nodes.
 
 
 
 
 
 
1602	 */
1603	if (!nodes_empty(mem->scan_nodes)) {
1604		for (nid = first_node(mem->scan_nodes);
1605		     nid < MAX_NUMNODES;
1606		     nid = next_node(nid, mem->scan_nodes)) {
 
 
 
 
 
 
 
 
1607
1608			if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1609				return true;
 
 
1610		}
1611	}
1612	/*
1613	 * Check rest of nodes.
1614	 */
1615	for_each_node_state(nid, N_HIGH_MEMORY) {
1616		if (node_isset(nid, mem->scan_nodes))
 
 
 
 
 
 
 
 
 
 
1617			continue;
1618		if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1619			return true;
 
 
1620	}
1621	return false;
1622}
1623
1624#else
1625int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1626{
1627	return 0;
 
 
 
 
 
1628}
1629
1630bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
 
 
 
 
 
 
 
 
 
1631{
1632	return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1633}
1634#endif
1635
1636/*
1637 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1638 * we reclaimed from, so that we don't end up penalizing one child extensively
1639 * based on its position in the children list.
1640 *
1641 * root_mem is the original ancestor that we've been reclaim from.
1642 *
1643 * We give up and return to the caller when we visit root_mem twice.
1644 * (other groups can be removed while we're walking....)
1645 *
1646 * If shrink==true, for avoiding to free too much, this returns immedieately.
1647 */
1648static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1649						struct zone *zone,
1650						gfp_t gfp_mask,
1651						unsigned long reclaim_options,
1652						unsigned long *total_scanned)
1653{
1654	struct mem_cgroup *victim;
1655	int ret, total = 0;
1656	int loop = 0;
1657	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1658	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1659	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1660	unsigned long excess;
1661	unsigned long nr_scanned;
1662
1663	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664
1665	/* If memsw_is_minimum==1, swap-out is of-no-use. */
1666	if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1667		noswap = true;
1668
1669	while (1) {
1670		victim = mem_cgroup_select_victim(root_mem);
1671		if (victim == root_mem) {
1672			loop++;
1673			/*
1674			 * We are not draining per cpu cached charges during
1675			 * soft limit reclaim  because global reclaim doesn't
1676			 * care about charges. It tries to free some memory and
1677			 * charges will not give any.
1678			 */
1679			if (!check_soft && loop >= 1)
1680				drain_all_stock_async(root_mem);
1681			if (loop >= 2) {
1682				/*
1683				 * If we have not been able to reclaim
1684				 * anything, it might because there are
1685				 * no reclaimable pages under this hierarchy
1686				 */
1687				if (!check_soft || !total) {
1688					css_put(&victim->css);
1689					break;
1690				}
1691				/*
1692				 * We want to do more targeted reclaim.
1693				 * excess >> 2 is not to excessive so as to
1694				 * reclaim too much, nor too less that we keep
1695				 * coming back to reclaim from this cgroup
1696				 */
1697				if (total >= (excess >> 2) ||
1698					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1699					css_put(&victim->css);
1700					break;
1701				}
1702			}
1703		}
1704		if (!mem_cgroup_reclaimable(victim, noswap)) {
1705			/* this cgroup's local usage == 0 */
1706			css_put(&victim->css);
1707			continue;
1708		}
1709		/* we use swappiness of local cgroup */
1710		if (check_soft) {
1711			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1712				noswap, zone, &nr_scanned);
1713			*total_scanned += nr_scanned;
1714		} else
1715			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1716						noswap);
1717		css_put(&victim->css);
1718		/*
1719		 * At shrinking usage, we can't check we should stop here or
1720		 * reclaim more. It's depends on callers. last_scanned_child
1721		 * will work enough for keeping fairness under tree.
1722		 */
1723		if (shrink)
1724			return ret;
1725		total += ret;
1726		if (check_soft) {
1727			if (!res_counter_soft_limit_excess(&root_mem->res))
1728				return total;
1729		} else if (mem_cgroup_margin(root_mem))
1730			return total;
1731	}
1732	return total;
1733}
1734
1735/*
1736 * Check OOM-Killer is already running under our hierarchy.
1737 * If someone is running, return false.
1738 * Has to be called with memcg_oom_lock
1739 */
1740static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1741{
1742	struct mem_cgroup *iter, *failed = NULL;
1743	bool cond = true;
1744
1745	for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1746		if (iter->oom_lock) {
1747			/*
1748			 * this subtree of our hierarchy is already locked
1749			 * so we cannot give a lock.
1750			 */
1751			failed = iter;
1752			cond = false;
1753		} else
1754			iter->oom_lock = true;
 
 
 
 
 
 
 
 
1755	}
 
1756
1757	if (!failed)
1758		return true;
1759
1760	/*
1761	 * OK, we failed to lock the whole subtree so we have to clean up
1762	 * what we set up to the failing subtree
1763	 */
1764	cond = true;
1765	for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1766		if (iter == failed) {
1767			cond = false;
1768			continue;
1769		}
1770		iter->oom_lock = false;
1771	}
1772	return false;
1773}
1774
1775/*
1776 * Has to be called with memcg_oom_lock
1777 */
1778static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1779{
1780	struct mem_cgroup *iter;
1781
1782	for_each_mem_cgroup_tree(iter, mem)
1783		iter->oom_lock = false;
1784	return 0;
1785}
1786
1787static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1788{
1789	struct mem_cgroup *iter;
1790
1791	for_each_mem_cgroup_tree(iter, mem)
1792		atomic_inc(&iter->under_oom);
 
 
 
 
 
 
1793}
1794
1795static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1796{
1797	struct mem_cgroup *iter;
1798
1799	/*
1800	 * When a new child is created while the hierarchy is under oom,
1801	 * mem_cgroup_oom_lock() may not be called. We have to use
1802	 * atomic_add_unless() here.
1803	 */
1804	for_each_mem_cgroup_tree(iter, mem)
1805		atomic_add_unless(&iter->under_oom, -1, 0);
1806}
1807
1808static DEFINE_SPINLOCK(memcg_oom_lock);
1809static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1810
1811struct oom_wait_info {
1812	struct mem_cgroup *mem;
1813	wait_queue_t	wait;
1814};
1815
1816static int memcg_oom_wake_function(wait_queue_t *wait,
1817	unsigned mode, int sync, void *arg)
1818{
1819	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
1820			  *oom_wait_mem;
1821	struct oom_wait_info *oom_wait_info;
 
 
 
 
 
 
 
 
1822
1823	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1824	oom_wait_mem = oom_wait_info->mem;
1825
1826	/*
1827	 * Both of oom_wait_info->mem and wake_mem are stable under us.
1828	 * Then we can use css_is_ancestor without taking care of RCU.
1829	 */
1830	if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
1831			&& !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
1832		return 0;
1833	return autoremove_wake_function(wait, mode, sync, arg);
1834}
1835
1836static void memcg_wakeup_oom(struct mem_cgroup *mem)
1837{
1838	/* for filtering, pass "mem" as argument. */
1839	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1840}
1841
1842static void memcg_oom_recover(struct mem_cgroup *mem)
1843{
1844	if (mem && atomic_read(&mem->under_oom))
1845		memcg_wakeup_oom(mem);
1846}
1847
1848/*
1849 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
 
1850 */
1851bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1852{
1853	struct oom_wait_info owait;
1854	bool locked, need_to_kill;
1855
1856	owait.mem = mem;
1857	owait.wait.flags = 0;
1858	owait.wait.func = memcg_oom_wake_function;
1859	owait.wait.private = current;
1860	INIT_LIST_HEAD(&owait.wait.task_list);
1861	need_to_kill = true;
1862	mem_cgroup_mark_under_oom(mem);
1863
1864	/* At first, try to OOM lock hierarchy under mem.*/
1865	spin_lock(&memcg_oom_lock);
1866	locked = mem_cgroup_oom_lock(mem);
1867	/*
1868	 * Even if signal_pending(), we can't quit charge() loop without
1869	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1870	 * under OOM is always welcomed, use TASK_KILLABLE here.
1871	 */
1872	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1873	if (!locked || mem->oom_kill_disable)
1874		need_to_kill = false;
1875	if (locked)
1876		mem_cgroup_oom_notify(mem);
1877	spin_unlock(&memcg_oom_lock);
1878
1879	if (need_to_kill) {
1880		finish_wait(&memcg_oom_waitq, &owait.wait);
1881		mem_cgroup_out_of_memory(mem, mask);
1882	} else {
1883		schedule();
1884		finish_wait(&memcg_oom_waitq, &owait.wait);
1885	}
1886	spin_lock(&memcg_oom_lock);
1887	if (locked)
1888		mem_cgroup_oom_unlock(mem);
1889	memcg_wakeup_oom(mem);
1890	spin_unlock(&memcg_oom_lock);
1891
1892	mem_cgroup_unmark_under_oom(mem);
1893
1894	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1895		return false;
1896	/* Give chance to dying process */
1897	schedule_timeout(1);
1898	return true;
 
 
 
1899}
1900
1901/*
1902 * Currently used to update mapped file statistics, but the routine can be
1903 * generalized to update other statistics as well.
1904 *
1905 * Notes: Race condition
1906 *
1907 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1908 * it tends to be costly. But considering some conditions, we doesn't need
1909 * to do so _always_.
1910 *
1911 * Considering "charge", lock_page_cgroup() is not required because all
1912 * file-stat operations happen after a page is attached to radix-tree. There
1913 * are no race with "charge".
1914 *
1915 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1916 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1917 * if there are race with "uncharge". Statistics itself is properly handled
1918 * by flags.
1919 *
1920 * Considering "move", this is an only case we see a race. To make the race
1921 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1922 * possibility of race condition. If there is, we take a lock.
1923 */
1924
1925void mem_cgroup_update_page_stat(struct page *page,
1926				 enum mem_cgroup_page_stat_item idx, int val)
1927{
1928	struct mem_cgroup *mem;
1929	struct page_cgroup *pc = lookup_page_cgroup(page);
1930	bool need_unlock = false;
1931	unsigned long uninitialized_var(flags);
1932
1933	if (unlikely(!pc))
1934		return;
 
 
 
1935
1936	rcu_read_lock();
1937	mem = pc->mem_cgroup;
1938	if (unlikely(!mem || !PageCgroupUsed(pc)))
 
1939		goto out;
1940	/* pc->mem_cgroup is unstable ? */
1941	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1942		/* take a lock against to access pc->mem_cgroup */
1943		move_lock_page_cgroup(pc, &flags);
1944		need_unlock = true;
1945		mem = pc->mem_cgroup;
1946		if (!mem || !PageCgroupUsed(pc))
1947			goto out;
1948	}
1949
1950	switch (idx) {
1951	case MEMCG_NR_FILE_MAPPED:
1952		if (val > 0)
1953			SetPageCgroupFileMapped(pc);
1954		else if (!page_mapped(page))
1955			ClearPageCgroupFileMapped(pc);
1956		idx = MEM_CGROUP_STAT_FILE_MAPPED;
1957		break;
1958	default:
1959		BUG();
1960	}
1961
1962	this_cpu_add(mem->stat->count[idx], val);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1963
 
 
1964out:
1965	if (unlikely(need_unlock))
1966		move_unlock_page_cgroup(pc, &flags);
1967	rcu_read_unlock();
1968	return;
 
 
 
 
 
 
 
 
1969}
1970EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1971
1972/*
1973 * size of first charge trial. "32" comes from vmscan.c's magic value.
1974 * TODO: maybe necessary to use big numbers in big irons.
1975 */
1976#define CHARGE_BATCH	32U
1977struct memcg_stock_pcp {
 
1978	struct mem_cgroup *cached; /* this never be root cgroup */
1979	unsigned int nr_pages;
 
 
 
 
 
 
 
1980	struct work_struct work;
1981	unsigned long flags;
1982#define FLUSHING_CACHED_CHARGE	(0)
 
 
 
1983};
1984static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1985static DEFINE_MUTEX(percpu_charge_mutex);
1986
1987/*
1988 * Try to consume stocked charge on this cpu. If success, one page is consumed
1989 * from local stock and true is returned. If the stock is 0 or charges from a
1990 * cgroup which is not current target, returns false. This stock will be
1991 * refilled.
 
 
 
 
 
 
 
 
 
1992 */
1993static bool consume_stock(struct mem_cgroup *mem)
1994{
1995	struct memcg_stock_pcp *stock;
1996	bool ret = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1997
1998	stock = &get_cpu_var(memcg_stock);
1999	if (mem == stock->cached && stock->nr_pages)
2000		stock->nr_pages--;
2001	else /* need to call res_counter_charge */
2002		ret = false;
2003	put_cpu_var(memcg_stock);
2004	return ret;
2005}
2006
2007/*
2008 * Returns stocks cached in percpu to res_counter and reset cached information.
2009 */
2010static void drain_stock(struct memcg_stock_pcp *stock)
2011{
2012	struct mem_cgroup *old = stock->cached;
 
 
 
 
2013
2014	if (stock->nr_pages) {
2015		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
 
 
2016
2017		res_counter_uncharge(&old->res, bytes);
2018		if (do_swap_account)
2019			res_counter_uncharge(&old->memsw, bytes);
2020		stock->nr_pages = 0;
2021	}
2022	stock->cached = NULL;
 
 
2023}
2024
2025/*
2026 * This must be called under preempt disabled or must be called by
2027 * a thread which is pinned to local cpu.
2028 */
2029static void drain_local_stock(struct work_struct *dummy)
2030{
2031	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
 
 
 
 
 
 
 
 
 
 
 
 
2032	drain_stock(stock);
2033	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
 
 
2034}
2035
2036/*
2037 * Cache charges(val) which is from res_counter, to local per_cpu area.
2038 * This will be consumed by consume_stock() function, later.
2039 */
2040static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2041{
2042	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 
2043
2044	if (stock->cached != mem) { /* reset if necessary */
 
2045		drain_stock(stock);
2046		stock->cached = mem;
 
2047	}
2048	stock->nr_pages += nr_pages;
2049	put_cpu_var(memcg_stock);
 
 
 
 
 
 
 
 
 
 
 
 
2050}
2051
2052/*
2053 * Drains all per-CPU charge caches for given root_mem resp. subtree
2054 * of the hierarchy under it. sync flag says whether we should block
2055 * until the work is done.
2056 */
2057static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2058{
2059	int cpu, curcpu;
2060
2061	/* Notify other cpus that system-wide "drain" is running */
2062	get_online_cpus();
2063	curcpu = get_cpu();
 
 
 
 
 
 
 
 
2064	for_each_online_cpu(cpu) {
2065		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2066		struct mem_cgroup *mem;
 
2067
2068		mem = stock->cached;
2069		if (!mem || !stock->nr_pages)
2070			continue;
2071		if (!mem_cgroup_same_or_subtree(root_mem, mem))
2072			continue;
2073		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 
 
 
 
 
2074			if (cpu == curcpu)
2075				drain_local_stock(&stock->work);
2076			else
2077				schedule_work_on(cpu, &stock->work);
2078		}
2079	}
2080	put_cpu();
 
 
2081
2082	if (!sync)
2083		goto out;
 
2084
2085	for_each_online_cpu(cpu) {
2086		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2087		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2088			flush_work(&stock->work);
2089	}
2090out:
2091 	put_online_cpus();
2092}
2093
2094/*
2095 * Tries to drain stocked charges in other cpus. This function is asynchronous
2096 * and just put a work per cpu for draining localy on each cpu. Caller can
2097 * expects some charges will be back to res_counter later but cannot wait for
2098 * it.
2099 */
2100static void drain_all_stock_async(struct mem_cgroup *root_mem)
2101{
2102	/*
2103	 * If someone calls draining, avoid adding more kworker runs.
2104	 */
2105	if (!mutex_trylock(&percpu_charge_mutex))
2106		return;
2107	drain_all_stock(root_mem, false);
2108	mutex_unlock(&percpu_charge_mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2109}
2110
2111/* This is a synchronous drain interface. */
2112static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2113{
2114	/* called when force_empty is called */
2115	mutex_lock(&percpu_charge_mutex);
2116	drain_all_stock(root_mem, true);
2117	mutex_unlock(&percpu_charge_mutex);
2118}
2119
2120/*
2121 * This function drains percpu counter value from DEAD cpu and
2122 * move it to local cpu. Note that this function can be preempted.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2123 */
2124static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
 
 
 
2125{
2126	int i;
2127
2128	spin_lock(&mem->pcp_counter_lock);
2129	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2130		long x = per_cpu(mem->stat->count[i], cpu);
2131
2132		per_cpu(mem->stat->count[i], cpu) = 0;
2133		mem->nocpu_base.count[i] += x;
2134	}
2135	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2136		unsigned long x = per_cpu(mem->stat->events[i], cpu);
2137
2138		per_cpu(mem->stat->events[i], cpu) = 0;
2139		mem->nocpu_base.events[i] += x;
2140	}
2141	/* need to clear ON_MOVE value, works as a kind of lock. */
2142	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2143	spin_unlock(&mem->pcp_counter_lock);
2144}
2145
2146static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
2147{
2148	int idx = MEM_CGROUP_ON_MOVE;
2149
2150	spin_lock(&mem->pcp_counter_lock);
2151	per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
2152	spin_unlock(&mem->pcp_counter_lock);
 
 
 
 
 
2153}
2154
2155static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2156					unsigned long action,
2157					void *hcpu)
2158{
2159	int cpu = (unsigned long)hcpu;
2160	struct memcg_stock_pcp *stock;
2161	struct mem_cgroup *iter;
2162
2163	if ((action == CPU_ONLINE)) {
2164		for_each_mem_cgroup_all(iter)
2165			synchronize_mem_cgroup_on_move(iter, cpu);
2166		return NOTIFY_OK;
2167	}
 
 
 
2168
2169	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2170		return NOTIFY_OK;
2171
2172	for_each_mem_cgroup_all(iter)
2173		mem_cgroup_drain_pcp_counter(iter, cpu);
 
 
 
 
 
 
 
2174
2175	stock = &per_cpu(memcg_stock, cpu);
2176	drain_stock(stock);
2177	return NOTIFY_OK;
2178}
2179
 
 
 
 
 
 
 
 
 
 
 
2180
2181/* See __mem_cgroup_try_charge() for details */
2182enum {
2183	CHARGE_OK,		/* success */
2184	CHARGE_RETRY,		/* need to retry but retry is not bad */
2185	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
2186	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
2187	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
2188};
 
 
2189
2190static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2191				unsigned int nr_pages, bool oom_check)
 
 
 
 
2192{
2193	unsigned long csize = nr_pages * PAGE_SIZE;
2194	struct mem_cgroup *mem_over_limit;
2195	struct res_counter *fail_res;
2196	unsigned long flags = 0;
2197	int ret;
 
 
2198
2199	ret = res_counter_charge(&mem->res, csize, &fail_res);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2200
2201	if (likely(!ret)) {
2202		if (!do_swap_account)
2203			return CHARGE_OK;
2204		ret = res_counter_charge(&mem->memsw, csize, &fail_res);
2205		if (likely(!ret))
2206			return CHARGE_OK;
2207
2208		res_counter_uncharge(&mem->res, csize);
2209		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2210		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2211	} else
2212		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2213	/*
2214	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2215	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
 
2216	 *
2217	 * Never reclaim on behalf of optional batching, retry with a
2218	 * single page instead.
 
2219	 */
2220	if (nr_pages == CHARGE_BATCH)
2221		return CHARGE_RETRY;
 
2222
2223	if (!(gfp_mask & __GFP_WAIT))
2224		return CHARGE_WOULDBLOCK;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2225
2226	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
2227					      gfp_mask, flags, NULL);
2228	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2229		return CHARGE_RETRY;
 
 
 
 
 
 
 
 
 
2230	/*
2231	 * Even though the limit is exceeded at this point, reclaim
2232	 * may have been able to free some pages.  Retry the charge
2233	 * before killing the task.
2234	 *
2235	 * Only for regular pages, though: huge pages are rather
2236	 * unlikely to succeed so close to the limit, and we fall back
2237	 * to regular pages anyway in case of failure.
2238	 */
2239	if (nr_pages == 1 && ret)
2240		return CHARGE_RETRY;
 
 
 
 
 
 
2241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2242	/*
2243	 * At task move, charge accounts can be doubly counted. So, it's
2244	 * better to wait until the end of task_move if something is going on.
2245	 */
2246	if (mem_cgroup_wait_acct_move(mem_over_limit))
2247		return CHARGE_RETRY;
2248
2249	/* If we don't need to call oom-killer at el, return immediately */
2250	if (!oom_check)
2251		return CHARGE_NOMEM;
2252	/* check OOM */
2253	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2254		return CHARGE_OOM_DIE;
 
 
2255
2256	return CHARGE_RETRY;
2257}
2258
2259/*
2260 * Unlike exported interface, "oom" parameter is added. if oom==true,
2261 * oom-killer can be invoked.
2262 */
2263static int __mem_cgroup_try_charge(struct mm_struct *mm,
2264				   gfp_t gfp_mask,
2265				   unsigned int nr_pages,
2266				   struct mem_cgroup **memcg,
2267				   bool oom)
2268{
2269	unsigned int batch = max(CHARGE_BATCH, nr_pages);
2270	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2271	struct mem_cgroup *mem = NULL;
2272	int ret;
2273
2274	/*
2275	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2276	 * in system level. So, allow to go ahead dying process in addition to
2277	 * MEMDIE process.
2278	 */
2279	if (unlikely(test_thread_flag(TIF_MEMDIE)
2280		     || fatal_signal_pending(current)))
2281		goto bypass;
2282
2283	/*
2284	 * We always charge the cgroup the mm_struct belongs to.
2285	 * The mm_struct's mem_cgroup changes on task migration if the
2286	 * thread group leader migrates. It's possible that mm is not
2287	 * set, if so charge the init_mm (happens for pagecache usage).
2288	 */
2289	if (!*memcg && !mm)
2290		goto bypass;
2291again:
2292	if (*memcg) { /* css should be a valid one */
2293		mem = *memcg;
2294		VM_BUG_ON(css_is_removed(&mem->css));
2295		if (mem_cgroup_is_root(mem))
2296			goto done;
2297		if (nr_pages == 1 && consume_stock(mem))
2298			goto done;
2299		css_get(&mem->css);
2300	} else {
2301		struct task_struct *p;
2302
2303		rcu_read_lock();
2304		p = rcu_dereference(mm->owner);
2305		/*
2306		 * Because we don't have task_lock(), "p" can exit.
2307		 * In that case, "mem" can point to root or p can be NULL with
2308		 * race with swapoff. Then, we have small risk of mis-accouning.
2309		 * But such kind of mis-account by race always happens because
2310		 * we don't have cgroup_mutex(). It's overkill and we allo that
2311		 * small race, here.
2312		 * (*) swapoff at el will charge against mm-struct not against
2313		 * task-struct. So, mm->owner can be NULL.
2314		 */
2315		mem = mem_cgroup_from_task(p);
2316		if (!mem || mem_cgroup_is_root(mem)) {
2317			rcu_read_unlock();
2318			goto done;
2319		}
2320		if (nr_pages == 1 && consume_stock(mem)) {
2321			/*
2322			 * It seems dagerous to access memcg without css_get().
2323			 * But considering how consume_stok works, it's not
2324			 * necessary. If consume_stock success, some charges
2325			 * from this memcg are cached on this cpu. So, we
2326			 * don't need to call css_get()/css_tryget() before
2327			 * calling consume_stock().
2328			 */
2329			rcu_read_unlock();
2330			goto done;
2331		}
2332		/* after here, we may be blocked. we need to get refcnt */
2333		if (!css_tryget(&mem->css)) {
2334			rcu_read_unlock();
2335			goto again;
2336		}
2337		rcu_read_unlock();
2338	}
2339
2340	do {
2341		bool oom_check;
2342
2343		/* If killed, bypass charge */
2344		if (fatal_signal_pending(current)) {
2345			css_put(&mem->css);
2346			goto bypass;
2347		}
2348
2349		oom_check = false;
2350		if (oom && !nr_oom_retries) {
2351			oom_check = true;
2352			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 
 
2353		}
2354
2355		ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
2356		switch (ret) {
2357		case CHARGE_OK:
2358			break;
2359		case CHARGE_RETRY: /* not in OOM situation but retry */
2360			batch = nr_pages;
2361			css_put(&mem->css);
2362			mem = NULL;
2363			goto again;
2364		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2365			css_put(&mem->css);
2366			goto nomem;
2367		case CHARGE_NOMEM: /* OOM routine works */
2368			if (!oom) {
2369				css_put(&mem->css);
2370				goto nomem;
2371			}
2372			/* If oom, we never return -ENOMEM */
2373			nr_oom_retries--;
2374			break;
2375		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2376			css_put(&mem->css);
2377			goto bypass;
2378		}
2379	} while (ret != CHARGE_OK);
2380
2381	if (batch > nr_pages)
2382		refill_stock(mem, batch - nr_pages);
2383	css_put(&mem->css);
2384done:
2385	*memcg = mem;
2386	return 0;
2387nomem:
2388	*memcg = NULL;
2389	return -ENOMEM;
2390bypass:
2391	*memcg = NULL;
2392	return 0;
2393}
2394
2395/*
2396 * Somemtimes we have to undo a charge we got by try_charge().
2397 * This function is for that and do uncharge, put css's refcnt.
2398 * gotten by try_charge().
2399 */
2400static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2401				       unsigned int nr_pages)
2402{
2403	if (!mem_cgroup_is_root(mem)) {
2404		unsigned long bytes = nr_pages * PAGE_SIZE;
2405
2406		res_counter_uncharge(&mem->res, bytes);
2407		if (do_swap_account)
2408			res_counter_uncharge(&mem->memsw, bytes);
2409	}
2410}
2411
2412/*
2413 * A helper function to get mem_cgroup from ID. must be called under
2414 * rcu_read_lock(). The caller must check css_is_removed() or some if
2415 * it's concern. (dropping refcnt from swap can be called against removed
2416 * memcg.)
2417 */
2418static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2419{
2420	struct cgroup_subsys_state *css;
 
2421
2422	/* ID 0 is unused ID */
2423	if (!id)
2424		return NULL;
2425	css = css_lookup(&mem_cgroup_subsys, id);
2426	if (!css)
2427		return NULL;
2428	return container_of(css, struct mem_cgroup, css);
2429}
2430
2431struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2432{
2433	struct mem_cgroup *mem = NULL;
2434	struct page_cgroup *pc;
2435	unsigned short id;
2436	swp_entry_t ent;
2437
2438	VM_BUG_ON(!PageLocked(page));
2439
2440	pc = lookup_page_cgroup(page);
2441	lock_page_cgroup(pc);
2442	if (PageCgroupUsed(pc)) {
2443		mem = pc->mem_cgroup;
2444		if (mem && !css_tryget(&mem->css))
2445			mem = NULL;
2446	} else if (PageSwapCache(page)) {
2447		ent.val = page_private(page);
2448		id = lookup_swap_cgroup(ent);
2449		rcu_read_lock();
2450		mem = mem_cgroup_lookup(id);
2451		if (mem && !css_tryget(&mem->css))
2452			mem = NULL;
2453		rcu_read_unlock();
2454	}
2455	unlock_page_cgroup(pc);
2456	return mem;
2457}
2458
2459static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2460				       struct page *page,
2461				       unsigned int nr_pages,
2462				       struct page_cgroup *pc,
2463				       enum charge_type ctype)
2464{
2465	lock_page_cgroup(pc);
2466	if (unlikely(PageCgroupUsed(pc))) {
2467		unlock_page_cgroup(pc);
2468		__mem_cgroup_cancel_charge(mem, nr_pages);
2469		return;
2470	}
2471	/*
2472	 * we don't need page_cgroup_lock about tail pages, becase they are not
2473	 * accessed by any other context at this point.
2474	 */
2475	pc->mem_cgroup = mem;
2476	/*
2477	 * We access a page_cgroup asynchronously without lock_page_cgroup().
2478	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2479	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2480	 * before USED bit, we need memory barrier here.
2481	 * See mem_cgroup_add_lru_list(), etc.
2482 	 */
2483	smp_wmb();
2484	switch (ctype) {
2485	case MEM_CGROUP_CHARGE_TYPE_CACHE:
2486	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2487		SetPageCgroupCache(pc);
2488		SetPageCgroupUsed(pc);
2489		break;
2490	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2491		ClearPageCgroupCache(pc);
2492		SetPageCgroupUsed(pc);
2493		break;
2494	default:
2495		break;
2496	}
2497
2498	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2499	unlock_page_cgroup(pc);
2500	/*
2501	 * "charge_statistics" updated event counter. Then, check it.
2502	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2503	 * if they exceeds softlimit.
2504	 */
2505	memcg_check_events(mem, page);
2506}
2507
2508#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2509
2510#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2511			(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2512/*
2513 * Because tail pages are not marked as "used", set it. We're under
2514 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2515 */
2516void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2517{
2518	struct page_cgroup *head_pc = lookup_page_cgroup(head);
2519	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2520	unsigned long flags;
2521
2522	if (mem_cgroup_disabled())
2523		return;
2524	/*
2525	 * We have no races with charge/uncharge but will have races with
2526	 * page state accounting.
2527	 */
2528	move_lock_page_cgroup(head_pc, &flags);
2529
2530	tail_pc->mem_cgroup = head_pc->mem_cgroup;
2531	smp_wmb(); /* see __commit_charge() */
2532	if (PageCgroupAcctLRU(head_pc)) {
2533		enum lru_list lru;
2534		struct mem_cgroup_per_zone *mz;
 
2535
2536		/*
2537		 * LRU flags cannot be copied because we need to add tail
2538		 *.page to LRU by generic call and our hook will be called.
2539		 * We hold lru_lock, then, reduce counter directly.
2540		 */
2541		lru = page_lru(head);
2542		mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2543		MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2544	}
2545	tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2546	move_unlock_page_cgroup(head_pc, &flags);
2547}
2548#endif
2549
2550/**
2551 * mem_cgroup_move_account - move account of the page
2552 * @page: the page
2553 * @nr_pages: number of regular pages (>1 for huge pages)
2554 * @pc:	page_cgroup of the page.
2555 * @from: mem_cgroup which the page is moved from.
2556 * @to:	mem_cgroup which the page is moved to. @from != @to.
2557 * @uncharge: whether we should call uncharge and css_put against @from.
2558 *
2559 * The caller must confirm following.
2560 * - page is not on LRU (isolate_page() is useful.)
2561 * - compound_lock is held when nr_pages > 1
2562 *
2563 * This function doesn't do "charge" nor css_get to new cgroup. It should be
2564 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2565 * true, this function does "uncharge" from old cgroup, but it doesn't if
2566 * @uncharge is false, so a caller should do "uncharge".
2567 */
2568static int mem_cgroup_move_account(struct page *page,
2569				   unsigned int nr_pages,
2570				   struct page_cgroup *pc,
2571				   struct mem_cgroup *from,
2572				   struct mem_cgroup *to,
2573				   bool uncharge)
2574{
2575	unsigned long flags;
2576	int ret;
2577
2578	VM_BUG_ON(from == to);
2579	VM_BUG_ON(PageLRU(page));
2580	/*
2581	 * The page is isolated from LRU. So, collapse function
2582	 * will not handle this page. But page splitting can happen.
2583	 * Do this check under compound_page_lock(). The caller should
2584	 * hold it.
2585	 */
2586	ret = -EBUSY;
2587	if (nr_pages > 1 && !PageTransHuge(page))
2588		goto out;
2589
2590	lock_page_cgroup(pc);
 
 
 
2591
2592	ret = -EINVAL;
2593	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2594		goto unlock;
2595
2596	move_lock_page_cgroup(pc, &flags);
 
2597
2598	if (PageCgroupFileMapped(pc)) {
2599		/* Update mapped_file data for mem_cgroup */
2600		preempt_disable();
2601		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2602		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2603		preempt_enable();
2604	}
2605	mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2606	if (uncharge)
2607		/* This is not "cancel", but cancel_charge does all we need. */
2608		__mem_cgroup_cancel_charge(from, nr_pages);
2609
2610	/* caller should have done css_get */
2611	pc->mem_cgroup = to;
2612	mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2613	/*
2614	 * We charges against "to" which may not have any tasks. Then, "to"
2615	 * can be under rmdir(). But in current implementation, caller of
2616	 * this function is just force_empty() and move charge, so it's
2617	 * guaranteed that "to" is never removed. So, we don't check rmdir
2618	 * status here.
2619	 */
2620	move_unlock_page_cgroup(pc, &flags);
2621	ret = 0;
2622unlock:
2623	unlock_page_cgroup(pc);
2624	/*
2625	 * check events
 
 
 
 
2626	 */
2627	memcg_check_events(to, page);
2628	memcg_check_events(from, page);
2629out:
2630	return ret;
2631}
2632
2633/*
2634 * move charges to its parent.
 
 
 
 
 
 
2635 */
 
 
 
 
2636
2637static int mem_cgroup_move_parent(struct page *page,
2638				  struct page_cgroup *pc,
2639				  struct mem_cgroup *child,
2640				  gfp_t gfp_mask)
2641{
2642	struct cgroup *cg = child->css.cgroup;
2643	struct cgroup *pcg = cg->parent;
2644	struct mem_cgroup *parent;
2645	unsigned int nr_pages;
2646	unsigned long uninitialized_var(flags);
2647	int ret;
2648
2649	/* Is ROOT ? */
2650	if (!pcg)
2651		return -EINVAL;
 
 
 
 
 
2652
2653	ret = -EBUSY;
2654	if (!get_page_unless_zero(page))
2655		goto out;
2656	if (isolate_lru_page(page))
2657		goto put;
2658
2659	nr_pages = hpage_nr_pages(page);
 
 
 
 
 
 
2660
2661	parent = mem_cgroup_from_cont(pcg);
2662	ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2663	if (ret || !parent)
2664		goto put_back;
2665
2666	if (nr_pages > 1)
2667		flags = compound_lock_irqsave(page);
 
2668
2669	ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2670	if (ret)
2671		__mem_cgroup_cancel_charge(parent, nr_pages);
 
 
 
 
 
2672
2673	if (nr_pages > 1)
2674		compound_unlock_irqrestore(page, flags);
2675put_back:
2676	putback_lru_page(page);
2677put:
2678	put_page(page);
2679out:
2680	return ret;
 
 
 
 
 
 
 
 
 
 
 
 
2681}
2682
2683/*
2684 * Charge the memory controller for page usage.
2685 * Return
2686 * 0 if the charge was successful
2687 * < 0 if the cgroup is over its limit
2688 */
2689static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2690				gfp_t gfp_mask, enum charge_type ctype)
2691{
2692	struct mem_cgroup *mem = NULL;
2693	unsigned int nr_pages = 1;
2694	struct page_cgroup *pc;
2695	bool oom = true;
2696	int ret;
2697
2698	if (PageTransHuge(page)) {
2699		nr_pages <<= compound_order(page);
2700		VM_BUG_ON(!PageTransHuge(page));
 
 
 
 
 
2701		/*
2702		 * Never OOM-kill a process for a huge page.  The
2703		 * fault handler will fall back to regular pages.
2704		 */
2705		oom = false;
2706	}
2707
2708	pc = lookup_page_cgroup(page);
2709	BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
 
2710
2711	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2712	if (ret || !mem)
2713		return ret;
2714
2715	__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2716	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
2717}
2718
2719int mem_cgroup_newpage_charge(struct page *page,
2720			      struct mm_struct *mm, gfp_t gfp_mask)
2721{
2722	if (mem_cgroup_disabled())
2723		return 0;
2724	/*
2725	 * If already mapped, we don't have to account.
2726	 * If page cache, page->mapping has address_space.
2727	 * But page->mapping may have out-of-use anon_vma pointer,
2728	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2729	 * is NULL.
2730  	 */
2731	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2732		return 0;
2733	if (unlikely(!mm))
2734		mm = &init_mm;
2735	return mem_cgroup_charge_common(page, mm, gfp_mask,
2736				MEM_CGROUP_CHARGE_TYPE_MAPPED);
2737}
2738
2739static void
2740__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2741					enum charge_type ctype);
2742
2743static void
2744__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2745					enum charge_type ctype)
2746{
2747	struct page_cgroup *pc = lookup_page_cgroup(page);
2748	/*
2749	 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2750	 * is already on LRU. It means the page may on some other page_cgroup's
2751	 * LRU. Take care of it.
2752	 */
2753	mem_cgroup_lru_del_before_commit(page);
2754	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2755	mem_cgroup_lru_add_after_commit(page);
2756	return;
 
2757}
2758
2759int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2760				gfp_t gfp_mask)
 
 
 
 
 
2761{
2762	struct mem_cgroup *mem = NULL;
2763	int ret;
2764
2765	if (mem_cgroup_disabled())
2766		return 0;
2767	if (PageCompound(page))
2768		return 0;
2769
2770	if (unlikely(!mm))
2771		mm = &init_mm;
2772
2773	if (page_is_file_cache(page)) {
2774		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2775		if (ret || !mem)
2776			return ret;
2777
2778		/*
2779		 * FUSE reuses pages without going through the final
2780		 * put that would remove them from the LRU list, make
2781		 * sure that they get relinked properly.
2782		 */
2783		__mem_cgroup_commit_charge_lrucare(page, mem,
2784					MEM_CGROUP_CHARGE_TYPE_CACHE);
2785		return ret;
2786	}
2787	/* shmem */
2788	if (PageSwapCache(page)) {
2789		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2790		if (!ret)
2791			__mem_cgroup_commit_charge_swapin(page, mem,
2792					MEM_CGROUP_CHARGE_TYPE_SHMEM);
2793	} else
2794		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2795					MEM_CGROUP_CHARGE_TYPE_SHMEM);
2796
2797	return ret;
2798}
2799
2800/*
2801 * While swap-in, try_charge -> commit or cancel, the page is locked.
2802 * And when try_charge() successfully returns, one refcnt to memcg without
2803 * struct page_cgroup is acquired. This refcnt will be consumed by
2804 * "commit()" or removed by "cancel()"
2805 */
2806int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2807				 struct page *page,
2808				 gfp_t mask, struct mem_cgroup **ptr)
 
2809{
2810	struct mem_cgroup *mem;
2811	int ret;
2812
2813	*ptr = NULL;
2814
2815	if (mem_cgroup_disabled())
2816		return 0;
 
 
 
 
 
 
2817
2818	if (!do_swap_account)
2819		goto charge_cur_mm;
2820	/*
2821	 * A racing thread's fault, or swapoff, may have already updated
2822	 * the pte, and even removed page from swap cache: in those cases
2823	 * do_swap_page()'s pte_same() test will fail; but there's also a
2824	 * KSM case which does need to charge the page.
2825	 */
2826	if (!PageSwapCache(page))
2827		goto charge_cur_mm;
2828	mem = try_get_mem_cgroup_from_page(page);
2829	if (!mem)
2830		goto charge_cur_mm;
2831	*ptr = mem;
2832	ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2833	css_put(&mem->css);
2834	return ret;
2835charge_cur_mm:
2836	if (unlikely(!mm))
2837		mm = &init_mm;
2838	return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2839}
2840
2841static void
2842__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2843					enum charge_type ctype)
 
 
 
 
 
 
2844{
2845	if (mem_cgroup_disabled())
2846		return;
2847	if (!ptr)
2848		return;
2849	cgroup_exclude_rmdir(&ptr->css);
2850
2851	__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2852	/*
2853	 * Now swap is on-memory. This means this page may be
2854	 * counted both as mem and swap....double count.
2855	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2856	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2857	 * may call delete_from_swap_cache() before reach here.
2858	 */
2859	if (do_swap_account && PageSwapCache(page)) {
2860		swp_entry_t ent = {.val = page_private(page)};
2861		unsigned short id;
2862		struct mem_cgroup *memcg;
2863
2864		id = swap_cgroup_record(ent, 0);
2865		rcu_read_lock();
2866		memcg = mem_cgroup_lookup(id);
2867		if (memcg) {
2868			/*
2869			 * This recorded memcg can be obsolete one. So, avoid
2870			 * calling css_tryget
2871			 */
2872			if (!mem_cgroup_is_root(memcg))
2873				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2874			mem_cgroup_swap_statistics(memcg, false);
2875			mem_cgroup_put(memcg);
2876		}
2877		rcu_read_unlock();
2878	}
2879	/*
2880	 * At swapin, we may charge account against cgroup which has no tasks.
2881	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2882	 * In that case, we need to call pre_destroy() again. check it here.
2883	 */
2884	cgroup_release_and_wakeup_rmdir(&ptr->css);
2885}
2886
2887void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 
 
 
 
 
2888{
2889	__mem_cgroup_commit_charge_swapin(page, ptr,
2890					MEM_CGROUP_CHARGE_TYPE_MAPPED);
2891}
2892
2893void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2894{
2895	if (mem_cgroup_disabled())
2896		return;
2897	if (!mem)
2898		return;
2899	__mem_cgroup_cancel_charge(mem, 1);
 
 
 
 
2900}
2901
2902static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2903				   unsigned int nr_pages,
2904				   const enum charge_type ctype)
2905{
2906	struct memcg_batch_info *batch = NULL;
2907	bool uncharge_memsw = true;
 
 
2908
2909	/* If swapout, usage of swap doesn't decrease */
2910	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2911		uncharge_memsw = false;
2912
2913	batch = &current->memcg_batch;
2914	/*
2915	 * In usual, we do css_get() when we remember memcg pointer.
2916	 * But in this case, we keep res->usage until end of a series of
2917	 * uncharges. Then, it's ok to ignore memcg's refcnt.
2918	 */
2919	if (!batch->memcg)
2920		batch->memcg = mem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2921	/*
2922	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2923	 * In those cases, all pages freed continuously can be expected to be in
2924	 * the same cgroup and we have chance to coalesce uncharges.
2925	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2926	 * because we want to do uncharge as soon as possible.
2927	 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2928
2929	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2930		goto direct_uncharge;
 
2931
2932	if (nr_pages > 1)
2933		goto direct_uncharge;
 
 
 
2934
2935	/*
2936	 * In typical case, batch->memcg == mem. This means we can
2937	 * merge a series of uncharges to an uncharge of res_counter.
2938	 * If not, we uncharge res_counter ony by one.
2939	 */
2940	if (batch->memcg != mem)
2941		goto direct_uncharge;
2942	/* remember freed charge and uncharge it later */
2943	batch->nr_pages++;
2944	if (uncharge_memsw)
2945		batch->memsw_nr_pages++;
2946	return;
2947direct_uncharge:
2948	res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2949	if (uncharge_memsw)
2950		res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2951	if (unlikely(batch->memcg != mem))
2952		memcg_oom_recover(mem);
2953	return;
2954}
2955
2956/*
2957 * uncharge if !page_mapped(page)
2958 */
2959static struct mem_cgroup *
2960__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2961{
2962	struct mem_cgroup *mem = NULL;
2963	unsigned int nr_pages = 1;
2964	struct page_cgroup *pc;
2965
2966	if (mem_cgroup_disabled())
2967		return NULL;
2968
2969	if (PageSwapCache(page))
2970		return NULL;
 
2971
2972	if (PageTransHuge(page)) {
2973		nr_pages <<= compound_order(page);
2974		VM_BUG_ON(!PageTransHuge(page));
2975	}
2976	/*
2977	 * Check if our page_cgroup is valid
2978	 */
2979	pc = lookup_page_cgroup(page);
2980	if (unlikely(!pc || !PageCgroupUsed(pc)))
2981		return NULL;
2982
2983	lock_page_cgroup(pc);
2984
2985	mem = pc->mem_cgroup;
 
 
2986
2987	if (!PageCgroupUsed(pc))
2988		goto unlock_out;
2989
2990	switch (ctype) {
2991	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2992	case MEM_CGROUP_CHARGE_TYPE_DROP:
2993		/* See mem_cgroup_prepare_migration() */
2994		if (page_mapped(page) || PageCgroupMigration(pc))
2995			goto unlock_out;
2996		break;
2997	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2998		if (!PageAnon(page)) {	/* Shared memory */
2999			if (page->mapping && !page_is_file_cache(page))
3000				goto unlock_out;
3001		} else if (page_mapped(page)) /* Anon */
3002				goto unlock_out;
3003		break;
3004	default:
3005		break;
3006	}
3007
3008	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
3009
3010	ClearPageCgroupUsed(pc);
3011	/*
3012	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
3013	 * freed from LRU. This is safe because uncharged page is expected not
3014	 * to be reused (freed soon). Exception is SwapCache, it's handled by
3015	 * special functions.
3016	 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3017
3018	unlock_page_cgroup(pc);
3019	/*
3020	 * even after unlock, we have mem->res.usage here and this memcg
3021	 * will never be freed.
3022	 */
3023	memcg_check_events(mem, page);
3024	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3025		mem_cgroup_swap_statistics(mem, true);
3026		mem_cgroup_get(mem);
3027	}
3028	if (!mem_cgroup_is_root(mem))
3029		mem_cgroup_do_uncharge(mem, nr_pages, ctype);
3030
3031	return mem;
3032
3033unlock_out:
3034	unlock_page_cgroup(pc);
3035	return NULL;
3036}
3037
3038void mem_cgroup_uncharge_page(struct page *page)
 
3039{
3040	/* early check. */
3041	if (page_mapped(page))
3042		return;
3043	if (page->mapping && !PageAnon(page))
3044		return;
3045	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 
 
 
 
3046}
3047
3048void mem_cgroup_uncharge_cache_page(struct page *page)
 
3049{
3050	VM_BUG_ON(page_mapped(page));
3051	VM_BUG_ON(page->mapping);
3052	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3053}
3054
3055/*
3056 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3057 * In that cases, pages are freed continuously and we can expect pages
3058 * are in the same memcg. All these calls itself limits the number of
3059 * pages freed at once, then uncharge_start/end() is called properly.
3060 * This may be called prural(2) times in a context,
3061 */
3062
3063void mem_cgroup_uncharge_start(void)
3064{
3065	current->memcg_batch.do_batch++;
3066	/* We can do nest. */
3067	if (current->memcg_batch.do_batch == 1) {
3068		current->memcg_batch.memcg = NULL;
3069		current->memcg_batch.nr_pages = 0;
3070		current->memcg_batch.memsw_nr_pages = 0;
 
 
 
 
 
 
3071	}
 
 
 
 
 
 
3072}
3073
3074void mem_cgroup_uncharge_end(void)
3075{
3076	struct memcg_batch_info *batch = &current->memcg_batch;
 
3077
3078	if (!batch->do_batch)
3079		return;
3080
3081	batch->do_batch--;
3082	if (batch->do_batch) /* If stacked, do nothing. */
3083		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3084
3085	if (!batch->memcg)
3086		return;
 
 
 
 
 
 
 
 
3087	/*
3088	 * This "batch->memcg" is valid without any css_get/put etc...
3089	 * bacause we hide charges behind us.
3090	 */
3091	if (batch->nr_pages)
3092		res_counter_uncharge(&batch->memcg->res,
3093				     batch->nr_pages * PAGE_SIZE);
3094	if (batch->memsw_nr_pages)
3095		res_counter_uncharge(&batch->memcg->memsw,
3096				     batch->memsw_nr_pages * PAGE_SIZE);
3097	memcg_oom_recover(batch->memcg);
3098	/* forget this pointer (for sanity check) */
3099	batch->memcg = NULL;
3100}
3101
3102#ifdef CONFIG_SWAP
3103/*
3104 * called after __delete_from_swap_cache() and drop "page" account.
3105 * memcg information is recorded to swap_cgroup of "ent"
3106 */
3107void
3108mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3109{
3110	struct mem_cgroup *memcg;
3111	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 
 
3112
3113	if (!swapout) /* this was a swap cache but the swap is unused ! */
3114		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3115
3116	memcg = __mem_cgroup_uncharge_common(page, ctype);
 
 
 
 
3117
3118	/*
3119	 * record memcg information,  if swapout && memcg != NULL,
3120	 * mem_cgroup_get() was called in uncharge().
 
 
3121	 */
3122	if (do_swap_account && swapout && memcg)
3123		swap_cgroup_record(ent, css_id(&memcg->css));
3124}
3125#endif
3126
3127#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3128/*
3129 * called from swap_entry_free(). remove record in swap_cgroup and
3130 * uncharge "memsw" account.
3131 */
3132void mem_cgroup_uncharge_swap(swp_entry_t ent)
3133{
3134	struct mem_cgroup *memcg;
3135	unsigned short id;
3136
3137	if (!do_swap_account)
3138		return;
 
3139
3140	id = swap_cgroup_record(ent, 0);
3141	rcu_read_lock();
3142	memcg = mem_cgroup_lookup(id);
3143	if (memcg) {
3144		/*
3145		 * We uncharge this because swap is freed.
3146		 * This memcg can be obsolete one. We avoid calling css_tryget
3147		 */
3148		if (!mem_cgroup_is_root(memcg))
3149			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3150		mem_cgroup_swap_statistics(memcg, false);
3151		mem_cgroup_put(memcg);
3152	}
3153	rcu_read_unlock();
3154}
3155
3156/**
3157 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3158 * @entry: swap entry to be moved
3159 * @from:  mem_cgroup which the entry is moved from
3160 * @to:  mem_cgroup which the entry is moved to
3161 * @need_fixup: whether we should fixup res_counters and refcounts.
3162 *
3163 * It succeeds only when the swap_cgroup's record for this entry is the same
3164 * as the mem_cgroup's id of @from.
3165 *
3166 * Returns 0 on success, -EINVAL on failure.
3167 *
3168 * The caller must have charged to @to, IOW, called res_counter_charge() about
3169 * both res and memsw, and called css_get().
3170 */
3171static int mem_cgroup_move_swap_account(swp_entry_t entry,
3172		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3173{
3174	unsigned short old_id, new_id;
3175
3176	old_id = css_id(&from->css);
3177	new_id = css_id(&to->css);
3178
3179	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3180		mem_cgroup_swap_statistics(from, false);
3181		mem_cgroup_swap_statistics(to, true);
3182		/*
3183		 * This function is only called from task migration context now.
3184		 * It postpones res_counter and refcount handling till the end
3185		 * of task migration(mem_cgroup_clear_mc()) for performance
3186		 * improvement. But we cannot postpone mem_cgroup_get(to)
3187		 * because if the process that has been moved to @to does
3188		 * swap-in, the refcount of @to might be decreased to 0.
3189		 */
3190		mem_cgroup_get(to);
3191		if (need_fixup) {
3192			if (!mem_cgroup_is_root(from))
3193				res_counter_uncharge(&from->memsw, PAGE_SIZE);
3194			mem_cgroup_put(from);
3195			/*
3196			 * we charged both to->res and to->memsw, so we should
3197			 * uncharge to->res.
3198			 */
3199			if (!mem_cgroup_is_root(to))
3200				res_counter_uncharge(&to->res, PAGE_SIZE);
3201		}
3202		return 0;
 
 
 
 
 
3203	}
3204	return -EINVAL;
 
3205}
3206#else
3207static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3208		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3209{
3210	return -EINVAL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3211}
3212#endif
3213
3214/*
3215 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3216 * page belongs to.
3217 */
3218int mem_cgroup_prepare_migration(struct page *page,
3219	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3220{
3221	struct mem_cgroup *mem = NULL;
3222	struct page_cgroup *pc;
3223	enum charge_type ctype;
3224	int ret = 0;
3225
3226	*ptr = NULL;
 
3227
3228	VM_BUG_ON(PageTransHuge(page));
3229	if (mem_cgroup_disabled())
3230		return 0;
 
 
 
 
 
3231
3232	pc = lookup_page_cgroup(page);
3233	lock_page_cgroup(pc);
3234	if (PageCgroupUsed(pc)) {
3235		mem = pc->mem_cgroup;
3236		css_get(&mem->css);
3237		/*
3238		 * At migrating an anonymous page, its mapcount goes down
3239		 * to 0 and uncharge() will be called. But, even if it's fully
3240		 * unmapped, migration may fail and this page has to be
3241		 * charged again. We set MIGRATION flag here and delay uncharge
3242		 * until end_migration() is called
3243		 *
3244		 * Corner Case Thinking
3245		 * A)
3246		 * When the old page was mapped as Anon and it's unmap-and-freed
3247		 * while migration was ongoing.
3248		 * If unmap finds the old page, uncharge() of it will be delayed
3249		 * until end_migration(). If unmap finds a new page, it's
3250		 * uncharged when it make mapcount to be 1->0. If unmap code
3251		 * finds swap_migration_entry, the new page will not be mapped
3252		 * and end_migration() will find it(mapcount==0).
3253		 *
3254		 * B)
3255		 * When the old page was mapped but migraion fails, the kernel
3256		 * remaps it. A charge for it is kept by MIGRATION flag even
3257		 * if mapcount goes down to 0. We can do remap successfully
3258		 * without charging it again.
3259		 *
3260		 * C)
3261		 * The "old" page is under lock_page() until the end of
3262		 * migration, so, the old page itself will not be swapped-out.
3263		 * If the new page is swapped out before end_migraton, our
3264		 * hook to usual swap-out path will catch the event.
3265		 */
3266		if (PageAnon(page))
3267			SetPageCgroupMigration(pc);
 
 
 
 
 
 
 
3268	}
3269	unlock_page_cgroup(pc);
3270	/*
3271	 * If the page is not charged at this point,
3272	 * we return here.
3273	 */
3274	if (!mem)
 
 
3275		return 0;
3276
3277	*ptr = mem;
3278	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3279	css_put(&mem->css);/* drop extra refcnt */
3280	if (ret || *ptr == NULL) {
3281		if (PageAnon(page)) {
3282			lock_page_cgroup(pc);
3283			ClearPageCgroupMigration(pc);
3284			unlock_page_cgroup(pc);
3285			/*
3286			 * The old page may be fully unmapped while we kept it.
3287			 */
3288			mem_cgroup_uncharge_page(page);
3289		}
3290		return -ENOMEM;
3291	}
3292	/*
3293	 * We charge new page before it's used/mapped. So, even if unlock_page()
3294	 * is called before end_migration, we can catch all events on this new
3295	 * page. In the case new page is migrated but not remapped, new page's
3296	 * mapcount will be finally 0 and we call uncharge in end_migration().
3297	 */
3298	pc = lookup_page_cgroup(newpage);
3299	if (PageAnon(page))
3300		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3301	else if (page_is_file_cache(page))
3302		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3303	else
3304		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3305	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
3306	return ret;
3307}
3308
3309/* remove redundant charge if migration failed*/
3310void mem_cgroup_end_migration(struct mem_cgroup *mem,
3311	struct page *oldpage, struct page *newpage, bool migration_ok)
3312{
3313	struct page *used, *unused;
3314	struct page_cgroup *pc;
3315
3316	if (!mem)
3317		return;
3318	/* blocks rmdir() */
3319	cgroup_exclude_rmdir(&mem->css);
3320	if (!migration_ok) {
3321		used = oldpage;
3322		unused = newpage;
3323	} else {
3324		used = newpage;
3325		unused = oldpage;
3326	}
3327	/*
3328	 * We disallowed uncharge of pages under migration because mapcount
3329	 * of the page goes down to zero, temporarly.
3330	 * Clear the flag and check the page should be charged.
3331	 */
3332	pc = lookup_page_cgroup(oldpage);
3333	lock_page_cgroup(pc);
3334	ClearPageCgroupMigration(pc);
3335	unlock_page_cgroup(pc);
3336
3337	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
 
 
 
 
 
 
 
3338
3339	/*
3340	 * If a page is a file cache, radix-tree replacement is very atomic
3341	 * and we can skip this check. When it was an Anon page, its mapcount
3342	 * goes down to 0. But because we added MIGRATION flage, it's not
3343	 * uncharged yet. There are several case but page->mapcount check
3344	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3345	 * check. (see prepare_charge() also)
3346	 */
3347	if (PageAnon(used))
3348		mem_cgroup_uncharge_page(used);
3349	/*
3350	 * At migration, we may charge account against cgroup which has no
3351	 * tasks.
3352	 * So, rmdir()->pre_destroy() can be called while we do this charge.
3353	 * In that case, we need to call pre_destroy() again. check it here.
3354	 */
3355	cgroup_release_and_wakeup_rmdir(&mem->css);
3356}
3357
3358#ifdef CONFIG_DEBUG_VM
3359static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 
 
 
3360{
3361	struct page_cgroup *pc;
 
3362
3363	pc = lookup_page_cgroup(page);
3364	if (likely(pc) && PageCgroupUsed(pc))
3365		return pc;
3366	return NULL;
3367}
3368
3369bool mem_cgroup_bad_page_check(struct page *page)
3370{
3371	if (mem_cgroup_disabled())
3372		return false;
 
 
 
 
 
 
 
3373
3374	return lookup_page_cgroup_used(page) != NULL;
3375}
3376
3377void mem_cgroup_print_bad_page(struct page *page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3378{
3379	struct page_cgroup *pc;
 
3380
3381	pc = lookup_page_cgroup_used(page);
3382	if (pc) {
3383		int ret = -1;
3384		char *path;
3385
3386		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3387		       pc, pc->flags, pc->mem_cgroup);
 
 
3388
3389		path = kmalloc(PATH_MAX, GFP_KERNEL);
3390		if (path) {
3391			rcu_read_lock();
3392			ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3393							path, PATH_MAX);
3394			rcu_read_unlock();
3395		}
3396
3397		printk(KERN_CONT "(%s)\n",
3398				(ret < 0) ? "cannot get the path" : path);
3399		kfree(path);
3400	}
3401}
3402#endif
3403
3404static DEFINE_MUTEX(set_limit_mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3405
3406static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3407				unsigned long long val)
3408{
3409	int retry_count;
3410	u64 memswlimit, memlimit;
3411	int ret = 0;
3412	int children = mem_cgroup_count_children(memcg);
3413	u64 curusage, oldusage;
3414	int enlarge;
3415
3416	/*
3417	 * For keeping hierarchical_reclaim simple, how long we should retry
3418	 * is depends on callers. We set our retry-count to be function
3419	 * of # of children which we should visit in this loop.
3420	 */
3421	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3422
3423	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3424
3425	enlarge = 0;
3426	while (retry_count) {
3427		if (signal_pending(current)) {
3428			ret = -EINTR;
3429			break;
 
 
 
 
3430		}
 
 
 
3431		/*
3432		 * Rather than hide all in some function, I do this in
3433		 * open coded manner. You see what this really does.
3434		 * We have to guarantee mem->res.limit < mem->memsw.limit.
 
 
3435		 */
3436		mutex_lock(&set_limit_mutex);
3437		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3438		if (memswlimit < val) {
3439			ret = -EINVAL;
3440			mutex_unlock(&set_limit_mutex);
3441			break;
3442		}
3443
3444		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3445		if (memlimit < val)
3446			enlarge = 1;
3447
3448		ret = res_counter_set_limit(&memcg->res, val);
3449		if (!ret) {
3450			if (memswlimit == val)
3451				memcg->memsw_is_minimum = true;
3452			else
3453				memcg->memsw_is_minimum = false;
3454		}
3455		mutex_unlock(&set_limit_mutex);
3456
3457		if (!ret)
3458			break;
3459
3460		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3461						MEM_CGROUP_RECLAIM_SHRINK,
3462						NULL);
3463		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3464		/* Usage is reduced ? */
3465  		if (curusage >= oldusage)
3466			retry_count--;
3467		else
3468			oldusage = curusage;
3469	}
3470	if (!ret && enlarge)
3471		memcg_oom_recover(memcg);
3472
3473	return ret;
3474}
3475
3476static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3477					unsigned long long val)
3478{
3479	int retry_count;
3480	u64 memlimit, memswlimit, oldusage, curusage;
3481	int children = mem_cgroup_count_children(memcg);
3482	int ret = -EBUSY;
3483	int enlarge = 0;
3484
3485	/* see mem_cgroup_resize_res_limit */
3486 	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3487	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3488	while (retry_count) {
3489		if (signal_pending(current)) {
3490			ret = -EINTR;
3491			break;
3492		}
3493		/*
3494		 * Rather than hide all in some function, I do this in
3495		 * open coded manner. You see what this really does.
3496		 * We have to guarantee mem->res.limit < mem->memsw.limit.
 
3497		 */
3498		mutex_lock(&set_limit_mutex);
3499		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3500		if (memlimit > val) {
3501			ret = -EINVAL;
3502			mutex_unlock(&set_limit_mutex);
3503			break;
 
3504		}
3505		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3506		if (memswlimit < val)
3507			enlarge = 1;
3508		ret = res_counter_set_limit(&memcg->memsw, val);
3509		if (!ret) {
3510			if (memlimit == val)
3511				memcg->memsw_is_minimum = true;
3512			else
3513				memcg->memsw_is_minimum = false;
3514		}
3515		mutex_unlock(&set_limit_mutex);
3516
3517		if (!ret)
3518			break;
3519
3520		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3521						MEM_CGROUP_RECLAIM_NOSWAP |
3522						MEM_CGROUP_RECLAIM_SHRINK,
3523						NULL);
3524		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3525		/* Usage is reduced ? */
3526		if (curusage >= oldusage)
3527			retry_count--;
3528		else
3529			oldusage = curusage;
3530	}
3531	if (!ret && enlarge)
3532		memcg_oom_recover(memcg);
3533	return ret;
3534}
3535
3536unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3537					    gfp_t gfp_mask,
3538					    unsigned long *total_scanned)
3539{
3540	unsigned long nr_reclaimed = 0;
3541	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3542	unsigned long reclaimed;
3543	int loop = 0;
3544	struct mem_cgroup_tree_per_zone *mctz;
3545	unsigned long long excess;
3546	unsigned long nr_scanned;
3547
3548	if (order > 0)
3549		return 0;
3550
3551	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3552	/*
3553	 * This loop can run a while, specially if mem_cgroup's continuously
3554	 * keep exceeding their soft limit and putting the system under
3555	 * pressure
3556	 */
3557	do {
3558		if (next_mz)
3559			mz = next_mz;
3560		else
3561			mz = mem_cgroup_largest_soft_limit_node(mctz);
3562		if (!mz)
3563			break;
3564
3565		nr_scanned = 0;
3566		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3567						gfp_mask,
3568						MEM_CGROUP_RECLAIM_SOFT,
3569						&nr_scanned);
3570		nr_reclaimed += reclaimed;
3571		*total_scanned += nr_scanned;
3572		spin_lock(&mctz->lock);
3573
3574		/*
3575		 * If we failed to reclaim anything from this memory cgroup
3576		 * it is time to move on to the next cgroup
3577		 */
3578		next_mz = NULL;
3579		if (!reclaimed) {
3580			do {
3581				/*
3582				 * Loop until we find yet another one.
3583				 *
3584				 * By the time we get the soft_limit lock
3585				 * again, someone might have aded the
3586				 * group back on the RB tree. Iterate to
3587				 * make sure we get a different mem.
3588				 * mem_cgroup_largest_soft_limit_node returns
3589				 * NULL if no other cgroup is present on
3590				 * the tree
3591				 */
3592				next_mz =
3593				__mem_cgroup_largest_soft_limit_node(mctz);
3594				if (next_mz == mz)
3595					css_put(&next_mz->mem->css);
3596				else /* next_mz == NULL or other memcg */
3597					break;
3598			} while (1);
3599		}
3600		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3601		excess = res_counter_soft_limit_excess(&mz->mem->res);
3602		/*
3603		 * One school of thought says that we should not add
3604		 * back the node to the tree if reclaim returns 0.
3605		 * But our reclaim could return 0, simply because due
3606		 * to priority we are exposing a smaller subset of
3607		 * memory to reclaim from. Consider this as a longer
3608		 * term TODO.
3609		 */
3610		/* If excess == 0, no tree ops */
3611		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3612		spin_unlock(&mctz->lock);
3613		css_put(&mz->mem->css);
3614		loop++;
3615		/*
3616		 * Could not reclaim anything and there are no more
3617		 * mem cgroups to try or we seem to be looping without
3618		 * reclaiming anything.
3619		 */
3620		if (!nr_reclaimed &&
3621			(next_mz == NULL ||
3622			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3623			break;
3624	} while (!nr_reclaimed);
3625	if (next_mz)
3626		css_put(&next_mz->mem->css);
3627	return nr_reclaimed;
3628}
3629
3630/*
3631 * This routine traverse page_cgroup in given list and drop them all.
3632 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3633 */
3634static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3635				int node, int zid, enum lru_list lru)
3636{
3637	struct zone *zone;
3638	struct mem_cgroup_per_zone *mz;
3639	struct page_cgroup *pc, *busy;
3640	unsigned long flags, loop;
3641	struct list_head *list;
3642	int ret = 0;
3643
3644	zone = &NODE_DATA(node)->node_zones[zid];
3645	mz = mem_cgroup_zoneinfo(mem, node, zid);
3646	list = &mz->lists[lru];
3647
3648	loop = MEM_CGROUP_ZSTAT(mz, lru);
3649	/* give some margin against EBUSY etc...*/
3650	loop += 256;
3651	busy = NULL;
3652	while (loop--) {
3653		struct page *page;
3654
3655		ret = 0;
3656		spin_lock_irqsave(&zone->lru_lock, flags);
3657		if (list_empty(list)) {
3658			spin_unlock_irqrestore(&zone->lru_lock, flags);
3659			break;
3660		}
3661		pc = list_entry(list->prev, struct page_cgroup, lru);
3662		if (busy == pc) {
3663			list_move(&pc->lru, list);
3664			busy = NULL;
3665			spin_unlock_irqrestore(&zone->lru_lock, flags);
3666			continue;
3667		}
3668		spin_unlock_irqrestore(&zone->lru_lock, flags);
3669
3670		page = lookup_cgroup_page(pc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3671
3672		ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3673		if (ret == -ENOMEM)
3674			break;
3675
3676		if (ret == -EBUSY || ret == -EINVAL) {
3677			/* found lock contention or "pc" is obsolete. */
3678			busy = pc;
3679			cond_resched();
3680		} else
3681			busy = NULL;
3682	}
3683
3684	if (!ret && !list_empty(list))
3685		return -EBUSY;
3686	return ret;
3687}
3688
3689/*
3690 * make mem_cgroup's charge to be 0 if there is no task.
3691 * This enables deleting this mem_cgroup.
3692 */
3693static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3694{
3695	int ret;
3696	int node, zid, shrink;
3697	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3698	struct cgroup *cgrp = mem->css.cgroup;
3699
3700	css_get(&mem->css);
3701
3702	shrink = 0;
3703	/* should free all ? */
3704	if (free_all)
3705		goto try_to_free;
3706move_account:
3707	do {
3708		ret = -EBUSY;
3709		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3710			goto out;
3711		ret = -EINTR;
3712		if (signal_pending(current))
3713			goto out;
3714		/* This is for making all *used* pages to be on LRU. */
3715		lru_add_drain_all();
3716		drain_all_stock_sync(mem);
3717		ret = 0;
3718		mem_cgroup_start_move(mem);
3719		for_each_node_state(node, N_HIGH_MEMORY) {
3720			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3721				enum lru_list l;
3722				for_each_lru(l) {
3723					ret = mem_cgroup_force_empty_list(mem,
3724							node, zid, l);
3725					if (ret)
3726						break;
3727				}
3728			}
3729			if (ret)
3730				break;
3731		}
3732		mem_cgroup_end_move(mem);
3733		memcg_oom_recover(mem);
3734		/* it seems parent cgroup doesn't have enough mem */
3735		if (ret == -ENOMEM)
3736			goto try_to_free;
3737		cond_resched();
3738	/* "ret" should also be checked to ensure all lists are empty. */
3739	} while (mem->res.usage > 0 || ret);
3740out:
3741	css_put(&mem->css);
3742	return ret;
3743
3744try_to_free:
3745	/* returns EBUSY if there is a task or if we come here twice. */
3746	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3747		ret = -EBUSY;
3748		goto out;
3749	}
3750	/* we call try-to-free pages for make this cgroup empty */
3751	lru_add_drain_all();
3752	/* try to free all pages in this cgroup */
3753	shrink = 1;
3754	while (nr_retries && mem->res.usage > 0) {
3755		int progress;
3756
3757		if (signal_pending(current)) {
3758			ret = -EINTR;
3759			goto out;
3760		}
3761		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3762						false);
3763		if (!progress) {
3764			nr_retries--;
3765			/* maybe some writeback is necessary */
3766			congestion_wait(BLK_RW_ASYNC, HZ/10);
3767		}
3768
 
 
3769	}
3770	lru_add_drain();
3771	/* try move_account...there may be some *locked* pages. */
3772	goto move_account;
3773}
3774
3775int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3776{
3777	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3778}
3779
3780
3781static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
 
 
 
 
 
3782{
3783	return mem_cgroup_from_cont(cont)->use_hierarchy;
 
3784}
3785
3786static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3787					u64 val)
3788{
3789	int retval = 0;
3790	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3791	struct cgroup *parent = cont->parent;
3792	struct mem_cgroup *parent_mem = NULL;
3793
3794	if (parent)
3795		parent_mem = mem_cgroup_from_cont(parent);
 
 
 
 
 
 
 
3796
3797	cgroup_lock();
3798	/*
3799	 * If parent's use_hierarchy is set, we can't make any modifications
3800	 * in the child subtrees. If it is unset, then the change can
3801	 * occur, provided the current cgroup has no children.
3802	 *
3803	 * For the root cgroup, parent_mem is NULL, we allow value to be
3804	 * set if there are no children.
3805	 */
3806	if ((!parent_mem || !parent_mem->use_hierarchy) &&
3807				(val == 1 || val == 0)) {
3808		if (list_empty(&cont->children))
3809			mem->use_hierarchy = val;
3810		else
3811			retval = -EBUSY;
3812	} else
3813		retval = -EINVAL;
3814	cgroup_unlock();
3815
3816	return retval;
3817}
 
3818
3819
3820static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3821					       enum mem_cgroup_stat_index idx)
3822{
3823	struct mem_cgroup *iter;
3824	long val = 0;
3825
3826	/* Per-cpu values can be negative, use a signed accumulator */
3827	for_each_mem_cgroup_tree(iter, mem)
3828		val += mem_cgroup_read_stat(iter, idx);
3829
3830	if (val < 0) /* race ? */
3831		val = 0;
3832	return val;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3833}
3834
3835static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3836{
3837	u64 val;
3838
3839	if (!mem_cgroup_is_root(mem)) {
3840		if (!swap)
3841			return res_counter_read_u64(&mem->res, RES_USAGE);
3842		else
3843			return res_counter_read_u64(&mem->memsw, RES_USAGE);
3844	}
3845
3846	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3847	val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
 
 
 
 
 
 
3848
3849	if (swap)
3850		val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3851
3852	return val << PAGE_SHIFT;
 
 
 
 
 
3853}
3854
3855static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3856{
3857	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3858	u64 val;
3859	int type, name;
3860
3861	type = MEMFILE_TYPE(cft->private);
3862	name = MEMFILE_ATTR(cft->private);
3863	switch (type) {
3864	case _MEM:
3865		if (name == RES_USAGE)
3866			val = mem_cgroup_usage(mem, false);
3867		else
3868			val = res_counter_read_u64(&mem->res, name);
3869		break;
3870	case _MEMSWAP:
3871		if (name == RES_USAGE)
3872			val = mem_cgroup_usage(mem, true);
3873		else
3874			val = res_counter_read_u64(&mem->memsw, name);
3875		break;
3876	default:
3877		BUG();
3878		break;
3879	}
3880	return val;
3881}
3882/*
3883 * The user of this function is...
3884 * RES_LIMIT.
3885 */
3886static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3887			    const char *buffer)
3888{
3889	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3890	int type, name;
3891	unsigned long long val;
3892	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3893
3894	type = MEMFILE_TYPE(cft->private);
3895	name = MEMFILE_ATTR(cft->private);
3896	switch (name) {
3897	case RES_LIMIT:
3898		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3899			ret = -EINVAL;
3900			break;
3901		}
3902		/* This function does all necessary parse...reuse it */
3903		ret = res_counter_memparse_write_strategy(buffer, &val);
3904		if (ret)
3905			break;
3906		if (type == _MEM)
3907			ret = mem_cgroup_resize_limit(memcg, val);
3908		else
3909			ret = mem_cgroup_resize_memsw_limit(memcg, val);
3910		break;
3911	case RES_SOFT_LIMIT:
3912		ret = res_counter_memparse_write_strategy(buffer, &val);
3913		if (ret)
3914			break;
3915		/*
3916		 * For memsw, soft limits are hard to implement in terms
3917		 * of semantics, for now, we support soft limits for
3918		 * control without swap
3919		 */
3920		if (type == _MEM)
3921			ret = res_counter_set_soft_limit(&memcg->res, val);
3922		else
3923			ret = -EINVAL;
3924		break;
3925	default:
3926		ret = -EINVAL; /* should be BUG() ? */
3927		break;
3928	}
3929	return ret;
 
 
 
 
 
 
 
 
3930}
3931
3932static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3933		unsigned long long *mem_limit, unsigned long long *memsw_limit)
3934{
3935	struct cgroup *cgroup;
3936	unsigned long long min_limit, min_memsw_limit, tmp;
3937
3938	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3939	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3940	cgroup = memcg->css.cgroup;
3941	if (!memcg->use_hierarchy)
3942		goto out;
3943
3944	while (cgroup->parent) {
3945		cgroup = cgroup->parent;
3946		memcg = mem_cgroup_from_cont(cgroup);
3947		if (!memcg->use_hierarchy)
3948			break;
3949		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3950		min_limit = min(min_limit, tmp);
3951		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3952		min_memsw_limit = min(min_memsw_limit, tmp);
3953	}
3954out:
3955	*mem_limit = min_limit;
3956	*memsw_limit = min_memsw_limit;
3957	return;
3958}
3959
3960static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3961{
3962	struct mem_cgroup *mem;
3963	int type, name;
3964
3965	mem = mem_cgroup_from_cont(cont);
3966	type = MEMFILE_TYPE(event);
3967	name = MEMFILE_ATTR(event);
3968	switch (name) {
3969	case RES_MAX_USAGE:
3970		if (type == _MEM)
3971			res_counter_reset_max(&mem->res);
3972		else
3973			res_counter_reset_max(&mem->memsw);
3974		break;
3975	case RES_FAILCNT:
3976		if (type == _MEM)
3977			res_counter_reset_failcnt(&mem->res);
3978		else
3979			res_counter_reset_failcnt(&mem->memsw);
3980		break;
3981	}
3982
3983	return 0;
3984}
3985
3986static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3987					struct cftype *cft)
3988{
3989	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3990}
3991
3992#ifdef CONFIG_MMU
3993static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3994					struct cftype *cft, u64 val)
3995{
3996	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3997
3998	if (val >= (1 << NR_MOVE_TYPE))
3999		return -EINVAL;
4000	/*
4001	 * We check this value several times in both in can_attach() and
4002	 * attach(), so we need cgroup lock to prevent this value from being
4003	 * inconsistent.
4004	 */
4005	cgroup_lock();
4006	mem->move_charge_at_immigrate = val;
4007	cgroup_unlock();
 
 
 
4008
4009	return 0;
 
 
 
 
 
4010}
4011#else
4012static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4013					struct cftype *cft, u64 val)
4014{
4015	return -ENOSYS;
4016}
4017#endif
4018
 
4019
4020/* For read statistics */
4021enum {
4022	MCS_CACHE,
4023	MCS_RSS,
4024	MCS_FILE_MAPPED,
4025	MCS_PGPGIN,
4026	MCS_PGPGOUT,
4027	MCS_SWAP,
4028	MCS_PGFAULT,
4029	MCS_PGMAJFAULT,
4030	MCS_INACTIVE_ANON,
4031	MCS_ACTIVE_ANON,
4032	MCS_INACTIVE_FILE,
4033	MCS_ACTIVE_FILE,
4034	MCS_UNEVICTABLE,
4035	NR_MCS_STAT,
4036};
4037
4038struct mcs_total_stat {
4039	s64 stat[NR_MCS_STAT];
4040};
4041
4042struct {
4043	char *local_name;
4044	char *total_name;
4045} memcg_stat_strings[NR_MCS_STAT] = {
4046	{"cache", "total_cache"},
4047	{"rss", "total_rss"},
4048	{"mapped_file", "total_mapped_file"},
4049	{"pgpgin", "total_pgpgin"},
4050	{"pgpgout", "total_pgpgout"},
4051	{"swap", "total_swap"},
4052	{"pgfault", "total_pgfault"},
4053	{"pgmajfault", "total_pgmajfault"},
4054	{"inactive_anon", "total_inactive_anon"},
4055	{"active_anon", "total_active_anon"},
4056	{"inactive_file", "total_inactive_file"},
4057	{"active_file", "total_active_file"},
4058	{"unevictable", "total_unevictable"}
4059};
4060
 
4061
4062static void
4063mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 
 
4064{
4065	s64 val;
4066
4067	/* per cpu stat */
4068	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
4069	s->stat[MCS_CACHE] += val * PAGE_SIZE;
4070	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
4071	s->stat[MCS_RSS] += val * PAGE_SIZE;
4072	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
4073	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4074	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
4075	s->stat[MCS_PGPGIN] += val;
4076	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
4077	s->stat[MCS_PGPGOUT] += val;
4078	if (do_swap_account) {
4079		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
4080		s->stat[MCS_SWAP] += val * PAGE_SIZE;
4081	}
4082	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
4083	s->stat[MCS_PGFAULT] += val;
4084	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
4085	s->stat[MCS_PGMAJFAULT] += val;
4086
4087	/* per zone stat */
4088	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
4089	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4090	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
4091	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4092	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
4093	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4094	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
4095	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4096	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
4097	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4098}
4099
4100static void
4101mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4102{
4103	struct mem_cgroup *iter;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4104
4105	for_each_mem_cgroup_tree(iter, mem)
4106		mem_cgroup_get_local_stat(iter, s);
 
 
 
4107}
4108
4109#ifdef CONFIG_NUMA
4110static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
 
 
 
 
 
 
 
 
 
 
 
 
4111{
4112	int nid;
4113	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4114	unsigned long node_nr;
4115	struct cgroup *cont = m->private;
4116	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4117
4118	total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4119	seq_printf(m, "total=%lu", total_nr);
4120	for_each_node_state(nid, N_HIGH_MEMORY) {
4121		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4122		seq_printf(m, " N%d=%lu", nid, node_nr);
4123	}
4124	seq_putc(m, '\n');
4125
4126	file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4127	seq_printf(m, "file=%lu", file_nr);
4128	for_each_node_state(nid, N_HIGH_MEMORY) {
4129		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4130				LRU_ALL_FILE);
4131		seq_printf(m, " N%d=%lu", nid, node_nr);
4132	}
4133	seq_putc(m, '\n');
4134
4135	anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4136	seq_printf(m, "anon=%lu", anon_nr);
4137	for_each_node_state(nid, N_HIGH_MEMORY) {
4138		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4139				LRU_ALL_ANON);
4140		seq_printf(m, " N%d=%lu", nid, node_nr);
4141	}
4142	seq_putc(m, '\n');
4143
4144	unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4145	seq_printf(m, "unevictable=%lu", unevictable_nr);
4146	for_each_node_state(nid, N_HIGH_MEMORY) {
4147		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4148				BIT(LRU_UNEVICTABLE));
4149		seq_printf(m, " N%d=%lu", nid, node_nr);
4150	}
4151	seq_putc(m, '\n');
4152	return 0;
4153}
4154#endif /* CONFIG_NUMA */
4155
4156static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4157				 struct cgroup_map_cb *cb)
4158{
4159	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4160	struct mcs_total_stat mystat;
4161	int i;
 
4162
4163	memset(&mystat, 0, sizeof(mystat));
4164	mem_cgroup_get_local_stat(mem_cont, &mystat);
 
 
 
 
 
 
 
4165
 
 
 
 
 
 
 
 
4166
4167	for (i = 0; i < NR_MCS_STAT; i++) {
4168		if (i == MCS_SWAP && !do_swap_account)
4169			continue;
4170		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4171	}
4172
4173	/* Hierarchical information */
4174	{
4175		unsigned long long limit, memsw_limit;
4176		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4177		cb->fill(cb, "hierarchical_memory_limit", limit);
4178		if (do_swap_account)
4179			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4180	}
 
4181
4182	memset(&mystat, 0, sizeof(mystat));
4183	mem_cgroup_get_total_stat(mem_cont, &mystat);
4184	for (i = 0; i < NR_MCS_STAT; i++) {
4185		if (i == MCS_SWAP && !do_swap_account)
4186			continue;
4187		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4188	}
4189
4190#ifdef CONFIG_DEBUG_VM
4191	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4192
4193	{
4194		int nid, zid;
4195		struct mem_cgroup_per_zone *mz;
4196		unsigned long recent_rotated[2] = {0, 0};
4197		unsigned long recent_scanned[2] = {0, 0};
4198
4199		for_each_online_node(nid)
4200			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4201				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4202
4203				recent_rotated[0] +=
4204					mz->reclaim_stat.recent_rotated[0];
4205				recent_rotated[1] +=
4206					mz->reclaim_stat.recent_rotated[1];
4207				recent_scanned[0] +=
4208					mz->reclaim_stat.recent_scanned[0];
4209				recent_scanned[1] +=
4210					mz->reclaim_stat.recent_scanned[1];
4211			}
4212		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4213		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4214		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4215		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4216	}
4217#endif
 
 
 
 
4218
4219	return 0;
 
 
 
 
 
 
 
 
4220}
4221
4222static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4223{
4224	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4225
4226	return mem_cgroup_swappiness(memcg);
 
 
 
 
 
 
 
 
 
 
4227}
4228
4229static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4230				       u64 val)
4231{
4232	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4233	struct mem_cgroup *parent;
4234
4235	if (val > 100)
4236		return -EINVAL;
 
4237
4238	if (cgrp->parent == NULL)
4239		return -EINVAL;
4240
4241	parent = mem_cgroup_from_cont(cgrp->parent);
 
 
 
 
 
 
 
4242
4243	cgroup_lock();
 
 
 
4244
4245	/* If under hierarchy, only empty-root can set this value */
4246	if ((parent->use_hierarchy) ||
4247	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4248		cgroup_unlock();
4249		return -EINVAL;
4250	}
 
4251
4252	memcg->swappiness = val;
 
 
 
 
4253
4254	cgroup_unlock();
 
 
 
 
 
4255
4256	return 0;
4257}
4258
4259static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 
4260{
4261	struct mem_cgroup_threshold_ary *t;
4262	u64 usage;
4263	int i;
4264
4265	rcu_read_lock();
4266	if (!swap)
4267		t = rcu_dereference(memcg->thresholds.primary);
4268	else
4269		t = rcu_dereference(memcg->memsw_thresholds.primary);
4270
4271	if (!t)
4272		goto unlock;
4273
4274	usage = mem_cgroup_usage(memcg, swap);
 
 
 
4275
4276	/*
4277	 * current_threshold points to threshold just below usage.
4278	 * If it's not true, a threshold was crossed after last
4279	 * call of __mem_cgroup_threshold().
4280	 */
4281	i = t->current_threshold;
4282
4283	/*
4284	 * Iterate backward over array of thresholds starting from
4285	 * current_threshold and check if a threshold is crossed.
4286	 * If none of thresholds below usage is crossed, we read
4287	 * only one element of the array here.
4288	 */
4289	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4290		eventfd_signal(t->entries[i].eventfd, 1);
4291
4292	/* i = current_threshold + 1 */
4293	i++;
 
4294
4295	/*
4296	 * Iterate forward over array of thresholds starting from
4297	 * current_threshold+1 and check if a threshold is crossed.
4298	 * If none of thresholds above usage is crossed, we read
4299	 * only one element of the array here.
4300	 */
4301	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4302		eventfd_signal(t->entries[i].eventfd, 1);
4303
4304	/* Update current_threshold */
4305	t->current_threshold = i - 1;
4306unlock:
4307	rcu_read_unlock();
 
 
4308}
4309
4310static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4311{
4312	while (memcg) {
4313		__mem_cgroup_threshold(memcg, false);
4314		if (do_swap_account)
4315			__mem_cgroup_threshold(memcg, true);
4316
4317		memcg = parent_mem_cgroup(memcg);
 
 
4318	}
 
 
 
4319}
4320
4321static int compare_thresholds(const void *a, const void *b)
 
 
4322{
4323	const struct mem_cgroup_threshold *_a = a;
4324	const struct mem_cgroup_threshold *_b = b;
 
 
4325
4326	return _a->threshold - _b->threshold;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4327}
4328
4329static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
 
4330{
4331	struct mem_cgroup_eventfd_list *ev;
4332
4333	list_for_each_entry(ev, &mem->oom_notify, list)
4334		eventfd_signal(ev->eventfd, 1);
4335	return 0;
4336}
4337
4338static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
4339{
4340	struct mem_cgroup *iter;
4341
4342	for_each_mem_cgroup_tree(iter, mem)
4343		mem_cgroup_oom_notify_cb(iter);
 
 
4344}
4345
4346static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4347	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4348{
4349	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4350	struct mem_cgroup_thresholds *thresholds;
4351	struct mem_cgroup_threshold_ary *new;
4352	int type = MEMFILE_TYPE(cft->private);
4353	u64 threshold, usage;
4354	int i, size, ret;
4355
4356	ret = res_counter_memparse_write_strategy(args, &threshold);
4357	if (ret)
4358		return ret;
 
4359
4360	mutex_lock(&memcg->thresholds_lock);
4361
4362	if (type == _MEM)
4363		thresholds = &memcg->thresholds;
4364	else if (type == _MEMSWAP)
4365		thresholds = &memcg->memsw_thresholds;
4366	else
4367		BUG();
4368
4369	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 
 
 
 
4370
4371	/* Check if a threshold crossed before adding a new one */
4372	if (thresholds->primary)
4373		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 
 
 
4374
4375	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 
 
 
4376
4377	/* Allocate memory for new array of thresholds */
4378	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4379			GFP_KERNEL);
4380	if (!new) {
4381		ret = -ENOMEM;
4382		goto unlock;
4383	}
4384	new->size = size;
4385
4386	/* Copy thresholds (if any) to new array */
4387	if (thresholds->primary) {
4388		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4389				sizeof(struct mem_cgroup_threshold));
4390	}
4391
4392	/* Add new threshold */
4393	new->entries[size - 1].eventfd = eventfd;
4394	new->entries[size - 1].threshold = threshold;
 
 
4395
4396	/* Sort thresholds. Registering of new threshold isn't time-critical */
4397	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4398			compare_thresholds, NULL);
 
 
 
 
 
4399
4400	/* Find current threshold */
4401	new->current_threshold = -1;
4402	for (i = 0; i < size; i++) {
4403		if (new->entries[i].threshold < usage) {
4404			/*
4405			 * new->current_threshold will not be used until
4406			 * rcu_assign_pointer(), so it's safe to increment
4407			 * it here.
4408			 */
4409			++new->current_threshold;
4410		}
4411	}
4412
4413	/* Free old spare buffer and save old primary buffer as spare */
4414	kfree(thresholds->spare);
4415	thresholds->spare = thresholds->primary;
4416
4417	rcu_assign_pointer(thresholds->primary, new);
 
 
4418
4419	/* To be sure that nobody uses thresholds */
4420	synchronize_rcu();
4421
4422unlock:
4423	mutex_unlock(&memcg->thresholds_lock);
4424
4425	return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4426}
4427
4428static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4429	struct cftype *cft, struct eventfd_ctx *eventfd)
4430{
4431	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4432	struct mem_cgroup_thresholds *thresholds;
4433	struct mem_cgroup_threshold_ary *new;
4434	int type = MEMFILE_TYPE(cft->private);
4435	u64 usage;
4436	int i, j, size;
4437
4438	mutex_lock(&memcg->thresholds_lock);
4439	if (type == _MEM)
4440		thresholds = &memcg->thresholds;
4441	else if (type == _MEMSWAP)
4442		thresholds = &memcg->memsw_thresholds;
4443	else
4444		BUG();
4445
4446	/*
4447	 * Something went wrong if we trying to unregister a threshold
4448	 * if we don't have thresholds
4449	 */
4450	BUG_ON(!thresholds);
4451
4452	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 
 
 
 
 
 
 
4453
4454	/* Check if a threshold crossed before removing */
4455	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 
 
4456
4457	/* Calculate new number of threshold */
4458	size = 0;
4459	for (i = 0; i < thresholds->primary->size; i++) {
4460		if (thresholds->primary->entries[i].eventfd != eventfd)
4461			size++;
4462	}
4463
4464	new = thresholds->spare;
 
4465
4466	/* Set thresholds array to NULL if we don't have thresholds */
4467	if (!size) {
4468		kfree(new);
4469		new = NULL;
4470		goto swap_buffers;
4471	}
4472
4473	new->size = size;
 
4474
4475	/* Copy thresholds and find current threshold */
4476	new->current_threshold = -1;
4477	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4478		if (thresholds->primary->entries[i].eventfd == eventfd)
4479			continue;
 
4480
4481		new->entries[j] = thresholds->primary->entries[i];
4482		if (new->entries[j].threshold < usage) {
4483			/*
4484			 * new->current_threshold will not be used
4485			 * until rcu_assign_pointer(), so it's safe to increment
4486			 * it here.
4487			 */
4488			++new->current_threshold;
4489		}
4490		j++;
4491	}
4492
4493swap_buffers:
4494	/* Swap primary and spare array */
4495	thresholds->spare = thresholds->primary;
4496	rcu_assign_pointer(thresholds->primary, new);
4497
4498	/* To be sure that nobody uses thresholds */
4499	synchronize_rcu();
 
4500
4501	mutex_unlock(&memcg->thresholds_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
4502}
4503
4504static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4505	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4506{
4507	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4508	struct mem_cgroup_eventfd_list *event;
4509	int type = MEMFILE_TYPE(cft->private);
4510
4511	BUG_ON(type != _OOM_TYPE);
4512	event = kmalloc(sizeof(*event),	GFP_KERNEL);
4513	if (!event)
4514		return -ENOMEM;
4515
4516	spin_lock(&memcg_oom_lock);
 
 
4517
4518	event->eventfd = eventfd;
4519	list_add(&event->list, &memcg->oom_notify);
 
4520
4521	/* already in OOM ? */
4522	if (atomic_read(&memcg->under_oom))
4523		eventfd_signal(eventfd, 1);
4524	spin_unlock(&memcg_oom_lock);
 
4525
 
 
 
 
 
 
4526	return 0;
4527}
4528
4529static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4530	struct cftype *cft, struct eventfd_ctx *eventfd)
 
 
 
 
 
 
 
4531{
4532	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4533	struct mem_cgroup_eventfd_list *ev, *tmp;
4534	int type = MEMFILE_TYPE(cft->private);
4535
4536	BUG_ON(type != _OOM_TYPE);
4537
4538	spin_lock(&memcg_oom_lock);
 
4539
4540	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4541		if (ev->eventfd == eventfd) {
4542			list_del(&ev->list);
4543			kfree(ev);
 
 
 
 
 
 
 
 
4544		}
 
4545	}
4546
4547	spin_unlock(&memcg_oom_lock);
4548}
 
4549
4550static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4551	struct cftype *cft,  struct cgroup_map_cb *cb)
4552{
4553	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4554
4555	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4556
4557	if (atomic_read(&mem->under_oom))
4558		cb->fill(cb, "under_oom", 1);
4559	else
4560		cb->fill(cb, "under_oom", 0);
4561	return 0;
4562}
4563
4564static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4565	struct cftype *cft, u64 val)
4566{
4567	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4568	struct mem_cgroup *parent;
4569
4570	/* cannot set to root cgroup and only 0 and 1 are allowed */
4571	if (!cgrp->parent || !((val == 0) || (val == 1)))
4572		return -EINVAL;
4573
4574	parent = mem_cgroup_from_cont(cgrp->parent);
 
 
4575
4576	cgroup_lock();
4577	/* oom-kill-disable is a flag for subhierarchy. */
4578	if ((parent->use_hierarchy) ||
4579	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
4580		cgroup_unlock();
4581		return -EINVAL;
4582	}
4583	mem->oom_kill_disable = val;
4584	if (!val)
4585		memcg_oom_recover(mem);
4586	cgroup_unlock();
4587	return 0;
4588}
4589
4590#ifdef CONFIG_NUMA
4591static const struct file_operations mem_control_numa_stat_file_operations = {
4592	.read = seq_read,
4593	.llseek = seq_lseek,
4594	.release = single_release,
 
 
 
4595};
4596
4597static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 
4598{
4599	struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4600
4601	file->f_op = &mem_control_numa_stat_file_operations;
4602	return single_open(file, mem_control_numa_stat_show, cont);
 
 
4603}
4604#endif /* CONFIG_NUMA */
4605
4606static struct cftype mem_cgroup_files[] = {
4607	{
4608		.name = "usage_in_bytes",
4609		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4610		.read_u64 = mem_cgroup_read,
4611		.register_event = mem_cgroup_usage_register_event,
4612		.unregister_event = mem_cgroup_usage_unregister_event,
4613	},
4614	{
4615		.name = "max_usage_in_bytes",
4616		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4617		.trigger = mem_cgroup_reset,
4618		.read_u64 = mem_cgroup_read,
 
 
4619	},
4620	{
4621		.name = "limit_in_bytes",
4622		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4623		.write_string = mem_cgroup_write,
4624		.read_u64 = mem_cgroup_read,
4625	},
4626	{
4627		.name = "soft_limit_in_bytes",
4628		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4629		.write_string = mem_cgroup_write,
4630		.read_u64 = mem_cgroup_read,
4631	},
4632	{
4633		.name = "failcnt",
4634		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4635		.trigger = mem_cgroup_reset,
4636		.read_u64 = mem_cgroup_read,
4637	},
4638	{
4639		.name = "stat",
4640		.read_map = mem_control_stat_show,
 
 
4641	},
4642	{
4643		.name = "force_empty",
4644		.trigger = mem_cgroup_force_empty_write,
 
 
4645	},
4646	{
4647		.name = "use_hierarchy",
4648		.write_u64 = mem_cgroup_hierarchy_write,
4649		.read_u64 = mem_cgroup_hierarchy_read,
 
4650	},
4651	{
4652		.name = "swappiness",
4653		.read_u64 = mem_cgroup_swappiness_read,
4654		.write_u64 = mem_cgroup_swappiness_write,
4655	},
4656	{
4657		.name = "move_charge_at_immigrate",
4658		.read_u64 = mem_cgroup_move_charge_read,
4659		.write_u64 = mem_cgroup_move_charge_write,
4660	},
4661	{
4662		.name = "oom_control",
4663		.read_map = mem_cgroup_oom_control_read,
4664		.write_u64 = mem_cgroup_oom_control_write,
4665		.register_event = mem_cgroup_oom_register_event,
4666		.unregister_event = mem_cgroup_oom_unregister_event,
4667		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4668	},
4669#ifdef CONFIG_NUMA
4670	{
4671		.name = "numa_stat",
4672		.open = mem_control_numa_stat_open,
4673		.mode = S_IRUGO,
4674	},
4675#endif
4676};
4677
4678#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4679static struct cftype memsw_cgroup_files[] = {
4680	{
4681		.name = "memsw.usage_in_bytes",
4682		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4683		.read_u64 = mem_cgroup_read,
4684		.register_event = mem_cgroup_usage_register_event,
4685		.unregister_event = mem_cgroup_usage_unregister_event,
4686	},
4687	{
4688		.name = "memsw.max_usage_in_bytes",
4689		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4690		.trigger = mem_cgroup_reset,
4691		.read_u64 = mem_cgroup_read,
4692	},
4693	{
4694		.name = "memsw.limit_in_bytes",
4695		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4696		.write_string = mem_cgroup_write,
4697		.read_u64 = mem_cgroup_read,
4698	},
4699	{
4700		.name = "memsw.failcnt",
4701		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4702		.trigger = mem_cgroup_reset,
4703		.read_u64 = mem_cgroup_read,
4704	},
 
4705};
4706
4707static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4708{
4709	if (!do_swap_account)
4710		return 0;
4711	return cgroup_add_files(cont, ss, memsw_cgroup_files,
4712				ARRAY_SIZE(memsw_cgroup_files));
 
 
 
 
 
 
 
 
 
 
4713};
4714#else
4715static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 
 
 
 
 
 
 
 
 
4716{
4717	return 0;
 
 
 
 
 
 
 
 
 
4718}
4719#endif
4720
4721static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 
4722{
4723	struct mem_cgroup_per_node *pn;
4724	struct mem_cgroup_per_zone *mz;
4725	enum lru_list l;
4726	int zone, tmp = node;
4727	/*
4728	 * This routine is called against possible nodes.
4729	 * But it's BUG to call kmalloc() against offline node.
4730	 *
4731	 * TODO: this routine can waste much memory for nodes which will
4732	 *       never be onlined. It's better to use memory hotplug callback
4733	 *       function.
4734	 */
4735	if (!node_state(node, N_NORMAL_MEMORY))
4736		tmp = -1;
4737	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4738	if (!pn)
4739		return 1;
4740
4741	mem->info.nodeinfo[node] = pn;
4742	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4743		mz = &pn->zoneinfo[zone];
4744		for_each_lru(l)
4745			INIT_LIST_HEAD(&mz->lists[l]);
4746		mz->usage_in_excess = 0;
4747		mz->on_tree = false;
4748		mz->mem = mem;
4749	}
4750	return 0;
4751}
4752
4753static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4754{
4755	kfree(mem->info.nodeinfo[node]);
 
 
 
 
 
 
 
4756}
4757
4758static struct mem_cgroup *mem_cgroup_alloc(void)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4759{
4760	struct mem_cgroup *mem;
4761	int size = sizeof(struct mem_cgroup);
 
 
 
 
 
 
4762
4763	/* Can be very big if MAX_NUMNODES is very big */
4764	if (size < PAGE_SIZE)
4765		mem = kzalloc(size, GFP_KERNEL);
4766	else
4767		mem = vzalloc(size);
4768
4769	if (!mem)
4770		return NULL;
4771
4772	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4773	if (!mem->stat)
4774		goto out_free;
4775	spin_lock_init(&mem->pcp_counter_lock);
4776	return mem;
4777
4778out_free:
4779	if (size < PAGE_SIZE)
4780		kfree(mem);
4781	else
4782		vfree(mem);
4783	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4784}
4785
4786/*
4787 * At destroying mem_cgroup, references from swap_cgroup can remain.
4788 * (scanning all at force_empty is too costly...)
 
4789 *
4790 * Instead of clearing all references at force_empty, we remember
4791 * the number of reference from swap_cgroup and free mem_cgroup when
4792 * it goes down to 0.
4793 *
4794 * Removal of cgroup itself succeeds regardless of refs from swap.
 
4795 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4796
4797static void __mem_cgroup_free(struct mem_cgroup *mem)
4798{
4799	int node;
 
4800
4801	mem_cgroup_remove_from_trees(mem);
4802	free_css_id(&mem_cgroup_subsys, &mem->css);
 
 
 
 
 
 
 
 
 
 
4803
4804	for_each_node_state(node, N_POSSIBLE)
4805		free_mem_cgroup_per_zone_info(mem, node);
4806
4807	free_percpu(mem->stat);
4808	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4809		kfree(mem);
4810	else
4811		vfree(mem);
4812}
4813
4814static void mem_cgroup_get(struct mem_cgroup *mem)
4815{
4816	atomic_inc(&mem->refcnt);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4817}
4818
4819static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4820{
4821	if (atomic_sub_and_test(count, &mem->refcnt)) {
4822		struct mem_cgroup *parent = parent_mem_cgroup(mem);
4823		__mem_cgroup_free(mem);
4824		if (parent)
4825			mem_cgroup_put(parent);
4826	}
 
 
 
4827}
4828
4829static void mem_cgroup_put(struct mem_cgroup *mem)
4830{
4831	__mem_cgroup_put(mem, 1);
 
 
 
 
 
 
 
4832}
4833
4834/*
4835 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
 
 
 
 
 
 
 
4836 */
4837static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4838{
4839	if (!mem->res.parent)
4840		return NULL;
4841	return mem_cgroup_from_res_counter(mem->res.parent, res);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4842}
4843
4844#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4845static void __init enable_swap_cgroup(void)
 
 
 
 
 
 
 
 
 
 
4846{
4847	if (!mem_cgroup_disabled() && really_do_swap_account)
4848		do_swap_account = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4849}
4850#else
4851static void __init enable_swap_cgroup(void)
 
 
 
4852{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4853}
4854#endif
4855
4856static int mem_cgroup_soft_limit_tree_init(void)
4857{
4858	struct mem_cgroup_tree_per_node *rtpn;
4859	struct mem_cgroup_tree_per_zone *rtpz;
4860	int tmp, node, zone;
4861
4862	for_each_node_state(node, N_POSSIBLE) {
4863		tmp = node;
4864		if (!node_state(node, N_NORMAL_MEMORY))
4865			tmp = -1;
4866		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4867		if (!rtpn)
4868			return 1;
4869
4870		soft_limit_tree.rb_tree_per_node[node] = rtpn;
4871
4872		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4873			rtpz = &rtpn->rb_tree_per_zone[zone];
4874			rtpz->rb_root = RB_ROOT;
4875			spin_lock_init(&rtpz->lock);
4876		}
4877	}
4878	return 0;
4879}
4880
4881static struct cgroup_subsys_state * __ref
4882mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 
 
 
 
 
 
 
 
 
4883{
4884	struct mem_cgroup *mem, *parent;
4885	long error = -ENOMEM;
4886	int node;
4887
4888	mem = mem_cgroup_alloc();
4889	if (!mem)
4890		return ERR_PTR(error);
4891
4892	for_each_node_state(node, N_POSSIBLE)
4893		if (alloc_mem_cgroup_per_zone_info(mem, node))
4894			goto free_out;
4895
4896	/* root ? */
4897	if (cont->parent == NULL) {
4898		int cpu;
4899		enable_swap_cgroup();
4900		parent = NULL;
4901		root_mem_cgroup = mem;
4902		if (mem_cgroup_soft_limit_tree_init())
4903			goto free_out;
4904		for_each_possible_cpu(cpu) {
4905			struct memcg_stock_pcp *stock =
4906						&per_cpu(memcg_stock, cpu);
4907			INIT_WORK(&stock->work, drain_local_stock);
4908		}
4909		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4910	} else {
4911		parent = mem_cgroup_from_cont(cont->parent);
4912		mem->use_hierarchy = parent->use_hierarchy;
4913		mem->oom_kill_disable = parent->oom_kill_disable;
4914	}
4915
4916	if (parent && parent->use_hierarchy) {
4917		res_counter_init(&mem->res, &parent->res);
4918		res_counter_init(&mem->memsw, &parent->memsw);
4919		/*
4920		 * We increment refcnt of the parent to ensure that we can
4921		 * safely access it on res_counter_charge/uncharge.
4922		 * This refcnt will be decremented when freeing this
4923		 * mem_cgroup(see mem_cgroup_put).
4924		 */
4925		mem_cgroup_get(parent);
4926	} else {
4927		res_counter_init(&mem->res, NULL);
4928		res_counter_init(&mem->memsw, NULL);
4929	}
4930	mem->last_scanned_child = 0;
4931	mem->last_scanned_node = MAX_NUMNODES;
4932	INIT_LIST_HEAD(&mem->oom_notify);
4933
4934	if (parent)
4935		mem->swappiness = mem_cgroup_swappiness(parent);
4936	atomic_set(&mem->refcnt, 1);
4937	mem->move_charge_at_immigrate = 0;
4938	mutex_init(&mem->thresholds_lock);
4939	return &mem->css;
4940free_out:
4941	__mem_cgroup_free(mem);
4942	root_mem_cgroup = NULL;
4943	return ERR_PTR(error);
4944}
4945
4946static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4947					struct cgroup *cont)
 
 
 
 
4948{
4949	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 
 
 
 
 
4950
4951	return mem_cgroup_force_empty(mem, false);
4952}
4953
4954static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4955				struct cgroup *cont)
4956{
4957	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4958
4959	mem_cgroup_put(mem);
 
 
 
 
 
 
 
 
 
 
4960}
 
4961
4962static int mem_cgroup_populate(struct cgroup_subsys *ss,
4963				struct cgroup *cont)
 
 
 
 
 
 
 
4964{
4965	int ret;
4966
4967	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4968				ARRAY_SIZE(mem_cgroup_files));
 
 
 
 
 
 
 
 
 
 
 
 
4969
4970	if (!ret)
4971		ret = register_memsw_files(cont, ss);
4972	return ret;
4973}
 
4974
4975#ifdef CONFIG_MMU
4976/* Handlers for move charge at task migration. */
4977#define PRECHARGE_COUNT_AT_ONCE	256
4978static int mem_cgroup_do_precharge(unsigned long count)
4979{
4980	int ret = 0;
4981	int batch_count = PRECHARGE_COUNT_AT_ONCE;
4982	struct mem_cgroup *mem = mc.to;
4983
4984	if (mem_cgroup_is_root(mem)) {
4985		mc.precharge += count;
4986		/* we don't need css_get for root */
4987		return ret;
4988	}
4989	/* try to charge at once */
4990	if (count > 1) {
4991		struct res_counter *dummy;
4992		/*
4993		 * "mem" cannot be under rmdir() because we've already checked
4994		 * by cgroup_lock_live_cgroup() that it is not removed and we
4995		 * are still under the same cgroup_mutex. So we can postpone
4996		 * css_get().
4997		 */
4998		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4999			goto one_by_one;
5000		if (do_swap_account && res_counter_charge(&mem->memsw,
5001						PAGE_SIZE * count, &dummy)) {
5002			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
5003			goto one_by_one;
5004		}
5005		mc.precharge += count;
5006		return ret;
5007	}
5008one_by_one:
5009	/* fall back to one by one charge */
5010	while (count--) {
5011		if (signal_pending(current)) {
5012			ret = -EINTR;
5013			break;
5014		}
5015		if (!batch_count--) {
5016			batch_count = PRECHARGE_COUNT_AT_ONCE;
5017			cond_resched();
5018		}
5019		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
5020		if (ret || !mem)
5021			/* mem_cgroup_clear_mc() will do uncharge later */
5022			return -ENOMEM;
5023		mc.precharge++;
5024	}
5025	return ret;
5026}
5027
5028/**
5029 * is_target_pte_for_mc - check a pte whether it is valid for move charge
5030 * @vma: the vma the pte to be checked belongs
5031 * @addr: the address corresponding to the pte to be checked
5032 * @ptent: the pte to be checked
5033 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5034 *
5035 * Returns
5036 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5037 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5038 *     move charge. if @target is not NULL, the page is stored in target->page
5039 *     with extra refcnt got(Callers should handle it).
5040 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5041 *     target for charge migration. if @target is not NULL, the entry is stored
5042 *     in target->ent.
5043 *
5044 * Called with pte lock held.
5045 */
5046union mc_target {
5047	struct page	*page;
5048	swp_entry_t	ent;
5049};
5050
5051enum mc_target_type {
5052	MC_TARGET_NONE,	/* not used */
5053	MC_TARGET_PAGE,
5054	MC_TARGET_SWAP,
5055};
5056
5057static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5058						unsigned long addr, pte_t ptent)
5059{
5060	struct page *page = vm_normal_page(vma, addr, ptent);
5061
5062	if (!page || !page_mapped(page))
5063		return NULL;
5064	if (PageAnon(page)) {
5065		/* we don't move shared anon */
5066		if (!move_anon() || page_mapcount(page) > 2)
5067			return NULL;
5068	} else if (!move_file())
5069		/* we ignore mapcount for file pages */
5070		return NULL;
5071	if (!get_page_unless_zero(page))
5072		return NULL;
5073
5074	return page;
5075}
5076
5077static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5078			unsigned long addr, pte_t ptent, swp_entry_t *entry)
5079{
5080	int usage_count;
5081	struct page *page = NULL;
5082	swp_entry_t ent = pte_to_swp_entry(ptent);
5083
5084	if (!move_anon() || non_swap_entry(ent))
5085		return NULL;
5086	usage_count = mem_cgroup_count_swap_user(ent, &page);
5087	if (usage_count > 1) { /* we don't move shared anon */
5088		if (page)
5089			put_page(page);
5090		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5091	}
5092	if (do_swap_account)
5093		entry->val = ent.val;
5094
5095	return page;
 
5096}
5097
5098static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5099			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 
 
 
 
 
 
 
 
5100{
5101	struct page *page = NULL;
5102	struct inode *inode;
5103	struct address_space *mapping;
5104	pgoff_t pgoff;
5105
5106	if (!vma->vm_file) /* anonymous vma */
5107		return NULL;
5108	if (!move_file())
5109		return NULL;
5110
5111	inode = vma->vm_file->f_path.dentry->d_inode;
5112	mapping = vma->vm_file->f_mapping;
5113	if (pte_none(ptent))
5114		pgoff = linear_page_index(vma, addr);
5115	else /* pte_file(ptent) is true */
5116		pgoff = pte_to_pgoff(ptent);
5117
5118	/* page is moved even if it's not RSS of this task(page-faulted). */
5119	page = find_get_page(mapping, pgoff);
 
5120
5121#ifdef CONFIG_SWAP
5122	/* shmem/tmpfs may report page out on swap: account for that too. */
5123	if (radix_tree_exceptional_entry(page)) {
5124		swp_entry_t swap = radix_to_swp_entry(page);
5125		if (do_swap_account)
5126			*entry = swap;
5127		page = find_get_page(&swapper_space, swap.val);
5128	}
5129#endif
5130	return page;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5131}
5132
5133static int is_target_pte_for_mc(struct vm_area_struct *vma,
5134		unsigned long addr, pte_t ptent, union mc_target *target)
 
 
 
 
5135{
5136	struct page *page = NULL;
5137	struct page_cgroup *pc;
5138	int ret = 0;
5139	swp_entry_t ent = { .val = 0 };
5140
5141	if (pte_present(ptent))
5142		page = mc_handle_present_pte(vma, addr, ptent);
5143	else if (is_swap_pte(ptent))
5144		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5145	else if (pte_none(ptent) || pte_file(ptent))
5146		page = mc_handle_file_pte(vma, addr, ptent, &ent);
5147
5148	if (!page && !ent.val)
5149		return 0;
5150	if (page) {
5151		pc = lookup_page_cgroup(page);
5152		/*
5153		 * Do only loose check w/o page_cgroup lock.
5154		 * mem_cgroup_move_account() checks the pc is valid or not under
5155		 * the lock.
5156		 */
5157		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5158			ret = MC_TARGET_PAGE;
5159			if (target)
5160				target->page = page;
5161		}
5162		if (!ret || !target)
5163			put_page(page);
5164	}
5165	/* There is a swap entry and a page doesn't exist or isn't charged */
5166	if (ent.val && !ret &&
5167			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
5168		ret = MC_TARGET_SWAP;
5169		if (target)
5170			target->ent = ent;
5171	}
5172	return ret;
5173}
5174
5175static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5176					unsigned long addr, unsigned long end,
5177					struct mm_walk *walk)
5178{
5179	struct vm_area_struct *vma = walk->private;
5180	pte_t *pte;
5181	spinlock_t *ptl;
5182
5183	split_huge_page_pmd(walk->mm, pmd);
5184
5185	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5186	for (; addr != end; pte++, addr += PAGE_SIZE)
5187		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
5188			mc.precharge++;	/* increment precharge temporarily */
5189	pte_unmap_unlock(pte - 1, ptl);
5190	cond_resched();
5191
5192	return 0;
 
 
 
 
 
 
5193}
5194
5195static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5196{
5197	unsigned long precharge;
5198	struct vm_area_struct *vma;
5199
5200	down_read(&mm->mmap_sem);
5201	for (vma = mm->mmap; vma; vma = vma->vm_next) {
5202		struct mm_walk mem_cgroup_count_precharge_walk = {
5203			.pmd_entry = mem_cgroup_count_precharge_pte_range,
5204			.mm = mm,
5205			.private = vma,
5206		};
5207		if (is_vm_hugetlb_page(vma))
5208			continue;
5209		walk_page_range(vma->vm_start, vma->vm_end,
5210					&mem_cgroup_count_precharge_walk);
 
 
 
 
 
 
5211	}
5212	up_read(&mm->mmap_sem);
5213
5214	precharge = mc.precharge;
5215	mc.precharge = 0;
 
 
 
 
5216
5217	return precharge;
 
 
 
 
 
5218}
 
5219
5220static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 
5221{
5222	unsigned long precharge = mem_cgroup_count_precharge(mm);
5223
5224	VM_BUG_ON(mc.moving_task);
5225	mc.moving_task = current;
5226	return mem_cgroup_do_precharge(precharge);
5227}
5228
5229/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5230static void __mem_cgroup_clear_mc(void)
5231{
5232	struct mem_cgroup *from = mc.from;
5233	struct mem_cgroup *to = mc.to;
5234
5235	/* we must uncharge all the leftover precharges from mc.to */
5236	if (mc.precharge) {
5237		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
5238		mc.precharge = 0;
5239	}
5240	/*
5241	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5242	 * we must uncharge here.
5243	 */
5244	if (mc.moved_charge) {
5245		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5246		mc.moved_charge = 0;
5247	}
5248	/* we must fixup refcnts and charges */
5249	if (mc.moved_swap) {
5250		/* uncharge swap account from the old cgroup */
5251		if (!mem_cgroup_is_root(mc.from))
5252			res_counter_uncharge(&mc.from->memsw,
5253						PAGE_SIZE * mc.moved_swap);
5254		__mem_cgroup_put(mc.from, mc.moved_swap);
5255
5256		if (!mem_cgroup_is_root(mc.to)) {
5257			/*
5258			 * we charged both to->res and to->memsw, so we should
5259			 * uncharge to->res.
5260			 */
5261			res_counter_uncharge(&mc.to->res,
5262						PAGE_SIZE * mc.moved_swap);
5263		}
5264		/* we've already done mem_cgroup_get(mc.to) */
5265		mc.moved_swap = 0;
5266	}
5267	memcg_oom_recover(from);
5268	memcg_oom_recover(to);
5269	wake_up_all(&mc.waitq);
5270}
5271
5272static void mem_cgroup_clear_mc(void)
5273{
5274	struct mem_cgroup *from = mc.from;
 
 
5275
5276	/*
5277	 * we must clear moving_task before waking up waiters at the end of
5278	 * task migration.
5279	 */
5280	mc.moving_task = NULL;
5281	__mem_cgroup_clear_mc();
5282	spin_lock(&mc.lock);
5283	mc.from = NULL;
5284	mc.to = NULL;
5285	spin_unlock(&mc.lock);
5286	mem_cgroup_end_move(from);
 
 
 
 
5287}
5288
5289static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5290				struct cgroup *cgroup,
5291				struct task_struct *p)
5292{
5293	int ret = 0;
5294	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
 
5295
5296	if (mem->move_charge_at_immigrate) {
5297		struct mm_struct *mm;
5298		struct mem_cgroup *from = mem_cgroup_from_task(p);
 
 
 
5299
5300		VM_BUG_ON(from == mem);
 
 
 
5301
5302		mm = get_task_mm(p);
5303		if (!mm)
5304			return 0;
5305		/* We move charges only when we move a owner of the mm */
5306		if (mm->owner == p) {
5307			VM_BUG_ON(mc.from);
5308			VM_BUG_ON(mc.to);
5309			VM_BUG_ON(mc.precharge);
5310			VM_BUG_ON(mc.moved_charge);
5311			VM_BUG_ON(mc.moved_swap);
5312			mem_cgroup_start_move(from);
5313			spin_lock(&mc.lock);
5314			mc.from = from;
5315			mc.to = mem;
5316			spin_unlock(&mc.lock);
5317			/* We set mc.moving_task later */
5318
5319			ret = mem_cgroup_precharge_mc(mm);
5320			if (ret)
5321				mem_cgroup_clear_mc();
5322		}
5323		mmput(mm);
5324	}
5325	return ret;
5326}
5327
5328static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5329				struct cgroup *cgroup,
5330				struct task_struct *p)
5331{
5332	mem_cgroup_clear_mc();
 
 
 
 
 
 
 
 
 
5333}
5334
5335static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5336				unsigned long addr, unsigned long end,
5337				struct mm_walk *walk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5338{
5339	int ret = 0;
5340	struct vm_area_struct *vma = walk->private;
5341	pte_t *pte;
5342	spinlock_t *ptl;
5343
5344	split_huge_page_pmd(walk->mm, pmd);
5345retry:
5346	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5347	for (; addr != end; addr += PAGE_SIZE) {
5348		pte_t ptent = *(pte++);
5349		union mc_target target;
5350		int type;
5351		struct page *page;
5352		struct page_cgroup *pc;
5353		swp_entry_t ent;
5354
5355		if (!mc.precharge)
5356			break;
 
 
 
5357
5358		type = is_target_pte_for_mc(vma, addr, ptent, &target);
5359		switch (type) {
5360		case MC_TARGET_PAGE:
5361			page = target.page;
5362			if (isolate_lru_page(page))
5363				goto put;
5364			pc = lookup_page_cgroup(page);
5365			if (!mem_cgroup_move_account(page, 1, pc,
5366						     mc.from, mc.to, false)) {
5367				mc.precharge--;
5368				/* we uncharge from mc.from later. */
5369				mc.moved_charge++;
5370			}
5371			putback_lru_page(page);
5372put:			/* is_target_pte_for_mc() gets the page */
5373			put_page(page);
5374			break;
5375		case MC_TARGET_SWAP:
5376			ent = target.ent;
5377			if (!mem_cgroup_move_swap_account(ent,
5378						mc.from, mc.to, false)) {
5379				mc.precharge--;
5380				/* we fixup refcnts and charges later. */
5381				mc.moved_swap++;
5382			}
5383			break;
5384		default:
5385			break;
5386		}
5387	}
5388	pte_unmap_unlock(pte - 1, ptl);
5389	cond_resched();
5390
5391	if (addr != end) {
5392		/*
5393		 * We have consumed all precharges we got in can_attach().
5394		 * We try charge one by one, but don't do any additional
5395		 * charges to mc.to if we have failed in charge once in attach()
5396		 * phase.
5397		 */
5398		ret = mem_cgroup_do_precharge(1);
5399		if (!ret)
5400			goto retry;
5401	}
5402
5403	return ret;
5404}
5405
5406static void mem_cgroup_move_charge(struct mm_struct *mm)
 
 
 
 
 
 
 
 
5407{
5408	struct vm_area_struct *vma;
5409
5410	lru_add_drain_all();
5411retry:
5412	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5413		/*
5414		 * Someone who are holding the mmap_sem might be waiting in
5415		 * waitq. So we cancel all extra charges, wake up all waiters,
5416		 * and retry. Because we cancel precharges, we might not be able
5417		 * to move enough charges, but moving charge is a best-effort
5418		 * feature anyway, so it wouldn't be a big problem.
5419		 */
5420		__mem_cgroup_clear_mc();
5421		cond_resched();
5422		goto retry;
5423	}
5424	for (vma = mm->mmap; vma; vma = vma->vm_next) {
5425		int ret;
5426		struct mm_walk mem_cgroup_move_charge_walk = {
5427			.pmd_entry = mem_cgroup_move_charge_pte_range,
5428			.mm = mm,
5429			.private = vma,
5430		};
5431		if (is_vm_hugetlb_page(vma))
5432			continue;
5433		ret = walk_page_range(vma->vm_start, vma->vm_end,
5434						&mem_cgroup_move_charge_walk);
5435		if (ret)
5436			/*
5437			 * means we have consumed all precharges and failed in
5438			 * doing additional charge. Just abandon here.
5439			 */
5440			break;
5441	}
5442	up_read(&mm->mmap_sem);
5443}
5444
5445static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5446				struct cgroup *cont,
5447				struct cgroup *old_cont,
5448				struct task_struct *p)
 
 
 
 
5449{
5450	struct mm_struct *mm = get_task_mm(p);
5451
5452	if (mm) {
5453		if (mc.to)
5454			mem_cgroup_move_charge(mm);
5455		put_swap_token(mm);
5456		mmput(mm);
5457	}
5458	if (mc.to)
5459		mem_cgroup_clear_mc();
 
 
5460}
5461#else	/* !CONFIG_MMU */
5462static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5463				struct cgroup *cgroup,
5464				struct task_struct *p)
5465{
5466	return 0;
 
 
 
 
 
 
 
 
5467}
5468static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5469				struct cgroup *cgroup,
5470				struct task_struct *p)
5471{
 
 
 
 
5472}
5473static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5474				struct cgroup *cont,
5475				struct cgroup *old_cont,
5476				struct task_struct *p)
5477{
 
 
5478}
5479#endif
5480
5481struct cgroup_subsys mem_cgroup_subsys = {
5482	.name = "memory",
5483	.subsys_id = mem_cgroup_subsys_id,
5484	.create = mem_cgroup_create,
5485	.pre_destroy = mem_cgroup_pre_destroy,
5486	.destroy = mem_cgroup_destroy,
5487	.populate = mem_cgroup_populate,
5488	.can_attach = mem_cgroup_can_attach,
5489	.cancel_attach = mem_cgroup_cancel_attach,
5490	.attach = mem_cgroup_move_task,
5491	.early_init = 0,
5492	.use_id = 1,
5493};
5494
5495#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5496static int __init enable_swap_account(char *s)
 
 
5497{
5498	/* consider enabled if no parameter or 1 is given */
5499	if (!strcmp(s, "1"))
5500		really_do_swap_account = 1;
5501	else if (!strcmp(s, "0"))
5502		really_do_swap_account = 0;
5503	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5504}
5505__setup("swapaccount=", enable_swap_account);
5506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5507#endif

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* memcontrol.c - Memory Controller
   3 *
   4 * Copyright IBM Corporation, 2007
   5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6 *
   7 * Copyright 2007 OpenVZ SWsoft Inc
   8 * Author: Pavel Emelianov <xemul@openvz.org>
   9 *
  10 * Memory thresholds
  11 * Copyright (C) 2009 Nokia Corporation
  12 * Author: Kirill A. Shutemov
  13 *
  14 * Kernel Memory Controller
  15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16 * Authors: Glauber Costa and Suleiman Souhlal
  17 *
  18 * Native page reclaim
  19 * Charge lifetime sanitation
  20 * Lockless page tracking & accounting
  21 * Unified hierarchy configuration model
  22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23 *
  24 * Per memcg lru locking
  25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  26 */
  27
  28#include <linux/cgroup-defs.h>
  29#include <linux/page_counter.h>
  30#include <linux/memcontrol.h>
  31#include <linux/cgroup.h>
  32#include <linux/sched/mm.h>
  33#include <linux/shmem_fs.h>
  34#include <linux/hugetlb.h>
  35#include <linux/pagemap.h>
  36#include <linux/pagevec.h>
  37#include <linux/vm_event_item.h>
  38#include <linux/smp.h>
  39#include <linux/page-flags.h>
  40#include <linux/backing-dev.h>
  41#include <linux/bit_spinlock.h>
  42#include <linux/rcupdate.h>
  43#include <linux/limits.h>
  44#include <linux/export.h>
  45#include <linux/list.h>
  46#include <linux/mutex.h>
  47#include <linux/rbtree.h>
  48#include <linux/slab.h>
 
  49#include <linux/swapops.h>
  50#include <linux/spinlock.h>
 
 
  51#include <linux/fs.h>
  52#include <linux/seq_file.h>
  53#include <linux/parser.h>
  54#include <linux/vmpressure.h>
  55#include <linux/memremap.h>
  56#include <linux/mm_inline.h>
  57#include <linux/swap_cgroup.h>
  58#include <linux/cpu.h>
  59#include <linux/oom.h>
  60#include <linux/lockdep.h>
  61#include <linux/resume_user_mode.h>
  62#include <linux/psi.h>
  63#include <linux/seq_buf.h>
  64#include <linux/sched/isolation.h>
  65#include <linux/kmemleak.h>
  66#include "internal.h"
  67#include <net/sock.h>
  68#include <net/ip.h>
  69#include "slab.h"
  70#include "memcontrol-v1.h"
  71
  72#include <linux/uaccess.h>
  73
  74#define CREATE_TRACE_POINTS
  75#include <trace/events/memcg.h>
  76#undef CREATE_TRACE_POINTS
  77
  78#include <trace/events/vmscan.h>
  79
  80struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  81EXPORT_SYMBOL(memory_cgrp_subsys);
  82
  83struct mem_cgroup *root_mem_cgroup __read_mostly;
  84
  85/* Active memory cgroup to use from an interrupt context */
  86DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  87EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
 
 
 
 
 
 
 
  88
  89/* Socket memory accounting disabled? */
  90static bool cgroup_memory_nosocket __ro_after_init;
  91
  92/* Kernel memory accounting disabled? */
  93static bool cgroup_memory_nokmem __ro_after_init;
  94
  95/* BPF memory accounting disabled? */
  96static bool cgroup_memory_nobpf __ro_after_init;
  97
  98#ifdef CONFIG_CGROUP_WRITEBACK
  99static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 100#endif
 101
 102static inline bool task_is_dying(void)
 103{
 104	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 105		(current->flags & PF_EXITING);
 106}
 107
 108/* Some nice accessors for the vmpressure. */
 109struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 110{
 111	if (!memcg)
 112		memcg = root_mem_cgroup;
 113	return &memcg->vmpressure;
 114}
 
 
 
 
 
 
 
 
 115
 116struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 117{
 118	return container_of(vmpr, struct mem_cgroup, vmpressure);
 119}
 120
 121#define SEQ_BUF_SIZE SZ_4K
 122#define CURRENT_OBJCG_UPDATE_BIT 0
 123#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
 124
 125static DEFINE_SPINLOCK(objcg_lock);
 126
 127bool mem_cgroup_kmem_disabled(void)
 128{
 129	return cgroup_memory_nokmem;
 130}
 131
 132static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 133				      unsigned int nr_pages);
 134
 135static void obj_cgroup_release(struct percpu_ref *ref)
 136{
 137	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 138	unsigned int nr_bytes;
 139	unsigned int nr_pages;
 140	unsigned long flags;
 
 
 
 
 141
 
 
 
 
 142	/*
 143	 * At this point all allocated objects are freed, and
 144	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
 145	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 146	 *
 147	 * The following sequence can lead to it:
 148	 * 1) CPU0: objcg == stock->cached_objcg
 149	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 150	 *          PAGE_SIZE bytes are charged
 151	 * 3) CPU1: a process from another memcg is allocating something,
 152	 *          the stock if flushed,
 153	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 154	 * 5) CPU0: we do release this object,
 155	 *          92 bytes are added to stock->nr_bytes
 156	 * 6) CPU0: stock is flushed,
 157	 *          92 bytes are added to objcg->nr_charged_bytes
 158	 *
 159	 * In the result, nr_charged_bytes == PAGE_SIZE.
 160	 * This page will be uncharged in obj_cgroup_release().
 161	 */
 162	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 163	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 164	nr_pages = nr_bytes >> PAGE_SHIFT;
 165
 166	if (nr_pages)
 167		obj_cgroup_uncharge_pages(objcg, nr_pages);
 
 
 
 
 
 
 
 
 
 
 
 
 168
 169	spin_lock_irqsave(&objcg_lock, flags);
 170	list_del(&objcg->list);
 171	spin_unlock_irqrestore(&objcg_lock, flags);
 172
 173	percpu_ref_exit(ref);
 174	kfree_rcu(objcg, rcu);
 175}
 
 176
 177static struct obj_cgroup *obj_cgroup_alloc(void)
 178{
 179	struct obj_cgroup *objcg;
 180	int ret;
 181
 182	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 183	if (!objcg)
 184		return NULL;
 185
 186	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 187			      GFP_KERNEL);
 188	if (ret) {
 189		kfree(objcg);
 190		return NULL;
 191	}
 192	INIT_LIST_HEAD(&objcg->list);
 193	return objcg;
 194}
 195
 196static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 197				  struct mem_cgroup *parent)
 198{
 199	struct obj_cgroup *objcg, *iter;
 200
 201	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 
 
 
 202
 203	spin_lock_irq(&objcg_lock);
 
 
 
 
 
 
 
 
 204
 205	/* 1) Ready to reparent active objcg. */
 206	list_add(&objcg->list, &memcg->objcg_list);
 207	/* 2) Reparent active objcg and already reparented objcgs to parent. */
 208	list_for_each_entry(iter, &memcg->objcg_list, list)
 209		WRITE_ONCE(iter->memcg, parent);
 210	/* 3) Move already reparented objcgs to the parent's list */
 211	list_splice(&memcg->objcg_list, &parent->objcg_list);
 
 
 
 212
 213	spin_unlock_irq(&objcg_lock);
 
 
 
 
 214
 215	percpu_ref_kill(&objcg->refcnt);
 216}
 217
 218/*
 219 * A lot of the calls to the cache allocation functions are expected to be
 220 * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
 221 * conditional to this static branch, we'll have to allow modules that does
 222 * kmem_cache_alloc and the such to see this symbol as well
 
 
 
 
 
 223 */
 224DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
 225EXPORT_SYMBOL(memcg_kmem_online_key);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 226
 227DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
 228EXPORT_SYMBOL(memcg_bpf_enabled_key);
 229
 230/**
 231 * mem_cgroup_css_from_folio - css of the memcg associated with a folio
 232 * @folio: folio of interest
 233 *
 234 * If memcg is bound to the default hierarchy, css of the memcg associated
 235 * with @folio is returned.  The returned css remains associated with @folio
 236 * until it is released.
 237 *
 238 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 239 * is returned.
 240 */
 241struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
 242{
 243	struct mem_cgroup *memcg = folio_memcg(folio);
 244
 245	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 246		memcg = root_mem_cgroup;
 247
 248	return &memcg->css;
 249}
 250
 251/**
 252 * page_cgroup_ino - return inode number of the memcg a page is charged to
 253 * @page: the page
 254 *
 255 * Look up the closest online ancestor of the memory cgroup @page is charged to
 256 * and return its inode number or 0 if @page is not charged to any cgroup. It
 257 * is safe to call this function without holding a reference to @page.
 258 *
 259 * Note, this function is inherently racy, because there is nothing to prevent
 260 * the cgroup inode from getting torn down and potentially reallocated a moment
 261 * after page_cgroup_ino() returns, so it only should be used by callers that
 262 * do not care (such as procfs interfaces).
 263 */
 264ino_t page_cgroup_ino(struct page *page)
 265{
 266	struct mem_cgroup *memcg;
 267	unsigned long ino = 0;
 268
 269	rcu_read_lock();
 270	/* page_folio() is racy here, but the entire function is racy anyway */
 271	memcg = folio_memcg_check(page_folio(page));
 272
 273	while (memcg && !(memcg->css.flags & CSS_ONLINE))
 274		memcg = parent_mem_cgroup(memcg);
 275	if (memcg)
 276		ino = cgroup_ino(memcg->css.cgroup);
 277	rcu_read_unlock();
 278	return ino;
 279}
 280
 281/* Subset of node_stat_item for memcg stats */
 282static const unsigned int memcg_node_stat_items[] = {
 283	NR_INACTIVE_ANON,
 284	NR_ACTIVE_ANON,
 285	NR_INACTIVE_FILE,
 286	NR_ACTIVE_FILE,
 287	NR_UNEVICTABLE,
 288	NR_SLAB_RECLAIMABLE_B,
 289	NR_SLAB_UNRECLAIMABLE_B,
 290	WORKINGSET_REFAULT_ANON,
 291	WORKINGSET_REFAULT_FILE,
 292	WORKINGSET_ACTIVATE_ANON,
 293	WORKINGSET_ACTIVATE_FILE,
 294	WORKINGSET_RESTORE_ANON,
 295	WORKINGSET_RESTORE_FILE,
 296	WORKINGSET_NODERECLAIM,
 297	NR_ANON_MAPPED,
 298	NR_FILE_MAPPED,
 299	NR_FILE_PAGES,
 300	NR_FILE_DIRTY,
 301	NR_WRITEBACK,
 302	NR_SHMEM,
 303	NR_SHMEM_THPS,
 304	NR_FILE_THPS,
 305	NR_ANON_THPS,
 306	NR_KERNEL_STACK_KB,
 307	NR_PAGETABLE,
 308	NR_SECONDARY_PAGETABLE,
 309#ifdef CONFIG_SWAP
 310	NR_SWAPCACHE,
 311#endif
 312#ifdef CONFIG_NUMA_BALANCING
 313	PGPROMOTE_SUCCESS,
 314#endif
 315	PGDEMOTE_KSWAPD,
 316	PGDEMOTE_DIRECT,
 317	PGDEMOTE_KHUGEPAGED,
 318#ifdef CONFIG_HUGETLB_PAGE
 319	NR_HUGETLB,
 320#endif
 321};
 322
 323static const unsigned int memcg_stat_items[] = {
 324	MEMCG_SWAP,
 325	MEMCG_SOCK,
 326	MEMCG_PERCPU_B,
 327	MEMCG_VMALLOC,
 328	MEMCG_KMEM,
 329	MEMCG_ZSWAP_B,
 330	MEMCG_ZSWAPPED,
 
 331};
 332
 333#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
 334#define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
 335			   ARRAY_SIZE(memcg_stat_items))
 336#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
 337static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
 
 
 
 
 
 
 
 
 
 338
 339static void init_memcg_stats(void)
 340{
 341	u8 i, j = 0;
 342
 343	BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);
 344
 345	memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));
 346
 347	for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
 348		mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;
 349
 350	for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
 351		mem_cgroup_stats_index[memcg_stat_items[i]] = j;
 352}
 353
 354static inline int memcg_stats_index(int idx)
 355{
 356	return mem_cgroup_stats_index[idx];
 
 357}
 358
 359struct lruvec_stats_percpu {
 360	/* Local (CPU and cgroup) state */
 361	long state[NR_MEMCG_NODE_STAT_ITEMS];
 
 
 
 362
 363	/* Delta calculation for lockless upward propagation */
 364	long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
 
 
 
 
 
 
 365};
 366
 367struct lruvec_stats {
 368	/* Aggregated (CPU and subtree) state */
 369	long state[NR_MEMCG_NODE_STAT_ITEMS];
 
 
 
 
 
 
 370
 371	/* Non-hierarchical (CPU aggregated) state */
 372	long state_local[NR_MEMCG_NODE_STAT_ITEMS];
 
 
 
 
 
 
 
 373
 374	/* Pending child counts during tree propagation */
 375	long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
 376};
 
 377
 378unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
 
 379{
 380	struct mem_cgroup_per_node *pn;
 381	long x;
 382	int i;
 383
 384	if (mem_cgroup_disabled())
 385		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 
 386
 387	i = memcg_stats_index(idx);
 388	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 389		return 0;
 
 
 390
 391	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 392	x = READ_ONCE(pn->lruvec_stats->state[i]);
 393#ifdef CONFIG_SMP
 394	if (x < 0)
 395		x = 0;
 396#endif
 397	return x;
 398}
 399
 400unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 401				      enum node_stat_item idx)
 402{
 403	struct mem_cgroup_per_node *pn;
 404	long x;
 405	int i;
 406
 407	if (mem_cgroup_disabled())
 408		return node_page_state(lruvec_pgdat(lruvec), idx);
 409
 410	i = memcg_stats_index(idx);
 411	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 412		return 0;
 413
 414	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 415	x = READ_ONCE(pn->lruvec_stats->state_local[i]);
 416#ifdef CONFIG_SMP
 417	if (x < 0)
 418		x = 0;
 419#endif
 420	return x;
 421}
 422
 423/* Subset of vm_event_item to report for memcg event stats */
 424static const unsigned int memcg_vm_event_stat[] = {
 425#ifdef CONFIG_MEMCG_V1
 426	PGPGIN,
 427	PGPGOUT,
 428#endif
 429	PSWPIN,
 430	PSWPOUT,
 431	PGSCAN_KSWAPD,
 432	PGSCAN_DIRECT,
 433	PGSCAN_KHUGEPAGED,
 434	PGSTEAL_KSWAPD,
 435	PGSTEAL_DIRECT,
 436	PGSTEAL_KHUGEPAGED,
 437	PGFAULT,
 438	PGMAJFAULT,
 439	PGREFILL,
 440	PGACTIVATE,
 441	PGDEACTIVATE,
 442	PGLAZYFREE,
 443	PGLAZYFREED,
 444#ifdef CONFIG_SWAP
 445	SWPIN_ZERO,
 446	SWPOUT_ZERO,
 447#endif
 448#ifdef CONFIG_ZSWAP
 449	ZSWPIN,
 450	ZSWPOUT,
 451	ZSWPWB,
 452#endif
 453#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 454	THP_FAULT_ALLOC,
 455	THP_COLLAPSE_ALLOC,
 456	THP_SWPOUT,
 457	THP_SWPOUT_FALLBACK,
 458#endif
 459#ifdef CONFIG_NUMA_BALANCING
 460	NUMA_PAGE_MIGRATE,
 461	NUMA_PTE_UPDATES,
 462	NUMA_HINT_FAULTS,
 463#endif
 464};
 465
 466#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
 467static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
 468
 469static void init_memcg_events(void)
 470{
 471	u8 i;
 
 
 472
 473	BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);
 
 474
 475	memset(mem_cgroup_events_index, U8_MAX,
 476	       sizeof(mem_cgroup_events_index));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 477
 478	for (i = 0; i < NR_MEMCG_EVENTS; ++i)
 479		mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
 
 
 
 
 
 
 
 480}
 481
 482static inline int memcg_events_index(enum vm_event_item idx)
 
 
 
 483{
 484	return mem_cgroup_events_index[idx];
 
 
 485}
 486
 487struct memcg_vmstats_percpu {
 488	/* Stats updates since the last flush */
 489	unsigned int			stats_updates;
 490
 491	/* Cached pointers for fast iteration in memcg_rstat_updated() */
 492	struct memcg_vmstats_percpu	*parent;
 493	struct memcg_vmstats		*vmstats;
 
 
 
 
 
 494
 495	/* The above should fit a single cacheline for memcg_rstat_updated() */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 496
 497	/* Local (CPU and cgroup) page state & events */
 498	long			state[MEMCG_VMSTAT_SIZE];
 499	unsigned long		events[NR_MEMCG_EVENTS];
 
 
 
 
 
 
 
 
 
 
 
 500
 501	/* Delta calculation for lockless upward propagation */
 502	long			state_prev[MEMCG_VMSTAT_SIZE];
 503	unsigned long		events_prev[NR_MEMCG_EVENTS];
 504} ____cacheline_aligned;
 
 505
 506struct memcg_vmstats {
 507	/* Aggregated (CPU and subtree) page state & events */
 508	long			state[MEMCG_VMSTAT_SIZE];
 509	unsigned long		events[NR_MEMCG_EVENTS];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 510
 511	/* Non-hierarchical (CPU aggregated) page state & events */
 512	long			state_local[MEMCG_VMSTAT_SIZE];
 513	unsigned long		events_local[NR_MEMCG_EVENTS];
 
 514
 515	/* Pending child counts during tree propagation */
 516	long			state_pending[MEMCG_VMSTAT_SIZE];
 517	unsigned long		events_pending[NR_MEMCG_EVENTS];
 518
 519	/* Stats updates since the last flush */
 520	atomic64_t		stats_updates;
 521};
 522
 523/*
 524 * memcg and lruvec stats flushing
 525 *
 526 * Many codepaths leading to stats update or read are performance sensitive and
 527 * adding stats flushing in such codepaths is not desirable. So, to optimize the
 528 * flushing the kernel does:
 529 *
 530 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
 531 *    rstat update tree grow unbounded.
 532 *
 533 * 2) Flush the stats synchronously on reader side only when there are more than
 534 *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
 535 *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
 536 *    only for 2 seconds due to (1).
 
 
 
 
 
 
 
 
 537 */
 538static void flush_memcg_stats_dwork(struct work_struct *w);
 539static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
 540static u64 flush_last_time;
 
 
 541
 542#define FLUSH_TIME (2UL*HZ)
 
 
 
 
 
 
 
 
 
 
 543
 544/*
 545 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
 546 * not rely on this as part of an acquired spinlock_t lock. These functions are
 547 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
 548 * is sufficient.
 549 */
 550static void memcg_stats_lock(void)
 551{
 552	preempt_disable_nested();
 553	VM_WARN_ON_IRQS_ENABLED();
 554}
 555
 556static void __memcg_stats_lock(void)
 557{
 558	preempt_disable_nested();
 559}
 560
 561static void memcg_stats_unlock(void)
 562{
 563	preempt_enable_nested();
 564}
 565
 
 
 
 
 
 566
 567static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
 568{
 569	return atomic64_read(&vmstats->stats_updates) >
 570		MEMCG_CHARGE_BATCH * num_online_cpus();
 
 
 
 
 571}
 572
 573static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 
 574{
 575	struct memcg_vmstats_percpu *statc;
 576	int cpu = smp_processor_id();
 577	unsigned int stats_updates;
 
 
 
 578
 579	if (!val)
 580		return;
 
 
 
 
 
 581
 582	cgroup_rstat_updated(memcg->css.cgroup, cpu);
 583	statc = this_cpu_ptr(memcg->vmstats_percpu);
 584	for (; statc; statc = statc->parent) {
 585		stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
 586		WRITE_ONCE(statc->stats_updates, stats_updates);
 587		if (stats_updates < MEMCG_CHARGE_BATCH)
 588			continue;
 589
 590		/*
 591		 * If @memcg is already flush-able, increasing stats_updates is
 592		 * redundant. Avoid the overhead of the atomic update.
 593		 */
 594		if (!memcg_vmstats_needs_flush(statc->vmstats))
 595			atomic64_add(stats_updates,
 596				     &statc->vmstats->stats_updates);
 597		WRITE_ONCE(statc->stats_updates, 0);
 598	}
 599}
 600
 601static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
 
 
 602{
 603	bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
 
 
 604
 605	trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
 606		force, needs_flush);
 607
 608	if (!force && !needs_flush)
 609		return;
 610
 611	if (mem_cgroup_is_root(memcg))
 612		WRITE_ONCE(flush_last_time, jiffies_64);
 613
 614	cgroup_rstat_flush(memcg->css.cgroup);
 615}
 616
 617/*
 618 * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
 619 * @memcg: root of the subtree to flush
 620 *
 621 * Flushing is serialized by the underlying global rstat lock. There is also a
 622 * minimum amount of work to be done even if there are no stat updates to flush.
 623 * Hence, we only flush the stats if the updates delta exceeds a threshold. This
 624 * avoids unnecessary work and contention on the underlying lock.
 625 */
 626void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 627{
 628	if (mem_cgroup_disabled())
 629		return;
 630
 631	if (!memcg)
 632		memcg = root_mem_cgroup;
 633
 634	__mem_cgroup_flush_stats(memcg, false);
 635}
 636
 637void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 
 638{
 639	/* Only flush if the periodic flusher is one full cycle late */
 640	if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
 641		mem_cgroup_flush_stats(memcg);
 
 
 
 642}
 643
 644static void flush_memcg_stats_dwork(struct work_struct *w)
 645{
 646	/*
 647	 * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
 648	 * in latency-sensitive paths is as cheap as possible.
 649	 */
 650	__mem_cgroup_flush_stats(root_mem_cgroup, true);
 651	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 652}
 653
 654unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 655{
 656	long x;
 657	int i = memcg_stats_index(idx);
 658
 659	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 660		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 661
 662	x = READ_ONCE(memcg->vmstats->state[i]);
 663#ifdef CONFIG_SMP
 664	if (x < 0)
 665		x = 0;
 666#endif
 667	return x;
 668}
 669
 670static int memcg_page_state_unit(int item);
 671
 672/*
 673 * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
 674 * up non-zero sub-page updates to 1 page as zero page updates are ignored.
 675 */
 676static int memcg_state_val_in_pages(int idx, int val)
 677{
 678	int unit = memcg_page_state_unit(idx);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 679
 680	if (!val || unit == PAGE_SIZE)
 681		return val;
 682	else
 683		return max(val * unit / PAGE_SIZE, 1UL);
 
 684}
 685
 686/**
 687 * __mod_memcg_state - update cgroup memory statistics
 688 * @memcg: the memory cgroup
 689 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 690 * @val: delta to add to the counter, can be negative
 691 */
 692void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
 693		       int val)
 694{
 695	int i = memcg_stats_index(idx);
 
 
 
 
 
 
 696
 697	if (mem_cgroup_disabled())
 698		return;
 
 699
 700	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 701		return;
 
 702
 703	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
 704	val = memcg_state_val_in_pages(idx, val);
 705	memcg_rstat_updated(memcg, val);
 706	trace_mod_memcg_state(memcg, idx, val);
 
 
 
 
 
 
 
 
 
 
 
 707}
 708
 709/* idx can be of type enum memcg_stat_item or node_stat_item. */
 710unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 711{
 712	long x;
 713	int i = memcg_stats_index(idx);
 714
 715	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 716		return 0;
 717
 718	x = READ_ONCE(memcg->vmstats->state_local[i]);
 719#ifdef CONFIG_SMP
 720	if (x < 0)
 721		x = 0;
 722#endif
 723	return x;
 
 
 
 
 
 
 
 
 
 
 724}
 725
 726static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 727				     enum node_stat_item idx,
 728				     int val)
 729{
 730	struct mem_cgroup_per_node *pn;
 731	struct mem_cgroup *memcg;
 732	int i = memcg_stats_index(idx);
 
 733
 734	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 735		return;
 736
 737	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 738	memcg = pn->memcg;
 
 
 739
 740	/*
 741	 * The caller from rmap relies on disabled preemption because they never
 742	 * update their counter from in-interrupt context. For these two
 743	 * counters we check that the update is never performed from an
 744	 * interrupt context while other caller need to have disabled interrupt.
 745	 */
 746	__memcg_stats_lock();
 747	if (IS_ENABLED(CONFIG_DEBUG_VM)) {
 748		switch (idx) {
 749		case NR_ANON_MAPPED:
 750		case NR_FILE_MAPPED:
 751		case NR_ANON_THPS:
 752			WARN_ON_ONCE(!in_task());
 753			break;
 754		default:
 755			VM_WARN_ON_IRQS_ENABLED();
 756		}
 757	}
 758
 759	/* Update memcg */
 760	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
 
 761
 762	/* Update lruvec */
 763	__this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
 
 
 
 
 
 
 764
 765	val = memcg_state_val_in_pages(idx, val);
 766	memcg_rstat_updated(memcg, val);
 767	trace_mod_memcg_lruvec_state(memcg, idx, val);
 768	memcg_stats_unlock();
 769}
 
 
 
 
 
 
 
 
 
 770
 771/**
 772 * __mod_lruvec_state - update lruvec memory statistics
 773 * @lruvec: the lruvec
 774 * @idx: the stat item
 775 * @val: delta to add to the counter, can be negative
 776 *
 777 * The lruvec is the intersection of the NUMA node and a cgroup. This
 778 * function updates the all three counters that are affected by a
 779 * change of state at this level: per-node, per-cgroup, per-lruvec.
 780 */
 781void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 782			int val)
 783{
 784	/* Update node */
 785	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 786
 787	/* Update memcg and lruvec */
 788	if (!mem_cgroup_disabled())
 789		__mod_memcg_lruvec_state(lruvec, idx, val);
 790}
 791
 792void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 793			     int val)
 794{
 795	struct mem_cgroup *memcg;
 796	pg_data_t *pgdat = folio_pgdat(folio);
 797	struct lruvec *lruvec;
 
 798
 799	rcu_read_lock();
 800	memcg = folio_memcg(folio);
 801	/* Untracked pages have no memcg, no lruvec. Update only the node */
 802	if (!memcg) {
 803		rcu_read_unlock();
 804		__mod_node_page_state(pgdat, idx, val);
 805		return;
 
 
 
 
 
 
 
 806	}
 807
 808	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 809	__mod_lruvec_state(lruvec, idx, val);
 810	rcu_read_unlock();
 811}
 812EXPORT_SYMBOL(__lruvec_stat_mod_folio);
 813
 814void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 815{
 816	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 817	struct mem_cgroup *memcg;
 818	struct lruvec *lruvec;
 819
 820	rcu_read_lock();
 821	memcg = mem_cgroup_from_slab_obj(p);
 822
 
 
 
 
 
 
 
 823	/*
 824	 * Untracked pages have no memcg, no lruvec. Update only the
 825	 * node. If we reparent the slab objects to the root memcg,
 826	 * when we free the slab object, we need to update the per-memcg
 827	 * vmstats to keep it correct for the root memcg.
 828	 */
 829	if (!memcg) {
 830		__mod_node_page_state(pgdat, idx, val);
 831	} else {
 832		lruvec = mem_cgroup_lruvec(memcg, pgdat);
 833		__mod_lruvec_state(lruvec, idx, val);
 834	}
 835	rcu_read_unlock();
 836}
 837
 838/**
 839 * __count_memcg_events - account VM events in a cgroup
 840 * @memcg: the memory cgroup
 841 * @idx: the event item
 842 * @count: the number of events that occurred
 843 */
 844void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 845			  unsigned long count)
 846{
 847	int i = memcg_events_index(idx);
 
 
 
 
 
 
 
 
 
 
 
 
 848
 849	if (mem_cgroup_disabled())
 850		return;
 851
 852	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 
 
 
 
 
 
 853		return;
 854
 855	memcg_stats_lock();
 856	__this_cpu_add(memcg->vmstats_percpu->events[i], count);
 857	memcg_rstat_updated(memcg, count);
 858	trace_count_memcg_events(memcg, idx, count);
 859	memcg_stats_unlock();
 860}
 861
 862unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 863{
 864	int i = memcg_events_index(event);
 
 865
 866	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 867		return 0;
 868
 869	return READ_ONCE(memcg->vmstats->events[i]);
 
 
 
 
 
 
 
 
 
 870}
 871
 872unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 873{
 874	int i = memcg_events_index(event);
 
 875
 876	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 877		return 0;
 878
 879	return READ_ONCE(memcg->vmstats->events_local[i]);
 
 
 
 
 
 
 
 
 
 
 
 880}
 881
 882struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 
 
 
 
 
 
 883{
 
 
 
 
 884	/*
 885	 * mm_update_next_owner() may clear mm->owner to NULL
 886	 * if it races with swapoff, page migration, etc.
 887	 * So this can be called with p == NULL.
 
 
 
 888	 */
 889	if (unlikely(!p))
 890		return NULL;
 891
 892	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 
 
 
 
 
 
 
 893}
 894EXPORT_SYMBOL(mem_cgroup_from_task);
 895
 896static __always_inline struct mem_cgroup *active_memcg(void)
 897{
 898	if (!in_task())
 899		return this_cpu_read(int_active_memcg);
 900	else
 901		return current->active_memcg;
 
 
 
 
 
 
 
 
 902}
 903
 904/**
 905 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 906 * @mm: mm from which memcg should be extracted. It can be NULL.
 907 *
 908 * Obtain a reference on mm->memcg and returns it if successful. If mm
 909 * is NULL, then the memcg is chosen as follows:
 910 * 1) The active memcg, if set.
 911 * 2) current->mm->memcg, if available
 912 * 3) root memcg
 913 * If mem_cgroup is disabled, NULL is returned.
 914 */
 915struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 916{
 917	struct mem_cgroup *memcg;
 918
 919	if (mem_cgroup_disabled())
 920		return NULL;
 
 
 
 921
 922	/*
 923	 * Page cache insertions can happen without an
 924	 * actual mm context, e.g. during disk probing
 925	 * on boot, loopback IO, acct() writes etc.
 926	 *
 927	 * No need to css_get on root memcg as the reference
 928	 * counting is disabled on the root level in the
 929	 * cgroup core. See CSS_NO_REF.
 930	 */
 931	if (unlikely(!mm)) {
 932		memcg = active_memcg();
 933		if (unlikely(memcg)) {
 934			/* remote memcg must hold a ref */
 935			css_get(&memcg->css);
 936			return memcg;
 937		}
 938		mm = current->mm;
 939		if (unlikely(!mm))
 940			return root_mem_cgroup;
 941	}
 942
 943	rcu_read_lock();
 944	do {
 945		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 946		if (unlikely(!memcg))
 947			memcg = root_mem_cgroup;
 948	} while (!css_tryget(&memcg->css));
 949	rcu_read_unlock();
 950	return memcg;
 951}
 952EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 953
 954/**
 955 * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
 956 */
 957struct mem_cgroup *get_mem_cgroup_from_current(void)
 958{
 959	struct mem_cgroup *memcg;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 960
 961	if (mem_cgroup_disabled())
 962		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 963
 964again:
 965	rcu_read_lock();
 966	memcg = mem_cgroup_from_task(current);
 967	if (!css_tryget(&memcg->css)) {
 968		rcu_read_unlock();
 969		goto again;
 970	}
 971	rcu_read_unlock();
 972	return memcg;
 973}
 974
 975/**
 976 * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
 977 * @folio: folio from which memcg should be extracted.
 978 */
 979struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
 980{
 981	struct mem_cgroup *memcg = folio_memcg(folio);
 
 
 
 
 
 982
 983	if (mem_cgroup_disabled())
 984		return NULL;
 
 
 
 985
 986	rcu_read_lock();
 987	if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
 988		memcg = root_mem_cgroup;
 989	rcu_read_unlock();
 990	return memcg;
 991}
 992
 993/**
 994 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 995 * @root: hierarchy root
 996 * @prev: previously returned memcg, NULL on first invocation
 997 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 998 *
 999 * Returns references to children of the hierarchy below @root, or
1000 * @root itself, or %NULL after a full round-trip.
1001 *
1002 * Caller must pass the return value in @prev on subsequent
1003 * invocations for reference counting, or use mem_cgroup_iter_break()
1004 * to cancel a hierarchy walk before the round-trip is complete.
1005 *
1006 * Reclaimers can specify a node in @reclaim to divide up the memcgs
1007 * in the hierarchy among all concurrent reclaimers operating on the
1008 * same node.
1009 */
1010struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1011				   struct mem_cgroup *prev,
1012				   struct mem_cgroup_reclaim_cookie *reclaim)
1013{
1014	struct mem_cgroup_reclaim_iter *iter;
1015	struct cgroup_subsys_state *css;
1016	struct mem_cgroup *pos;
1017	struct mem_cgroup *next;
1018
1019	if (mem_cgroup_disabled())
1020		return NULL;
1021
1022	if (!root)
1023		root = root_mem_cgroup;
1024
1025	rcu_read_lock();
1026restart:
1027	next = NULL;
 
 
 
1028
1029	if (reclaim) {
1030		int gen;
1031		int nid = reclaim->pgdat->node_id;
1032
1033		iter = &root->nodeinfo[nid]->iter;
1034		gen = atomic_read(&iter->generation);
 
 
 
1035
1036		/*
1037		 * On start, join the current reclaim iteration cycle.
1038		 * Exit when a concurrent walker completes it.
1039		 */
1040		if (!prev)
1041			reclaim->generation = gen;
1042		else if (reclaim->generation != gen)
1043			goto out_unlock;
1044
1045		pos = READ_ONCE(iter->position);
1046	} else
1047		pos = prev;
1048
1049	css = pos ? &pos->css : NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1050
1051	while ((css = css_next_descendant_pre(css, &root->css))) {
1052		/*
1053		 * Verify the css and acquire a reference.  The root
1054		 * is provided by the caller, so we know it's alive
1055		 * and kicking, and don't take an extra reference.
1056		 */
1057		if (css == &root->css || css_tryget(css))
1058			break;
1059	}
1060
1061	next = mem_cgroup_from_css(css);
 
1062
1063	if (reclaim) {
1064		/*
1065		 * The position could have already been updated by a competing
1066		 * thread, so check that the value hasn't changed since we read
1067		 * it to avoid reclaiming from the same cgroup twice.
1068		 */
1069		if (cmpxchg(&iter->position, pos, next) != pos) {
1070			if (css && css != &root->css)
1071				css_put(css);
1072			goto restart;
1073		}
1074
1075		if (!next) {
1076			atomic_inc(&iter->generation);
1077
1078			/*
1079			 * Reclaimers share the hierarchy walk, and a
1080			 * new one might jump in right at the end of
1081			 * the hierarchy - make sure they see at least
1082			 * one group and restart from the beginning.
1083			 */
1084			if (!prev)
1085				goto restart;
 
 
 
 
 
 
1086		}
1087	}
1088
1089out_unlock:
1090	rcu_read_unlock();
1091	if (prev && prev != root)
1092		css_put(&prev->css);
1093
1094	return next;
1095}
1096
 
 
 
1097/**
1098 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1099 * @root: hierarchy root
1100 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 
 
1101 */
1102void mem_cgroup_iter_break(struct mem_cgroup *root,
1103			   struct mem_cgroup *prev)
1104{
1105	if (!root)
1106		root = root_mem_cgroup;
1107	if (prev && prev != root)
1108		css_put(&prev->css);
 
 
1109}
1110
1111static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1112					struct mem_cgroup *dead_memcg)
1113{
1114	struct mem_cgroup_reclaim_iter *iter;
1115	struct mem_cgroup_per_node *mz;
1116	int nid;
 
 
1117
1118	for_each_node(nid) {
1119		mz = from->nodeinfo[nid];
1120		iter = &mz->iter;
1121		cmpxchg(&iter->position, dead_memcg, NULL);
1122	}
1123}
1124
1125static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1126{
1127	struct mem_cgroup *memcg = dead_memcg;
1128	struct mem_cgroup *last;
1129
1130	do {
1131		__invalidate_reclaim_iterators(memcg, dead_memcg);
1132		last = memcg;
1133	} while ((memcg = parent_mem_cgroup(memcg)));
 
 
 
1134
1135	/*
1136	 * When cgroup1 non-hierarchy mode is used,
1137	 * parent_mem_cgroup() does not walk all the way up to the
1138	 * cgroup root (root_mem_cgroup). So we have to handle
1139	 * dead_memcg from cgroup root separately.
1140	 */
1141	if (!mem_cgroup_is_root(last))
1142		__invalidate_reclaim_iterators(root_mem_cgroup,
1143						dead_memcg);
1144}
1145
1146/**
1147 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1148 * @memcg: hierarchy root
1149 * @fn: function to call for each task
1150 * @arg: argument passed to @fn
 
 
 
 
 
 
 
 
 
 
 
1151 *
1152 * This function iterates over tasks attached to @memcg or to any of its
1153 * descendants and calls @fn for each task. If @fn returns a non-zero
1154 * value, the function breaks the iteration loop. Otherwise, it will iterate
1155 * over all tasks and return 0.
1156 *
1157 * This function must not be called for the root memory cgroup.
 
 
1158 */
1159void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1160			   int (*fn)(struct task_struct *, void *), void *arg)
 
 
 
 
 
 
1161{
1162	struct mem_cgroup *iter;
1163	int ret = 0;
1164	int i = 0;
 
 
 
 
 
 
 
 
 
1165
1166	BUG_ON(mem_cgroup_is_root(memcg));
 
 
 
 
 
1167
1168	for_each_mem_cgroup_tree(iter, memcg) {
1169		struct css_task_iter it;
1170		struct task_struct *task;
1171
1172		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1173		while (!ret && (task = css_task_iter_next(&it))) {
1174			/* Avoid potential softlockup warning */
1175			if ((++i & 1023) == 0)
1176				cond_resched();
1177			ret = fn(task, arg);
1178		}
1179		css_task_iter_end(&it);
1180		if (ret) {
1181			mem_cgroup_iter_break(memcg, iter);
1182			break;
1183		}
1184	}
 
1185}
1186
1187#ifdef CONFIG_DEBUG_VM
1188void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
 
 
 
 
 
 
 
1189{
1190	struct mem_cgroup *memcg;
 
 
 
 
 
 
 
 
1191
1192	if (mem_cgroup_disabled())
1193		return;
1194
1195	memcg = folio_memcg(folio);
1196
1197	if (!memcg)
1198		VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
1199	else
1200		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1201}
1202#endif
 
 
 
 
 
 
 
 
 
 
 
1203
1204/**
1205 * folio_lruvec_lock - Lock the lruvec for a folio.
1206 * @folio: Pointer to the folio.
1207 *
1208 * These functions are safe to use under any of the following conditions:
1209 * - folio locked
1210 * - folio_test_lru false
1211 * - folio frozen (refcount of 0)
1212 *
1213 * Return: The lruvec this folio is on with its lock held.
1214 */
1215struct lruvec *folio_lruvec_lock(struct folio *folio)
1216{
1217	struct lruvec *lruvec = folio_lruvec(folio);
1218
1219	spin_lock(&lruvec->lru_lock);
1220	lruvec_memcg_debug(lruvec, folio);
 
 
 
1221
1222	return lruvec;
 
 
 
 
 
 
 
 
1223}
1224
1225/**
1226 * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1227 * @folio: Pointer to the folio.
1228 *
1229 * These functions are safe to use under any of the following conditions:
1230 * - folio locked
1231 * - folio_test_lru false
1232 * - folio frozen (refcount of 0)
1233 *
1234 * Return: The lruvec this folio is on with its lock held and interrupts
1235 * disabled.
1236 */
1237struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1238{
1239	struct lruvec *lruvec = folio_lruvec(folio);
 
1240
1241	spin_lock_irq(&lruvec->lru_lock);
1242	lruvec_memcg_debug(lruvec, folio);
1243
1244	return lruvec;
1245}
1246
1247/**
1248 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1249 * @folio: Pointer to the folio.
1250 * @flags: Pointer to irqsave flags.
1251 *
1252 * These functions are safe to use under any of the following conditions:
1253 * - folio locked
1254 * - folio_test_lru false
1255 * - folio frozen (refcount of 0)
1256 *
1257 * Return: The lruvec this folio is on with its lock held and interrupts
1258 * disabled.
1259 */
1260struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1261		unsigned long *flags)
1262{
1263	struct lruvec *lruvec = folio_lruvec(folio);
 
1264
1265	spin_lock_irqsave(&lruvec->lru_lock, *flags);
1266	lruvec_memcg_debug(lruvec, folio);
1267
1268	return lruvec;
 
 
 
 
 
1269}
1270
1271/**
1272 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1273 * @lruvec: mem_cgroup per zone lru vector
1274 * @lru: index of lru list the page is sitting on
1275 * @zid: zone id of the accounted pages
1276 * @nr_pages: positive when adding or negative when removing
1277 *
1278 * This function must be called under lru_lock, just before a page is added
1279 * to or just after a page is removed from an lru list.
1280 */
1281void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1282				int zid, int nr_pages)
1283{
1284	struct mem_cgroup_per_node *mz;
1285	unsigned long *lru_size;
1286	long size;
1287
1288	if (mem_cgroup_disabled())
1289		return;
 
 
1290
1291	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1292	lru_size = &mz->lru_zone_size[zid][lru];
 
 
 
 
 
1293
1294	if (nr_pages < 0)
1295		*lru_size += nr_pages;
1296
1297	size = *lru_size;
1298	if (WARN_ONCE(size < 0,
1299		"%s(%p, %d, %d): lru_size %ld\n",
1300		__func__, lruvec, lru, nr_pages, size)) {
1301		VM_BUG_ON(1);
1302		*lru_size = 0;
1303	}
1304
1305	if (nr_pages > 0)
1306		*lru_size += nr_pages;
1307}
1308
1309/**
1310 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1311 * @memcg: the memory cgroup
1312 *
1313 * Returns the maximum amount of memory @mem can be charged with, in
1314 * pages.
 
 
 
1315 */
1316static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 
1317{
1318	unsigned long margin = 0;
1319	unsigned long count;
1320	unsigned long limit;
1321
1322	count = page_counter_read(&memcg->memory);
1323	limit = READ_ONCE(memcg->memory.max);
1324	if (count < limit)
1325		margin = limit - count;
1326
1327	if (do_memsw_account()) {
1328		count = page_counter_read(&memcg->memsw);
1329		limit = READ_ONCE(memcg->memsw.max);
1330		if (count < limit)
1331			margin = min(margin, limit - count);
1332		else
1333			margin = 0;
1334	}
1335
1336	return margin;
1337}
 
1338
1339struct memory_stat {
1340	const char *name;
1341	unsigned int idx;
1342};
 
 
 
 
 
 
 
 
 
 
 
 
 
1343
1344static const struct memory_stat memory_stats[] = {
1345	{ "anon",			NR_ANON_MAPPED			},
1346	{ "file",			NR_FILE_PAGES			},
1347	{ "kernel",			MEMCG_KMEM			},
1348	{ "kernel_stack",		NR_KERNEL_STACK_KB		},
1349	{ "pagetables",			NR_PAGETABLE			},
1350	{ "sec_pagetables",		NR_SECONDARY_PAGETABLE		},
1351	{ "percpu",			MEMCG_PERCPU_B			},
1352	{ "sock",			MEMCG_SOCK			},
1353	{ "vmalloc",			MEMCG_VMALLOC			},
1354	{ "shmem",			NR_SHMEM			},
1355#ifdef CONFIG_ZSWAP
1356	{ "zswap",			MEMCG_ZSWAP_B			},
1357	{ "zswapped",			MEMCG_ZSWAPPED			},
1358#endif
1359	{ "file_mapped",		NR_FILE_MAPPED			},
1360	{ "file_dirty",			NR_FILE_DIRTY			},
1361	{ "file_writeback",		NR_WRITEBACK			},
1362#ifdef CONFIG_SWAP
1363	{ "swapcached",			NR_SWAPCACHE			},
1364#endif
1365#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1366	{ "anon_thp",			NR_ANON_THPS			},
1367	{ "file_thp",			NR_FILE_THPS			},
1368	{ "shmem_thp",			NR_SHMEM_THPS			},
1369#endif
1370	{ "inactive_anon",		NR_INACTIVE_ANON		},
1371	{ "active_anon",		NR_ACTIVE_ANON			},
1372	{ "inactive_file",		NR_INACTIVE_FILE		},
1373	{ "active_file",		NR_ACTIVE_FILE			},
1374	{ "unevictable",		NR_UNEVICTABLE			},
1375	{ "slab_reclaimable",		NR_SLAB_RECLAIMABLE_B		},
1376	{ "slab_unreclaimable",		NR_SLAB_UNRECLAIMABLE_B		},
1377#ifdef CONFIG_HUGETLB_PAGE
1378	{ "hugetlb",			NR_HUGETLB			},
1379#endif
1380
1381	/* The memory events */
1382	{ "workingset_refault_anon",	WORKINGSET_REFAULT_ANON		},
1383	{ "workingset_refault_file",	WORKINGSET_REFAULT_FILE		},
1384	{ "workingset_activate_anon",	WORKINGSET_ACTIVATE_ANON	},
1385	{ "workingset_activate_file",	WORKINGSET_ACTIVATE_FILE	},
1386	{ "workingset_restore_anon",	WORKINGSET_RESTORE_ANON		},
1387	{ "workingset_restore_file",	WORKINGSET_RESTORE_FILE		},
1388	{ "workingset_nodereclaim",	WORKINGSET_NODERECLAIM		},
1389
1390	{ "pgdemote_kswapd",		PGDEMOTE_KSWAPD		},
1391	{ "pgdemote_direct",		PGDEMOTE_DIRECT		},
1392	{ "pgdemote_khugepaged",	PGDEMOTE_KHUGEPAGED	},
1393#ifdef CONFIG_NUMA_BALANCING
1394	{ "pgpromote_success",		PGPROMOTE_SUCCESS	},
1395#endif
1396};
1397
1398/* The actual unit of the state item, not the same as the output unit */
1399static int memcg_page_state_unit(int item)
1400{
1401	switch (item) {
1402	case MEMCG_PERCPU_B:
1403	case MEMCG_ZSWAP_B:
1404	case NR_SLAB_RECLAIMABLE_B:
1405	case NR_SLAB_UNRECLAIMABLE_B:
1406		return 1;
1407	case NR_KERNEL_STACK_KB:
1408		return SZ_1K;
1409	default:
1410		return PAGE_SIZE;
1411	}
 
 
 
1412}
1413
1414/* Translate stat items to the correct unit for memory.stat output */
1415static int memcg_page_state_output_unit(int item)
 
 
 
 
 
 
 
 
 
 
 
1416{
1417	/*
1418	 * Workingset state is actually in pages, but we export it to userspace
1419	 * as a scalar count of events, so special case it here.
1420	 *
1421	 * Demotion and promotion activities are exported in pages, consistent
1422	 * with their global counterparts.
 
 
 
 
 
 
 
1423	 */
1424	switch (item) {
1425	case WORKINGSET_REFAULT_ANON:
1426	case WORKINGSET_REFAULT_FILE:
1427	case WORKINGSET_ACTIVATE_ANON:
1428	case WORKINGSET_ACTIVATE_FILE:
1429	case WORKINGSET_RESTORE_ANON:
1430	case WORKINGSET_RESTORE_FILE:
1431	case WORKINGSET_NODERECLAIM:
1432	case PGDEMOTE_KSWAPD:
1433	case PGDEMOTE_DIRECT:
1434	case PGDEMOTE_KHUGEPAGED:
1435#ifdef CONFIG_NUMA_BALANCING
1436	case PGPROMOTE_SUCCESS:
1437#endif
1438		return 1;
1439	default:
1440		return memcg_page_state_unit(item);
1441	}
1442}
1443
1444unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
1445{
1446	return memcg_page_state(memcg, item) *
1447		memcg_page_state_output_unit(item);
1448}
1449
1450unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
 
 
 
 
 
 
1451{
1452	return memcg_page_state_local(memcg, item) *
1453		memcg_page_state_output_unit(item);
1454}
1455
1456static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1457{
1458	int i;
1459
1460	/*
1461	 * Provide statistics on the state of the memory subsystem as
1462	 * well as cumulative event counters that show past behavior.
1463	 *
1464	 * This list is ordered following a combination of these gradients:
1465	 * 1) generic big picture -> specifics and details
1466	 * 2) reflecting userspace activity -> reflecting kernel heuristics
1467	 *
1468	 * Current memory state:
1469	 */
1470	mem_cgroup_flush_stats(memcg);
1471
1472	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1473		u64 size;
1474
1475#ifdef CONFIG_HUGETLB_PAGE
1476		if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
1477		    !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
1478			continue;
1479#endif
1480		size = memcg_page_state_output(memcg, memory_stats[i].idx);
1481		seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
1482
1483		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1484			size += memcg_page_state_output(memcg,
1485							NR_SLAB_RECLAIMABLE_B);
1486			seq_buf_printf(s, "slab %llu\n", size);
1487		}
1488	}
1489
1490	/* Accumulated memory events */
1491	seq_buf_printf(s, "pgscan %lu\n",
1492		       memcg_events(memcg, PGSCAN_KSWAPD) +
1493		       memcg_events(memcg, PGSCAN_DIRECT) +
1494		       memcg_events(memcg, PGSCAN_KHUGEPAGED));
1495	seq_buf_printf(s, "pgsteal %lu\n",
1496		       memcg_events(memcg, PGSTEAL_KSWAPD) +
1497		       memcg_events(memcg, PGSTEAL_DIRECT) +
1498		       memcg_events(memcg, PGSTEAL_KHUGEPAGED));
1499
1500	for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
1501#ifdef CONFIG_MEMCG_V1
1502		if (memcg_vm_event_stat[i] == PGPGIN ||
1503		    memcg_vm_event_stat[i] == PGPGOUT)
1504			continue;
1505#endif
1506		seq_buf_printf(s, "%s %lu\n",
1507			       vm_event_name(memcg_vm_event_stat[i]),
1508			       memcg_events(memcg, memcg_vm_event_stat[i]));
1509	}
 
1510}
1511
1512static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 
1513{
1514	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1515		memcg_stat_format(memcg, s);
1516	else
1517		memcg1_stat_format(memcg, s);
1518	if (seq_buf_has_overflowed(s))
1519		pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__);
1520}
1521
1522/**
1523 * mem_cgroup_print_oom_context: Print OOM information relevant to
1524 * memory controller.
1525 * @memcg: The memory cgroup that went over limit
1526 * @p: Task that is going to be killed
1527 *
1528 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1529 * enabled
1530 */
1531void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1532{
1533	rcu_read_lock();
 
 
1534
1535	if (memcg) {
1536		pr_cont(",oom_memcg=");
1537		pr_cont_cgroup_path(memcg->css.cgroup);
1538	} else
1539		pr_cont(",global_oom");
1540	if (p) {
1541		pr_cont(",task_memcg=");
1542		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1543	}
1544	rcu_read_unlock();
1545}
1546
1547/**
1548 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1549 * memory controller.
1550 * @memcg: The memory cgroup that went over limit
1551 */
1552void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1553{
1554	/* Use static buffer, for the caller is holding oom_lock. */
1555	static char buf[SEQ_BUF_SIZE];
1556	struct seq_buf s;
1557
1558	lockdep_assert_held(&oom_lock);
1559
1560	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1561		K((u64)page_counter_read(&memcg->memory)),
1562		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1563	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1564		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1565			K((u64)page_counter_read(&memcg->swap)),
1566			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1567#ifdef CONFIG_MEMCG_V1
1568	else {
1569		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1570			K((u64)page_counter_read(&memcg->memsw)),
1571			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1572		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1573			K((u64)page_counter_read(&memcg->kmem)),
1574			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1575	}
1576#endif
1577
1578	pr_info("Memory cgroup stats for ");
1579	pr_cont_cgroup_path(memcg->css.cgroup);
1580	pr_cont(":");
1581	seq_buf_init(&s, buf, SEQ_BUF_SIZE);
1582	memory_stat_format(memcg, &s);
1583	seq_buf_do_printk(&s, KERN_INFO);
 
 
 
 
 
 
 
 
 
 
1584}
1585
1586/*
1587 * Return the memory (and swap, if configured) limit for a memcg.
1588 */
1589unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1590{
1591	unsigned long max = READ_ONCE(memcg->memory.max);
1592
1593	if (do_memsw_account()) {
1594		if (mem_cgroup_swappiness(memcg)) {
1595			/* Calculate swap excess capacity from memsw limit */
1596			unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
 
 
 
 
1597
1598			max += min(swap, (unsigned long)total_swap_pages);
1599		}
1600	} else {
1601		if (mem_cgroup_swappiness(memcg))
1602			max += min(READ_ONCE(memcg->swap.max),
1603				   (unsigned long)total_swap_pages);
1604	}
1605	return max;
1606}
1607
1608unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1609{
1610	return page_counter_read(&memcg->memory);
 
 
 
 
 
 
 
 
1611}
1612
1613static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1614				     int order)
 
 
 
 
 
 
 
 
1615{
1616	struct oom_control oc = {
1617		.zonelist = NULL,
1618		.nodemask = NULL,
1619		.memcg = memcg,
1620		.gfp_mask = gfp_mask,
1621		.order = order,
1622	};
1623	bool ret = true;
1624
1625	if (mutex_lock_killable(&oom_lock))
1626		return true;
1627
1628	if (mem_cgroup_margin(memcg) >= (1 << order))
1629		goto unlock;
1630
1631	/*
1632	 * A few threads which were not waiting at mutex_lock_killable() can
1633	 * fail to bail out. Therefore, check again after holding oom_lock.
1634	 */
1635	ret = task_is_dying() || out_of_memory(&oc);
 
 
 
 
1636
1637unlock:
1638	mutex_unlock(&oom_lock);
1639	return ret;
 
 
 
 
 
 
 
1640}
1641
1642/*
1643 * Returns true if successfully killed one or more processes. Though in some
1644 * corner cases it can return true even without killing any process.
1645 */
1646static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1647{
1648	bool locked, ret;
 
1649
1650	if (order > PAGE_ALLOC_COSTLY_ORDER)
1651		return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1652
1653	memcg_memory_event(memcg, MEMCG_OOM);
1654
1655	if (!memcg1_oom_prepare(memcg, &locked))
1656		return false;
1657
1658	ret = mem_cgroup_out_of_memory(memcg, mask, order);
1659
1660	memcg1_oom_finish(memcg, locked);
1661
1662	return ret;
1663}
1664
1665/**
1666 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1667 * @victim: task to be killed by the OOM killer
1668 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
 
 
 
 
 
 
 
 
 
1669 *
1670 * Returns a pointer to a memory cgroup, which has to be cleaned up
1671 * by killing all belonging OOM-killable tasks.
 
 
1672 *
1673 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
 
 
1674 */
1675struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1676					    struct mem_cgroup *oom_domain)
 
1677{
1678	struct mem_cgroup *oom_group = NULL;
1679	struct mem_cgroup *memcg;
 
 
1680
1681	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1682		return NULL;
1683
1684	if (!oom_domain)
1685		oom_domain = root_mem_cgroup;
1686
1687	rcu_read_lock();
1688
1689	memcg = mem_cgroup_from_task(victim);
1690	if (mem_cgroup_is_root(memcg))
1691		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1692
1693	/*
1694	 * If the victim task has been asynchronously moved to a different
1695	 * memory cgroup, we might end up killing tasks outside oom_domain.
1696	 * In this case it's better to ignore memory.group.oom.
1697	 */
1698	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1699		goto out;
1700
1701	/*
1702	 * Traverse the memory cgroup hierarchy from the victim task's
1703	 * cgroup up to the OOMing cgroup (or root) to find the
1704	 * highest-level memory cgroup with oom.group set.
1705	 */
1706	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1707		if (READ_ONCE(memcg->oom_group))
1708			oom_group = memcg;
1709
1710		if (memcg == oom_domain)
1711			break;
1712	}
1713
1714	if (oom_group)
1715		css_get(&oom_group->css);
1716out:
 
 
1717	rcu_read_unlock();
1718
1719	return oom_group;
1720}
1721
1722void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1723{
1724	pr_info("Tasks in ");
1725	pr_cont_cgroup_path(memcg->css.cgroup);
1726	pr_cont(" are going to be killed due to memory.oom.group set\n");
1727}
 
1728
 
 
 
 
 
1729struct memcg_stock_pcp {
1730	local_lock_t stock_lock;
1731	struct mem_cgroup *cached; /* this never be root cgroup */
1732	unsigned int nr_pages;
1733
1734	struct obj_cgroup *cached_objcg;
1735	struct pglist_data *cached_pgdat;
1736	unsigned int nr_bytes;
1737	int nr_slab_reclaimable_b;
1738	int nr_slab_unreclaimable_b;
1739
1740	struct work_struct work;
1741	unsigned long flags;
1742#define FLUSHING_CACHED_CHARGE	0
1743};
1744static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
1745	.stock_lock = INIT_LOCAL_LOCK(stock_lock),
1746};
 
1747static DEFINE_MUTEX(percpu_charge_mutex);
1748
1749static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
1750static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
1751				     struct mem_cgroup *root_memcg);
1752
1753/**
1754 * consume_stock: Try to consume stocked charge on this cpu.
1755 * @memcg: memcg to consume from.
1756 * @nr_pages: how many pages to charge.
1757 *
1758 * The charges will only happen if @memcg matches the current cpu's memcg
1759 * stock, and at least @nr_pages are available in that stock.  Failure to
1760 * service an allocation will refill the stock.
1761 *
1762 * returns true if successful, false otherwise.
1763 */
1764static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1765{
1766	struct memcg_stock_pcp *stock;
1767	unsigned int stock_pages;
1768	unsigned long flags;
1769	bool ret = false;
1770
1771	if (nr_pages > MEMCG_CHARGE_BATCH)
1772		return ret;
1773
1774	local_lock_irqsave(&memcg_stock.stock_lock, flags);
1775
1776	stock = this_cpu_ptr(&memcg_stock);
1777	stock_pages = READ_ONCE(stock->nr_pages);
1778	if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
1779		WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
1780		ret = true;
1781	}
1782
1783	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1784
 
 
 
 
 
 
1785	return ret;
1786}
1787
1788/*
1789 * Returns stocks cached in percpu and reset cached information.
1790 */
1791static void drain_stock(struct memcg_stock_pcp *stock)
1792{
1793	unsigned int stock_pages = READ_ONCE(stock->nr_pages);
1794	struct mem_cgroup *old = READ_ONCE(stock->cached);
1795
1796	if (!old)
1797		return;
1798
1799	if (stock_pages) {
1800		page_counter_uncharge(&old->memory, stock_pages);
1801		if (do_memsw_account())
1802			page_counter_uncharge(&old->memsw, stock_pages);
1803
1804		WRITE_ONCE(stock->nr_pages, 0);
 
 
 
1805	}
1806
1807	css_put(&old->css);
1808	WRITE_ONCE(stock->cached, NULL);
1809}
1810
 
 
 
 
1811static void drain_local_stock(struct work_struct *dummy)
1812{
1813	struct memcg_stock_pcp *stock;
1814	struct obj_cgroup *old = NULL;
1815	unsigned long flags;
1816
1817	/*
1818	 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
1819	 * drain_stock races is that we always operate on local CPU stock
1820	 * here with IRQ disabled
1821	 */
1822	local_lock_irqsave(&memcg_stock.stock_lock, flags);
1823
1824	stock = this_cpu_ptr(&memcg_stock);
1825	old = drain_obj_stock(stock);
1826	drain_stock(stock);
1827	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1828
1829	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1830	obj_cgroup_put(old);
1831}
1832
1833/*
1834 * Cache charges(val) to local per_cpu area.
1835 * This will be consumed by consume_stock() function, later.
1836 */
1837static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1838{
1839	struct memcg_stock_pcp *stock;
1840	unsigned int stock_pages;
1841
1842	stock = this_cpu_ptr(&memcg_stock);
1843	if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
1844		drain_stock(stock);
1845		css_get(&memcg->css);
1846		WRITE_ONCE(stock->cached, memcg);
1847	}
1848	stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
1849	WRITE_ONCE(stock->nr_pages, stock_pages);
1850
1851	if (stock_pages > MEMCG_CHARGE_BATCH)
1852		drain_stock(stock);
1853}
1854
1855static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1856{
1857	unsigned long flags;
1858
1859	local_lock_irqsave(&memcg_stock.stock_lock, flags);
1860	__refill_stock(memcg, nr_pages);
1861	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1862}
1863
1864/*
1865 * Drains all per-CPU charge caches for given root_memcg resp. subtree
1866 * of the hierarchy under it.
 
1867 */
1868void drain_all_stock(struct mem_cgroup *root_memcg)
1869{
1870	int cpu, curcpu;
1871
1872	/* If someone's already draining, avoid adding running more workers. */
1873	if (!mutex_trylock(&percpu_charge_mutex))
1874		return;
1875	/*
1876	 * Notify other cpus that system-wide "drain" is running
1877	 * We do not care about races with the cpu hotplug because cpu down
1878	 * as well as workers from this path always operate on the local
1879	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
1880	 */
1881	migrate_disable();
1882	curcpu = smp_processor_id();
1883	for_each_online_cpu(cpu) {
1884		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1885		struct mem_cgroup *memcg;
1886		bool flush = false;
1887
1888		rcu_read_lock();
1889		memcg = READ_ONCE(stock->cached);
1890		if (memcg && READ_ONCE(stock->nr_pages) &&
1891		    mem_cgroup_is_descendant(memcg, root_memcg))
1892			flush = true;
1893		else if (obj_stock_flush_required(stock, root_memcg))
1894			flush = true;
1895		rcu_read_unlock();
1896
1897		if (flush &&
1898		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1899			if (cpu == curcpu)
1900				drain_local_stock(&stock->work);
1901			else if (!cpu_is_isolated(cpu))
1902				schedule_work_on(cpu, &stock->work);
1903		}
1904	}
1905	migrate_enable();
1906	mutex_unlock(&percpu_charge_mutex);
1907}
1908
1909static int memcg_hotplug_cpu_dead(unsigned int cpu)
1910{
1911	struct memcg_stock_pcp *stock;
1912
1913	stock = &per_cpu(memcg_stock, cpu);
1914	drain_stock(stock);
1915
1916	return 0;
 
 
 
1917}
1918
1919static unsigned long reclaim_high(struct mem_cgroup *memcg,
1920				  unsigned int nr_pages,
1921				  gfp_t gfp_mask)
 
 
 
 
1922{
1923	unsigned long nr_reclaimed = 0;
1924
1925	do {
1926		unsigned long pflags;
1927
1928		if (page_counter_read(&memcg->memory) <=
1929		    READ_ONCE(memcg->memory.high))
1930			continue;
1931
1932		memcg_memory_event(memcg, MEMCG_HIGH);
1933
1934		psi_memstall_enter(&pflags);
1935		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
1936							gfp_mask,
1937							MEMCG_RECLAIM_MAY_SWAP,
1938							NULL);
1939		psi_memstall_leave(&pflags);
1940	} while ((memcg = parent_mem_cgroup(memcg)) &&
1941		 !mem_cgroup_is_root(memcg));
1942
1943	return nr_reclaimed;
1944}
1945
1946static void high_work_func(struct work_struct *work)
 
1947{
1948	struct mem_cgroup *memcg;
1949
1950	memcg = container_of(work, struct mem_cgroup, high_work);
1951	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
1952}
1953
1954/*
1955 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
1956 * enough to still cause a significant slowdown in most cases, while still
1957 * allowing diagnostics and tracing to proceed without becoming stuck.
1958 */
1959#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
1960
1961/*
1962 * When calculating the delay, we use these either side of the exponentiation to
1963 * maintain precision and scale to a reasonable number of jiffies (see the table
1964 * below.
1965 *
1966 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
1967 *   overage ratio to a delay.
1968 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
1969 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
1970 *   to produce a reasonable delay curve.
1971 *
1972 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
1973 * reasonable delay curve compared to precision-adjusted overage, not
1974 * penalising heavily at first, but still making sure that growth beyond the
1975 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
1976 * example, with a high of 100 megabytes:
1977 *
1978 *  +-------+------------------------+
1979 *  | usage | time to allocate in ms |
1980 *  +-------+------------------------+
1981 *  | 100M  |                      0 |
1982 *  | 101M  |                      6 |
1983 *  | 102M  |                     25 |
1984 *  | 103M  |                     57 |
1985 *  | 104M  |                    102 |
1986 *  | 105M  |                    159 |
1987 *  | 106M  |                    230 |
1988 *  | 107M  |                    313 |
1989 *  | 108M  |                    409 |
1990 *  | 109M  |                    518 |
1991 *  | 110M  |                    639 |
1992 *  | 111M  |                    774 |
1993 *  | 112M  |                    921 |
1994 *  | 113M  |                   1081 |
1995 *  | 114M  |                   1254 |
1996 *  | 115M  |                   1439 |
1997 *  | 116M  |                   1638 |
1998 *  | 117M  |                   1849 |
1999 *  | 118M  |                   2000 |
2000 *  | 119M  |                   2000 |
2001 *  | 120M  |                   2000 |
2002 *  +-------+------------------------+
2003 */
2004 #define MEMCG_DELAY_PRECISION_SHIFT 20
2005 #define MEMCG_DELAY_SCALING_SHIFT 14
2006
2007static u64 calculate_overage(unsigned long usage, unsigned long high)
2008{
2009	u64 overage;
2010
2011	if (usage <= high)
2012		return 0;
 
2013
2014	/*
2015	 * Prevent division by 0 in overage calculation by acting as if
2016	 * it was a threshold of 1 page
2017	 */
2018	high = max(high, 1UL);
2019
2020	overage = usage - high;
2021	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2022	return div64_u64(overage, high);
 
 
 
2023}
2024
2025static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2026{
2027	u64 overage, max_overage = 0;
2028
2029	do {
2030		overage = calculate_overage(page_counter_read(&memcg->memory),
2031					    READ_ONCE(memcg->memory.high));
2032		max_overage = max(overage, max_overage);
2033	} while ((memcg = parent_mem_cgroup(memcg)) &&
2034		 !mem_cgroup_is_root(memcg));
2035
2036	return max_overage;
2037}
2038
2039static u64 swap_find_max_overage(struct mem_cgroup *memcg)
 
 
2040{
2041	u64 overage, max_overage = 0;
 
 
2042
2043	do {
2044		overage = calculate_overage(page_counter_read(&memcg->swap),
2045					    READ_ONCE(memcg->swap.high));
2046		if (overage)
2047			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2048		max_overage = max(overage, max_overage);
2049	} while ((memcg = parent_mem_cgroup(memcg)) &&
2050		 !mem_cgroup_is_root(memcg));
2051
2052	return max_overage;
2053}
2054
2055/*
2056 * Get the number of jiffies that we should penalise a mischievous cgroup which
2057 * is exceeding its memory.high by checking both it and its ancestors.
2058 */
2059static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2060					  unsigned int nr_pages,
2061					  u64 max_overage)
2062{
2063	unsigned long penalty_jiffies;
2064
2065	if (!max_overage)
2066		return 0;
 
 
2067
2068	/*
2069	 * We use overage compared to memory.high to calculate the number of
2070	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2071	 * fairly lenient on small overages, and increasingly harsh when the
2072	 * memcg in question makes it clear that it has no intention of stopping
2073	 * its crazy behaviour, so we exponentially increase the delay based on
2074	 * overage amount.
2075	 */
2076	penalty_jiffies = max_overage * max_overage * HZ;
2077	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2078	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2079
2080	/*
2081	 * Factor in the task's own contribution to the overage, such that four
2082	 * N-sized allocations are throttled approximately the same as one
2083	 * 4N-sized allocation.
2084	 *
2085	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2086	 * larger the current charge patch is than that.
2087	 */
2088	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2089}
2090
2091/*
2092 * Reclaims memory over the high limit. Called directly from
2093 * try_charge() (context permitting), as well as from the userland
2094 * return path where reclaim is always able to block.
2095 */
2096void mem_cgroup_handle_over_high(gfp_t gfp_mask)
2097{
2098	unsigned long penalty_jiffies;
2099	unsigned long pflags;
2100	unsigned long nr_reclaimed;
2101	unsigned int nr_pages = current->memcg_nr_pages_over_high;
2102	int nr_retries = MAX_RECLAIM_RETRIES;
2103	struct mem_cgroup *memcg;
2104	bool in_retry = false;
2105
2106	if (likely(!nr_pages))
2107		return;
2108
2109	memcg = get_mem_cgroup_from_mm(current->mm);
2110	current->memcg_nr_pages_over_high = 0;
2111
2112retry_reclaim:
2113	/*
2114	 * Bail if the task is already exiting. Unlike memory.max,
2115	 * memory.high enforcement isn't as strict, and there is no
2116	 * OOM killer involved, which means the excess could already
2117	 * be much bigger (and still growing) than it could for
2118	 * memory.max; the dying task could get stuck in fruitless
2119	 * reclaim for a long time, which isn't desirable.
2120	 */
2121	if (task_is_dying())
2122		goto out;
2123
 
 
 
 
 
 
 
 
 
 
 
 
2124	/*
2125	 * The allocating task should reclaim at least the batch size, but for
2126	 * subsequent retries we only want to do what's necessary to prevent oom
2127	 * or breaching resource isolation.
2128	 *
2129	 * This is distinct from memory.max or page allocator behaviour because
2130	 * memory.high is currently batched, whereas memory.max and the page
2131	 * allocator run every time an allocation is made.
2132	 */
2133	nr_reclaimed = reclaim_high(memcg,
2134				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2135				    gfp_mask);
2136
2137	/*
2138	 * memory.high is breached and reclaim is unable to keep up. Throttle
2139	 * allocators proactively to slow down excessive growth.
2140	 */
2141	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2142					       mem_find_max_overage(memcg));
2143
2144	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2145						swap_find_max_overage(memcg));
2146
2147	/*
2148	 * Clamp the max delay per usermode return so as to still keep the
2149	 * application moving forwards and also permit diagnostics, albeit
2150	 * extremely slowly.
2151	 */
2152	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2153
2154	/*
2155	 * Don't sleep if the amount of jiffies this memcg owes us is so low
2156	 * that it's not even worth doing, in an attempt to be nice to those who
2157	 * go only a small amount over their memory.high value and maybe haven't
2158	 * been aggressively reclaimed enough yet.
2159	 */
2160	if (penalty_jiffies <= HZ / 100)
2161		goto out;
2162
2163	/*
2164	 * If reclaim is making forward progress but we're still over
2165	 * memory.high, we want to encourage that rather than doing allocator
2166	 * throttling.
2167	 */
2168	if (nr_reclaimed || nr_retries--) {
2169		in_retry = true;
2170		goto retry_reclaim;
2171	}
2172
2173	/*
2174	 * Reclaim didn't manage to push usage below the limit, slow
2175	 * this allocating task down.
2176	 *
2177	 * If we exit early, we're guaranteed to die (since
2178	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2179	 * need to account for any ill-begotten jiffies to pay them off later.
2180	 */
2181	psi_memstall_enter(&pflags);
2182	schedule_timeout_killable(penalty_jiffies);
2183	psi_memstall_leave(&pflags);
2184
2185out:
2186	css_put(&memcg->css);
2187}
2188
2189int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2190		     unsigned int nr_pages)
2191{
2192	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2193	int nr_retries = MAX_RECLAIM_RETRIES;
2194	struct mem_cgroup *mem_over_limit;
2195	struct page_counter *counter;
2196	unsigned long nr_reclaimed;
2197	bool passed_oom = false;
2198	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2199	bool drained = false;
2200	bool raised_max_event = false;
2201	unsigned long pflags;
2202
2203retry:
2204	if (consume_stock(memcg, nr_pages))
2205		return 0;
2206
2207	if (!do_memsw_account() ||
2208	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2209		if (page_counter_try_charge(&memcg->memory, batch, &counter))
2210			goto done_restock;
2211		if (do_memsw_account())
2212			page_counter_uncharge(&memcg->memsw, batch);
2213		mem_over_limit = mem_cgroup_from_counter(counter, memory);
2214	} else {
2215		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2216		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2217	}
2218
2219	if (batch > nr_pages) {
2220		batch = nr_pages;
2221		goto retry;
2222	}
2223
2224	/*
2225	 * Prevent unbounded recursion when reclaim operations need to
2226	 * allocate memory. This might exceed the limits temporarily,
2227	 * but we prefer facilitating memory reclaim and getting back
2228	 * under the limit over triggering OOM kills in these cases.
2229	 */
2230	if (unlikely(current->flags & PF_MEMALLOC))
2231		goto force;
2232
2233	if (unlikely(task_in_memcg_oom(current)))
2234		goto nomem;
2235
2236	if (!gfpflags_allow_blocking(gfp_mask))
2237		goto nomem;
2238
2239	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2240	raised_max_event = true;
2241
2242	psi_memstall_enter(&pflags);
2243	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2244						    gfp_mask, reclaim_options, NULL);
2245	psi_memstall_leave(&pflags);
2246
 
 
2247	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2248		goto retry;
2249
2250	if (!drained) {
2251		drain_all_stock(mem_over_limit);
2252		drained = true;
2253		goto retry;
2254	}
2255
2256	if (gfp_mask & __GFP_NORETRY)
2257		goto nomem;
2258	/*
2259	 * Even though the limit is exceeded at this point, reclaim
2260	 * may have been able to free some pages.  Retry the charge
2261	 * before killing the task.
2262	 *
2263	 * Only for regular pages, though: huge pages are rather
2264	 * unlikely to succeed so close to the limit, and we fall back
2265	 * to regular pages anyway in case of failure.
2266	 */
2267	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2268		goto retry;
2269
2270	if (nr_retries--)
2271		goto retry;
2272
2273	if (gfp_mask & __GFP_RETRY_MAYFAIL)
2274		goto nomem;
2275
2276	/* Avoid endless loop for tasks bypassed by the oom killer */
2277	if (passed_oom && task_is_dying())
2278		goto nomem;
2279
2280	/*
2281	 * keep retrying as long as the memcg oom killer is able to make
2282	 * a forward progress or bypass the charge if the oom killer
2283	 * couldn't make any progress.
2284	 */
2285	if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2286			   get_order(nr_pages * PAGE_SIZE))) {
2287		passed_oom = true;
2288		nr_retries = MAX_RECLAIM_RETRIES;
2289		goto retry;
2290	}
2291nomem:
2292	/*
2293	 * Memcg doesn't have a dedicated reserve for atomic
2294	 * allocations. But like the global atomic pool, we need to
2295	 * put the burden of reclaim on regular allocation requests
2296	 * and let these go through as privileged allocations.
2297	 */
2298	if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2299		return -ENOMEM;
2300force:
2301	/*
2302	 * If the allocation has to be enforced, don't forget to raise
2303	 * a MEMCG_MAX event.
2304	 */
2305	if (!raised_max_event)
2306		memcg_memory_event(mem_over_limit, MEMCG_MAX);
2307
2308	/*
2309	 * The allocation either can't fail or will lead to more memory
2310	 * being freed very soon.  Allow memory usage go over the limit
2311	 * temporarily by force charging it.
2312	 */
2313	page_counter_charge(&memcg->memory, nr_pages);
2314	if (do_memsw_account())
2315		page_counter_charge(&memcg->memsw, nr_pages);
2316
2317	return 0;
 
2318
2319done_restock:
2320	if (batch > nr_pages)
2321		refill_stock(memcg, batch - nr_pages);
 
 
 
 
 
 
 
 
 
 
 
2322
2323	/*
2324	 * If the hierarchy is above the normal consumption range, schedule
2325	 * reclaim on returning to userland.  We can perform reclaim here
2326	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2327	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2328	 * not recorded as it most likely matches current's and won't
2329	 * change in the meantime.  As high limit is checked again before
2330	 * reclaim, the cost of mismatch is negligible.
 
 
 
 
 
 
2331	 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2332	do {
2333		bool mem_high, swap_high;
2334
2335		mem_high = page_counter_read(&memcg->memory) >
2336			READ_ONCE(memcg->memory.high);
2337		swap_high = page_counter_read(&memcg->swap) >
2338			READ_ONCE(memcg->swap.high);
2339
2340		/* Don't bother a random interrupted task */
2341		if (!in_task()) {
2342			if (mem_high) {
2343				schedule_work(&memcg->high_work);
2344				break;
2345			}
2346			continue;
2347		}
2348
2349		if (mem_high || swap_high) {
2350			/*
2351			 * The allocating tasks in this cgroup will need to do
2352			 * reclaim or be throttled to prevent further growth
2353			 * of the memory or swap footprints.
2354			 *
2355			 * Target some best-effort fairness between the tasks,
2356			 * and distribute reclaim work and delay penalties
2357			 * based on how much each task is actually allocating.
2358			 */
2359			current->memcg_nr_pages_over_high += batch;
2360			set_notify_resume(current);
 
 
 
 
 
 
 
2361			break;
 
 
 
2362		}
2363	} while ((memcg = parent_mem_cgroup(memcg)));
2364
2365	/*
2366	 * Reclaim is set up above to be called from the userland
2367	 * return path. But also attempt synchronous reclaim to avoid
2368	 * excessive overrun while the task is still inside the
2369	 * kernel. If this is successful, the return path will see it
2370	 * when it rechecks the overage and simply bail out.
2371	 */
2372	if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2373	    !(current->flags & PF_MEMALLOC) &&
2374	    gfpflags_allow_blocking(gfp_mask))
2375		mem_cgroup_handle_over_high(gfp_mask);
2376	return 0;
2377}
2378
2379/**
2380 * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
2381 * @memcg: memcg previously charged.
2382 * @nr_pages: number of pages previously charged.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2383 */
2384void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2385{
2386	if (mem_cgroup_is_root(memcg))
2387		return;
2388
2389	page_counter_uncharge(&memcg->memory, nr_pages);
2390	if (do_memsw_account())
2391		page_counter_uncharge(&memcg->memsw, nr_pages);
 
 
 
 
2392}
2393
2394static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2395{
2396	VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2397	/*
2398	 * Any of the following ensures page's memcg stability:
2399	 *
2400	 * - the page lock
2401	 * - LRU isolation
2402	 * - exclusive reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2403	 */
2404	folio->memcg_data = (unsigned long)memcg;
2405}
2406
2407/**
2408 * mem_cgroup_commit_charge - commit a previously successful try_charge().
2409 * @folio: folio to commit the charge to.
2410 * @memcg: memcg previously charged.
 
 
 
2411 */
2412void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2413{
2414	css_get(&memcg->css);
2415	commit_charge(folio, memcg);
2416	memcg1_commit_charge(folio, memcg);
2417}
 
 
 
 
 
 
 
2418
2419static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
2420				       struct pglist_data *pgdat,
2421				       enum node_stat_item idx, int nr)
2422{
2423	struct mem_cgroup *memcg;
2424	struct lruvec *lruvec;
2425
2426	rcu_read_lock();
2427	memcg = obj_cgroup_memcg(objcg);
2428	lruvec = mem_cgroup_lruvec(memcg, pgdat);
2429	__mod_memcg_lruvec_state(lruvec, idx, nr);
2430	rcu_read_unlock();
 
 
 
 
 
 
2431}
 
2432
2433static __always_inline
2434struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2435{
 
 
 
 
 
2436	/*
2437	 * Slab objects are accounted individually, not per-page.
2438	 * Memcg membership data for each individual object is saved in
2439	 * slab->obj_exts.
2440	 */
2441	if (folio_test_slab(folio)) {
2442		struct slabobj_ext *obj_exts;
2443		struct slab *slab;
2444		unsigned int off;
2445
2446		slab = folio_slab(folio);
2447		obj_exts = slab_obj_exts(slab);
2448		if (!obj_exts)
2449			return NULL;
2450
2451		off = obj_to_index(slab->slab_cache, slab, p);
2452		if (obj_exts[off].objcg)
2453			return obj_cgroup_memcg(obj_exts[off].objcg);
2454
2455		return NULL;
2456	}
2457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2458	/*
2459	 * folio_memcg_check() is used here, because in theory we can encounter
2460	 * a folio where the slab flag has been cleared already, but
2461	 * slab->obj_exts has not been freed yet
2462	 * folio_memcg_check() will guarantee that a proper memory
2463	 * cgroup pointer or NULL will be returned.
2464	 */
2465	return folio_memcg_check(folio);
 
 
 
2466}
2467
2468/*
2469 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2470 * It is not suitable for objects allocated using vmalloc().
2471 *
2472 * A passed kernel object must be a slab object or a generic kernel page.
2473 *
2474 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2475 * cgroup_mutex, etc.
2476 */
2477struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
2478{
2479	if (mem_cgroup_disabled())
2480		return NULL;
2481
2482	return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
2483}
2484
2485static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
2486{
2487	struct obj_cgroup *objcg = NULL;
 
 
 
 
 
2488
2489	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2490		objcg = rcu_dereference(memcg->objcg);
2491		if (likely(objcg && obj_cgroup_tryget(objcg)))
2492			break;
2493		objcg = NULL;
2494	}
2495	return objcg;
2496}
2497
2498static struct obj_cgroup *current_objcg_update(void)
2499{
2500	struct mem_cgroup *memcg;
2501	struct obj_cgroup *old, *objcg = NULL;
 
2502
2503	do {
2504		/* Atomically drop the update bit. */
2505		old = xchg(&current->objcg, NULL);
2506		if (old) {
2507			old = (struct obj_cgroup *)
2508				((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
2509			obj_cgroup_put(old);
2510
2511			old = NULL;
2512		}
 
 
2513
2514		/* If new objcg is NULL, no reason for the second atomic update. */
2515		if (!current->mm || (current->flags & PF_KTHREAD))
2516			return NULL;
2517
2518		/*
2519		 * Release the objcg pointer from the previous iteration,
2520		 * if try_cmpxcg() below fails.
2521		 */
2522		if (unlikely(objcg)) {
2523			obj_cgroup_put(objcg);
2524			objcg = NULL;
2525		}
2526
2527		/*
2528		 * Obtain the new objcg pointer. The current task can be
2529		 * asynchronously moved to another memcg and the previous
2530		 * memcg can be offlined. So let's get the memcg pointer
2531		 * and try get a reference to objcg under a rcu read lock.
2532		 */
2533
2534		rcu_read_lock();
2535		memcg = mem_cgroup_from_task(current);
2536		objcg = __get_obj_cgroup_from_memcg(memcg);
2537		rcu_read_unlock();
2538
2539		/*
2540		 * Try set up a new objcg pointer atomically. If it
2541		 * fails, it means the update flag was set concurrently, so
2542		 * the whole procedure should be repeated.
2543		 */
2544	} while (!try_cmpxchg(&current->objcg, &old, objcg));
2545
2546	return objcg;
2547}
2548
2549__always_inline struct obj_cgroup *current_obj_cgroup(void)
2550{
2551	struct mem_cgroup *memcg;
2552	struct obj_cgroup *objcg;
 
 
 
 
 
 
 
 
 
 
2553
2554	if (in_task()) {
2555		memcg = current->active_memcg;
2556		if (unlikely(memcg))
2557			goto from_memcg;
2558
2559		objcg = READ_ONCE(current->objcg);
2560		if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
2561			objcg = current_objcg_update();
2562		/*
2563		 * Objcg reference is kept by the task, so it's safe
2564		 * to use the objcg by the current task.
2565		 */
2566		return objcg;
2567	}
2568
2569	memcg = this_cpu_read(int_active_memcg);
2570	if (unlikely(memcg))
2571		goto from_memcg;
2572
2573	return NULL;
 
 
2574
2575from_memcg:
2576	objcg = NULL;
2577	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2578		/*
2579		 * Memcg pointer is protected by scope (see set_active_memcg())
2580		 * and is pinning the corresponding objcg, so objcg can't go
2581		 * away and can be used within the scope without any additional
2582		 * protection.
2583		 */
2584		objcg = rcu_dereference_check(memcg->objcg, 1);
2585		if (likely(objcg))
2586			break;
2587	}
2588
2589	return objcg;
2590}
2591
2592struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
 
2593{
2594	struct obj_cgroup *objcg;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2595
2596	if (!memcg_kmem_online())
2597		return NULL;
 
2598
2599	if (folio_memcg_kmem(folio)) {
2600		objcg = __folio_objcg(folio);
2601		obj_cgroup_get(objcg);
2602	} else {
2603		struct mem_cgroup *memcg;
2604
2605		rcu_read_lock();
2606		memcg = __folio_memcg(folio);
2607		if (memcg)
2608			objcg = __get_obj_cgroup_from_memcg(memcg);
2609		else
2610			objcg = NULL;
2611		rcu_read_unlock();
2612	}
2613	return objcg;
2614}
2615
2616/*
2617 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
2618 * @objcg: object cgroup to uncharge
2619 * @nr_pages: number of pages to uncharge
2620 */
2621static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2622				      unsigned int nr_pages)
2623{
2624	struct mem_cgroup *memcg;
 
 
 
 
 
 
 
 
 
2625
2626	memcg = get_mem_cgroup_from_objcg(objcg);
 
 
 
2627
2628	mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2629	memcg1_account_kmem(memcg, -nr_pages);
2630	refill_stock(memcg, nr_pages);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2631
2632	css_put(&memcg->css);
2633}
2634
2635/*
2636 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
2637 * @objcg: object cgroup to charge
2638 * @gfp: reclaim mode
2639 * @nr_pages: number of pages to charge
2640 *
2641 * Returns 0 on success, an error code on failure.
2642 */
2643static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2644				   unsigned int nr_pages)
2645{
2646	struct mem_cgroup *memcg;
2647	int ret;
2648
2649	memcg = get_mem_cgroup_from_objcg(objcg);
2650
2651	ret = try_charge_memcg(memcg, gfp, nr_pages);
2652	if (ret)
2653		goto out;
2654
2655	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
2656	memcg1_account_kmem(memcg, nr_pages);
2657out:
2658	css_put(&memcg->css);
2659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2660	return ret;
 
 
 
 
2661}
2662
2663/**
2664 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2665 * @page: page to charge
2666 * @gfp: reclaim mode
2667 * @order: allocation order
2668 *
2669 * Returns 0 on success, an error code on failure.
2670 */
2671int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2672{
2673	struct obj_cgroup *objcg;
2674	int ret = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2675
2676	objcg = current_obj_cgroup();
2677	if (objcg) {
2678		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
2679		if (!ret) {
2680			obj_cgroup_get(objcg);
2681			page->memcg_data = (unsigned long)objcg |
2682				MEMCG_DATA_KMEM;
2683			return 0;
 
 
 
 
2684		}
 
2685	}
2686	return ret;
 
 
 
 
 
2687}
2688
2689/**
2690 * __memcg_kmem_uncharge_page: uncharge a kmem page
2691 * @page: page to uncharge
2692 * @order: allocation order
2693 */
2694void __memcg_kmem_uncharge_page(struct page *page, int order)
2695{
2696	struct folio *folio = page_folio(page);
2697	struct obj_cgroup *objcg;
2698	unsigned int nr_pages = 1 << order;
2699
2700	if (!folio_memcg_kmem(folio))
 
 
 
 
2701		return;
2702
2703	objcg = __folio_objcg(folio);
2704	obj_cgroup_uncharge_pages(objcg, nr_pages);
2705	folio->memcg_data = 0;
2706	obj_cgroup_put(objcg);
2707}
2708
2709static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
2710		     enum node_stat_item idx, int nr)
 
2711{
2712	struct memcg_stock_pcp *stock;
2713	struct obj_cgroup *old = NULL;
2714	unsigned long flags;
2715	int *bytes;
2716
2717	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2718	stock = this_cpu_ptr(&memcg_stock);
 
2719
 
2720	/*
2721	 * Save vmstat data in stock and skip vmstat array update unless
2722	 * accumulating over a page of vmstat data or when pgdat or idx
2723	 * changes.
2724	 */
2725	if (READ_ONCE(stock->cached_objcg) != objcg) {
2726		old = drain_obj_stock(stock);
2727		obj_cgroup_get(objcg);
2728		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
2729				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
2730		WRITE_ONCE(stock->cached_objcg, objcg);
2731		stock->cached_pgdat = pgdat;
2732	} else if (stock->cached_pgdat != pgdat) {
2733		/* Flush the existing cached vmstat data */
2734		struct pglist_data *oldpg = stock->cached_pgdat;
2735
2736		if (stock->nr_slab_reclaimable_b) {
2737			__mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
2738					  stock->nr_slab_reclaimable_b);
2739			stock->nr_slab_reclaimable_b = 0;
2740		}
2741		if (stock->nr_slab_unreclaimable_b) {
2742			__mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
2743					  stock->nr_slab_unreclaimable_b);
2744			stock->nr_slab_unreclaimable_b = 0;
2745		}
2746		stock->cached_pgdat = pgdat;
2747	}
2748
2749	bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
2750					       : &stock->nr_slab_unreclaimable_b;
2751	/*
2752	 * Even for large object >= PAGE_SIZE, the vmstat data will still be
2753	 * cached locally at least once before pushing it out.
 
 
 
2754	 */
2755	if (!*bytes) {
2756		*bytes = nr;
2757		nr = 0;
2758	} else {
2759		*bytes += nr;
2760		if (abs(*bytes) > PAGE_SIZE) {
2761			nr = *bytes;
2762			*bytes = 0;
2763		} else {
2764			nr = 0;
2765		}
2766	}
2767	if (nr)
2768		__mod_objcg_mlstate(objcg, pgdat, idx, nr);
2769
2770	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2771	obj_cgroup_put(old);
2772}
2773
2774static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2775{
2776	struct memcg_stock_pcp *stock;
2777	unsigned long flags;
2778	bool ret = false;
2779
2780	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2781
2782	stock = this_cpu_ptr(&memcg_stock);
2783	if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
2784		stock->nr_bytes -= nr_bytes;
2785		ret = true;
2786	}
2787
2788	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2789
2790	return ret;
 
 
 
 
 
 
 
 
2791}
2792
2793static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 
 
 
 
2794{
2795	struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
 
 
2796
2797	if (!old)
2798		return NULL;
2799
2800	if (stock->nr_bytes) {
2801		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2802		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2803
2804		if (nr_pages) {
2805			struct mem_cgroup *memcg;
 
 
 
 
 
 
 
 
2806
2807			memcg = get_mem_cgroup_from_objcg(old);
2808
2809			mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2810			memcg1_account_kmem(memcg, -nr_pages);
2811			__refill_stock(memcg, nr_pages);
2812
2813			css_put(&memcg->css);
2814		}
2815
2816		/*
2817		 * The leftover is flushed to the centralized per-memcg value.
2818		 * On the next attempt to refill obj stock it will be moved
2819		 * to a per-cpu stock (probably, on an other CPU), see
2820		 * refill_obj_stock().
2821		 *
2822		 * How often it's flushed is a trade-off between the memory
2823		 * limit enforcement accuracy and potential CPU contention,
2824		 * so it might be changed in the future.
2825		 */
2826		atomic_add(nr_bytes, &old->nr_charged_bytes);
2827		stock->nr_bytes = 0;
 
 
 
 
2828	}
2829
 
 
 
2830	/*
2831	 * Flush the vmstat data in current stock
 
 
 
2832	 */
2833	if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
2834		if (stock->nr_slab_reclaimable_b) {
2835			__mod_objcg_mlstate(old, stock->cached_pgdat,
2836					  NR_SLAB_RECLAIMABLE_B,
2837					  stock->nr_slab_reclaimable_b);
2838			stock->nr_slab_reclaimable_b = 0;
2839		}
2840		if (stock->nr_slab_unreclaimable_b) {
2841			__mod_objcg_mlstate(old, stock->cached_pgdat,
2842					  NR_SLAB_UNRECLAIMABLE_B,
2843					  stock->nr_slab_unreclaimable_b);
2844			stock->nr_slab_unreclaimable_b = 0;
2845		}
2846		stock->cached_pgdat = NULL;
2847	}
2848
2849	WRITE_ONCE(stock->cached_objcg, NULL);
2850	/*
2851	 * The `old' objects needs to be released by the caller via
2852	 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
2853	 */
2854	return old;
 
 
 
 
 
 
 
 
 
 
 
 
2855}
2856
2857static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2858				     struct mem_cgroup *root_memcg)
2859{
2860	struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
2861	struct mem_cgroup *memcg;
2862
2863	if (objcg) {
2864		memcg = obj_cgroup_memcg(objcg);
2865		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
2866			return true;
2867	}
2868
2869	return false;
2870}
2871
2872static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
2873			     bool allow_uncharge)
2874{
2875	struct memcg_stock_pcp *stock;
2876	struct obj_cgroup *old = NULL;
2877	unsigned long flags;
2878	unsigned int nr_pages = 0;
2879
2880	local_lock_irqsave(&memcg_stock.stock_lock, flags);
 
 
 
 
 
 
2881
2882	stock = this_cpu_ptr(&memcg_stock);
2883	if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
2884		old = drain_obj_stock(stock);
2885		obj_cgroup_get(objcg);
2886		WRITE_ONCE(stock->cached_objcg, objcg);
2887		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
2888				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
2889		allow_uncharge = true;	/* Allow uncharge when objcg changes */
2890	}
2891	stock->nr_bytes += nr_bytes;
2892
2893	if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
2894		nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2895		stock->nr_bytes &= (PAGE_SIZE - 1);
2896	}
2897
2898	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2899	obj_cgroup_put(old);
2900
2901	if (nr_pages)
2902		obj_cgroup_uncharge_pages(objcg, nr_pages);
2903}
2904
2905int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
2906{
2907	unsigned int nr_pages, nr_bytes;
2908	int ret;
2909
2910	if (consume_obj_stock(objcg, size))
2911		return 0;
2912
2913	/*
2914	 * In theory, objcg->nr_charged_bytes can have enough
2915	 * pre-charged bytes to satisfy the allocation. However,
2916	 * flushing objcg->nr_charged_bytes requires two atomic
2917	 * operations, and objcg->nr_charged_bytes can't be big.
2918	 * The shared objcg->nr_charged_bytes can also become a
2919	 * performance bottleneck if all tasks of the same memcg are
2920	 * trying to update it. So it's better to ignore it and try
2921	 * grab some new pages. The stock's nr_bytes will be flushed to
2922	 * objcg->nr_charged_bytes later on when objcg changes.
2923	 *
2924	 * The stock's nr_bytes may contain enough pre-charged bytes
2925	 * to allow one less page from being charged, but we can't rely
2926	 * on the pre-charged bytes not being changed outside of
2927	 * consume_obj_stock() or refill_obj_stock(). So ignore those
2928	 * pre-charged bytes as well when charging pages. To avoid a
2929	 * page uncharge right after a page charge, we set the
2930	 * allow_uncharge flag to false when calling refill_obj_stock()
2931	 * to temporarily allow the pre-charged bytes to exceed the page
2932	 * size limit. The maximum reachable value of the pre-charged
2933	 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
2934	 * race.
2935	 */
2936	nr_pages = size >> PAGE_SHIFT;
2937	nr_bytes = size & (PAGE_SIZE - 1);
2938
2939	if (nr_bytes)
2940		nr_pages += 1;
2941
2942	ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
2943	if (!ret && nr_bytes)
2944		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
2945
2946	return ret;
2947}
2948
2949void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
2950{
2951	refill_obj_stock(objcg, size, true);
2952}
2953
2954static inline size_t obj_full_size(struct kmem_cache *s)
2955{
2956	/*
2957	 * For each accounted object there is an extra space which is used
2958	 * to store obj_cgroup membership. Charge it too.
2959	 */
2960	return s->size + sizeof(struct obj_cgroup *);
 
 
 
 
 
 
 
 
2961}
2962
2963bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2964				  gfp_t flags, size_t size, void **p)
 
 
 
 
 
2965{
2966	struct obj_cgroup *objcg;
2967	struct slab *slab;
2968	unsigned long off;
2969	size_t i;
2970
2971	/*
2972	 * The obtained objcg pointer is safe to use within the current scope,
2973	 * defined by current task or set_active_memcg() pair.
2974	 * obj_cgroup_get() is used to get a permanent reference.
2975	 */
2976	objcg = current_obj_cgroup();
2977	if (!objcg)
2978		return true;
2979
2980	/*
2981	 * slab_alloc_node() avoids the NULL check, so we might be called with a
2982	 * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill
2983	 * the whole requested size.
2984	 * return success as there's nothing to free back
2985	 */
2986	if (unlikely(*p == NULL))
2987		return true;
 
 
2988
2989	flags &= gfp_allowed_mask;
 
 
 
 
 
 
 
 
2990
2991	if (lru) {
2992		int ret;
2993		struct mem_cgroup *memcg;
2994
2995		memcg = get_mem_cgroup_from_objcg(objcg);
2996		ret = memcg_list_lru_alloc(memcg, lru, flags);
2997		css_put(&memcg->css);
2998
2999		if (ret)
3000			return false;
 
 
 
 
 
 
3001	}
 
 
3002
3003	if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
3004		return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3005
3006	for (i = 0; i < size; i++) {
3007		slab = virt_to_slab(p[i]);
3008
3009		if (!slab_obj_exts(slab) &&
3010		    alloc_slab_obj_exts(slab, s, flags, false)) {
3011			obj_cgroup_uncharge(objcg, obj_full_size(s));
3012			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3013		}
3014
3015		off = obj_to_index(s, slab, p[i]);
3016		obj_cgroup_get(objcg);
3017		slab_obj_exts(slab)[off].objcg = objcg;
3018		mod_objcg_state(objcg, slab_pgdat(slab),
3019				cache_vmstat_idx(s), obj_full_size(s));
3020	}
3021
3022	return true;
3023}
3024
3025void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
3026			    void **p, int objects, struct slabobj_ext *obj_exts)
3027{
3028	for (int i = 0; i < objects; i++) {
3029		struct obj_cgroup *objcg;
3030		unsigned int off;
3031
3032		off = obj_to_index(s, slab, p[i]);
3033		objcg = obj_exts[off].objcg;
3034		if (!objcg)
3035			continue;
3036
3037		obj_exts[off].objcg = NULL;
3038		obj_cgroup_uncharge(objcg, obj_full_size(s));
3039		mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
3040				-obj_full_size(s));
3041		obj_cgroup_put(objcg);
3042	}
3043}
 
3044
3045/*
3046 * Because folio_memcg(head) is not set on tails, set it now.
 
3047 */
3048void split_page_memcg(struct page *head, int old_order, int new_order)
 
3049{
3050	struct folio *folio = page_folio(head);
3051	int i;
3052	unsigned int old_nr = 1 << old_order;
3053	unsigned int new_nr = 1 << new_order;
3054
3055	if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
3056		return;
3057
3058	for (i = new_nr; i < old_nr; i += new_nr)
3059		folio_page(folio, i)->memcg_data = folio->memcg_data;
3060
3061	if (folio_memcg_kmem(folio))
3062		obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
3063	else
3064		css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
3065}
3066
3067unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3068{
3069	unsigned long val;
3070
3071	if (mem_cgroup_is_root(memcg)) {
3072		/*
3073		 * Approximate root's usage from global state. This isn't
3074		 * perfect, but the root usage was always an approximation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3075		 */
3076		val = global_node_page_state(NR_FILE_PAGES) +
3077			global_node_page_state(NR_ANON_MAPPED);
3078		if (swap)
3079			val += total_swap_pages - get_nr_swap_pages();
3080	} else {
3081		if (!swap)
3082			val = page_counter_read(&memcg->memory);
3083		else
3084			val = page_counter_read(&memcg->memsw);
3085	}
3086	return val;
3087}
3088
3089static int memcg_online_kmem(struct mem_cgroup *memcg)
3090{
3091	struct obj_cgroup *objcg;
3092
3093	if (mem_cgroup_kmem_disabled())
3094		return 0;
3095
3096	if (unlikely(mem_cgroup_is_root(memcg)))
3097		return 0;
3098
3099	objcg = obj_cgroup_alloc();
3100	if (!objcg)
 
 
 
 
 
 
 
 
3101		return -ENOMEM;
3102
3103	objcg->memcg = memcg;
3104	rcu_assign_pointer(memcg->objcg, objcg);
3105	obj_cgroup_get(objcg);
3106	memcg->orig_objcg = objcg;
3107
3108	static_branch_enable(&memcg_kmem_online_key);
3109
3110	memcg->kmemcg_id = memcg->id.id;
3111
3112	return 0;
 
 
 
 
 
3113}
3114
3115static void memcg_offline_kmem(struct mem_cgroup *memcg)
 
 
3116{
3117	struct mem_cgroup *parent;
 
3118
3119	if (mem_cgroup_kmem_disabled())
3120		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3121
3122	if (unlikely(mem_cgroup_is_root(memcg)))
3123		return;
3124
3125	parent = parent_mem_cgroup(memcg);
3126	if (!parent)
3127		parent = root_mem_cgroup;
3128
3129	memcg_reparent_list_lrus(memcg, parent);
3130
3131	/*
3132	 * Objcg's reparenting must be after list_lru's, make sure list_lru
3133	 * helpers won't use parent's list_lru until child is drained.
 
 
 
 
 
 
 
 
 
 
 
 
3134	 */
3135	memcg_reparent_objcgs(memcg, parent);
3136}
3137
3138#ifdef CONFIG_CGROUP_WRITEBACK
3139
3140#include <trace/events/writeback.h>
3141
3142static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3143{
3144	return wb_domain_init(&memcg->cgwb_domain, gfp);
3145}
3146
3147static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3148{
3149	wb_domain_exit(&memcg->cgwb_domain);
 
3150}
3151
3152static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3153{
3154	wb_domain_size_changed(&memcg->cgwb_domain);
3155}
3156
3157struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3158{
3159	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3160
3161	if (!memcg->css.parent)
3162		return NULL;
3163
3164	return &memcg->cgwb_domain;
3165}
3166
3167/**
3168 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3169 * @wb: bdi_writeback in question
3170 * @pfilepages: out parameter for number of file pages
3171 * @pheadroom: out parameter for number of allocatable pages according to memcg
3172 * @pdirty: out parameter for number of dirty pages
3173 * @pwriteback: out parameter for number of pages under writeback
3174 *
3175 * Determine the numbers of file, headroom, dirty, and writeback pages in
3176 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3177 * is a bit more involved.
3178 *
3179 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3180 * headroom is calculated as the lowest headroom of itself and the
3181 * ancestors.  Note that this doesn't consider the actual amount of
3182 * available memory in the system.  The caller should further cap
3183 * *@pheadroom accordingly.
3184 */
3185void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3186			 unsigned long *pheadroom, unsigned long *pdirty,
3187			 unsigned long *pwriteback)
3188{
3189	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3190	struct mem_cgroup *parent;
3191
3192	mem_cgroup_flush_stats_ratelimited(memcg);
 
 
 
3193
3194	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3195	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3196	*pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
3197			memcg_page_state(memcg, NR_ACTIVE_FILE);
3198
3199	*pheadroom = PAGE_COUNTER_MAX;
3200	while ((parent = parent_mem_cgroup(memcg))) {
3201		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
3202					    READ_ONCE(memcg->memory.high));
3203		unsigned long used = page_counter_read(&memcg->memory);
 
 
3204
3205		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3206		memcg = parent;
 
3207	}
3208}
 
3209
3210/*
3211 * Foreign dirty flushing
3212 *
3213 * There's an inherent mismatch between memcg and writeback.  The former
3214 * tracks ownership per-page while the latter per-inode.  This was a
3215 * deliberate design decision because honoring per-page ownership in the
3216 * writeback path is complicated, may lead to higher CPU and IO overheads
3217 * and deemed unnecessary given that write-sharing an inode across
3218 * different cgroups isn't a common use-case.
3219 *
3220 * Combined with inode majority-writer ownership switching, this works well
3221 * enough in most cases but there are some pathological cases.  For
3222 * example, let's say there are two cgroups A and B which keep writing to
3223 * different but confined parts of the same inode.  B owns the inode and
3224 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
3225 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
3226 * triggering background writeback.  A will be slowed down without a way to
3227 * make writeback of the dirty pages happen.
3228 *
3229 * Conditions like the above can lead to a cgroup getting repeatedly and
3230 * severely throttled after making some progress after each
3231 * dirty_expire_interval while the underlying IO device is almost
3232 * completely idle.
3233 *
3234 * Solving this problem completely requires matching the ownership tracking
3235 * granularities between memcg and writeback in either direction.  However,
3236 * the more egregious behaviors can be avoided by simply remembering the
3237 * most recent foreign dirtying events and initiating remote flushes on
3238 * them when local writeback isn't enough to keep the memory clean enough.
3239 *
3240 * The following two functions implement such mechanism.  When a foreign
3241 * page - a page whose memcg and writeback ownerships don't match - is
3242 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
3243 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
3244 * decides that the memcg needs to sleep due to high dirty ratio, it calls
3245 * mem_cgroup_flush_foreign() which queues writeback on the recorded
3246 * foreign bdi_writebacks which haven't expired.  Both the numbers of
3247 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
3248 * limited to MEMCG_CGWB_FRN_CNT.
3249 *
3250 * The mechanism only remembers IDs and doesn't hold any object references.
3251 * As being wrong occasionally doesn't matter, updates and accesses to the
3252 * records are lockless and racy.
3253 */
3254void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
3255					     struct bdi_writeback *wb)
3256{
3257	struct mem_cgroup *memcg = folio_memcg(folio);
3258	struct memcg_cgwb_frn *frn;
3259	u64 now = get_jiffies_64();
3260	u64 oldest_at = now;
3261	int oldest = -1;
3262	int i;
3263
3264	trace_track_foreign_dirty(folio, wb);
 
 
 
 
 
 
 
 
3265
3266	/*
3267	 * Pick the slot to use.  If there is already a slot for @wb, keep
3268	 * using it.  If not replace the oldest one which isn't being
3269	 * written out.
3270	 */
3271	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3272		frn = &memcg->cgwb_frn[i];
3273		if (frn->bdi_id == wb->bdi->id &&
3274		    frn->memcg_id == wb->memcg_css->id)
 
 
 
 
3275			break;
3276		if (time_before64(frn->at, oldest_at) &&
3277		    atomic_read(&frn->done.cnt) == 1) {
3278			oldest = i;
3279			oldest_at = frn->at;
3280		}
3281	}
3282
3283	if (i < MEMCG_CGWB_FRN_CNT) {
3284		/*
3285		 * Re-using an existing one.  Update timestamp lazily to
3286		 * avoid making the cacheline hot.  We want them to be
3287		 * reasonably up-to-date and significantly shorter than
3288		 * dirty_expire_interval as that's what expires the record.
3289		 * Use the shorter of 1s and dirty_expire_interval / 8.
3290		 */
3291		unsigned long update_intv =
3292			min_t(unsigned long, HZ,
3293			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
3294
3295		if (time_before64(frn->at, now - update_intv))
3296			frn->at = now;
3297	} else if (oldest >= 0) {
3298		/* replace the oldest free one */
3299		frn = &memcg->cgwb_frn[oldest];
3300		frn->bdi_id = wb->bdi->id;
3301		frn->memcg_id = wb->memcg_css->id;
3302		frn->at = now;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3303	}
 
 
 
 
3304}
3305
3306/* issue foreign writeback flushes for recorded foreign dirtying events */
3307void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
3308{
3309	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3310	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
3311	u64 now = jiffies_64;
3312	int i;
3313
3314	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3315		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
3316
 
 
 
 
 
 
3317		/*
3318		 * If the record is older than dirty_expire_interval,
3319		 * writeback on it has already started.  No need to kick it
3320		 * off again.  Also, don't start a new one if there's
3321		 * already one in flight.
3322		 */
3323		if (time_after64(frn->at, now - intv) &&
3324		    atomic_read(&frn->done.cnt) == 1) {
3325			frn->at = 0;
3326			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
3327			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
3328					       WB_REASON_FOREIGN_FLUSH,
3329					       &frn->done);
3330		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3331	}
 
 
 
3332}
3333
3334#else	/* CONFIG_CGROUP_WRITEBACK */
 
 
 
 
 
 
 
 
 
 
3335
3336static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3337{
3338	return 0;
3339}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3340
3341static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3342{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3343}
3344
3345static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
 
 
 
 
 
3346{
3347}
 
 
 
 
 
3348
3349#endif	/* CONFIG_CGROUP_WRITEBACK */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3350
3351/*
3352 * Private memory cgroup IDR
3353 *
3354 * Swap-out records and page cache shadow entries need to store memcg
3355 * references in constrained space, so we maintain an ID space that is
3356 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
3357 * memory-controlled cgroups to 64k.
3358 *
3359 * However, there usually are many references to the offline CSS after
3360 * the cgroup has been destroyed, such as page cache or reclaimable
3361 * slab objects, that don't need to hang on to the ID. We want to keep
3362 * those dead CSS from occupying IDs, or we might quickly exhaust the
3363 * relatively small ID space and prevent the creation of new cgroups
3364 * even when there are much fewer than 64k cgroups - possibly none.
3365 *
3366 * Maintain a private 16-bit ID space for memcg, and allow the ID to
3367 * be freed and recycled when it's no longer needed, which is usually
3368 * when the CSS is offlined.
3369 *
3370 * The only exception to that are records of swapped out tmpfs/shmem
3371 * pages that need to be attributed to live ancestors on swapin. But
3372 * those references are manageable from userspace.
3373 */
3374
3375#define MEM_CGROUP_ID_MAX	((1UL << MEM_CGROUP_ID_SHIFT) - 1)
3376static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids);
 
3377
3378static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
3379{
3380	if (memcg->id.id > 0) {
3381		xa_erase(&mem_cgroup_ids, memcg->id.id);
3382		memcg->id.id = 0;
 
3383	}
 
 
 
 
3384}
3385
3386void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
3387					   unsigned int n)
 
 
 
3388{
3389	refcount_add(n, &memcg->id.ref);
3390}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3391
3392void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
3393{
3394	if (refcount_sub_and_test(n, &memcg->id.ref)) {
3395		mem_cgroup_id_remove(memcg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3396
3397		/* Memcg ID pins CSS */
3398		css_put(&memcg->css);
3399	}
 
 
 
3400}
3401
3402static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
3403{
3404	mem_cgroup_id_put_many(memcg, 1);
3405}
3406
3407/**
3408 * mem_cgroup_from_id - look up a memcg from a memcg id
3409 * @id: the memcg id to look up
3410 *
3411 * Caller must hold rcu_read_lock().
3412 */
3413struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
3414{
3415	WARN_ON_ONCE(!rcu_read_lock_held());
3416	return xa_load(&mem_cgroup_ids, id);
3417}
3418
3419#ifdef CONFIG_SHRINKER_DEBUG
3420struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
3421{
3422	struct cgroup *cgrp;
3423	struct cgroup_subsys_state *css;
3424	struct mem_cgroup *memcg;
 
3425
3426	cgrp = cgroup_get_from_id(ino);
3427	if (IS_ERR(cgrp))
3428		return ERR_CAST(cgrp);
3429
3430	css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
3431	if (css)
3432		memcg = container_of(css, struct mem_cgroup, css);
3433	else
3434		memcg = ERR_PTR(-ENOENT);
3435
3436	cgroup_put(cgrp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3437
3438	return memcg;
3439}
3440#endif
3441
3442static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
 
3443{
3444	struct mem_cgroup_per_node *pn;
 
3445
3446	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
3447	if (!pn)
3448		return false;
3449
3450	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
3451					GFP_KERNEL_ACCOUNT, node);
3452	if (!pn->lruvec_stats)
3453		goto fail;
3454
3455	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
3456						   GFP_KERNEL_ACCOUNT);
3457	if (!pn->lruvec_stats_percpu)
3458		goto fail;
3459
3460	lruvec_init(&pn->lruvec);
3461	pn->memcg = memcg;
3462
3463	memcg->nodeinfo[node] = pn;
3464	return true;
3465fail:
3466	kfree(pn->lruvec_stats);
3467	kfree(pn);
3468	return false;
3469}
3470
3471static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
3472{
3473	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3474
3475	if (!pn)
3476		return;
 
 
 
 
3477
3478	free_percpu(pn->lruvec_stats_percpu);
3479	kfree(pn->lruvec_stats);
3480	kfree(pn);
3481}
3482
3483static void __mem_cgroup_free(struct mem_cgroup *memcg)
3484{
3485	int node;
3486
3487	obj_cgroup_put(memcg->orig_objcg);
 
3488
3489	for_each_node(node)
3490		free_mem_cgroup_per_node_info(memcg, node);
3491	memcg1_free_events(memcg);
3492	kfree(memcg->vmstats);
3493	free_percpu(memcg->vmstats_percpu);
3494	kfree(memcg);
3495}
3496
3497static void mem_cgroup_free(struct mem_cgroup *memcg)
3498{
3499	lru_gen_exit_memcg(memcg);
3500	memcg_wb_domain_exit(memcg);
3501	__mem_cgroup_free(memcg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3502}
3503
3504static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 
 
 
 
3505{
3506	struct memcg_vmstats_percpu *statc, *pstatc;
3507	struct mem_cgroup *memcg;
3508	int node, cpu;
3509	int __maybe_unused i;
3510	long error;
3511
3512	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
3513	if (!memcg)
3514		return ERR_PTR(-ENOMEM);
3515
3516	error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL,
3517			 XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL);
3518	if (error)
3519		goto fail;
3520	error = -ENOMEM;
3521
3522	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
3523				 GFP_KERNEL_ACCOUNT);
3524	if (!memcg->vmstats)
3525		goto fail;
3526
3527	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
3528						 GFP_KERNEL_ACCOUNT);
3529	if (!memcg->vmstats_percpu)
3530		goto fail;
3531
3532	if (!memcg1_alloc_events(memcg))
3533		goto fail;
3534
3535	for_each_possible_cpu(cpu) {
3536		if (parent)
3537			pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
3538		statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3539		statc->parent = parent ? pstatc : NULL;
3540		statc->vmstats = memcg->vmstats;
3541	}
3542
3543	for_each_node(node)
3544		if (!alloc_mem_cgroup_per_node_info(memcg, node))
3545			goto fail;
3546
3547	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
3548		goto fail;
3549
3550	INIT_WORK(&memcg->high_work, high_work_func);
3551	vmpressure_init(&memcg->vmpressure);
3552	INIT_LIST_HEAD(&memcg->memory_peaks);
3553	INIT_LIST_HEAD(&memcg->swap_peaks);
3554	spin_lock_init(&memcg->peaks_lock);
3555	memcg->socket_pressure = jiffies;
3556	memcg1_memcg_init(memcg);
3557	memcg->kmemcg_id = -1;
3558	INIT_LIST_HEAD(&memcg->objcg_list);
3559#ifdef CONFIG_CGROUP_WRITEBACK
3560	INIT_LIST_HEAD(&memcg->cgwb_list);
3561	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3562		memcg->cgwb_frn[i].done =
3563			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
3564#endif
3565#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3566	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
3567	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
3568	memcg->deferred_split_queue.split_queue_len = 0;
3569#endif
3570	lru_gen_init_memcg(memcg);
3571	return memcg;
3572fail:
3573	mem_cgroup_id_remove(memcg);
3574	__mem_cgroup_free(memcg);
3575	return ERR_PTR(error);
3576}
3577
3578static struct cgroup_subsys_state * __ref
3579mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
3580{
3581	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
3582	struct mem_cgroup *memcg, *old_memcg;
3583
3584	old_memcg = set_active_memcg(parent);
3585	memcg = mem_cgroup_alloc(parent);
3586	set_active_memcg(old_memcg);
3587	if (IS_ERR(memcg))
3588		return ERR_CAST(memcg);
3589
3590	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3591	memcg1_soft_limit_reset(memcg);
3592#ifdef CONFIG_ZSWAP
3593	memcg->zswap_max = PAGE_COUNTER_MAX;
3594	WRITE_ONCE(memcg->zswap_writeback, true);
3595#endif
3596	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3597	if (parent) {
3598		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
3599
3600		page_counter_init(&memcg->memory, &parent->memory, true);
3601		page_counter_init(&memcg->swap, &parent->swap, false);
3602#ifdef CONFIG_MEMCG_V1
3603		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
3604		page_counter_init(&memcg->kmem, &parent->kmem, false);
3605		page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
3606#endif
3607	} else {
3608		init_memcg_stats();
3609		init_memcg_events();
3610		page_counter_init(&memcg->memory, NULL, true);
3611		page_counter_init(&memcg->swap, NULL, false);
3612#ifdef CONFIG_MEMCG_V1
3613		page_counter_init(&memcg->kmem, NULL, false);
3614		page_counter_init(&memcg->tcpmem, NULL, false);
3615#endif
3616		root_mem_cgroup = memcg;
3617		return &memcg->css;
 
 
 
 
 
 
 
 
 
3618	}
3619
3620	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3621		static_branch_inc(&memcg_sockets_enabled_key);
3622
3623	if (!cgroup_memory_nobpf)
3624		static_branch_inc(&memcg_bpf_enabled_key);
3625
3626	return &memcg->css;
3627}
3628
3629static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 
 
3630{
3631	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3632
3633	if (memcg_online_kmem(memcg))
3634		goto remove_id;
3635
3636	/*
3637	 * A memcg must be visible for expand_shrinker_info()
3638	 * by the time the maps are allocated. So, we allocate maps
3639	 * here, when for_each_mem_cgroup() can't skip it.
3640	 */
3641	if (alloc_shrinker_info(memcg))
3642		goto offline_kmem;
3643
3644	if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
3645		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
3646				   FLUSH_TIME);
3647	lru_gen_online_memcg(memcg);
3648
3649	/* Online state pins memcg ID, memcg ID pins CSS */
3650	refcount_set(&memcg->id.ref, 1);
3651	css_get(css);
3652
 
 
3653	/*
3654	 * Ensure mem_cgroup_from_id() works once we're fully online.
3655	 *
3656	 * We could do this earlier and require callers to filter with
3657	 * css_tryget_online(). But right now there are no users that
3658	 * need earlier access, and the workingset code relies on the
3659	 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
3660	 * publish it here at the end of onlining. This matches the
3661	 * regular ID destruction during offlining.
3662	 */
3663	xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL);
3664
3665	return 0;
3666offline_kmem:
3667	memcg_offline_kmem(memcg);
3668remove_id:
3669	mem_cgroup_id_remove(memcg);
3670	return -ENOMEM;
3671}
3672
3673static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
3674{
3675	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
 
3676
3677	memcg1_css_offline(memcg);
3678
3679	page_counter_set_min(&memcg->memory, 0);
3680	page_counter_set_low(&memcg->memory, 0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3681
3682	zswap_memcg_offline_cleanup(memcg);
 
 
3683
3684	memcg_offline_kmem(memcg);
3685	reparent_shrinker_deferred(memcg);
3686	wb_memcg_offline(memcg);
3687	lru_gen_offline_memcg(memcg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3688
3689	drain_all_stock(memcg);
3690
3691	mem_cgroup_id_put(memcg);
3692}
3693
3694static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
3695{
3696	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3697
3698	invalidate_reclaim_iterators(memcg);
3699	lru_gen_release_memcg(memcg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3700}
3701
3702static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 
3703{
3704	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3705	int __maybe_unused i;
3706
3707#ifdef CONFIG_CGROUP_WRITEBACK
3708	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3709		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
3710#endif
3711	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3712		static_branch_dec(&memcg_sockets_enabled_key);
3713
3714	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg))
3715		static_branch_dec(&memcg_sockets_enabled_key);
3716
3717	if (!cgroup_memory_nobpf)
3718		static_branch_dec(&memcg_bpf_enabled_key);
3719
3720	vmpressure_cleanup(&memcg->vmpressure);
3721	cancel_work_sync(&memcg->high_work);
3722	memcg1_remove_from_trees(memcg);
3723	free_shrinker_info(memcg);
3724	mem_cgroup_free(memcg);
3725}
3726
3727/**
3728 * mem_cgroup_css_reset - reset the states of a mem_cgroup
3729 * @css: the target css
3730 *
3731 * Reset the states of the mem_cgroup associated with @css.  This is
3732 * invoked when the userland requests disabling on the default hierarchy
3733 * but the memcg is pinned through dependency.  The memcg should stop
3734 * applying policies and should revert to the vanilla state as it may be
3735 * made visible again.
3736 *
3737 * The current implementation only resets the essential configurations.
3738 * This needs to be expanded to cover all the visible parts.
3739 */
3740static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
3741{
3742	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3743
3744	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
3745	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
3746#ifdef CONFIG_MEMCG_V1
3747	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
3748	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
3749#endif
3750	page_counter_set_min(&memcg->memory, 0);
3751	page_counter_set_low(&memcg->memory, 0);
3752	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3753	memcg1_soft_limit_reset(memcg);
3754	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3755	memcg_wb_domain_size_changed(memcg);
3756}
3757
3758struct aggregate_control {
3759	/* pointer to the aggregated (CPU and subtree aggregated) counters */
3760	long *aggregate;
3761	/* pointer to the non-hierarchichal (CPU aggregated) counters */
3762	long *local;
3763	/* pointer to the pending child counters during tree propagation */
3764	long *pending;
3765	/* pointer to the parent's pending counters, could be NULL */
3766	long *ppending;
3767	/* pointer to the percpu counters to be aggregated */
3768	long *cstat;
3769	/* pointer to the percpu counters of the last aggregation*/
3770	long *cstat_prev;
3771	/* size of the above counters */
3772	int size;
3773};
 
 
 
 
 
 
 
 
 
 
 
3774
3775static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
 
3776{
 
 
3777	int i;
3778	long delta, delta_cpu, v;
3779
3780	for (i = 0; i < ac->size; i++) {
3781		/*
3782		 * Collect the aggregated propagation counts of groups
3783		 * below us. We're in a per-cpu loop here and this is
3784		 * a global counter, so the first cycle will get them.
3785		 */
3786		delta = ac->pending[i];
3787		if (delta)
3788			ac->pending[i] = 0;
3789
3790		/* Add CPU changes on this level since the last flush */
3791		delta_cpu = 0;
3792		v = READ_ONCE(ac->cstat[i]);
3793		if (v != ac->cstat_prev[i]) {
3794			delta_cpu = v - ac->cstat_prev[i];
3795			delta += delta_cpu;
3796			ac->cstat_prev[i] = v;
3797		}
3798
3799		/* Aggregate counts on this level and propagate upwards */
3800		if (delta_cpu)
3801			ac->local[i] += delta_cpu;
 
 
3802
3803		if (delta) {
3804			ac->aggregate[i] += delta;
3805			if (ac->ppending)
3806				ac->ppending[i] += delta;
3807		}
 
 
3808	}
3809}
3810
3811static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
3812{
3813	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3814	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
3815	struct memcg_vmstats_percpu *statc;
3816	struct aggregate_control ac;
3817	int nid;
3818
3819	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3820
3821	ac = (struct aggregate_control) {
3822		.aggregate = memcg->vmstats->state,
3823		.local = memcg->vmstats->state_local,
3824		.pending = memcg->vmstats->state_pending,
3825		.ppending = parent ? parent->vmstats->state_pending : NULL,
3826		.cstat = statc->state,
3827		.cstat_prev = statc->state_prev,
3828		.size = MEMCG_VMSTAT_SIZE,
3829	};
3830	mem_cgroup_stat_aggregate(&ac);
3831
3832	ac = (struct aggregate_control) {
3833		.aggregate = memcg->vmstats->events,
3834		.local = memcg->vmstats->events_local,
3835		.pending = memcg->vmstats->events_pending,
3836		.ppending = parent ? parent->vmstats->events_pending : NULL,
3837		.cstat = statc->events,
3838		.cstat_prev = statc->events_prev,
3839		.size = NR_MEMCG_EVENTS,
3840	};
3841	mem_cgroup_stat_aggregate(&ac);
3842
3843	for_each_node_state(nid, N_MEMORY) {
3844		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
3845		struct lruvec_stats *lstats = pn->lruvec_stats;
3846		struct lruvec_stats *plstats = NULL;
3847		struct lruvec_stats_percpu *lstatc;
3848
3849		if (parent)
3850			plstats = parent->nodeinfo[nid]->lruvec_stats;
3851
3852		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
3853
3854		ac = (struct aggregate_control) {
3855			.aggregate = lstats->state,
3856			.local = lstats->state_local,
3857			.pending = lstats->state_pending,
3858			.ppending = plstats ? plstats->state_pending : NULL,
3859			.cstat = lstatc->state,
3860			.cstat_prev = lstatc->state_prev,
3861			.size = NR_MEMCG_NODE_STAT_ITEMS,
3862		};
3863		mem_cgroup_stat_aggregate(&ac);
3864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3865	}
3866	WRITE_ONCE(statc->stats_updates, 0);
3867	/* We are in a per-cpu loop here, only do the atomic write once */
3868	if (atomic64_read(&memcg->vmstats->stats_updates))
3869		atomic64_set(&memcg->vmstats->stats_updates, 0);
3870}
3871
3872static void mem_cgroup_fork(struct task_struct *task)
3873{
3874	/*
3875	 * Set the update flag to cause task->objcg to be initialized lazily
3876	 * on the first allocation. It can be done without any synchronization
3877	 * because it's always performed on the current task, so does
3878	 * current_objcg_update().
3879	 */
3880	task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
3881}
3882
3883static void mem_cgroup_exit(struct task_struct *task)
3884{
3885	struct obj_cgroup *objcg = task->objcg;
3886
3887	objcg = (struct obj_cgroup *)
3888		((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
3889	obj_cgroup_put(objcg);
3890
3891	/*
3892	 * Some kernel allocations can happen after this point,
3893	 * but let's ignore them. It can be done without any synchronization
3894	 * because it's always performed on the current task, so does
3895	 * current_objcg_update().
3896	 */
3897	task->objcg = NULL;
3898}
3899
3900#ifdef CONFIG_LRU_GEN
3901static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
3902{
3903	struct task_struct *task;
3904	struct cgroup_subsys_state *css;
3905
3906	/* find the first leader if there is any */
3907	cgroup_taskset_for_each_leader(task, css, tset)
3908		break;
3909
3910	if (!task)
3911		return;
3912
3913	task_lock(task);
3914	if (task->mm && READ_ONCE(task->mm->owner) == task)
3915		lru_gen_migrate_mm(task->mm);
3916	task_unlock(task);
3917}
3918#else
3919static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
3920#endif /* CONFIG_LRU_GEN */
3921
3922static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
3923{
3924	struct task_struct *task;
3925	struct cgroup_subsys_state *css;
3926
3927	cgroup_taskset_for_each(task, css, tset) {
3928		/* atomically set the update bit */
3929		set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
 
 
3930	}
3931}
3932
3933static void mem_cgroup_attach(struct cgroup_taskset *tset)
3934{
3935	mem_cgroup_lru_gen_attach(tset);
3936	mem_cgroup_kmem_attach(tset);
3937}
3938
3939static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
3940{
3941	if (value == PAGE_COUNTER_MAX)
3942		seq_puts(m, "max\n");
3943	else
3944		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
3945
3946	return 0;
3947}
3948
3949static u64 memory_current_read(struct cgroup_subsys_state *css,
3950			       struct cftype *cft)
3951{
3952	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
 
3953
3954	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
3955}
 
 
 
3956
3957#define OFP_PEAK_UNSET (((-1UL)))
 
3958
3959static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
3960{
3961	struct cgroup_of_peak *ofp = of_peak(sf->private);
3962	u64 fd_peak = READ_ONCE(ofp->value), peak;
3963
3964	/* User wants global or local peak? */
3965	if (fd_peak == OFP_PEAK_UNSET)
3966		peak = pc->watermark;
3967	else
3968		peak = max(fd_peak, READ_ONCE(pc->local_watermark));
 
3969
3970	seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
3971	return 0;
3972}
 
 
 
 
 
3973
3974static int memory_peak_show(struct seq_file *sf, void *v)
3975{
3976	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3977
3978	return peak_show(sf, v, &memcg->memory);
3979}
 
 
 
 
 
 
3980
3981static int peak_open(struct kernfs_open_file *of)
3982{
3983	struct cgroup_of_peak *ofp = of_peak(of);
3984
3985	ofp->value = OFP_PEAK_UNSET;
3986	return 0;
3987}
3988
3989static void peak_release(struct kernfs_open_file *of)
3990{
3991	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3992	struct cgroup_of_peak *ofp = of_peak(of);
 
 
3993
3994	if (ofp->value == OFP_PEAK_UNSET) {
3995		/* fast path (no writes on this fd) */
3996		return;
3997	}
3998	spin_lock(&memcg->peaks_lock);
3999	list_del(&ofp->list);
4000	spin_unlock(&memcg->peaks_lock);
4001}
4002
4003static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
4004			  loff_t off, struct page_counter *pc,
4005			  struct list_head *watchers)
4006{
4007	unsigned long usage;
4008	struct cgroup_of_peak *peer_ctx;
4009	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4010	struct cgroup_of_peak *ofp = of_peak(of);
4011
4012	spin_lock(&memcg->peaks_lock);
4013
4014	usage = page_counter_read(pc);
4015	WRITE_ONCE(pc->local_watermark, usage);
4016
4017	list_for_each_entry(peer_ctx, watchers, list)
4018		if (usage > peer_ctx->value)
4019			WRITE_ONCE(peer_ctx->value, usage);
4020
4021	/* initial write, register watcher */
4022	if (ofp->value == -1)
4023		list_add(&ofp->list, watchers);
4024
4025	WRITE_ONCE(ofp->value, usage);
4026	spin_unlock(&memcg->peaks_lock);
4027
4028	return nbytes;
4029}
4030
4031static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
4032				 size_t nbytes, loff_t off)
4033{
4034	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4035
4036	return peak_write(of, buf, nbytes, off, &memcg->memory,
4037			  &memcg->memory_peaks);
 
4038}
4039
4040#undef OFP_PEAK_UNSET
 
 
4041
4042static int memory_min_show(struct seq_file *m, void *v)
4043{
4044	return seq_puts_memcg_tunable(m,
4045		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
4046}
4047
4048static ssize_t memory_min_write(struct kernfs_open_file *of,
4049				char *buf, size_t nbytes, loff_t off)
4050{
4051	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4052	unsigned long min;
4053	int err;
 
 
 
4054
4055	buf = strstrip(buf);
4056	err = page_counter_memparse(buf, "max", &min);
4057	if (err)
4058		return err;
4059
4060	page_counter_set_min(&memcg->memory, min);
4061
4062	return nbytes;
4063}
 
 
 
 
4064
4065static int memory_low_show(struct seq_file *m, void *v)
4066{
4067	return seq_puts_memcg_tunable(m,
4068		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
4069}
4070
4071static ssize_t memory_low_write(struct kernfs_open_file *of,
4072				char *buf, size_t nbytes, loff_t off)
4073{
4074	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4075	unsigned long low;
4076	int err;
4077
4078	buf = strstrip(buf);
4079	err = page_counter_memparse(buf, "max", &low);
4080	if (err)
4081		return err;
4082
4083	page_counter_set_low(&memcg->memory, low);
 
 
 
 
 
 
 
4084
4085	return nbytes;
4086}
 
 
 
4087
4088static int memory_high_show(struct seq_file *m, void *v)
4089{
4090	return seq_puts_memcg_tunable(m,
4091		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
4092}
4093
4094static ssize_t memory_high_write(struct kernfs_open_file *of,
4095				 char *buf, size_t nbytes, loff_t off)
4096{
4097	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4098	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4099	bool drained = false;
4100	unsigned long high;
4101	int err;
4102
4103	buf = strstrip(buf);
4104	err = page_counter_memparse(buf, "max", &high);
4105	if (err)
4106		return err;
 
 
 
 
 
 
 
 
4107
4108	page_counter_set_high(&memcg->memory, high);
 
 
4109
4110	for (;;) {
4111		unsigned long nr_pages = page_counter_read(&memcg->memory);
4112		unsigned long reclaimed;
4113
4114		if (nr_pages <= high)
4115			break;
4116
4117		if (signal_pending(current))
4118			break;
4119
4120		if (!drained) {
4121			drain_all_stock(memcg);
4122			drained = true;
4123			continue;
4124		}
4125
4126		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
4127					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
4128
4129		if (!reclaimed && !nr_retries--)
4130			break;
4131	}
4132
4133	memcg_wb_domain_size_changed(memcg);
4134	return nbytes;
4135}
4136
4137static int memory_max_show(struct seq_file *m, void *v)
 
4138{
4139	return seq_puts_memcg_tunable(m,
4140		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
4141}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4142
4143static ssize_t memory_max_write(struct kernfs_open_file *of,
4144				char *buf, size_t nbytes, loff_t off)
4145{
4146	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4147	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
4148	bool drained = false;
4149	unsigned long max;
4150	int err;
4151
4152	buf = strstrip(buf);
4153	err = page_counter_memparse(buf, "max", &max);
4154	if (err)
4155		return err;
4156
4157	xchg(&memcg->memory.max, max);
 
 
 
 
 
4158
4159	for (;;) {
4160		unsigned long nr_pages = page_counter_read(&memcg->memory);
4161
4162		if (nr_pages <= max)
4163			break;
 
 
 
 
4164
4165		if (signal_pending(current))
4166			break;
4167
4168		if (!drained) {
4169			drain_all_stock(memcg);
4170			drained = true;
 
4171			continue;
4172		}
4173
4174		if (nr_reclaims) {
4175			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
4176					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
4177				nr_reclaims--;
4178			continue;
 
 
 
4179		}
 
 
4180
4181		memcg_memory_event(memcg, MEMCG_OOM);
4182		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
4183			break;
4184	}
4185
4186	memcg_wb_domain_size_changed(memcg);
4187	return nbytes;
4188}
4189
4190/*
4191 * Note: don't forget to update the 'samples/cgroup/memcg_event_listener'
4192 * if any new events become available.
4193 */
4194static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
4195{
4196	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
4197	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
4198	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
4199	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
4200	seq_printf(m, "oom_kill %lu\n",
4201		   atomic_long_read(&events[MEMCG_OOM_KILL]));
4202	seq_printf(m, "oom_group_kill %lu\n",
4203		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
4204}
4205
4206static int memory_events_show(struct seq_file *m, void *v)
 
4207{
4208	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
 
4209
4210	__memory_events_show(m, memcg->memory_events);
4211	return 0;
4212}
 
4213
4214static int memory_events_local_show(struct seq_file *m, void *v)
4215{
4216	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4217
4218	__memory_events_show(m, memcg->memory_events_local);
4219	return 0;
4220}
4221
4222int memory_stat_show(struct seq_file *m, void *v)
4223{
4224	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4225	char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL);
4226	struct seq_buf s;
4227
4228	if (!buf)
4229		return -ENOMEM;
4230	seq_buf_init(&s, buf, SEQ_BUF_SIZE);
4231	memory_stat_format(memcg, &s);
4232	seq_puts(m, buf);
4233	kfree(buf);
4234	return 0;
4235}
4236
4237#ifdef CONFIG_NUMA
4238static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
4239						     int item)
4240{
4241	return lruvec_page_state(lruvec, item) *
4242		memcg_page_state_output_unit(item);
4243}
4244
4245static int memory_numa_stat_show(struct seq_file *m, void *v)
4246{
4247	int i;
4248	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
4249
4250	mem_cgroup_flush_stats(memcg);
4251
4252	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
4253		int nid;
4254
4255		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
4256			continue;
4257
4258		seq_printf(m, "%s", memory_stats[i].name);
4259		for_each_node_state(nid, N_MEMORY) {
4260			u64 size;
4261			struct lruvec *lruvec;
4262
4263			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4264			size = lruvec_page_state_output(lruvec,
4265							memory_stats[i].idx);
4266			seq_printf(m, " N%d=%llu", nid, size);
4267		}
4268		seq_putc(m, '\n');
4269	}
4270
4271	return 0;
4272}
4273#endif
4274
4275static int memory_oom_group_show(struct seq_file *m, void *v)
 
4276{
4277	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4278
4279	seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
4280
 
 
 
 
4281	return 0;
4282}
4283
4284static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
4285				      char *buf, size_t nbytes, loff_t off)
4286{
4287	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4288	int ret, oom_group;
4289
4290	buf = strstrip(buf);
4291	if (!buf)
4292		return -EINVAL;
4293
4294	ret = kstrtoint(buf, 0, &oom_group);
4295	if (ret)
4296		return ret;
4297
4298	if (oom_group != 0 && oom_group != 1)
 
 
 
 
4299		return -EINVAL;
4300
4301	WRITE_ONCE(memcg->oom_group, oom_group);
4302
4303	return nbytes;
 
 
4304}
4305
4306enum {
4307	MEMORY_RECLAIM_SWAPPINESS = 0,
4308	MEMORY_RECLAIM_NULL,
4309};
4310
4311static const match_table_t tokens = {
4312	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
4313	{ MEMORY_RECLAIM_NULL, NULL },
4314};
4315
4316static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
4317			      size_t nbytes, loff_t off)
4318{
4319	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4320	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4321	unsigned long nr_to_reclaim, nr_reclaimed = 0;
4322	int swappiness = -1;
4323	unsigned int reclaim_options;
4324	char *old_buf, *start;
4325	substring_t args[MAX_OPT_ARGS];
4326
4327	buf = strstrip(buf);
4328
4329	old_buf = buf;
4330	nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
4331	if (buf == old_buf)
4332		return -EINVAL;
4333
4334	buf = strstrip(buf);
4335
4336	while ((start = strsep(&buf, " ")) != NULL) {
4337		if (!strlen(start))
4338			continue;
4339		switch (match_token(start, tokens, args)) {
4340		case MEMORY_RECLAIM_SWAPPINESS:
4341			if (match_int(&args[0], &swappiness))
4342				return -EINVAL;
4343			if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
4344				return -EINVAL;
4345			break;
4346		default:
4347			return -EINVAL;
4348		}
4349	}
4350
4351	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
4352	while (nr_reclaimed < nr_to_reclaim) {
4353		/* Will converge on zero, but reclaim enforces a minimum */
4354		unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
4355		unsigned long reclaimed;
4356
4357		if (signal_pending(current))
4358			return -EINTR;
4359
4360		/*
4361		 * This is the final attempt, drain percpu lru caches in the
4362		 * hope of introducing more evictable pages for
4363		 * try_to_free_mem_cgroup_pages().
4364		 */
4365		if (!nr_retries)
4366			lru_add_drain_all();
4367
4368		reclaimed = try_to_free_mem_cgroup_pages(memcg,
4369					batch_size, GFP_KERNEL,
4370					reclaim_options,
4371					swappiness == -1 ? NULL : &swappiness);
4372
4373		if (!reclaimed && !nr_retries--)
4374			return -EAGAIN;
4375
4376		nr_reclaimed += reclaimed;
4377	}
4378
4379	return nbytes;
4380}
 
4381
4382static struct cftype memory_files[] = {
4383	{
4384		.name = "current",
4385		.flags = CFTYPE_NOT_ON_ROOT,
4386		.read_u64 = memory_current_read,
 
 
4387	},
4388	{
4389		.name = "peak",
4390		.flags = CFTYPE_NOT_ON_ROOT,
4391		.open = peak_open,
4392		.release = peak_release,
4393		.seq_show = memory_peak_show,
4394		.write = memory_peak_write,
4395	},
4396	{
4397		.name = "min",
4398		.flags = CFTYPE_NOT_ON_ROOT,
4399		.seq_show = memory_min_show,
4400		.write = memory_min_write,
4401	},
4402	{
4403		.name = "low",
4404		.flags = CFTYPE_NOT_ON_ROOT,
4405		.seq_show = memory_low_show,
4406		.write = memory_low_write,
4407	},
4408	{
4409		.name = "high",
4410		.flags = CFTYPE_NOT_ON_ROOT,
4411		.seq_show = memory_high_show,
4412		.write = memory_high_write,
4413	},
4414	{
4415		.name = "max",
4416		.flags = CFTYPE_NOT_ON_ROOT,
4417		.seq_show = memory_max_show,
4418		.write = memory_max_write,
4419	},
4420	{
4421		.name = "events",
4422		.flags = CFTYPE_NOT_ON_ROOT,
4423		.file_offset = offsetof(struct mem_cgroup, events_file),
4424		.seq_show = memory_events_show,
4425	},
4426	{
4427		.name = "events.local",
4428		.flags = CFTYPE_NOT_ON_ROOT,
4429		.file_offset = offsetof(struct mem_cgroup, events_local_file),
4430		.seq_show = memory_events_local_show,
4431	},
4432	{
4433		.name = "stat",
4434		.seq_show = memory_stat_show,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4435	},
4436#ifdef CONFIG_NUMA
4437	{
4438		.name = "numa_stat",
4439		.seq_show = memory_numa_stat_show,
 
4440	},
4441#endif
 
 
 
 
 
 
 
 
 
 
 
4442	{
4443		.name = "oom.group",
4444		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
4445		.seq_show = memory_oom_group_show,
4446		.write = memory_oom_group_write,
4447	},
4448	{
4449		.name = "reclaim",
4450		.flags = CFTYPE_NS_DELEGATABLE,
4451		.write = memory_reclaim,
 
 
 
 
 
 
 
4452	},
4453	{ }	/* terminate */
4454};
4455
4456struct cgroup_subsys memory_cgrp_subsys = {
4457	.css_alloc = mem_cgroup_css_alloc,
4458	.css_online = mem_cgroup_css_online,
4459	.css_offline = mem_cgroup_css_offline,
4460	.css_released = mem_cgroup_css_released,
4461	.css_free = mem_cgroup_css_free,
4462	.css_reset = mem_cgroup_css_reset,
4463	.css_rstat_flush = mem_cgroup_css_rstat_flush,
4464	.attach = mem_cgroup_attach,
4465	.fork = mem_cgroup_fork,
4466	.exit = mem_cgroup_exit,
4467	.dfl_cftypes = memory_files,
4468#ifdef CONFIG_MEMCG_V1
4469	.legacy_cftypes = mem_cgroup_legacy_files,
4470#endif
4471	.early_init = 0,
4472};
4473
4474/**
4475 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
4476 * @root: the top ancestor of the sub-tree being checked
4477 * @memcg: the memory cgroup to check
4478 *
4479 * WARNING: This function is not stateless! It can only be used as part
4480 *          of a top-down tree iteration, not for isolated queries.
4481 */
4482void mem_cgroup_calculate_protection(struct mem_cgroup *root,
4483				     struct mem_cgroup *memcg)
4484{
4485	bool recursive_protection =
4486		cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT;
4487
4488	if (mem_cgroup_disabled())
4489		return;
4490
4491	if (!root)
4492		root = root_mem_cgroup;
4493
4494	page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
4495}
 
4496
4497static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
4498			gfp_t gfp)
4499{
4500	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4501
4502	ret = try_charge(memcg, gfp, folio_nr_pages(folio));
4503	if (ret)
4504		goto out;
4505
4506	mem_cgroup_commit_charge(folio, memcg);
4507out:
4508	return ret;
 
 
 
4509}
4510
4511int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
4512{
4513	struct mem_cgroup *memcg;
4514	int ret;
4515
4516	memcg = get_mem_cgroup_from_mm(mm);
4517	ret = charge_memcg(folio, memcg, gfp);
4518	css_put(&memcg->css);
4519
4520	return ret;
4521}
4522
4523/**
4524 * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
4525 * @memcg: memcg to charge.
4526 * @gfp: reclaim mode.
4527 * @nr_pages: number of pages to charge.
4528 *
4529 * This function is called when allocating a huge page folio to determine if
4530 * the memcg has the capacity for it. It does not commit the charge yet,
4531 * as the hugetlb folio itself has not been obtained from the hugetlb pool.
4532 *
4533 * Once we have obtained the hugetlb folio, we can call
4534 * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
4535 * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
4536 * of try_charge().
4537 *
4538 * Returns 0 on success. Otherwise, an error code is returned.
4539 */
4540int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
4541			long nr_pages)
4542{
4543	/*
4544	 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
4545	 * but do not attempt to commit charge later (or cancel on error) either.
4546	 */
4547	if (mem_cgroup_disabled() || !memcg ||
4548		!cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
4549		!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
4550		return -EOPNOTSUPP;
4551
4552	if (try_charge(memcg, gfp, nr_pages))
4553		return -ENOMEM;
 
 
 
4554
4555	return 0;
4556}
4557
4558/**
4559 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
4560 * @folio: folio to charge.
4561 * @mm: mm context of the victim
4562 * @gfp: reclaim mode
4563 * @entry: swap entry for which the folio is allocated
4564 *
4565 * This function charges a folio allocated for swapin. Please call this before
4566 * adding the folio to the swapcache.
4567 *
4568 * Returns 0 on success. Otherwise, an error code is returned.
4569 */
4570int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
4571				  gfp_t gfp, swp_entry_t entry)
4572{
4573	struct mem_cgroup *memcg;
4574	unsigned short id;
4575	int ret;
4576
4577	if (mem_cgroup_disabled())
4578		return 0;
4579
4580	id = lookup_swap_cgroup_id(entry);
4581	rcu_read_lock();
4582	memcg = mem_cgroup_from_id(id);
4583	if (!memcg || !css_tryget_online(&memcg->css))
4584		memcg = get_mem_cgroup_from_mm(mm);
4585	rcu_read_unlock();
4586
4587	ret = charge_memcg(folio, memcg, gfp);
4588
4589	css_put(&memcg->css);
4590	return ret;
4591}
4592
4593/*
4594 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
4595 * @entry: the first swap entry for which the pages are charged
4596 * @nr_pages: number of pages which will be uncharged
4597 *
4598 * Call this function after successfully adding the charged page to swapcache.
 
 
4599 *
4600 * Note: This function assumes the page for which swap slot is being uncharged
4601 * is order 0 page.
4602 */
4603void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
4604{
4605	/*
4606	 * Cgroup1's unified memory+swap counter has been charged with the
4607	 * new swapcache page, finish the transfer by uncharging the swap
4608	 * slot. The swap slot would also get uncharged when it dies, but
4609	 * it can stick around indefinitely and we'd count the page twice
4610	 * the entire time.
4611	 *
4612	 * Cgroup2 has separate resource counters for memory and swap,
4613	 * so this is a non-issue here. Memory and swap charge lifetimes
4614	 * correspond 1:1 to page and swap slot lifetimes: we charge the
4615	 * page to memory here, and uncharge swap when the slot is freed.
4616	 */
4617	if (!mem_cgroup_disabled() && do_memsw_account()) {
4618		/*
4619		 * The swap entry might not get freed for a long time,
4620		 * let's not wait for it.  The page already received a
4621		 * memory+swap charge, drop the swap entry duplicate.
4622		 */
4623		mem_cgroup_uncharge_swap(entry, nr_pages);
4624	}
4625}
4626
4627struct uncharge_gather {
4628	struct mem_cgroup *memcg;
4629	unsigned long nr_memory;
4630	unsigned long pgpgout;
4631	unsigned long nr_kmem;
4632	int nid;
4633};
4634
4635static inline void uncharge_gather_clear(struct uncharge_gather *ug)
4636{
4637	memset(ug, 0, sizeof(*ug));
4638}
4639
4640static void uncharge_batch(const struct uncharge_gather *ug)
4641{
4642	if (ug->nr_memory) {
4643		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
4644		if (do_memsw_account())
4645			page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
4646		if (ug->nr_kmem) {
4647			mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
4648			memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
4649		}
4650		memcg1_oom_recover(ug->memcg);
4651	}
4652
4653	memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
 
4654
4655	/* drop reference from uncharge_folio */
4656	css_put(&ug->memcg->css);
 
 
 
4657}
4658
4659static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
4660{
4661	long nr_pages;
4662	struct mem_cgroup *memcg;
4663	struct obj_cgroup *objcg;
4664
4665	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4666
4667	/*
4668	 * Nobody should be changing or seriously looking at
4669	 * folio memcg or objcg at this point, we have fully
4670	 * exclusive access to the folio.
4671	 */
4672	if (folio_memcg_kmem(folio)) {
4673		objcg = __folio_objcg(folio);
4674		/*
4675		 * This get matches the put at the end of the function and
4676		 * kmem pages do not hold memcg references anymore.
4677		 */
4678		memcg = get_mem_cgroup_from_objcg(objcg);
4679	} else {
4680		memcg = __folio_memcg(folio);
4681	}
4682
4683	if (!memcg)
4684		return;
4685
4686	if (ug->memcg != memcg) {
4687		if (ug->memcg) {
4688			uncharge_batch(ug);
4689			uncharge_gather_clear(ug);
4690		}
4691		ug->memcg = memcg;
4692		ug->nid = folio_nid(folio);
4693
4694		/* pairs with css_put in uncharge_batch */
4695		css_get(&memcg->css);
4696	}
4697
4698	nr_pages = folio_nr_pages(folio);
4699
4700	if (folio_memcg_kmem(folio)) {
4701		ug->nr_memory += nr_pages;
4702		ug->nr_kmem += nr_pages;
4703
4704		folio->memcg_data = 0;
4705		obj_cgroup_put(objcg);
4706	} else {
4707		/* LRU pages aren't accounted at the root level */
4708		if (!mem_cgroup_is_root(memcg))
4709			ug->nr_memory += nr_pages;
4710		ug->pgpgout++;
4711
4712		WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
4713		folio->memcg_data = 0;
4714	}
4715
4716	css_put(&memcg->css);
4717}
4718
4719void __mem_cgroup_uncharge(struct folio *folio)
4720{
4721	struct uncharge_gather ug;
4722
4723	/* Don't touch folio->lru of any random page, pre-check: */
4724	if (!folio_memcg_charged(folio))
4725		return;
4726
4727	uncharge_gather_clear(&ug);
4728	uncharge_folio(folio, &ug);
4729	uncharge_batch(&ug);
4730}
4731
4732void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
4733{
4734	struct uncharge_gather ug;
4735	unsigned int i;
4736
4737	uncharge_gather_clear(&ug);
4738	for (i = 0; i < folios->nr; i++)
4739		uncharge_folio(folios->folios[i], &ug);
4740	if (ug.memcg)
4741		uncharge_batch(&ug);
4742}
4743
4744/**
4745 * mem_cgroup_replace_folio - Charge a folio's replacement.
4746 * @old: Currently circulating folio.
4747 * @new: Replacement folio.
4748 *
4749 * Charge @new as a replacement folio for @old. @old will
4750 * be uncharged upon free.
4751 *
4752 * Both folios must be locked, @new->mapping must be set up.
4753 */
4754void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
4755{
4756	struct mem_cgroup *memcg;
4757	long nr_pages = folio_nr_pages(new);
4758
4759	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4760	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4761	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4762	VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
4763
4764	if (mem_cgroup_disabled())
4765		return;
4766
4767	/* Page cache replacement: new folio already charged? */
4768	if (folio_memcg_charged(new))
4769		return;
4770
4771	memcg = folio_memcg(old);
4772	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
4773	if (!memcg)
4774		return;
4775
4776	/* Force-charge the new page. The old one will be freed soon */
4777	if (!mem_cgroup_is_root(memcg)) {
4778		page_counter_charge(&memcg->memory, nr_pages);
4779		if (do_memsw_account())
4780			page_counter_charge(&memcg->memsw, nr_pages);
4781	}
4782
4783	css_get(&memcg->css);
4784	commit_charge(new, memcg);
4785	memcg1_commit_charge(new, memcg);
4786}
4787
4788/**
4789 * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
4790 * @old: Currently circulating folio.
4791 * @new: Replacement folio.
4792 *
4793 * Transfer the memcg data from the old folio to the new folio for migration.
4794 * The old folio's data info will be cleared. Note that the memory counters
4795 * will remain unchanged throughout the process.
4796 *
4797 * Both folios must be locked, @new->mapping must be set up.
4798 */
4799void mem_cgroup_migrate(struct folio *old, struct folio *new)
4800{
4801	struct mem_cgroup *memcg;
4802
4803	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4804	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4805	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4806	VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
4807	VM_BUG_ON_FOLIO(folio_test_lru(old), old);
4808
4809	if (mem_cgroup_disabled())
4810		return;
4811
4812	memcg = folio_memcg(old);
4813	/*
4814	 * Note that it is normal to see !memcg for a hugetlb folio.
4815	 * For e.g, itt could have been allocated when memory_hugetlb_accounting
4816	 * was not selected.
4817	 */
4818	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
4819	if (!memcg)
4820		return;
4821
4822	/* Transfer the charge and the css ref */
4823	commit_charge(new, memcg);
4824
4825	/* Warning should never happen, so don't worry about refcount non-0 */
4826	WARN_ON_ONCE(folio_unqueue_deferred_split(old));
4827	old->memcg_data = 0;
4828}
4829
4830DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
4831EXPORT_SYMBOL(memcg_sockets_enabled_key);
4832
4833void mem_cgroup_sk_alloc(struct sock *sk)
4834{
4835	struct mem_cgroup *memcg;
4836
4837	if (!mem_cgroup_sockets_enabled)
4838		return;
4839
4840	/* Do not associate the sock with unrelated interrupted task's memcg. */
4841	if (!in_task())
4842		return;
4843
4844	rcu_read_lock();
4845	memcg = mem_cgroup_from_task(current);
4846	if (mem_cgroup_is_root(memcg))
4847		goto out;
4848	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
4849		goto out;
4850	if (css_tryget(&memcg->css))
4851		sk->sk_memcg = memcg;
4852out:
4853	rcu_read_unlock();
4854}
 
4855
4856void mem_cgroup_sk_free(struct sock *sk)
4857{
4858	if (sk->sk_memcg)
4859		css_put(&sk->sk_memcg->css);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4860}
4861
4862/**
4863 * mem_cgroup_charge_skmem - charge socket memory
4864 * @memcg: memcg to charge
4865 * @nr_pages: number of pages to charge
4866 * @gfp_mask: reclaim mode
4867 *
4868 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
4869 * @memcg's configured limit, %false if it doesn't.
4870 */
4871bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
4872			     gfp_t gfp_mask)
4873{
4874	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
4875		return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
 
4876
4877	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
4878		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
4879		return true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4880	}
4881
4882	return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4883}
4884
4885/**
4886 * mem_cgroup_uncharge_skmem - uncharge socket memory
4887 * @memcg: memcg to uncharge
4888 * @nr_pages: number of pages to uncharge
4889 */
4890void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
4891{
4892	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
4893		memcg1_uncharge_skmem(memcg, nr_pages);
4894		return;
4895	}
4896
4897	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
4898
4899	refill_stock(memcg, nr_pages);
4900}
4901
4902static int __init cgroup_memory(char *s)
 
4903{
4904	char *token;
4905
4906	while ((token = strsep(&s, ",")) != NULL) {
4907		if (!*token)
4908			continue;
4909		if (!strcmp(token, "nosocket"))
4910			cgroup_memory_nosocket = true;
4911		if (!strcmp(token, "nokmem"))
4912			cgroup_memory_nokmem = true;
4913		if (!strcmp(token, "nobpf"))
4914			cgroup_memory_nobpf = true;
4915	}
4916	return 1;
4917}
4918__setup("cgroup.memory=", cgroup_memory);
4919
4920/*
4921 * subsys_initcall() for memory controller.
4922 *
4923 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
4924 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
4925 * basically everything that doesn't depend on a specific mem_cgroup structure
4926 * should be initialized from here.
4927 */
4928static int __init mem_cgroup_init(void)
4929{
4930	int cpu;
4931
4932	/*
4933	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
4934	 * used for per-memcg-per-cpu caching of per-node statistics. In order
4935	 * to work fine, we should make sure that the overfill threshold can't
4936	 * exceed S32_MAX / PAGE_SIZE.
4937	 */
4938	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
4939
4940	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
4941				  memcg_hotplug_cpu_dead);
4942
4943	for_each_possible_cpu(cpu)
4944		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
4945			  drain_local_stock);
4946
4947	return 0;
 
 
4948}
4949subsys_initcall(mem_cgroup_init);
4950
4951#ifdef CONFIG_SWAP
4952static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 
 
4953{
4954	while (!refcount_inc_not_zero(&memcg->id.ref)) {
 
 
 
 
 
 
 
 
 
 
 
4955		/*
4956		 * The root cgroup cannot be destroyed, so it's refcount must
4957		 * always be >= 1.
 
 
4958		 */
4959		if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
4960			VM_BUG_ON(1);
 
 
 
 
 
 
 
 
 
 
 
 
 
4961			break;
4962		}
4963		memcg = parent_mem_cgroup(memcg);
4964		if (!memcg)
4965			memcg = root_mem_cgroup;
 
 
 
 
 
 
4966	}
4967	return memcg;
4968}
4969
4970/**
4971 * mem_cgroup_swapout - transfer a memsw charge to swap
4972 * @folio: folio whose memsw charge to transfer
4973 * @entry: swap entry to move the charge to
4974 *
4975 * Transfer the memsw charge of @folio to @entry.
4976 */
4977void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
4978{
4979	struct mem_cgroup *memcg, *swap_memcg;
4980	unsigned int nr_entries;
4981	unsigned short oldid;
 
 
 
 
 
 
 
 
 
 
4982
4983	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4984	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
 
 
 
4985
4986	if (mem_cgroup_disabled())
4987		return;
 
 
4988
4989	if (!do_memsw_account())
4990		return;
 
 
 
 
 
 
 
 
 
4991
4992	memcg = folio_memcg(folio);
 
4993
4994	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
4995	if (!memcg)
4996		return;
 
 
 
4997
4998	/*
4999	 * In case the memcg owning these pages has been offlined and doesn't
5000	 * have an ID allocated to it anymore, charge the closest online
5001	 * ancestor for the swap instead and transfer the memory+swap charge.
5002	 */
5003	swap_memcg = mem_cgroup_id_get_online(memcg);
5004	nr_entries = folio_nr_pages(folio);
5005	/* Get references for the tail pages, too */
5006	if (nr_entries > 1)
5007		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
5008	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
5009				   nr_entries);
5010	VM_BUG_ON_FOLIO(oldid, folio);
5011	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
5012
5013	folio_unqueue_deferred_split(folio);
5014	folio->memcg_data = 0;
5015
5016	if (!mem_cgroup_is_root(memcg))
5017		page_counter_uncharge(&memcg->memory, nr_entries);
5018
5019	if (memcg != swap_memcg) {
5020		if (!mem_cgroup_is_root(swap_memcg))
5021			page_counter_charge(&swap_memcg->memsw, nr_entries);
5022		page_counter_uncharge(&memcg->memsw, nr_entries);
5023	}
 
 
5024
5025	memcg1_swapout(folio, memcg);
5026	css_put(&memcg->css);
5027}
5028
5029/**
5030 * __mem_cgroup_try_charge_swap - try charging swap space for a folio
5031 * @folio: folio being added to swap
5032 * @entry: swap entry to charge
5033 *
5034 * Try to charge @folio's memcg for the swap space at @entry.
5035 *
5036 * Returns 0 on success, -ENOMEM on failure.
5037 */
5038int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
5039{
5040	unsigned int nr_pages = folio_nr_pages(folio);
5041	struct page_counter *counter;
5042	struct mem_cgroup *memcg;
5043	unsigned short oldid;
5044
5045	if (do_memsw_account())
5046		return 0;
 
 
5047
5048	memcg = folio_memcg(folio);
 
 
 
 
 
5049
5050	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
5051	if (!memcg)
5052		return 0;
5053
5054	if (!entry.val) {
5055		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
5056		return 0;
 
 
 
 
5057	}
5058
5059	memcg = mem_cgroup_id_get_online(memcg);
5060
5061	if (!mem_cgroup_is_root(memcg) &&
5062	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
5063		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
5064		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
5065		mem_cgroup_id_put(memcg);
5066		return -ENOMEM;
5067	}
5068
5069	/* Get references for the tail pages, too */
5070	if (nr_pages > 1)
5071		mem_cgroup_id_get_many(memcg, nr_pages - 1);
5072	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
5073	VM_BUG_ON_FOLIO(oldid, folio);
5074	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
5075
5076	return 0;
5077}
5078
5079/**
5080 * __mem_cgroup_uncharge_swap - uncharge swap space
5081 * @entry: swap entry to uncharge
5082 * @nr_pages: the amount of swap space to uncharge
5083 */
5084void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
5085{
5086	struct mem_cgroup *memcg;
5087	unsigned short id;
 
 
 
 
 
 
 
 
 
5088
5089	id = swap_cgroup_record(entry, 0, nr_pages);
5090	rcu_read_lock();
5091	memcg = mem_cgroup_from_id(id);
5092	if (memcg) {
5093		if (!mem_cgroup_is_root(memcg)) {
5094			if (do_memsw_account())
5095				page_counter_uncharge(&memcg->memsw, nr_pages);
5096			else
5097				page_counter_uncharge(&memcg->swap, nr_pages);
 
 
 
 
5098		}
5099		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
5100		mem_cgroup_id_put_many(memcg, nr_pages);
5101	}
5102	rcu_read_unlock();
 
 
 
 
 
 
 
5103}
5104
5105long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5106{
5107	long nr_swap_pages = get_nr_swap_pages();
 
 
 
 
 
 
 
 
 
 
 
 
 
5108
5109	if (mem_cgroup_disabled() || do_memsw_account())
5110		return nr_swap_pages;
5111	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
5112		nr_swap_pages = min_t(long, nr_swap_pages,
5113				      READ_ONCE(memcg->swap.max) -
5114				      page_counter_read(&memcg->swap));
5115	return nr_swap_pages;
5116}
5117
5118bool mem_cgroup_swap_full(struct folio *folio)
5119{
5120	struct mem_cgroup *memcg;
 
5121
5122	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
5123
5124	if (vm_swap_full())
5125		return true;
5126	if (do_memsw_account())
5127		return false;
5128
5129	memcg = folio_memcg(folio);
5130	if (!memcg)
5131		return false;
5132
5133	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
5134		unsigned long usage = page_counter_read(&memcg->swap);
5135
5136		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
5137		    usage * 2 >= READ_ONCE(memcg->swap.max))
5138			return true;
5139	}
 
5140
5141	return false;
5142}
5143
5144static int __init setup_swap_account(char *s)
5145{
5146	bool res;
5147
5148	if (!kstrtobool(s, &res) && !res)
5149		pr_warn_once("The swapaccount=0 commandline option is deprecated "
5150			     "in favor of configuring swap control via cgroupfs. "
5151			     "Please report your usecase to linux-mm@kvack.org if you "
5152			     "depend on this functionality.\n");
5153	return 1;
5154}
5155__setup("swapaccount=", setup_swap_account);
5156
5157static u64 swap_current_read(struct cgroup_subsys_state *css,
5158			     struct cftype *cft)
5159{
5160	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5161
5162	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
 
 
5163}
5164
5165static int swap_peak_show(struct seq_file *sf, void *v)
 
5166{
5167	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 
5168
5169	return peak_show(sf, v, &memcg->swap);
5170}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5171
5172static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
5173			       size_t nbytes, loff_t off)
5174{
5175	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5176
5177	return peak_write(of, buf, nbytes, off, &memcg->swap,
5178			  &memcg->swap_peaks);
 
 
 
 
 
 
 
5179}
5180
5181static int swap_high_show(struct seq_file *m, void *v)
5182{
5183	return seq_puts_memcg_tunable(m,
5184		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
5185}
5186
5187static ssize_t swap_high_write(struct kernfs_open_file *of,
5188			       char *buf, size_t nbytes, loff_t off)
5189{
5190	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5191	unsigned long high;
5192	int err;
5193
5194	buf = strstrip(buf);
5195	err = page_counter_memparse(buf, "max", &high);
5196	if (err)
5197		return err;
5198
5199	page_counter_set_high(&memcg->swap, high);
5200
5201	return nbytes;
5202}
5203
5204static int swap_max_show(struct seq_file *m, void *v)
 
 
5205{
5206	return seq_puts_memcg_tunable(m,
5207		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
5208}
5209
5210static ssize_t swap_max_write(struct kernfs_open_file *of,
5211			      char *buf, size_t nbytes, loff_t off)
5212{
5213	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5214	unsigned long max;
5215	int err;
5216
5217	buf = strstrip(buf);
5218	err = page_counter_memparse(buf, "max", &max);
5219	if (err)
5220		return err;
5221
5222	xchg(&memcg->swap.max, max);
5223
5224	return nbytes;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5225}
5226
5227static int swap_events_show(struct seq_file *m, void *v)
 
 
5228{
5229	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5230
5231	seq_printf(m, "high %lu\n",
5232		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
5233	seq_printf(m, "max %lu\n",
5234		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
5235	seq_printf(m, "fail %lu\n",
5236		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
5237
5238	return 0;
5239}
5240
5241static struct cftype swap_files[] = {
5242	{
5243		.name = "swap.current",
5244		.flags = CFTYPE_NOT_ON_ROOT,
5245		.read_u64 = swap_current_read,
5246	},
5247	{
5248		.name = "swap.high",
5249		.flags = CFTYPE_NOT_ON_ROOT,
5250		.seq_show = swap_high_show,
5251		.write = swap_high_write,
5252	},
5253	{
5254		.name = "swap.max",
5255		.flags = CFTYPE_NOT_ON_ROOT,
5256		.seq_show = swap_max_show,
5257		.write = swap_max_write,
5258	},
5259	{
5260		.name = "swap.peak",
5261		.flags = CFTYPE_NOT_ON_ROOT,
5262		.open = peak_open,
5263		.release = peak_release,
5264		.seq_show = swap_peak_show,
5265		.write = swap_peak_write,
5266	},
5267	{
5268		.name = "swap.events",
5269		.flags = CFTYPE_NOT_ON_ROOT,
5270		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
5271		.seq_show = swap_events_show,
5272	},
5273	{ }	/* terminate */
5274};
5275
5276#ifdef CONFIG_ZSWAP
5277/**
5278 * obj_cgroup_may_zswap - check if this cgroup can zswap
5279 * @objcg: the object cgroup
5280 *
5281 * Check if the hierarchical zswap limit has been reached.
5282 *
5283 * This doesn't check for specific headroom, and it is not atomic
5284 * either. But with zswap, the size of the allocation is only known
5285 * once compression has occurred, and this optimistic pre-check avoids
5286 * spending cycles on compression when there is already no room left
5287 * or zswap is disabled altogether somewhere in the hierarchy.
5288 */
5289bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
5290{
5291	struct mem_cgroup *memcg, *original_memcg;
5292	bool ret = true;
 
 
5293
5294	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5295		return true;
 
 
 
 
 
 
 
 
5296
5297	original_memcg = get_mem_cgroup_from_objcg(objcg);
5298	for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
5299	     memcg = parent_mem_cgroup(memcg)) {
5300		unsigned long max = READ_ONCE(memcg->zswap_max);
5301		unsigned long pages;
5302
5303		if (max == PAGE_COUNTER_MAX)
5304			continue;
5305		if (max == 0) {
5306			ret = false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5307			break;
5308		}
 
 
 
5309
5310		/* Force flush to get accurate stats for charging */
5311		__mem_cgroup_flush_stats(memcg, true);
5312		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
5313		if (pages < max)
5314			continue;
5315		ret = false;
5316		break;
 
 
 
5317	}
5318	mem_cgroup_put(original_memcg);
5319	return ret;
5320}
5321
5322/**
5323 * obj_cgroup_charge_zswap - charge compression backend memory
5324 * @objcg: the object cgroup
5325 * @size: size of compressed object
5326 *
5327 * This forces the charge after obj_cgroup_may_zswap() allowed
5328 * compression and storage in zwap for this cgroup to go ahead.
5329 */
5330void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
5331{
5332	struct mem_cgroup *memcg;
5333
5334	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5335		return;
5336
5337	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
5338
5339	/* PF_MEMALLOC context, charging must succeed */
5340	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
5341		VM_WARN_ON_ONCE(1);
5342
5343	rcu_read_lock();
5344	memcg = obj_cgroup_memcg(objcg);
5345	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
5346	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
5347	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5348}
5349
5350/**
5351 * obj_cgroup_uncharge_zswap - uncharge compression backend memory
5352 * @objcg: the object cgroup
5353 * @size: size of compressed object
5354 *
5355 * Uncharges zswap memory on page in.
5356 */
5357void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
5358{
5359	struct mem_cgroup *memcg;
5360
5361	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5362		return;
5363
5364	obj_cgroup_uncharge(objcg, size);
5365
5366	rcu_read_lock();
5367	memcg = obj_cgroup_memcg(objcg);
5368	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
5369	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
5370	rcu_read_unlock();
5371}
5372
5373bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
 
 
5374{
5375	/* if zswap is disabled, do not block pages going to the swapping device */
5376	if (!zswap_is_enabled())
5377		return true;
5378
5379	for (; memcg; memcg = parent_mem_cgroup(memcg))
5380		if (!READ_ONCE(memcg->zswap_writeback))
5381			return false;
5382
5383	return true;
5384}
5385
5386static u64 zswap_current_read(struct cgroup_subsys_state *css,
5387			      struct cftype *cft)
5388{
5389	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5390
5391	mem_cgroup_flush_stats(memcg);
5392	return memcg_page_state(memcg, MEMCG_ZSWAP_B);
5393}
5394
5395static int zswap_max_show(struct seq_file *m, void *v)
 
 
5396{
5397	return seq_puts_memcg_tunable(m,
5398		READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
5399}
 
5400
5401static ssize_t zswap_max_write(struct kernfs_open_file *of,
5402			       char *buf, size_t nbytes, loff_t off)
5403{
5404	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5405	unsigned long max;
5406	int err;
5407
5408	buf = strstrip(buf);
5409	err = page_counter_memparse(buf, "max", &max);
5410	if (err)
5411		return err;
5412
5413	xchg(&memcg->zswap_max, max);
5414
5415	return nbytes;
5416}
5417
5418static int zswap_writeback_show(struct seq_file *m, void *v)
5419{
5420	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5421
5422	seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback));
5423	return 0;
5424}
5425
5426static ssize_t zswap_writeback_write(struct kernfs_open_file *of,
5427				char *buf, size_t nbytes, loff_t off)
5428{
5429	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5430	int zswap_writeback;
5431	ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback);
5432
5433	if (parse_ret)
5434		return parse_ret;
5435
5436	if (zswap_writeback != 0 && zswap_writeback != 1)
5437		return -EINVAL;
5438
5439	WRITE_ONCE(memcg->zswap_writeback, zswap_writeback);
5440	return nbytes;
5441}
 
5442
5443static struct cftype zswap_files[] = {
5444	{
5445		.name = "zswap.current",
5446		.flags = CFTYPE_NOT_ON_ROOT,
5447		.read_u64 = zswap_current_read,
5448	},
5449	{
5450		.name = "zswap.max",
5451		.flags = CFTYPE_NOT_ON_ROOT,
5452		.seq_show = zswap_max_show,
5453		.write = zswap_max_write,
5454	},
5455	{
5456		.name = "zswap.writeback",
5457		.seq_show = zswap_writeback_show,
5458		.write = zswap_writeback_write,
5459	},
5460	{ }	/* terminate */
5461};
5462#endif /* CONFIG_ZSWAP */
5463
5464static int __init mem_cgroup_swap_init(void)
5465{
5466	if (mem_cgroup_disabled())
5467		return 0;
5468
5469	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
5470#ifdef CONFIG_MEMCG_V1
5471	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
5472#endif
5473#ifdef CONFIG_ZSWAP
5474	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
5475#endif
5476	return 0;
5477}
5478subsys_initcall(mem_cgroup_swap_init);
5479
5480#endif /* CONFIG_SWAP */