Linux Audio

Check our new training course

Loading...
v4.6
 
   1/* memcontrol.c - Memory Controller
   2 *
   3 * Copyright IBM Corporation, 2007
   4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5 *
   6 * Copyright 2007 OpenVZ SWsoft Inc
   7 * Author: Pavel Emelianov <xemul@openvz.org>
   8 *
   9 * Memory thresholds
  10 * Copyright (C) 2009 Nokia Corporation
  11 * Author: Kirill A. Shutemov
  12 *
  13 * Kernel Memory Controller
  14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15 * Authors: Glauber Costa and Suleiman Souhlal
  16 *
  17 * Native page reclaim
  18 * Charge lifetime sanitation
  19 * Lockless page tracking & accounting
  20 * Unified hierarchy configuration model
  21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  22 *
  23 * This program is free software; you can redistribute it and/or modify
  24 * it under the terms of the GNU General Public License as published by
  25 * the Free Software Foundation; either version 2 of the License, or
  26 * (at your option) any later version.
  27 *
  28 * This program is distributed in the hope that it will be useful,
  29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31 * GNU General Public License for more details.
  32 */
  33
  34#include <linux/page_counter.h>
  35#include <linux/memcontrol.h>
  36#include <linux/cgroup.h>
  37#include <linux/mm.h>
 
 
  38#include <linux/hugetlb.h>
  39#include <linux/pagemap.h>
 
  40#include <linux/smp.h>
  41#include <linux/page-flags.h>
  42#include <linux/backing-dev.h>
  43#include <linux/bit_spinlock.h>
  44#include <linux/rcupdate.h>
  45#include <linux/limits.h>
  46#include <linux/export.h>
  47#include <linux/mutex.h>
  48#include <linux/rbtree.h>
  49#include <linux/slab.h>
  50#include <linux/swap.h>
  51#include <linux/swapops.h>
  52#include <linux/spinlock.h>
  53#include <linux/eventfd.h>
  54#include <linux/poll.h>
  55#include <linux/sort.h>
  56#include <linux/fs.h>
  57#include <linux/seq_file.h>
  58#include <linux/vmpressure.h>
  59#include <linux/mm_inline.h>
  60#include <linux/swap_cgroup.h>
  61#include <linux/cpu.h>
  62#include <linux/oom.h>
  63#include <linux/lockdep.h>
  64#include <linux/file.h>
  65#include <linux/tracehook.h>
 
 
  66#include "internal.h"
  67#include <net/sock.h>
  68#include <net/ip.h>
  69#include "slab.h"
  70
  71#include <asm/uaccess.h>
  72
  73#include <trace/events/vmscan.h>
  74
  75struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  76EXPORT_SYMBOL(memory_cgrp_subsys);
  77
  78struct mem_cgroup *root_mem_cgroup __read_mostly;
  79
  80#define MEM_CGROUP_RECLAIM_RETRIES	5
  81
  82/* Socket memory accounting disabled? */
  83static bool cgroup_memory_nosocket;
  84
  85/* Kernel memory accounting disabled? */
  86static bool cgroup_memory_nokmem;
  87
  88/* Whether the swap controller is active */
  89#ifdef CONFIG_MEMCG_SWAP
  90int do_swap_account __read_mostly;
  91#else
  92#define do_swap_account		0
 
 
 
 
  93#endif
  94
  95/* Whether legacy memory+swap accounting is active */
  96static bool do_memsw_account(void)
  97{
  98	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
  99}
 100
 101static const char * const mem_cgroup_stat_names[] = {
 102	"cache",
 103	"rss",
 104	"rss_huge",
 105	"mapped_file",
 106	"dirty",
 107	"writeback",
 108	"swap",
 109};
 110
 111static const char * const mem_cgroup_events_names[] = {
 112	"pgpgin",
 113	"pgpgout",
 114	"pgfault",
 115	"pgmajfault",
 116};
 117
 118static const char * const mem_cgroup_lru_names[] = {
 119	"inactive_anon",
 120	"active_anon",
 121	"inactive_file",
 122	"active_file",
 123	"unevictable",
 124};
 125
 126#define THRESHOLDS_EVENTS_TARGET 128
 127#define SOFTLIMIT_EVENTS_TARGET 1024
 128#define NUMAINFO_EVENTS_TARGET	1024
 129
 130/*
 131 * Cgroups above their limits are maintained in a RB-Tree, independent of
 132 * their hierarchy representation
 133 */
 134
 135struct mem_cgroup_tree_per_zone {
 136	struct rb_root rb_root;
 
 137	spinlock_t lock;
 138};
 139
 140struct mem_cgroup_tree_per_node {
 141	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 142};
 143
 144struct mem_cgroup_tree {
 145	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 146};
 147
 148static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 149
 150/* for OOM */
 151struct mem_cgroup_eventfd_list {
 152	struct list_head list;
 153	struct eventfd_ctx *eventfd;
 154};
 155
 156/*
 157 * cgroup_event represents events which userspace want to receive.
 158 */
 159struct mem_cgroup_event {
 160	/*
 161	 * memcg which the event belongs to.
 162	 */
 163	struct mem_cgroup *memcg;
 164	/*
 165	 * eventfd to signal userspace about the event.
 166	 */
 167	struct eventfd_ctx *eventfd;
 168	/*
 169	 * Each of these stored in a list by the cgroup.
 170	 */
 171	struct list_head list;
 172	/*
 173	 * register_event() callback will be used to add new userspace
 174	 * waiter for changes related to this event.  Use eventfd_signal()
 175	 * on eventfd to send notification to userspace.
 176	 */
 177	int (*register_event)(struct mem_cgroup *memcg,
 178			      struct eventfd_ctx *eventfd, const char *args);
 179	/*
 180	 * unregister_event() callback will be called when userspace closes
 181	 * the eventfd or on cgroup removing.  This callback must be set,
 182	 * if you want provide notification functionality.
 183	 */
 184	void (*unregister_event)(struct mem_cgroup *memcg,
 185				 struct eventfd_ctx *eventfd);
 186	/*
 187	 * All fields below needed to unregister event when
 188	 * userspace closes eventfd.
 189	 */
 190	poll_table pt;
 191	wait_queue_head_t *wqh;
 192	wait_queue_t wait;
 193	struct work_struct remove;
 194};
 195
 196static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 197static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 198
 199/* Stuffs for move charges at task migration. */
 200/*
 201 * Types of charges to be moved.
 202 */
 203#define MOVE_ANON	0x1U
 204#define MOVE_FILE	0x2U
 205#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
 206
 207/* "mc" and its members are protected by cgroup_mutex */
 208static struct move_charge_struct {
 209	spinlock_t	  lock; /* for from, to */
 210	struct mm_struct  *mm;
 211	struct mem_cgroup *from;
 212	struct mem_cgroup *to;
 213	unsigned long flags;
 214	unsigned long precharge;
 215	unsigned long moved_charge;
 216	unsigned long moved_swap;
 217	struct task_struct *moving_task;	/* a task moving charges */
 218	wait_queue_head_t waitq;		/* a waitq for other context */
 219} mc = {
 220	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 221	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 222};
 223
 224/*
 225 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 226 * limit reclaim to prevent infinite loops, if they ever occur.
 227 */
 228#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 229#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 230
 231enum charge_type {
 232	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 233	MEM_CGROUP_CHARGE_TYPE_ANON,
 234	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 235	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 236	NR_CHARGE_TYPE,
 237};
 238
 239/* for encoding cft->private value on file */
 240enum res_type {
 241	_MEM,
 242	_MEMSWAP,
 243	_OOM_TYPE,
 244	_KMEM,
 245	_TCP,
 246};
 247
 248#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 249#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 250#define MEMFILE_ATTR(val)	((val) & 0xffff)
 251/* Used for OOM nofiier */
 252#define OOM_CONTROL		(0)
 253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 254/* Some nice accessors for the vmpressure. */
 255struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 256{
 257	if (!memcg)
 258		memcg = root_mem_cgroup;
 259	return &memcg->vmpressure;
 260}
 261
 262struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 263{
 264	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 265}
 266
 267static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 
 
 
 268{
 269	return (memcg == root_mem_cgroup);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 270}
 271
 272#ifndef CONFIG_SLOB
 273/*
 274 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 275 * The main reason for not using cgroup id for this:
 276 *  this works better in sparse environments, where we have a lot of memcgs,
 277 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 278 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 279 *  200 entry array for that.
 280 *
 281 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 282 * will double each time we have to increase it.
 283 */
 284static DEFINE_IDA(memcg_cache_ida);
 285int memcg_nr_cache_ids;
 286
 287/* Protects memcg_nr_cache_ids */
 288static DECLARE_RWSEM(memcg_cache_ids_sem);
 289
 290void memcg_get_cache_ids(void)
 291{
 292	down_read(&memcg_cache_ids_sem);
 293}
 294
 295void memcg_put_cache_ids(void)
 296{
 297	up_read(&memcg_cache_ids_sem);
 298}
 299
 300/*
 301 * MIN_SIZE is different than 1, because we would like to avoid going through
 302 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 303 * cgroups is a reasonable guess. In the future, it could be a parameter or
 304 * tunable, but that is strictly not necessary.
 305 *
 306 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 307 * this constant directly from cgroup, but it is understandable that this is
 308 * better kept as an internal representation in cgroup.c. In any case, the
 309 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 310 * increase ours as well if it increases.
 311 */
 312#define MEMCG_CACHES_MIN_SIZE 4
 313#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 314
 315/*
 316 * A lot of the calls to the cache allocation functions are expected to be
 317 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 318 * conditional to this static branch, we'll have to allow modules that does
 319 * kmem_cache_alloc and the such to see this symbol as well
 320 */
 321DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 322EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 323
 324#endif /* !CONFIG_SLOB */
 
 325
 326static struct mem_cgroup_per_zone *
 327mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 328{
 329	int nid = zone_to_nid(zone);
 330	int zid = zone_idx(zone);
 
 
 
 
 
 
 
 
 331
 332	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 333}
 334
 335/**
 336 * mem_cgroup_css_from_page - css of the memcg associated with a page
 337 * @page: page of interest
 338 *
 339 * If memcg is bound to the default hierarchy, css of the memcg associated
 340 * with @page is returned.  The returned css remains associated with @page
 341 * until it is released.
 342 *
 343 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 344 * is returned.
 345 */
 346struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 347{
 348	struct mem_cgroup *memcg;
 349
 350	memcg = page->mem_cgroup;
 351
 352	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 353		memcg = root_mem_cgroup;
 354
 355	return &memcg->css;
 356}
 357
 358/**
 359 * page_cgroup_ino - return inode number of the memcg a page is charged to
 360 * @page: the page
 361 *
 362 * Look up the closest online ancestor of the memory cgroup @page is charged to
 363 * and return its inode number or 0 if @page is not charged to any cgroup. It
 364 * is safe to call this function without holding a reference to @page.
 365 *
 366 * Note, this function is inherently racy, because there is nothing to prevent
 367 * the cgroup inode from getting torn down and potentially reallocated a moment
 368 * after page_cgroup_ino() returns, so it only should be used by callers that
 369 * do not care (such as procfs interfaces).
 370 */
 371ino_t page_cgroup_ino(struct page *page)
 372{
 373	struct mem_cgroup *memcg;
 374	unsigned long ino = 0;
 375
 376	rcu_read_lock();
 377	memcg = READ_ONCE(page->mem_cgroup);
 
 
 
 
 
 
 
 
 
 
 378	while (memcg && !(memcg->css.flags & CSS_ONLINE))
 379		memcg = parent_mem_cgroup(memcg);
 380	if (memcg)
 381		ino = cgroup_ino(memcg->css.cgroup);
 382	rcu_read_unlock();
 383	return ino;
 384}
 385
 386static struct mem_cgroup_per_zone *
 387mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 388{
 389	int nid = page_to_nid(page);
 390	int zid = page_zonenum(page);
 391
 392	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 393}
 394
 395static struct mem_cgroup_tree_per_zone *
 396soft_limit_tree_node_zone(int nid, int zid)
 397{
 398	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 399}
 400
 401static struct mem_cgroup_tree_per_zone *
 402soft_limit_tree_from_page(struct page *page)
 403{
 404	int nid = page_to_nid(page);
 405	int zid = page_zonenum(page);
 406
 407	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 408}
 409
 410static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
 411					 struct mem_cgroup_tree_per_zone *mctz,
 412					 unsigned long new_usage_in_excess)
 413{
 414	struct rb_node **p = &mctz->rb_root.rb_node;
 415	struct rb_node *parent = NULL;
 416	struct mem_cgroup_per_zone *mz_node;
 
 417
 418	if (mz->on_tree)
 419		return;
 420
 421	mz->usage_in_excess = new_usage_in_excess;
 422	if (!mz->usage_in_excess)
 423		return;
 424	while (*p) {
 425		parent = *p;
 426		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 427					tree_node);
 428		if (mz->usage_in_excess < mz_node->usage_in_excess)
 429			p = &(*p)->rb_left;
 
 
 
 430		/*
 431		 * We can't avoid mem cgroups that are over their soft
 432		 * limit by the same amount
 433		 */
 434		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 435			p = &(*p)->rb_right;
 436	}
 
 
 
 
 437	rb_link_node(&mz->tree_node, parent, p);
 438	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 439	mz->on_tree = true;
 440}
 441
 442static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 443					 struct mem_cgroup_tree_per_zone *mctz)
 444{
 445	if (!mz->on_tree)
 446		return;
 
 
 
 
 447	rb_erase(&mz->tree_node, &mctz->rb_root);
 448	mz->on_tree = false;
 449}
 450
 451static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 452				       struct mem_cgroup_tree_per_zone *mctz)
 453{
 454	unsigned long flags;
 455
 456	spin_lock_irqsave(&mctz->lock, flags);
 457	__mem_cgroup_remove_exceeded(mz, mctz);
 458	spin_unlock_irqrestore(&mctz->lock, flags);
 459}
 460
 461static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 462{
 463	unsigned long nr_pages = page_counter_read(&memcg->memory);
 464	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 465	unsigned long excess = 0;
 466
 467	if (nr_pages > soft_limit)
 468		excess = nr_pages - soft_limit;
 469
 470	return excess;
 471}
 472
 473static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 474{
 475	unsigned long excess;
 476	struct mem_cgroup_per_zone *mz;
 477	struct mem_cgroup_tree_per_zone *mctz;
 478
 479	mctz = soft_limit_tree_from_page(page);
 
 
 480	/*
 481	 * Necessary to update all ancestors when hierarchy is used.
 482	 * because their event counter is not touched.
 483	 */
 484	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 485		mz = mem_cgroup_page_zoneinfo(memcg, page);
 486		excess = soft_limit_excess(memcg);
 487		/*
 488		 * We have to update the tree if mz is on RB-tree or
 489		 * mem is over its softlimit.
 490		 */
 491		if (excess || mz->on_tree) {
 492			unsigned long flags;
 493
 494			spin_lock_irqsave(&mctz->lock, flags);
 495			/* if on-tree, remove it */
 496			if (mz->on_tree)
 497				__mem_cgroup_remove_exceeded(mz, mctz);
 498			/*
 499			 * Insert again. mz->usage_in_excess will be updated.
 500			 * If excess is 0, no tree ops.
 501			 */
 502			__mem_cgroup_insert_exceeded(mz, mctz, excess);
 503			spin_unlock_irqrestore(&mctz->lock, flags);
 504		}
 505	}
 506}
 507
 508static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 509{
 510	struct mem_cgroup_tree_per_zone *mctz;
 511	struct mem_cgroup_per_zone *mz;
 512	int nid, zid;
 513
 514	for_each_node(nid) {
 515		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 516			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 517			mctz = soft_limit_tree_node_zone(nid, zid);
 518			mem_cgroup_remove_exceeded(mz, mctz);
 519		}
 520	}
 521}
 522
 523static struct mem_cgroup_per_zone *
 524__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 525{
 526	struct rb_node *rightmost = NULL;
 527	struct mem_cgroup_per_zone *mz;
 528
 529retry:
 530	mz = NULL;
 531	rightmost = rb_last(&mctz->rb_root);
 532	if (!rightmost)
 533		goto done;		/* Nothing to reclaim from */
 534
 535	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 
 536	/*
 537	 * Remove the node now but someone else can add it back,
 538	 * we will to add it back at the end of reclaim to its correct
 539	 * position in the tree.
 540	 */
 541	__mem_cgroup_remove_exceeded(mz, mctz);
 542	if (!soft_limit_excess(mz->memcg) ||
 543	    !css_tryget_online(&mz->memcg->css))
 544		goto retry;
 545done:
 546	return mz;
 547}
 548
 549static struct mem_cgroup_per_zone *
 550mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 551{
 552	struct mem_cgroup_per_zone *mz;
 553
 554	spin_lock_irq(&mctz->lock);
 555	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 556	spin_unlock_irq(&mctz->lock);
 557	return mz;
 558}
 559
 560/*
 561 * Return page count for single (non recursive) @memcg.
 562 *
 563 * Implementation Note: reading percpu statistics for memcg.
 564 *
 565 * Both of vmstat[] and percpu_counter has threshold and do periodic
 566 * synchronization to implement "quick" read. There are trade-off between
 567 * reading cost and precision of value. Then, we may have a chance to implement
 568 * a periodic synchronization of counter in memcg's counter.
 569 *
 570 * But this _read() function is used for user interface now. The user accounts
 571 * memory usage by memory cgroup and he _always_ requires exact value because
 572 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 573 * have to visit all online cpus and make sum. So, for now, unnecessary
 574 * synchronization is not implemented. (just implemented for cpu hotplug)
 575 *
 576 * If there are kernel internal actions which can make use of some not-exact
 577 * value, and reading all cpu value can be performance bottleneck in some
 578 * common workload, threshold and synchronization as vmstat[] should be
 579 * implemented.
 580 */
 581static unsigned long
 582mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 583{
 584	long val = 0;
 585	int cpu;
 586
 587	/* Per-cpu values can be negative, use a signed accumulator */
 588	for_each_possible_cpu(cpu)
 589		val += per_cpu(memcg->stat->count[idx], cpu);
 590	/*
 591	 * Summing races with updates, so val may be negative.  Avoid exposing
 592	 * transient negative values.
 593	 */
 594	if (val < 0)
 595		val = 0;
 596	return val;
 
 
 
 
 
 
 
 
 
 
 597}
 598
 599static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 600					    enum mem_cgroup_events_index idx)
 601{
 602	unsigned long val = 0;
 603	int cpu;
 604
 605	for_each_possible_cpu(cpu)
 606		val += per_cpu(memcg->stat->events[idx], cpu);
 607	return val;
 
 608}
 609
 610static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 611					 struct page *page,
 612					 bool compound, int nr_pages)
 613{
 614	/*
 615	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 616	 * counted as CACHE even if it's on ANON LRU.
 617	 */
 618	if (PageAnon(page))
 619		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 620				nr_pages);
 621	else
 622		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 623				nr_pages);
 
 
 
 
 
 624
 625	if (compound) {
 626		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 627		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 628				nr_pages);
 
 
 
 
 629	}
 
 
 630
 631	/* pagein of a big page is an event. So, ignore page size */
 632	if (nr_pages > 0)
 633		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 634	else {
 635		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 636		nr_pages = -nr_pages; /* for event */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 637	}
 
 
 
 
 
 
 638
 639	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 
 
 
 
 640}
 641
 642unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 643					   int nid, unsigned int lru_mask)
 
 
 
 
 
 
 644{
 645	unsigned long nr = 0;
 646	int zid;
 647
 648	VM_BUG_ON((unsigned)nid >= nr_node_ids);
 
 649
 650	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 651		struct mem_cgroup_per_zone *mz;
 652		enum lru_list lru;
 653
 654		for_each_lru(lru) {
 655			if (!(BIT(lru) & lru_mask))
 656				continue;
 657			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 658			nr += mz->lru_size[lru];
 659		}
 
 
 660	}
 661	return nr;
 662}
 663
 664static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 665			unsigned int lru_mask)
 666{
 667	unsigned long nr = 0;
 668	int nid;
 669
 670	for_each_node_state(nid, N_MEMORY)
 671		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 672	return nr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 673}
 674
 675static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 676				       enum mem_cgroup_events_target target)
 677{
 678	unsigned long val, next;
 679
 680	val = __this_cpu_read(memcg->stat->nr_page_events);
 681	next = __this_cpu_read(memcg->stat->targets[target]);
 682	/* from time_after() in jiffies.h */
 683	if ((long)next - (long)val < 0) {
 684		switch (target) {
 685		case MEM_CGROUP_TARGET_THRESH:
 686			next = val + THRESHOLDS_EVENTS_TARGET;
 687			break;
 688		case MEM_CGROUP_TARGET_SOFTLIMIT:
 689			next = val + SOFTLIMIT_EVENTS_TARGET;
 690			break;
 691		case MEM_CGROUP_TARGET_NUMAINFO:
 692			next = val + NUMAINFO_EVENTS_TARGET;
 693			break;
 694		default:
 695			break;
 696		}
 697		__this_cpu_write(memcg->stat->targets[target], next);
 698		return true;
 699	}
 700	return false;
 701}
 702
 703/*
 704 * Check events in order.
 705 *
 706 */
 707static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 708{
 709	/* threshold event is triggered in finer grain than soft limit */
 710	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 711						MEM_CGROUP_TARGET_THRESH))) {
 712		bool do_softlimit;
 713		bool do_numainfo __maybe_unused;
 714
 715		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 716						MEM_CGROUP_TARGET_SOFTLIMIT);
 717#if MAX_NUMNODES > 1
 718		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 719						MEM_CGROUP_TARGET_NUMAINFO);
 720#endif
 721		mem_cgroup_threshold(memcg);
 722		if (unlikely(do_softlimit))
 723			mem_cgroup_update_tree(memcg, page);
 724#if MAX_NUMNODES > 1
 725		if (unlikely(do_numainfo))
 726			atomic_inc(&memcg->numainfo_events);
 727#endif
 728	}
 729}
 730
 731struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 732{
 733	/*
 734	 * mm_update_next_owner() may clear mm->owner to NULL
 735	 * if it races with swapoff, page migration, etc.
 736	 * So this can be called with p == NULL.
 737	 */
 738	if (unlikely(!p))
 739		return NULL;
 740
 741	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 742}
 743EXPORT_SYMBOL(mem_cgroup_from_task);
 744
 745static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 
 
 
 
 
 
 
 
 746{
 747	struct mem_cgroup *memcg = NULL;
 
 
 
 748
 749	rcu_read_lock();
 750	do {
 751		/*
 752		 * Page cache insertions can happen withou an
 753		 * actual mm context, e.g. during disk probing
 754		 * on boot, loopback IO, acct() writes etc.
 755		 */
 756		if (unlikely(!mm))
 757			memcg = root_mem_cgroup;
 758		else {
 759			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 760			if (unlikely(!memcg))
 761				memcg = root_mem_cgroup;
 762		}
 763	} while (!css_tryget_online(&memcg->css));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 764	rcu_read_unlock();
 765	return memcg;
 766}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 767
 768/**
 769 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 770 * @root: hierarchy root
 771 * @prev: previously returned memcg, NULL on first invocation
 772 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 773 *
 774 * Returns references to children of the hierarchy below @root, or
 775 * @root itself, or %NULL after a full round-trip.
 776 *
 777 * Caller must pass the return value in @prev on subsequent
 778 * invocations for reference counting, or use mem_cgroup_iter_break()
 779 * to cancel a hierarchy walk before the round-trip is complete.
 780 *
 781 * Reclaimers can specify a zone and a priority level in @reclaim to
 782 * divide up the memcgs in the hierarchy among all concurrent
 783 * reclaimers operating on the same zone and priority.
 784 */
 785struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 786				   struct mem_cgroup *prev,
 787				   struct mem_cgroup_reclaim_cookie *reclaim)
 788{
 789	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 790	struct cgroup_subsys_state *css = NULL;
 791	struct mem_cgroup *memcg = NULL;
 792	struct mem_cgroup *pos = NULL;
 793
 794	if (mem_cgroup_disabled())
 795		return NULL;
 796
 797	if (!root)
 798		root = root_mem_cgroup;
 799
 800	if (prev && !reclaim)
 801		pos = prev;
 802
 803	if (!root->use_hierarchy && root != root_mem_cgroup) {
 804		if (prev)
 805			goto out;
 806		return root;
 807	}
 808
 809	rcu_read_lock();
 810
 811	if (reclaim) {
 812		struct mem_cgroup_per_zone *mz;
 813
 814		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
 815		iter = &mz->iter[reclaim->priority];
 816
 817		if (prev && reclaim->generation != iter->generation)
 818			goto out_unlock;
 819
 820		while (1) {
 821			pos = READ_ONCE(iter->position);
 822			if (!pos || css_tryget(&pos->css))
 823				break;
 824			/*
 825			 * css reference reached zero, so iter->position will
 826			 * be cleared by ->css_released. However, we should not
 827			 * rely on this happening soon, because ->css_released
 828			 * is called from a work queue, and by busy-waiting we
 829			 * might block it. So we clear iter->position right
 830			 * away.
 831			 */
 832			(void)cmpxchg(&iter->position, pos, NULL);
 833		}
 834	}
 835
 836	if (pos)
 837		css = &pos->css;
 838
 839	for (;;) {
 840		css = css_next_descendant_pre(css, &root->css);
 841		if (!css) {
 842			/*
 843			 * Reclaimers share the hierarchy walk, and a
 844			 * new one might jump in right at the end of
 845			 * the hierarchy - make sure they see at least
 846			 * one group and restart from the beginning.
 847			 */
 848			if (!prev)
 849				continue;
 850			break;
 851		}
 852
 853		/*
 854		 * Verify the css and acquire a reference.  The root
 855		 * is provided by the caller, so we know it's alive
 856		 * and kicking, and don't take an extra reference.
 857		 */
 858		memcg = mem_cgroup_from_css(css);
 859
 860		if (css == &root->css)
 861			break;
 862
 863		if (css_tryget(css))
 864			break;
 865
 866		memcg = NULL;
 867	}
 868
 869	if (reclaim) {
 870		/*
 871		 * The position could have already been updated by a competing
 872		 * thread, so check that the value hasn't changed since we read
 873		 * it to avoid reclaiming from the same cgroup twice.
 874		 */
 875		(void)cmpxchg(&iter->position, pos, memcg);
 876
 877		if (pos)
 878			css_put(&pos->css);
 879
 880		if (!memcg)
 881			iter->generation++;
 882		else if (!prev)
 883			reclaim->generation = iter->generation;
 884	}
 885
 886out_unlock:
 887	rcu_read_unlock();
 888out:
 889	if (prev && prev != root)
 890		css_put(&prev->css);
 891
 892	return memcg;
 893}
 894
 895/**
 896 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 897 * @root: hierarchy root
 898 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 899 */
 900void mem_cgroup_iter_break(struct mem_cgroup *root,
 901			   struct mem_cgroup *prev)
 902{
 903	if (!root)
 904		root = root_mem_cgroup;
 905	if (prev && prev != root)
 906		css_put(&prev->css);
 907}
 908
 909static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 
 910{
 911	struct mem_cgroup *memcg = dead_memcg;
 912	struct mem_cgroup_reclaim_iter *iter;
 913	struct mem_cgroup_per_zone *mz;
 914	int nid, zid;
 915	int i;
 916
 917	while ((memcg = parent_mem_cgroup(memcg))) {
 918		for_each_node(nid) {
 919			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 920				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 921				for (i = 0; i <= DEF_PRIORITY; i++) {
 922					iter = &mz->iter[i];
 923					cmpxchg(&iter->position,
 924						dead_memcg, NULL);
 925				}
 926			}
 927		}
 928	}
 929}
 930
 931/*
 932 * Iteration constructs for visiting all cgroups (under a tree).  If
 933 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 934 * be used for reference counting.
 935 */
 936#define for_each_mem_cgroup_tree(iter, root)		\
 937	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 938	     iter != NULL;				\
 939	     iter = mem_cgroup_iter(root, iter, NULL))
 940
 941#define for_each_mem_cgroup(iter)			\
 942	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 943	     iter != NULL;				\
 944	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
 
 
 
 
 
 
 
 
 
 
 945
 946/**
 947 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
 948 * @zone: zone of the wanted lruvec
 949 * @memcg: memcg of the wanted lruvec
 950 *
 951 * Returns the lru list vector holding pages for the given @zone and
 952 * @mem.  This can be the global zone lruvec, if the memory controller
 953 * is disabled.
 
 
 
 
 954 */
 955struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 956				      struct mem_cgroup *memcg)
 957{
 958	struct mem_cgroup_per_zone *mz;
 959	struct lruvec *lruvec;
 960
 961	if (mem_cgroup_disabled()) {
 962		lruvec = &zone->lruvec;
 963		goto out;
 964	}
 965
 966	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
 967	lruvec = &mz->lruvec;
 968out:
 969	/*
 970	 * Since a node can be onlined after the mem_cgroup was created,
 971	 * we have to be prepared to initialize lruvec->zone here;
 972	 * and if offlined then reonlined, we need to reinitialize it.
 973	 */
 974	if (unlikely(lruvec->zone != zone))
 975		lruvec->zone = zone;
 976	return lruvec;
 
 
 
 977}
 978
 979/**
 980 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 981 * @page: the page
 982 * @zone: zone of the page
 983 *
 984 * This function is only safe when following the LRU page isolation
 985 * and putback protocol: the LRU lock must be held, and the page must
 986 * either be PageLRU() or the caller must have isolated/allocated it.
 987 */
 988struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 989{
 990	struct mem_cgroup_per_zone *mz;
 991	struct mem_cgroup *memcg;
 992	struct lruvec *lruvec;
 993
 994	if (mem_cgroup_disabled()) {
 995		lruvec = &zone->lruvec;
 996		goto out;
 997	}
 998
 999	memcg = page->mem_cgroup;
1000	/*
1001	 * Swapcache readahead pages are added to the LRU - and
1002	 * possibly migrated - before they are charged.
1003	 */
1004	if (!memcg)
1005		memcg = root_mem_cgroup;
1006
1007	mz = mem_cgroup_page_zoneinfo(memcg, page);
1008	lruvec = &mz->lruvec;
1009out:
1010	/*
1011	 * Since a node can be onlined after the mem_cgroup was created,
1012	 * we have to be prepared to initialize lruvec->zone here;
1013	 * and if offlined then reonlined, we need to reinitialize it.
1014	 */
1015	if (unlikely(lruvec->zone != zone))
1016		lruvec->zone = zone;
1017	return lruvec;
1018}
1019
1020/**
1021 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1022 * @lruvec: mem_cgroup per zone lru vector
1023 * @lru: index of lru list the page is sitting on
 
1024 * @nr_pages: positive when adding or negative when removing
1025 *
1026 * This function must be called when a page is added to or removed from an
1027 * lru list.
 
1028 */
1029void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1030				int nr_pages)
1031{
1032	struct mem_cgroup_per_zone *mz;
1033	unsigned long *lru_size;
 
1034
1035	if (mem_cgroup_disabled())
1036		return;
1037
1038	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1039	lru_size = mz->lru_size + lru;
1040	*lru_size += nr_pages;
1041	VM_BUG_ON((long)(*lru_size) < 0);
1042}
1043
1044bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1045{
1046	struct mem_cgroup *task_memcg;
1047	struct task_struct *p;
1048	bool ret;
1049
1050	p = find_lock_task_mm(task);
1051	if (p) {
1052		task_memcg = get_mem_cgroup_from_mm(p->mm);
1053		task_unlock(p);
1054	} else {
1055		/*
1056		 * All threads may have already detached their mm's, but the oom
1057		 * killer still needs to detect if they have already been oom
1058		 * killed to prevent needlessly killing additional tasks.
1059		 */
1060		rcu_read_lock();
1061		task_memcg = mem_cgroup_from_task(task);
1062		css_get(&task_memcg->css);
1063		rcu_read_unlock();
1064	}
1065	ret = mem_cgroup_is_descendant(task_memcg, memcg);
1066	css_put(&task_memcg->css);
1067	return ret;
1068}
1069
1070/**
1071 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1072 * @memcg: the memory cgroup
1073 *
1074 * Returns the maximum amount of memory @mem can be charged with, in
1075 * pages.
1076 */
1077static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1078{
1079	unsigned long margin = 0;
1080	unsigned long count;
1081	unsigned long limit;
1082
1083	count = page_counter_read(&memcg->memory);
1084	limit = READ_ONCE(memcg->memory.limit);
1085	if (count < limit)
1086		margin = limit - count;
1087
1088	if (do_memsw_account()) {
1089		count = page_counter_read(&memcg->memsw);
1090		limit = READ_ONCE(memcg->memsw.limit);
1091		if (count <= limit)
1092			margin = min(margin, limit - count);
 
 
1093	}
1094
1095	return margin;
1096}
1097
1098/*
1099 * A routine for checking "mem" is under move_account() or not.
1100 *
1101 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1102 * moving cgroups. This is for waiting at high-memory pressure
1103 * caused by "move".
1104 */
1105static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1106{
1107	struct mem_cgroup *from;
1108	struct mem_cgroup *to;
1109	bool ret = false;
1110	/*
1111	 * Unlike task_move routines, we access mc.to, mc.from not under
1112	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1113	 */
1114	spin_lock(&mc.lock);
1115	from = mc.from;
1116	to = mc.to;
1117	if (!from)
1118		goto unlock;
1119
1120	ret = mem_cgroup_is_descendant(from, memcg) ||
1121		mem_cgroup_is_descendant(to, memcg);
1122unlock:
1123	spin_unlock(&mc.lock);
1124	return ret;
1125}
1126
1127static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1128{
1129	if (mc.moving_task && current != mc.moving_task) {
1130		if (mem_cgroup_under_move(memcg)) {
1131			DEFINE_WAIT(wait);
1132			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1133			/* moving charge context might have finished. */
1134			if (mc.moving_task)
1135				schedule();
1136			finish_wait(&mc.waitq, &wait);
1137			return true;
1138		}
1139	}
1140	return false;
1141}
1142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143#define K(x) ((x) << (PAGE_SHIFT-10))
1144/**
1145 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
 
1146 * @memcg: The memory cgroup that went over limit
1147 * @p: Task that is going to be killed
1148 *
1149 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1150 * enabled
1151 */
1152void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1153{
1154	struct mem_cgroup *iter;
1155	unsigned int i;
1156
1157	rcu_read_lock();
1158
 
 
 
 
 
1159	if (p) {
1160		pr_info("Task in ");
1161		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1162		pr_cont(" killed as a result of limit of ");
1163	} else {
1164		pr_info("Memory limit reached of cgroup ");
1165	}
1166
1167	pr_cont_cgroup_path(memcg->css.cgroup);
1168	pr_cont("\n");
1169
1170	rcu_read_unlock();
1171
1172	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1173		K((u64)page_counter_read(&memcg->memory)),
1174		K((u64)memcg->memory.limit), memcg->memory.failcnt);
1175	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1176		K((u64)page_counter_read(&memcg->memsw)),
1177		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1178	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1179		K((u64)page_counter_read(&memcg->kmem)),
1180		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1181
1182	for_each_mem_cgroup_tree(iter, memcg) {
1183		pr_info("Memory cgroup stats for ");
1184		pr_cont_cgroup_path(iter->css.cgroup);
1185		pr_cont(":");
1186
1187		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1188			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1189				continue;
1190			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
1191				K(mem_cgroup_read_stat(iter, i)));
1192		}
1193
1194		for (i = 0; i < NR_LRU_LISTS; i++)
1195			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1196				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1197
1198		pr_cont("\n");
1199	}
1200}
1201
1202/*
1203 * This function returns the number of memcg under hierarchy tree. Returns
1204 * 1(self count) if no children.
 
1205 */
1206static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1207{
1208	int num = 0;
1209	struct mem_cgroup *iter;
1210
1211	for_each_mem_cgroup_tree(iter, memcg)
1212		num++;
1213	return num;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1214}
1215
1216/*
1217 * Return the memory (and swap, if configured) limit for a memcg.
1218 */
1219static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1220{
1221	unsigned long limit;
1222
1223	limit = memcg->memory.limit;
1224	if (mem_cgroup_swappiness(memcg)) {
1225		unsigned long memsw_limit;
1226		unsigned long swap_limit;
1227
1228		memsw_limit = memcg->memsw.limit;
1229		swap_limit = memcg->swap.limit;
1230		swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
1231		limit = min(limit + swap_limit, memsw_limit);
1232	}
1233	return limit;
 
 
 
 
 
1234}
1235
1236static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1237				     int order)
1238{
1239	struct oom_control oc = {
1240		.zonelist = NULL,
1241		.nodemask = NULL,
 
1242		.gfp_mask = gfp_mask,
1243		.order = order,
1244	};
1245	struct mem_cgroup *iter;
1246	unsigned long chosen_points = 0;
1247	unsigned long totalpages;
1248	unsigned int points = 0;
1249	struct task_struct *chosen = NULL;
1250
1251	mutex_lock(&oom_lock);
1252
1253	/*
1254	 * If current has a pending SIGKILL or is exiting, then automatically
1255	 * select it.  The goal is to allow it to allocate so that it may
1256	 * quickly exit and free its memory.
1257	 */
1258	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1259		mark_oom_victim(current);
1260		goto unlock;
1261	}
1262
1263	check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
1264	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1265	for_each_mem_cgroup_tree(iter, memcg) {
1266		struct css_task_iter it;
1267		struct task_struct *task;
1268
1269		css_task_iter_start(&iter->css, &it);
1270		while ((task = css_task_iter_next(&it))) {
1271			switch (oom_scan_process_thread(&oc, task, totalpages)) {
1272			case OOM_SCAN_SELECT:
1273				if (chosen)
1274					put_task_struct(chosen);
1275				chosen = task;
1276				chosen_points = ULONG_MAX;
1277				get_task_struct(chosen);
1278				/* fall through */
1279			case OOM_SCAN_CONTINUE:
1280				continue;
1281			case OOM_SCAN_ABORT:
1282				css_task_iter_end(&it);
1283				mem_cgroup_iter_break(memcg, iter);
1284				if (chosen)
1285					put_task_struct(chosen);
1286				goto unlock;
1287			case OOM_SCAN_OK:
1288				break;
1289			};
1290			points = oom_badness(task, memcg, NULL, totalpages);
1291			if (!points || points < chosen_points)
1292				continue;
1293			/* Prefer thread group leaders for display purposes */
1294			if (points == chosen_points &&
1295			    thread_group_leader(chosen))
1296				continue;
1297
1298			if (chosen)
1299				put_task_struct(chosen);
1300			chosen = task;
1301			chosen_points = points;
1302			get_task_struct(chosen);
1303		}
1304		css_task_iter_end(&it);
1305	}
1306
1307	if (chosen) {
1308		points = chosen_points * 1000 / totalpages;
1309		oom_kill_process(&oc, chosen, points, totalpages, memcg,
1310				 "Memory cgroup out of memory");
1311	}
1312unlock:
1313	mutex_unlock(&oom_lock);
1314	return chosen;
1315}
1316
1317#if MAX_NUMNODES > 1
1318
1319/**
1320 * test_mem_cgroup_node_reclaimable
1321 * @memcg: the target memcg
1322 * @nid: the node ID to be checked.
1323 * @noswap : specify true here if the user wants flle only information.
1324 *
1325 * This function returns whether the specified memcg contains any
1326 * reclaimable pages on a node. Returns true if there are any reclaimable
1327 * pages in the node.
1328 */
1329static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1330		int nid, bool noswap)
1331{
1332	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1333		return true;
1334	if (noswap || !total_swap_pages)
1335		return false;
1336	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1337		return true;
1338	return false;
1339
1340}
 
1341
1342/*
1343 * Always updating the nodemask is not very good - even if we have an empty
1344 * list or the wrong list here, we can start from some node and traverse all
1345 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1346 *
1347 */
1348static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1349{
1350	int nid;
1351	/*
1352	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1353	 * pagein/pageout changes since the last update.
1354	 */
1355	if (!atomic_read(&memcg->numainfo_events))
1356		return;
1357	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1358		return;
1359
1360	/* make a nodemask where this memcg uses memory from */
1361	memcg->scan_nodes = node_states[N_MEMORY];
1362
1363	for_each_node_mask(nid, node_states[N_MEMORY]) {
1364
1365		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1366			node_clear(nid, memcg->scan_nodes);
1367	}
1368
1369	atomic_set(&memcg->numainfo_events, 0);
1370	atomic_set(&memcg->numainfo_updating, 0);
1371}
1372
1373/*
1374 * Selecting a node where we start reclaim from. Because what we need is just
1375 * reducing usage counter, start from anywhere is O,K. Considering
1376 * memory reclaim from current node, there are pros. and cons.
1377 *
1378 * Freeing memory from current node means freeing memory from a node which
1379 * we'll use or we've used. So, it may make LRU bad. And if several threads
1380 * hit limits, it will see a contention on a node. But freeing from remote
1381 * node means more costs for memory reclaim because of memory latency.
1382 *
1383 * Now, we use round-robin. Better algorithm is welcomed.
1384 */
1385int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1386{
1387	int node;
1388
1389	mem_cgroup_may_update_nodemask(memcg);
1390	node = memcg->last_scanned_node;
1391
1392	node = next_node(node, memcg->scan_nodes);
1393	if (node == MAX_NUMNODES)
1394		node = first_node(memcg->scan_nodes);
1395	/*
1396	 * We call this when we hit limit, not when pages are added to LRU.
1397	 * No LRU may hold pages because all pages are UNEVICTABLE or
1398	 * memcg is too small and all pages are not on LRU. In that case,
1399	 * we use curret node.
1400	 */
1401	if (unlikely(node == MAX_NUMNODES))
1402		node = numa_node_id();
1403
1404	memcg->last_scanned_node = node;
1405	return node;
1406}
1407#else
1408int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1409{
1410	return 0;
1411}
1412#endif
1413
1414static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1415				   struct zone *zone,
1416				   gfp_t gfp_mask,
1417				   unsigned long *total_scanned)
1418{
1419	struct mem_cgroup *victim = NULL;
1420	int total = 0;
1421	int loop = 0;
1422	unsigned long excess;
1423	unsigned long nr_scanned;
1424	struct mem_cgroup_reclaim_cookie reclaim = {
1425		.zone = zone,
1426		.priority = 0,
1427	};
1428
1429	excess = soft_limit_excess(root_memcg);
1430
1431	while (1) {
1432		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1433		if (!victim) {
1434			loop++;
1435			if (loop >= 2) {
1436				/*
1437				 * If we have not been able to reclaim
1438				 * anything, it might because there are
1439				 * no reclaimable pages under this hierarchy
1440				 */
1441				if (!total)
1442					break;
1443				/*
1444				 * We want to do more targeted reclaim.
1445				 * excess >> 2 is not to excessive so as to
1446				 * reclaim too much, nor too less that we keep
1447				 * coming back to reclaim from this cgroup
1448				 */
1449				if (total >= (excess >> 2) ||
1450					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1451					break;
1452			}
1453			continue;
1454		}
1455		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1456						     zone, &nr_scanned);
1457		*total_scanned += nr_scanned;
1458		if (!soft_limit_excess(root_memcg))
1459			break;
1460	}
1461	mem_cgroup_iter_break(root_memcg, victim);
1462	return total;
1463}
1464
1465#ifdef CONFIG_LOCKDEP
1466static struct lockdep_map memcg_oom_lock_dep_map = {
1467	.name = "memcg_oom_lock",
1468};
1469#endif
1470
1471static DEFINE_SPINLOCK(memcg_oom_lock);
1472
1473/*
1474 * Check OOM-Killer is already running under our hierarchy.
1475 * If someone is running, return false.
1476 */
1477static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1478{
1479	struct mem_cgroup *iter, *failed = NULL;
1480
1481	spin_lock(&memcg_oom_lock);
1482
1483	for_each_mem_cgroup_tree(iter, memcg) {
1484		if (iter->oom_lock) {
1485			/*
1486			 * this subtree of our hierarchy is already locked
1487			 * so we cannot give a lock.
1488			 */
1489			failed = iter;
1490			mem_cgroup_iter_break(memcg, iter);
1491			break;
1492		} else
1493			iter->oom_lock = true;
1494	}
1495
1496	if (failed) {
1497		/*
1498		 * OK, we failed to lock the whole subtree so we have
1499		 * to clean up what we set up to the failing subtree
1500		 */
1501		for_each_mem_cgroup_tree(iter, memcg) {
1502			if (iter == failed) {
1503				mem_cgroup_iter_break(memcg, iter);
1504				break;
1505			}
1506			iter->oom_lock = false;
1507		}
1508	} else
1509		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1510
1511	spin_unlock(&memcg_oom_lock);
1512
1513	return !failed;
1514}
1515
1516static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1517{
1518	struct mem_cgroup *iter;
1519
1520	spin_lock(&memcg_oom_lock);
1521	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1522	for_each_mem_cgroup_tree(iter, memcg)
1523		iter->oom_lock = false;
1524	spin_unlock(&memcg_oom_lock);
1525}
1526
1527static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1528{
1529	struct mem_cgroup *iter;
1530
1531	spin_lock(&memcg_oom_lock);
1532	for_each_mem_cgroup_tree(iter, memcg)
1533		iter->under_oom++;
1534	spin_unlock(&memcg_oom_lock);
1535}
1536
1537static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1538{
1539	struct mem_cgroup *iter;
1540
1541	/*
1542	 * When a new child is created while the hierarchy is under oom,
1543	 * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1544	 */
1545	spin_lock(&memcg_oom_lock);
1546	for_each_mem_cgroup_tree(iter, memcg)
1547		if (iter->under_oom > 0)
1548			iter->under_oom--;
1549	spin_unlock(&memcg_oom_lock);
1550}
1551
1552static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1553
1554struct oom_wait_info {
1555	struct mem_cgroup *memcg;
1556	wait_queue_t	wait;
1557};
1558
1559static int memcg_oom_wake_function(wait_queue_t *wait,
1560	unsigned mode, int sync, void *arg)
1561{
1562	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1563	struct mem_cgroup *oom_wait_memcg;
1564	struct oom_wait_info *oom_wait_info;
1565
1566	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1567	oom_wait_memcg = oom_wait_info->memcg;
1568
1569	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1570	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1571		return 0;
1572	return autoremove_wake_function(wait, mode, sync, arg);
1573}
1574
1575static void memcg_oom_recover(struct mem_cgroup *memcg)
1576{
1577	/*
1578	 * For the following lockless ->under_oom test, the only required
1579	 * guarantee is that it must see the state asserted by an OOM when
1580	 * this function is called as a result of userland actions
1581	 * triggered by the notification of the OOM.  This is trivially
1582	 * achieved by invoking mem_cgroup_mark_under_oom() before
1583	 * triggering notification.
1584	 */
1585	if (memcg && memcg->under_oom)
1586		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1587}
1588
1589static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 
 
 
 
 
 
 
1590{
1591	if (!current->memcg_may_oom)
1592		return;
 
 
 
 
 
 
1593	/*
1594	 * We are in the middle of the charge context here, so we
1595	 * don't want to block when potentially sitting on a callstack
1596	 * that holds all kinds of filesystem and mm locks.
1597	 *
1598	 * Also, the caller may handle a failed allocation gracefully
1599	 * (like optional page cache readahead) and so an OOM killer
1600	 * invocation might not even be necessary.
 
 
 
 
 
 
1601	 *
1602	 * That's why we don't do anything here except remember the
1603	 * OOM context and then deal with it at the end of the page
1604	 * fault when the stack is unwound, the locks are released,
1605	 * and when we know whether the fault was overall successful.
1606	 */
1607	css_get(&memcg->css);
1608	current->memcg_in_oom = memcg;
1609	current->memcg_oom_gfp_mask = mask;
1610	current->memcg_oom_order = order;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1611}
1612
1613/**
1614 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1615 * @handle: actually kill/wait or just clean up the OOM state
1616 *
1617 * This has to be called at the end of a page fault if the memcg OOM
1618 * handler was enabled.
1619 *
1620 * Memcg supports userspace OOM handling where failed allocations must
1621 * sleep on a waitqueue until the userspace task resolves the
1622 * situation.  Sleeping directly in the charge context with all kinds
1623 * of locks held is not a good idea, instead we remember an OOM state
1624 * in the task and mem_cgroup_oom_synchronize() has to be called at
1625 * the end of the page fault to complete the OOM handling.
1626 *
1627 * Returns %true if an ongoing memcg OOM situation was detected and
1628 * completed, %false otherwise.
1629 */
1630bool mem_cgroup_oom_synchronize(bool handle)
1631{
1632	struct mem_cgroup *memcg = current->memcg_in_oom;
1633	struct oom_wait_info owait;
1634	bool locked;
1635
1636	/* OOM is global, do not handle */
1637	if (!memcg)
1638		return false;
1639
1640	if (!handle || oom_killer_disabled)
1641		goto cleanup;
1642
1643	owait.memcg = memcg;
1644	owait.wait.flags = 0;
1645	owait.wait.func = memcg_oom_wake_function;
1646	owait.wait.private = current;
1647	INIT_LIST_HEAD(&owait.wait.task_list);
1648
1649	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1650	mem_cgroup_mark_under_oom(memcg);
1651
1652	locked = mem_cgroup_oom_trylock(memcg);
1653
1654	if (locked)
1655		mem_cgroup_oom_notify(memcg);
1656
1657	if (locked && !memcg->oom_kill_disable) {
1658		mem_cgroup_unmark_under_oom(memcg);
1659		finish_wait(&memcg_oom_waitq, &owait.wait);
1660		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1661					 current->memcg_oom_order);
1662	} else {
1663		schedule();
1664		mem_cgroup_unmark_under_oom(memcg);
1665		finish_wait(&memcg_oom_waitq, &owait.wait);
1666	}
1667
1668	if (locked) {
1669		mem_cgroup_oom_unlock(memcg);
1670		/*
1671		 * There is no guarantee that an OOM-lock contender
1672		 * sees the wakeups triggered by the OOM kill
1673		 * uncharges.  Wake any sleepers explicitely.
1674		 */
1675		memcg_oom_recover(memcg);
1676	}
1677cleanup:
1678	current->memcg_in_oom = NULL;
1679	css_put(&memcg->css);
1680	return true;
1681}
1682
1683/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1684 * lock_page_memcg - lock a page->mem_cgroup binding
1685 * @page: the page
1686 *
1687 * This function protects unlocked LRU pages from being moved to
1688 * another cgroup and stabilizes their page->mem_cgroup binding.
 
 
 
 
1689 */
1690void lock_page_memcg(struct page *page)
1691{
 
1692	struct mem_cgroup *memcg;
1693	unsigned long flags;
1694
1695	/*
1696	 * The RCU lock is held throughout the transaction.  The fast
1697	 * path can get away without acquiring the memcg->move_lock
1698	 * because page moving starts with an RCU grace period.
1699	 */
 
 
 
 
 
 
1700	rcu_read_lock();
1701
1702	if (mem_cgroup_disabled())
1703		return;
1704again:
1705	memcg = page->mem_cgroup;
1706	if (unlikely(!memcg))
1707		return;
1708
1709	if (atomic_read(&memcg->moving_account) <= 0)
1710		return;
1711
1712	spin_lock_irqsave(&memcg->move_lock, flags);
1713	if (memcg != page->mem_cgroup) {
1714		spin_unlock_irqrestore(&memcg->move_lock, flags);
1715		goto again;
1716	}
1717
1718	/*
1719	 * When charge migration first begins, we can have locked and
1720	 * unlocked page stat updates happening concurrently.  Track
1721	 * the task who has the lock for unlock_page_memcg().
1722	 */
1723	memcg->move_lock_task = current;
1724	memcg->move_lock_flags = flags;
1725
1726	return;
1727}
1728EXPORT_SYMBOL(lock_page_memcg);
1729
1730/**
1731 * unlock_page_memcg - unlock a page->mem_cgroup binding
1732 * @page: the page
 
 
1733 */
1734void unlock_page_memcg(struct page *page)
1735{
1736	struct mem_cgroup *memcg = page->mem_cgroup;
1737
1738	if (memcg && memcg->move_lock_task == current) {
1739		unsigned long flags = memcg->move_lock_flags;
1740
1741		memcg->move_lock_task = NULL;
1742		memcg->move_lock_flags = 0;
1743
1744		spin_unlock_irqrestore(&memcg->move_lock, flags);
1745	}
1746
1747	rcu_read_unlock();
1748}
1749EXPORT_SYMBOL(unlock_page_memcg);
1750
1751/*
1752 * size of first charge trial. "32" comes from vmscan.c's magic value.
1753 * TODO: maybe necessary to use big numbers in big irons.
1754 */
1755#define CHARGE_BATCH	32U
 
 
 
 
 
 
 
1756struct memcg_stock_pcp {
1757	struct mem_cgroup *cached; /* this never be root cgroup */
1758	unsigned int nr_pages;
 
 
 
 
 
 
1759	struct work_struct work;
1760	unsigned long flags;
1761#define FLUSHING_CACHED_CHARGE	0
1762};
1763static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1764static DEFINE_MUTEX(percpu_charge_mutex);
1765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1766/**
1767 * consume_stock: Try to consume stocked charge on this cpu.
1768 * @memcg: memcg to consume from.
1769 * @nr_pages: how many pages to charge.
1770 *
1771 * The charges will only happen if @memcg matches the current cpu's memcg
1772 * stock, and at least @nr_pages are available in that stock.  Failure to
1773 * service an allocation will refill the stock.
1774 *
1775 * returns true if successful, false otherwise.
1776 */
1777static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1778{
1779	struct memcg_stock_pcp *stock;
 
1780	bool ret = false;
1781
1782	if (nr_pages > CHARGE_BATCH)
1783		return ret;
1784
1785	stock = &get_cpu_var(memcg_stock);
 
 
1786	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1787		stock->nr_pages -= nr_pages;
1788		ret = true;
1789	}
1790	put_cpu_var(memcg_stock);
 
 
1791	return ret;
1792}
1793
1794/*
1795 * Returns stocks cached in percpu and reset cached information.
1796 */
1797static void drain_stock(struct memcg_stock_pcp *stock)
1798{
1799	struct mem_cgroup *old = stock->cached;
1800
 
 
 
1801	if (stock->nr_pages) {
1802		page_counter_uncharge(&old->memory, stock->nr_pages);
1803		if (do_memsw_account())
1804			page_counter_uncharge(&old->memsw, stock->nr_pages);
1805		css_put_many(&old->css, stock->nr_pages);
1806		stock->nr_pages = 0;
1807	}
 
 
1808	stock->cached = NULL;
1809}
1810
1811/*
1812 * This must be called under preempt disabled or must be called by
1813 * a thread which is pinned to local cpu.
1814 */
1815static void drain_local_stock(struct work_struct *dummy)
1816{
1817	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
 
 
 
 
 
 
 
 
 
 
1818	drain_stock(stock);
1819	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
 
1820}
1821
1822/*
1823 * Cache charges(val) to local per_cpu area.
1824 * This will be consumed by consume_stock() function, later.
1825 */
1826static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1827{
1828	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 
1829
 
 
 
1830	if (stock->cached != memcg) { /* reset if necessary */
1831		drain_stock(stock);
 
1832		stock->cached = memcg;
1833	}
1834	stock->nr_pages += nr_pages;
1835	put_cpu_var(memcg_stock);
 
 
 
 
1836}
1837
1838/*
1839 * Drains all per-CPU charge caches for given root_memcg resp. subtree
1840 * of the hierarchy under it.
1841 */
1842static void drain_all_stock(struct mem_cgroup *root_memcg)
1843{
1844	int cpu, curcpu;
1845
1846	/* If someone's already draining, avoid adding running more workers. */
1847	if (!mutex_trylock(&percpu_charge_mutex))
1848		return;
1849	/* Notify other cpus that system-wide "drain" is running */
1850	get_online_cpus();
 
 
 
 
1851	curcpu = get_cpu();
1852	for_each_online_cpu(cpu) {
1853		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1854		struct mem_cgroup *memcg;
 
1855
 
1856		memcg = stock->cached;
1857		if (!memcg || !stock->nr_pages)
1858			continue;
1859		if (!mem_cgroup_is_descendant(memcg, root_memcg))
1860			continue;
1861		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 
 
 
 
1862			if (cpu == curcpu)
1863				drain_local_stock(&stock->work);
1864			else
1865				schedule_work_on(cpu, &stock->work);
1866		}
1867	}
1868	put_cpu();
1869	put_online_cpus();
1870	mutex_unlock(&percpu_charge_mutex);
1871}
1872
1873static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
1874					unsigned long action,
1875					void *hcpu)
1876{
1877	int cpu = (unsigned long)hcpu;
1878	struct memcg_stock_pcp *stock;
1879
1880	if (action == CPU_ONLINE)
1881		return NOTIFY_OK;
1882
1883	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1884		return NOTIFY_OK;
1885
1886	stock = &per_cpu(memcg_stock, cpu);
1887	drain_stock(stock);
1888	return NOTIFY_OK;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1889}
1890
1891static void reclaim_high(struct mem_cgroup *memcg,
1892			 unsigned int nr_pages,
1893			 gfp_t gfp_mask)
1894{
 
 
1895	do {
1896		if (page_counter_read(&memcg->memory) <= memcg->high)
 
 
 
1897			continue;
1898		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
1899		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
1900	} while ((memcg = parent_mem_cgroup(memcg)));
 
 
 
 
 
 
 
 
1901}
1902
1903static void high_work_func(struct work_struct *work)
1904{
1905	struct mem_cgroup *memcg;
1906
1907	memcg = container_of(work, struct mem_cgroup, high_work);
1908	reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1909}
1910
1911/*
1912 * Scheduled by try_charge() to be executed from the userland return path
1913 * and reclaims memory over the high limit.
1914 */
1915void mem_cgroup_handle_over_high(void)
1916{
 
 
 
1917	unsigned int nr_pages = current->memcg_nr_pages_over_high;
 
1918	struct mem_cgroup *memcg;
 
1919
1920	if (likely(!nr_pages))
1921		return;
1922
1923	memcg = get_mem_cgroup_from_mm(current->mm);
1924	reclaim_high(memcg, nr_pages, GFP_KERNEL);
1925	css_put(&memcg->css);
1926	current->memcg_nr_pages_over_high = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1927}
1928
1929static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1930		      unsigned int nr_pages)
1931{
1932	unsigned int batch = max(CHARGE_BATCH, nr_pages);
1933	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1934	struct mem_cgroup *mem_over_limit;
1935	struct page_counter *counter;
 
1936	unsigned long nr_reclaimed;
1937	bool may_swap = true;
1938	bool drained = false;
 
1939
1940	if (mem_cgroup_is_root(memcg))
1941		return 0;
1942retry:
1943	if (consume_stock(memcg, nr_pages))
1944		return 0;
1945
1946	if (!do_memsw_account() ||
1947	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
1948		if (page_counter_try_charge(&memcg->memory, batch, &counter))
1949			goto done_restock;
1950		if (do_memsw_account())
1951			page_counter_uncharge(&memcg->memsw, batch);
1952		mem_over_limit = mem_cgroup_from_counter(counter, memory);
1953	} else {
1954		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
1955		may_swap = false;
1956	}
1957
1958	if (batch > nr_pages) {
1959		batch = nr_pages;
1960		goto retry;
1961	}
1962
1963	/*
 
 
 
 
 
 
 
 
 
1964	 * Unlike in global OOM situations, memcg is not in a physical
1965	 * memory shortage.  Allow dying and OOM-killed tasks to
1966	 * bypass the last charges so that they can exit quickly and
1967	 * free their memory.
1968	 */
1969	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
1970		     fatal_signal_pending(current) ||
1971		     current->flags & PF_EXITING))
 
 
 
 
 
 
 
1972		goto force;
1973
1974	if (unlikely(task_in_memcg_oom(current)))
1975		goto nomem;
1976
1977	if (!gfpflags_allow_blocking(gfp_mask))
1978		goto nomem;
1979
1980	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
1981
 
1982	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
1983						    gfp_mask, may_swap);
 
1984
1985	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1986		goto retry;
1987
1988	if (!drained) {
1989		drain_all_stock(mem_over_limit);
1990		drained = true;
1991		goto retry;
1992	}
1993
1994	if (gfp_mask & __GFP_NORETRY)
1995		goto nomem;
1996	/*
1997	 * Even though the limit is exceeded at this point, reclaim
1998	 * may have been able to free some pages.  Retry the charge
1999	 * before killing the task.
2000	 *
2001	 * Only for regular pages, though: huge pages are rather
2002	 * unlikely to succeed so close to the limit, and we fall back
2003	 * to regular pages anyway in case of failure.
2004	 */
2005	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2006		goto retry;
2007	/*
2008	 * At task move, charge accounts can be doubly counted. So, it's
2009	 * better to wait until the end of task_move if something is going on.
2010	 */
2011	if (mem_cgroup_wait_acct_move(mem_over_limit))
2012		goto retry;
2013
2014	if (nr_retries--)
2015		goto retry;
2016
 
 
 
2017	if (gfp_mask & __GFP_NOFAIL)
2018		goto force;
2019
2020	if (fatal_signal_pending(current))
2021		goto force;
2022
2023	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2024
2025	mem_cgroup_oom(mem_over_limit, gfp_mask,
 
 
 
2026		       get_order(nr_pages * PAGE_SIZE));
 
 
 
 
 
 
 
 
 
2027nomem:
2028	if (!(gfp_mask & __GFP_NOFAIL))
2029		return -ENOMEM;
2030force:
2031	/*
2032	 * The allocation either can't fail or will lead to more memory
2033	 * being freed very soon.  Allow memory usage go over the limit
2034	 * temporarily by force charging it.
2035	 */
2036	page_counter_charge(&memcg->memory, nr_pages);
2037	if (do_memsw_account())
2038		page_counter_charge(&memcg->memsw, nr_pages);
2039	css_get_many(&memcg->css, nr_pages);
2040
2041	return 0;
2042
2043done_restock:
2044	css_get_many(&memcg->css, batch);
2045	if (batch > nr_pages)
2046		refill_stock(memcg, batch - nr_pages);
2047
2048	/*
2049	 * If the hierarchy is above the normal consumption range, schedule
2050	 * reclaim on returning to userland.  We can perform reclaim here
2051	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2052	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2053	 * not recorded as it most likely matches current's and won't
2054	 * change in the meantime.  As high limit is checked again before
2055	 * reclaim, the cost of mismatch is negligible.
2056	 */
2057	do {
2058		if (page_counter_read(&memcg->memory) > memcg->high) {
2059			/* Don't bother a random interrupted task */
2060			if (in_interrupt()) {
 
 
 
 
 
 
 
2061				schedule_work(&memcg->high_work);
2062				break;
2063			}
 
 
 
 
 
 
 
 
 
 
 
 
 
2064			current->memcg_nr_pages_over_high += batch;
2065			set_notify_resume(current);
2066			break;
2067		}
2068	} while ((memcg = parent_mem_cgroup(memcg)));
2069
2070	return 0;
2071}
2072
 
2073static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2074{
2075	if (mem_cgroup_is_root(memcg))
2076		return;
2077
2078	page_counter_uncharge(&memcg->memory, nr_pages);
2079	if (do_memsw_account())
2080		page_counter_uncharge(&memcg->memsw, nr_pages);
2081
2082	css_put_many(&memcg->css, nr_pages);
2083}
 
2084
2085static void lock_page_lru(struct page *page, int *isolated)
2086{
2087	struct zone *zone = page_zone(page);
 
 
 
 
 
 
 
 
 
 
2088
2089	spin_lock_irq(&zone->lru_lock);
2090	if (PageLRU(page)) {
2091		struct lruvec *lruvec;
 
 
 
 
 
 
 
 
2092
2093		lruvec = mem_cgroup_page_lruvec(page, zone);
2094		ClearPageLRU(page);
2095		del_page_from_lru_list(page, lruvec, page_lru(page));
2096		*isolated = 1;
2097	} else
2098		*isolated = 0;
 
2099}
2100
2101static void unlock_page_lru(struct page *page, int isolated)
 
 
 
 
 
 
2102{
2103	struct zone *zone = page_zone(page);
 
 
 
2104
2105	if (isolated) {
2106		struct lruvec *lruvec;
2107
2108		lruvec = mem_cgroup_page_lruvec(page, zone);
2109		VM_BUG_ON_PAGE(PageLRU(page), page);
2110		SetPageLRU(page);
2111		add_page_to_lru_list(page, lruvec, page_lru(page));
 
 
 
 
 
 
 
 
 
 
 
2112	}
2113	spin_unlock_irq(&zone->lru_lock);
 
 
2114}
2115
2116static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2117			  bool lrucare)
2118{
2119	int isolated;
 
2120
2121	VM_BUG_ON_PAGE(page->mem_cgroup, page);
 
2122
2123	/*
2124	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2125	 * may already be on some other mem_cgroup's LRU.  Take care of it.
2126	 */
2127	if (lrucare)
2128		lock_page_lru(page, &isolated);
2129
2130	/*
2131	 * Nobody should be changing or seriously looking at
2132	 * page->mem_cgroup at this point:
2133	 *
2134	 * - the page is uncharged
2135	 *
2136	 * - the page is off-LRU
2137	 *
2138	 * - an anonymous fault has exclusive page access, except for
2139	 *   a locked page table
2140	 *
2141	 * - a page cache insertion, a swapin fault, or a migration
2142	 *   have the page locked
2143	 */
2144	page->mem_cgroup = memcg;
2145
2146	if (lrucare)
2147		unlock_page_lru(page, isolated);
2148}
2149
2150#ifndef CONFIG_SLOB
2151static int memcg_alloc_cache_id(void)
2152{
2153	int id, size;
2154	int err;
2155
2156	id = ida_simple_get(&memcg_cache_ida,
2157			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2158	if (id < 0)
2159		return id;
2160
2161	if (id < memcg_nr_cache_ids)
2162		return id;
2163
2164	/*
2165	 * There's no space for the new id in memcg_caches arrays,
2166	 * so we have to grow them.
2167	 */
2168	down_write(&memcg_cache_ids_sem);
2169
2170	size = 2 * (id + 1);
2171	if (size < MEMCG_CACHES_MIN_SIZE)
2172		size = MEMCG_CACHES_MIN_SIZE;
2173	else if (size > MEMCG_CACHES_MAX_SIZE)
2174		size = MEMCG_CACHES_MAX_SIZE;
2175
2176	err = memcg_update_all_caches(size);
2177	if (!err)
2178		err = memcg_update_all_list_lrus(size);
2179	if (!err)
2180		memcg_nr_cache_ids = size;
2181
2182	up_write(&memcg_cache_ids_sem);
2183
2184	if (err) {
2185		ida_simple_remove(&memcg_cache_ida, id);
2186		return err;
2187	}
2188	return id;
2189}
2190
2191static void memcg_free_cache_id(int id)
2192{
2193	ida_simple_remove(&memcg_cache_ida, id);
2194}
2195
2196struct memcg_kmem_cache_create_work {
2197	struct mem_cgroup *memcg;
2198	struct kmem_cache *cachep;
2199	struct work_struct work;
2200};
2201
2202static void memcg_kmem_cache_create_func(struct work_struct *w)
 
 
 
2203{
2204	struct memcg_kmem_cache_create_work *cw =
2205		container_of(w, struct memcg_kmem_cache_create_work, work);
2206	struct mem_cgroup *memcg = cw->memcg;
2207	struct kmem_cache *cachep = cw->cachep;
2208
2209	memcg_create_kmem_cache(memcg, cachep);
 
 
2210
2211	css_put(&memcg->css);
2212	kfree(cw);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2213}
2214
2215/*
2216 * Enqueue the creation of a per-memcg kmem_cache.
 
 
2217 */
2218static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2219					       struct kmem_cache *cachep)
2220{
2221	struct memcg_kmem_cache_create_work *cw;
 
2222
2223	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2224	if (!cw)
2225		return;
 
2226
2227	css_get(&memcg->css);
 
 
 
 
 
 
 
 
 
 
 
2228
2229	cw->memcg = memcg;
2230	cw->cachep = cachep;
2231	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2232
2233	schedule_work(&cw->work);
 
 
 
 
 
 
 
 
 
 
2234}
2235
2236static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2237					     struct kmem_cache *cachep)
 
 
 
 
2238{
2239	/*
2240	 * We need to stop accounting when we kmalloc, because if the
2241	 * corresponding kmalloc cache is not yet created, the first allocation
2242	 * in __memcg_schedule_kmem_cache_create will recurse.
2243	 *
2244	 * However, it is better to enclose the whole function. Depending on
2245	 * the debugging options enabled, INIT_WORK(), for instance, can
2246	 * trigger an allocation. This too, will make us recurse. Because at
2247	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
2248	 * the safest choice is to do it like this, wrapping the whole function.
2249	 */
2250	current->memcg_kmem_skip_account = 1;
2251	__memcg_schedule_kmem_cache_create(memcg, cachep);
2252	current->memcg_kmem_skip_account = 0;
2253}
2254
2255/*
2256 * Return the kmem_cache we're supposed to use for a slab allocation.
2257 * We try to use the current memcg's version of the cache.
2258 *
2259 * If the cache does not exist yet, if we are the first user of it,
2260 * we either create it immediately, if possible, or create it asynchronously
2261 * in a workqueue.
2262 * In the latter case, we will let the current allocation go through with
2263 * the original cache.
2264 *
2265 * Can't be called in interrupt context or from kernel threads.
2266 * This function needs to be called with rcu_read_lock() held.
2267 */
2268struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
2269{
2270	struct mem_cgroup *memcg;
2271	struct kmem_cache *memcg_cachep;
2272	int kmemcg_id;
2273
2274	VM_BUG_ON(!is_root_cache(cachep));
 
 
 
 
 
 
2275
2276	if (cachep->flags & SLAB_ACCOUNT)
2277		gfp |= __GFP_ACCOUNT;
2278
2279	if (!(gfp & __GFP_ACCOUNT))
2280		return cachep;
2281
2282	if (current->memcg_kmem_skip_account)
2283		return cachep;
 
2284
2285	memcg = get_mem_cgroup_from_mm(current->mm);
2286	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2287	if (kmemcg_id < 0)
2288		goto out;
2289
2290	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2291	if (likely(memcg_cachep))
2292		return memcg_cachep;
2293
2294	/*
2295	 * If we are in a safe context (can wait, and not in interrupt
2296	 * context), we could be be predictable and return right away.
2297	 * This would guarantee that the allocation being performed
2298	 * already belongs in the new cache.
2299	 *
2300	 * However, there are some clashes that can arrive from locking.
2301	 * For instance, because we acquire the slab_mutex while doing
2302	 * memcg_create_kmem_cache, this means no further allocation
2303	 * could happen with the slab_mutex held. So it's better to
2304	 * defer everything.
2305	 */
2306	memcg_schedule_kmem_cache_create(memcg, cachep);
2307out:
2308	css_put(&memcg->css);
2309	return cachep;
 
 
 
 
 
 
2310}
2311
2312void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 
2313{
2314	if (!is_root_cache(cachep))
2315		css_put(&cachep->memcg_params.memcg->css);
 
 
 
 
 
 
 
2316}
2317
2318int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2319			      struct mem_cgroup *memcg)
2320{
2321	unsigned int nr_pages = 1 << order;
2322	struct page_counter *counter;
2323	int ret;
2324
2325	ret = try_charge(memcg, gfp, nr_pages);
2326	if (ret)
2327		return ret;
2328
2329	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2330	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2331		cancel_charge(memcg, nr_pages);
2332		return -ENOMEM;
 
 
2333	}
 
2334
2335	page->mem_cgroup = memcg;
 
2336
2337	return 0;
2338}
2339
2340int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2341{
2342	struct mem_cgroup *memcg;
2343	int ret = 0;
 
2344
2345	memcg = get_mem_cgroup_from_mm(current->mm);
2346	if (!mem_cgroup_is_root(memcg))
2347		ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
2348	css_put(&memcg->css);
2349	return ret;
2350}
2351
2352void __memcg_kmem_uncharge(struct page *page, int order)
2353{
2354	struct mem_cgroup *memcg = page->mem_cgroup;
2355	unsigned int nr_pages = 1 << order;
 
 
 
 
 
 
 
 
 
 
2356
2357	if (!memcg)
2358		return;
2359
2360	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 
2361
2362	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2363		page_counter_uncharge(&memcg->kmem, nr_pages);
 
2364
2365	page_counter_uncharge(&memcg->memory, nr_pages);
2366	if (do_memsw_account())
2367		page_counter_uncharge(&memcg->memsw, nr_pages);
2368
2369	page->mem_cgroup = NULL;
2370	css_put_many(&memcg->css, nr_pages);
 
2371}
2372#endif /* !CONFIG_SLOB */
 
2373
2374#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2375
2376/*
2377 * Because tail pages are not marked as "used", set it. We're under
2378 * zone->lru_lock and migration entries setup in all page mappings.
2379 */
2380void mem_cgroup_split_huge_fixup(struct page *head)
2381{
 
2382	int i;
2383
2384	if (mem_cgroup_disabled())
2385		return;
2386
2387	for (i = 1; i < HPAGE_PMD_NR; i++)
2388		head[i].mem_cgroup = head->mem_cgroup;
2389
2390	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
2391		       HPAGE_PMD_NR);
2392}
2393#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2394
2395#ifdef CONFIG_MEMCG_SWAP
2396static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2397					 bool charge)
2398{
2399	int val = (charge) ? 1 : -1;
2400	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
2401}
2402
2403/**
2404 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2405 * @entry: swap entry to be moved
2406 * @from:  mem_cgroup which the entry is moved from
2407 * @to:  mem_cgroup which the entry is moved to
2408 *
2409 * It succeeds only when the swap_cgroup's record for this entry is the same
2410 * as the mem_cgroup's id of @from.
2411 *
2412 * Returns 0 on success, -EINVAL on failure.
2413 *
2414 * The caller must have charged to @to, IOW, called page_counter_charge() about
2415 * both res and memsw, and called css_get().
2416 */
2417static int mem_cgroup_move_swap_account(swp_entry_t entry,
2418				struct mem_cgroup *from, struct mem_cgroup *to)
2419{
2420	unsigned short old_id, new_id;
2421
2422	old_id = mem_cgroup_id(from);
2423	new_id = mem_cgroup_id(to);
2424
2425	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2426		mem_cgroup_swap_statistics(from, false);
2427		mem_cgroup_swap_statistics(to, true);
2428		return 0;
2429	}
2430	return -EINVAL;
2431}
2432#else
2433static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2434				struct mem_cgroup *from, struct mem_cgroup *to)
2435{
2436	return -EINVAL;
2437}
2438#endif
2439
2440static DEFINE_MUTEX(memcg_limit_mutex);
2441
2442static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2443				   unsigned long limit)
2444{
2445	unsigned long curusage;
2446	unsigned long oldusage;
2447	bool enlarge = false;
2448	int retry_count;
2449	int ret;
2450
2451	/*
2452	 * For keeping hierarchical_reclaim simple, how long we should retry
2453	 * is depends on callers. We set our retry-count to be function
2454	 * of # of children which we should visit in this loop.
2455	 */
2456	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2457		      mem_cgroup_count_children(memcg);
2458
2459	oldusage = page_counter_read(&memcg->memory);
2460
2461	do {
2462		if (signal_pending(current)) {
2463			ret = -EINTR;
2464			break;
2465		}
2466
2467		mutex_lock(&memcg_limit_mutex);
2468		if (limit > memcg->memsw.limit) {
2469			mutex_unlock(&memcg_limit_mutex);
 
 
 
 
 
 
2470			ret = -EINVAL;
2471			break;
2472		}
2473		if (limit > memcg->memory.limit)
2474			enlarge = true;
2475		ret = page_counter_limit(&memcg->memory, limit);
2476		mutex_unlock(&memcg_limit_mutex);
2477
2478		if (!ret)
2479			break;
2480
2481		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
2482
2483		curusage = page_counter_read(&memcg->memory);
2484		/* Usage is reduced ? */
2485		if (curusage >= oldusage)
2486			retry_count--;
2487		else
2488			oldusage = curusage;
2489	} while (retry_count);
2490
2491	if (!ret && enlarge)
2492		memcg_oom_recover(memcg);
2493
2494	return ret;
2495}
2496
2497static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2498					 unsigned long limit)
2499{
2500	unsigned long curusage;
2501	unsigned long oldusage;
2502	bool enlarge = false;
2503	int retry_count;
2504	int ret;
2505
2506	/* see mem_cgroup_resize_res_limit */
2507	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2508		      mem_cgroup_count_children(memcg);
2509
2510	oldusage = page_counter_read(&memcg->memsw);
2511
2512	do {
2513		if (signal_pending(current)) {
2514			ret = -EINTR;
2515			break;
2516		}
2517
2518		mutex_lock(&memcg_limit_mutex);
2519		if (limit < memcg->memory.limit) {
2520			mutex_unlock(&memcg_limit_mutex);
2521			ret = -EINVAL;
2522			break;
2523		}
2524		if (limit > memcg->memsw.limit)
2525			enlarge = true;
2526		ret = page_counter_limit(&memcg->memsw, limit);
2527		mutex_unlock(&memcg_limit_mutex);
2528
2529		if (!ret)
2530			break;
2531
2532		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
2533
2534		curusage = page_counter_read(&memcg->memsw);
2535		/* Usage is reduced ? */
2536		if (curusage >= oldusage)
2537			retry_count--;
2538		else
2539			oldusage = curusage;
2540	} while (retry_count);
2541
2542	if (!ret && enlarge)
2543		memcg_oom_recover(memcg);
2544
2545	return ret;
2546}
2547
2548unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2549					    gfp_t gfp_mask,
2550					    unsigned long *total_scanned)
2551{
2552	unsigned long nr_reclaimed = 0;
2553	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2554	unsigned long reclaimed;
2555	int loop = 0;
2556	struct mem_cgroup_tree_per_zone *mctz;
2557	unsigned long excess;
2558	unsigned long nr_scanned;
2559
2560	if (order > 0)
2561		return 0;
2562
2563	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 
 
 
 
 
 
 
 
 
2564	/*
2565	 * This loop can run a while, specially if mem_cgroup's continuously
2566	 * keep exceeding their soft limit and putting the system under
2567	 * pressure
2568	 */
2569	do {
2570		if (next_mz)
2571			mz = next_mz;
2572		else
2573			mz = mem_cgroup_largest_soft_limit_node(mctz);
2574		if (!mz)
2575			break;
2576
2577		nr_scanned = 0;
2578		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
2579						    gfp_mask, &nr_scanned);
2580		nr_reclaimed += reclaimed;
2581		*total_scanned += nr_scanned;
2582		spin_lock_irq(&mctz->lock);
2583		__mem_cgroup_remove_exceeded(mz, mctz);
2584
2585		/*
2586		 * If we failed to reclaim anything from this memory cgroup
2587		 * it is time to move on to the next cgroup
2588		 */
2589		next_mz = NULL;
2590		if (!reclaimed)
2591			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2592
2593		excess = soft_limit_excess(mz->memcg);
2594		/*
2595		 * One school of thought says that we should not add
2596		 * back the node to the tree if reclaim returns 0.
2597		 * But our reclaim could return 0, simply because due
2598		 * to priority we are exposing a smaller subset of
2599		 * memory to reclaim from. Consider this as a longer
2600		 * term TODO.
2601		 */
2602		/* If excess == 0, no tree ops */
2603		__mem_cgroup_insert_exceeded(mz, mctz, excess);
2604		spin_unlock_irq(&mctz->lock);
2605		css_put(&mz->memcg->css);
2606		loop++;
2607		/*
2608		 * Could not reclaim anything and there are no more
2609		 * mem cgroups to try or we seem to be looping without
2610		 * reclaiming anything.
2611		 */
2612		if (!nr_reclaimed &&
2613			(next_mz == NULL ||
2614			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2615			break;
2616	} while (!nr_reclaimed);
2617	if (next_mz)
2618		css_put(&next_mz->memcg->css);
2619	return nr_reclaimed;
2620}
2621
2622/*
2623 * Test whether @memcg has children, dead or alive.  Note that this
2624 * function doesn't care whether @memcg has use_hierarchy enabled and
2625 * returns %true if there are child csses according to the cgroup
2626 * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
2627 */
2628static inline bool memcg_has_children(struct mem_cgroup *memcg)
2629{
2630	bool ret;
2631
2632	rcu_read_lock();
2633	ret = css_next_child(NULL, &memcg->css);
2634	rcu_read_unlock();
2635	return ret;
2636}
2637
2638/*
2639 * Reclaims as many pages from the given memcg as possible and moves
2640 * the rest to the parent.
2641 *
2642 * Caller is responsible for holding css reference for memcg.
2643 */
2644static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2645{
2646	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2647
2648	/* we call try-to-free pages for make this cgroup empty */
2649	lru_add_drain_all();
 
 
 
2650	/* try to free all pages in this cgroup */
2651	while (nr_retries && page_counter_read(&memcg->memory)) {
2652		int progress;
2653
2654		if (signal_pending(current))
2655			return -EINTR;
2656
2657		progress = try_to_free_mem_cgroup_pages(memcg, 1,
2658							GFP_KERNEL, true);
2659		if (!progress) {
2660			nr_retries--;
2661			/* maybe some writeback is necessary */
2662			congestion_wait(BLK_RW_ASYNC, HZ/10);
2663		}
2664
2665	}
2666
2667	return 0;
2668}
2669
2670static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2671					    char *buf, size_t nbytes,
2672					    loff_t off)
2673{
2674	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2675
2676	if (mem_cgroup_is_root(memcg))
2677		return -EINVAL;
2678	return mem_cgroup_force_empty(memcg) ?: nbytes;
2679}
2680
2681static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2682				     struct cftype *cft)
2683{
2684	return mem_cgroup_from_css(css)->use_hierarchy;
2685}
2686
2687static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2688				      struct cftype *cft, u64 val)
2689{
2690	int retval = 0;
2691	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2692	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2693
2694	if (memcg->use_hierarchy == val)
2695		return 0;
2696
2697	/*
2698	 * If parent's use_hierarchy is set, we can't make any modifications
2699	 * in the child subtrees. If it is unset, then the change can
2700	 * occur, provided the current cgroup has no children.
2701	 *
2702	 * For the root cgroup, parent_mem is NULL, we allow value to be
2703	 * set if there are no children.
2704	 */
2705	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2706				(val == 1 || val == 0)) {
2707		if (!memcg_has_children(memcg))
2708			memcg->use_hierarchy = val;
2709		else
2710			retval = -EBUSY;
2711	} else
2712		retval = -EINVAL;
2713
2714	return retval;
2715}
2716
2717static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
2718{
2719	struct mem_cgroup *iter;
2720	int i;
2721
2722	memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
2723
2724	for_each_mem_cgroup_tree(iter, memcg) {
2725		for (i = 0; i < MEMCG_NR_STAT; i++)
2726			stat[i] += mem_cgroup_read_stat(iter, i);
2727	}
2728}
2729
2730static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
2731{
2732	struct mem_cgroup *iter;
2733	int i;
2734
2735	memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
2736
2737	for_each_mem_cgroup_tree(iter, memcg) {
2738		for (i = 0; i < MEMCG_NR_EVENTS; i++)
2739			events[i] += mem_cgroup_read_events(iter, i);
2740	}
2741}
2742
2743static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2744{
2745	unsigned long val = 0;
2746
2747	if (mem_cgroup_is_root(memcg)) {
2748		struct mem_cgroup *iter;
2749
2750		for_each_mem_cgroup_tree(iter, memcg) {
2751			val += mem_cgroup_read_stat(iter,
2752					MEM_CGROUP_STAT_CACHE);
2753			val += mem_cgroup_read_stat(iter,
2754					MEM_CGROUP_STAT_RSS);
2755			if (swap)
2756				val += mem_cgroup_read_stat(iter,
2757						MEM_CGROUP_STAT_SWAP);
2758		}
2759	} else {
2760		if (!swap)
2761			val = page_counter_read(&memcg->memory);
2762		else
2763			val = page_counter_read(&memcg->memsw);
2764	}
2765	return val;
2766}
2767
2768enum {
2769	RES_USAGE,
2770	RES_LIMIT,
2771	RES_MAX_USAGE,
2772	RES_FAILCNT,
2773	RES_SOFT_LIMIT,
2774};
2775
2776static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2777			       struct cftype *cft)
2778{
2779	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2780	struct page_counter *counter;
2781
2782	switch (MEMFILE_TYPE(cft->private)) {
2783	case _MEM:
2784		counter = &memcg->memory;
2785		break;
2786	case _MEMSWAP:
2787		counter = &memcg->memsw;
2788		break;
2789	case _KMEM:
2790		counter = &memcg->kmem;
2791		break;
2792	case _TCP:
2793		counter = &memcg->tcpmem;
2794		break;
2795	default:
2796		BUG();
2797	}
2798
2799	switch (MEMFILE_ATTR(cft->private)) {
2800	case RES_USAGE:
2801		if (counter == &memcg->memory)
2802			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2803		if (counter == &memcg->memsw)
2804			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2805		return (u64)page_counter_read(counter) * PAGE_SIZE;
2806	case RES_LIMIT:
2807		return (u64)counter->limit * PAGE_SIZE;
2808	case RES_MAX_USAGE:
2809		return (u64)counter->watermark * PAGE_SIZE;
2810	case RES_FAILCNT:
2811		return counter->failcnt;
2812	case RES_SOFT_LIMIT:
2813		return (u64)memcg->soft_limit * PAGE_SIZE;
2814	default:
2815		BUG();
2816	}
2817}
2818
2819#ifndef CONFIG_SLOB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2820static int memcg_online_kmem(struct mem_cgroup *memcg)
2821{
 
2822	int memcg_id;
2823
2824	if (cgroup_memory_nokmem)
2825		return 0;
2826
2827	BUG_ON(memcg->kmemcg_id >= 0);
2828	BUG_ON(memcg->kmem_state);
2829
2830	memcg_id = memcg_alloc_cache_id();
2831	if (memcg_id < 0)
2832		return memcg_id;
2833
2834	static_branch_inc(&memcg_kmem_enabled_key);
 
 
 
 
 
 
 
 
 
2835	/*
2836	 * A memory cgroup is considered kmem-online as soon as it gets
2837	 * kmemcg_id. Setting the id after enabling static branching will
2838	 * guarantee no one starts accounting before all call sites are
2839	 * patched.
2840	 */
2841	memcg->kmemcg_id = memcg_id;
2842	memcg->kmem_state = KMEM_ONLINE;
2843
2844	return 0;
2845}
2846
2847static void memcg_offline_kmem(struct mem_cgroup *memcg)
2848{
2849	struct cgroup_subsys_state *css;
2850	struct mem_cgroup *parent, *child;
2851	int kmemcg_id;
2852
2853	if (memcg->kmem_state != KMEM_ONLINE)
2854		return;
2855	/*
2856	 * Clear the online state before clearing memcg_caches array
2857	 * entries. The slab_mutex in memcg_deactivate_kmem_caches()
2858	 * guarantees that no cache will be created for this cgroup
2859	 * after we are done (see memcg_create_kmem_cache()).
2860	 */
2861	memcg->kmem_state = KMEM_ALLOCATED;
2862
2863	memcg_deactivate_kmem_caches(memcg);
2864
2865	kmemcg_id = memcg->kmemcg_id;
2866	BUG_ON(kmemcg_id < 0);
2867
2868	parent = parent_mem_cgroup(memcg);
2869	if (!parent)
2870		parent = root_mem_cgroup;
2871
 
 
 
 
 
2872	/*
2873	 * Change kmemcg_id of this cgroup and all its descendants to the
2874	 * parent's id, and then move all entries from this cgroup's list_lrus
2875	 * to ones of the parent. After we have finished, all list_lrus
2876	 * corresponding to this cgroup are guaranteed to remain empty. The
2877	 * ordering is imposed by list_lru_node->lock taken by
2878	 * memcg_drain_all_list_lrus().
2879	 */
 
2880	css_for_each_descendant_pre(css, &memcg->css) {
2881		child = mem_cgroup_from_css(css);
2882		BUG_ON(child->kmemcg_id != kmemcg_id);
2883		child->kmemcg_id = parent->kmemcg_id;
2884		if (!memcg->use_hierarchy)
2885			break;
2886	}
2887	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
 
 
2888
2889	memcg_free_cache_id(kmemcg_id);
2890}
2891
2892static void memcg_free_kmem(struct mem_cgroup *memcg)
2893{
2894	/* css_alloc() failed, offlining didn't happen */
2895	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
2896		memcg_offline_kmem(memcg);
2897
2898	if (memcg->kmem_state == KMEM_ALLOCATED) {
2899		memcg_destroy_kmem_caches(memcg);
2900		static_branch_dec(&memcg_kmem_enabled_key);
2901		WARN_ON(page_counter_read(&memcg->kmem));
2902	}
2903}
2904#else
2905static int memcg_online_kmem(struct mem_cgroup *memcg)
2906{
2907	return 0;
2908}
2909static void memcg_offline_kmem(struct mem_cgroup *memcg)
2910{
2911}
2912static void memcg_free_kmem(struct mem_cgroup *memcg)
2913{
2914}
2915#endif /* !CONFIG_SLOB */
2916
2917static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
2918				   unsigned long limit)
2919{
2920	int ret;
2921
2922	mutex_lock(&memcg_limit_mutex);
2923	ret = page_counter_limit(&memcg->kmem, limit);
2924	mutex_unlock(&memcg_limit_mutex);
2925	return ret;
2926}
2927
2928static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
2929{
2930	int ret;
2931
2932	mutex_lock(&memcg_limit_mutex);
2933
2934	ret = page_counter_limit(&memcg->tcpmem, limit);
2935	if (ret)
2936		goto out;
2937
2938	if (!memcg->tcpmem_active) {
2939		/*
2940		 * The active flag needs to be written after the static_key
2941		 * update. This is what guarantees that the socket activation
2942		 * function is the last one to run. See sock_update_memcg() for
2943		 * details, and note that we don't mark any socket as belonging
2944		 * to this memcg until that flag is up.
2945		 *
2946		 * We need to do this, because static_keys will span multiple
2947		 * sites, but we can't control their order. If we mark a socket
2948		 * as accounted, but the accounting functions are not patched in
2949		 * yet, we'll lose accounting.
2950		 *
2951		 * We never race with the readers in sock_update_memcg(),
2952		 * because when this value change, the code to process it is not
2953		 * patched in yet.
2954		 */
2955		static_branch_inc(&memcg_sockets_enabled_key);
2956		memcg->tcpmem_active = true;
2957	}
2958out:
2959	mutex_unlock(&memcg_limit_mutex);
2960	return ret;
2961}
2962
2963/*
2964 * The user of this function is...
2965 * RES_LIMIT.
2966 */
2967static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2968				char *buf, size_t nbytes, loff_t off)
2969{
2970	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2971	unsigned long nr_pages;
2972	int ret;
2973
2974	buf = strstrip(buf);
2975	ret = page_counter_memparse(buf, "-1", &nr_pages);
2976	if (ret)
2977		return ret;
2978
2979	switch (MEMFILE_ATTR(of_cft(of)->private)) {
2980	case RES_LIMIT:
2981		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2982			ret = -EINVAL;
2983			break;
2984		}
2985		switch (MEMFILE_TYPE(of_cft(of)->private)) {
2986		case _MEM:
2987			ret = mem_cgroup_resize_limit(memcg, nr_pages);
2988			break;
2989		case _MEMSWAP:
2990			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
2991			break;
2992		case _KMEM:
2993			ret = memcg_update_kmem_limit(memcg, nr_pages);
 
 
 
2994			break;
2995		case _TCP:
2996			ret = memcg_update_tcp_limit(memcg, nr_pages);
2997			break;
2998		}
2999		break;
3000	case RES_SOFT_LIMIT:
3001		memcg->soft_limit = nr_pages;
3002		ret = 0;
3003		break;
3004	}
3005	return ret ?: nbytes;
3006}
3007
3008static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3009				size_t nbytes, loff_t off)
3010{
3011	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3012	struct page_counter *counter;
3013
3014	switch (MEMFILE_TYPE(of_cft(of)->private)) {
3015	case _MEM:
3016		counter = &memcg->memory;
3017		break;
3018	case _MEMSWAP:
3019		counter = &memcg->memsw;
3020		break;
3021	case _KMEM:
3022		counter = &memcg->kmem;
3023		break;
3024	case _TCP:
3025		counter = &memcg->tcpmem;
3026		break;
3027	default:
3028		BUG();
3029	}
3030
3031	switch (MEMFILE_ATTR(of_cft(of)->private)) {
3032	case RES_MAX_USAGE:
3033		page_counter_reset_watermark(counter);
3034		break;
3035	case RES_FAILCNT:
3036		counter->failcnt = 0;
3037		break;
3038	default:
3039		BUG();
3040	}
3041
3042	return nbytes;
3043}
3044
3045static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3046					struct cftype *cft)
3047{
3048	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3049}
3050
3051#ifdef CONFIG_MMU
3052static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3053					struct cftype *cft, u64 val)
3054{
3055	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3056
3057	if (val & ~MOVE_MASK)
3058		return -EINVAL;
3059
3060	/*
3061	 * No kind of locking is needed in here, because ->can_attach() will
3062	 * check this value once in the beginning of the process, and then carry
3063	 * on with stale data. This means that changes to this value will only
3064	 * affect task migrations starting after the change.
3065	 */
3066	memcg->move_charge_at_immigrate = val;
3067	return 0;
3068}
3069#else
3070static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3071					struct cftype *cft, u64 val)
3072{
3073	return -ENOSYS;
3074}
3075#endif
3076
3077#ifdef CONFIG_NUMA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3078static int memcg_numa_stat_show(struct seq_file *m, void *v)
3079{
3080	struct numa_stat {
3081		const char *name;
3082		unsigned int lru_mask;
3083	};
3084
3085	static const struct numa_stat stats[] = {
3086		{ "total", LRU_ALL },
3087		{ "file", LRU_ALL_FILE },
3088		{ "anon", LRU_ALL_ANON },
3089		{ "unevictable", BIT(LRU_UNEVICTABLE) },
3090	};
3091	const struct numa_stat *stat;
3092	int nid;
3093	unsigned long nr;
3094	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3095
3096	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3097		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3098		seq_printf(m, "%s=%lu", stat->name, nr);
3099		for_each_node_state(nid, N_MEMORY) {
3100			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3101							  stat->lru_mask);
3102			seq_printf(m, " N%d=%lu", nid, nr);
3103		}
3104		seq_putc(m, '\n');
3105	}
3106
3107	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3108		struct mem_cgroup *iter;
3109
3110		nr = 0;
3111		for_each_mem_cgroup_tree(iter, memcg)
3112			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3113		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3114		for_each_node_state(nid, N_MEMORY) {
3115			nr = 0;
3116			for_each_mem_cgroup_tree(iter, memcg)
3117				nr += mem_cgroup_node_nr_lru_pages(
3118					iter, nid, stat->lru_mask);
3119			seq_printf(m, " N%d=%lu", nid, nr);
3120		}
3121		seq_putc(m, '\n');
3122	}
3123
3124	return 0;
3125}
3126#endif /* CONFIG_NUMA */
3127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3128static int memcg_stat_show(struct seq_file *m, void *v)
3129{
3130	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3131	unsigned long memory, memsw;
3132	struct mem_cgroup *mi;
3133	unsigned int i;
3134
3135	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3136		     MEM_CGROUP_STAT_NSTATS);
3137	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3138		     MEM_CGROUP_EVENTS_NSTATS);
3139	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3140
3141	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3142		if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3143			continue;
3144		seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
3145			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 
 
 
 
3146	}
3147
3148	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
3149		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
3150			   mem_cgroup_read_events(memcg, i));
3151
3152	for (i = 0; i < NR_LRU_LISTS; i++)
3153		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3154			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
 
3155
3156	/* Hierarchical information */
3157	memory = memsw = PAGE_COUNTER_MAX;
3158	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3159		memory = min(memory, mi->memory.limit);
3160		memsw = min(memsw, mi->memsw.limit);
3161	}
3162	seq_printf(m, "hierarchical_memory_limit %llu\n",
3163		   (u64)memory * PAGE_SIZE);
3164	if (do_memsw_account())
3165		seq_printf(m, "hierarchical_memsw_limit %llu\n",
3166			   (u64)memsw * PAGE_SIZE);
3167
3168	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3169		unsigned long long val = 0;
3170
3171		if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3172			continue;
3173		for_each_mem_cgroup_tree(mi, memcg)
3174			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
3175		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
3176	}
3177
3178	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
3179		unsigned long long val = 0;
3180
3181		for_each_mem_cgroup_tree(mi, memcg)
3182			val += mem_cgroup_read_events(mi, i);
3183		seq_printf(m, "total_%s %llu\n",
3184			   mem_cgroup_events_names[i], val);
3185	}
3186
3187	for (i = 0; i < NR_LRU_LISTS; i++) {
3188		unsigned long long val = 0;
3189
3190		for_each_mem_cgroup_tree(mi, memcg)
3191			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3192		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3193	}
3194
3195#ifdef CONFIG_DEBUG_VM
3196	{
3197		int nid, zid;
3198		struct mem_cgroup_per_zone *mz;
3199		struct zone_reclaim_stat *rstat;
3200		unsigned long recent_rotated[2] = {0, 0};
3201		unsigned long recent_scanned[2] = {0, 0};
3202
3203		for_each_online_node(nid)
3204			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3205				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
3206				rstat = &mz->lruvec.reclaim_stat;
3207
3208				recent_rotated[0] += rstat->recent_rotated[0];
3209				recent_rotated[1] += rstat->recent_rotated[1];
3210				recent_scanned[0] += rstat->recent_scanned[0];
3211				recent_scanned[1] += rstat->recent_scanned[1];
3212			}
3213		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3214		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3215		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3216		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3217	}
3218#endif
3219
3220	return 0;
3221}
3222
3223static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3224				      struct cftype *cft)
3225{
3226	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3227
3228	return mem_cgroup_swappiness(memcg);
3229}
3230
3231static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3232				       struct cftype *cft, u64 val)
3233{
3234	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3235
3236	if (val > 100)
3237		return -EINVAL;
3238
3239	if (css->parent)
3240		memcg->swappiness = val;
3241	else
3242		vm_swappiness = val;
3243
3244	return 0;
3245}
3246
3247static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3248{
3249	struct mem_cgroup_threshold_ary *t;
3250	unsigned long usage;
3251	int i;
3252
3253	rcu_read_lock();
3254	if (!swap)
3255		t = rcu_dereference(memcg->thresholds.primary);
3256	else
3257		t = rcu_dereference(memcg->memsw_thresholds.primary);
3258
3259	if (!t)
3260		goto unlock;
3261
3262	usage = mem_cgroup_usage(memcg, swap);
3263
3264	/*
3265	 * current_threshold points to threshold just below or equal to usage.
3266	 * If it's not true, a threshold was crossed after last
3267	 * call of __mem_cgroup_threshold().
3268	 */
3269	i = t->current_threshold;
3270
3271	/*
3272	 * Iterate backward over array of thresholds starting from
3273	 * current_threshold and check if a threshold is crossed.
3274	 * If none of thresholds below usage is crossed, we read
3275	 * only one element of the array here.
3276	 */
3277	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3278		eventfd_signal(t->entries[i].eventfd, 1);
3279
3280	/* i = current_threshold + 1 */
3281	i++;
3282
3283	/*
3284	 * Iterate forward over array of thresholds starting from
3285	 * current_threshold+1 and check if a threshold is crossed.
3286	 * If none of thresholds above usage is crossed, we read
3287	 * only one element of the array here.
3288	 */
3289	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3290		eventfd_signal(t->entries[i].eventfd, 1);
3291
3292	/* Update current_threshold */
3293	t->current_threshold = i - 1;
3294unlock:
3295	rcu_read_unlock();
3296}
3297
3298static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3299{
3300	while (memcg) {
3301		__mem_cgroup_threshold(memcg, false);
3302		if (do_memsw_account())
3303			__mem_cgroup_threshold(memcg, true);
3304
3305		memcg = parent_mem_cgroup(memcg);
3306	}
3307}
3308
3309static int compare_thresholds(const void *a, const void *b)
3310{
3311	const struct mem_cgroup_threshold *_a = a;
3312	const struct mem_cgroup_threshold *_b = b;
3313
3314	if (_a->threshold > _b->threshold)
3315		return 1;
3316
3317	if (_a->threshold < _b->threshold)
3318		return -1;
3319
3320	return 0;
3321}
3322
3323static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3324{
3325	struct mem_cgroup_eventfd_list *ev;
3326
3327	spin_lock(&memcg_oom_lock);
3328
3329	list_for_each_entry(ev, &memcg->oom_notify, list)
3330		eventfd_signal(ev->eventfd, 1);
3331
3332	spin_unlock(&memcg_oom_lock);
3333	return 0;
3334}
3335
3336static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3337{
3338	struct mem_cgroup *iter;
3339
3340	for_each_mem_cgroup_tree(iter, memcg)
3341		mem_cgroup_oom_notify_cb(iter);
3342}
3343
3344static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3345	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3346{
3347	struct mem_cgroup_thresholds *thresholds;
3348	struct mem_cgroup_threshold_ary *new;
3349	unsigned long threshold;
3350	unsigned long usage;
3351	int i, size, ret;
3352
3353	ret = page_counter_memparse(args, "-1", &threshold);
3354	if (ret)
3355		return ret;
3356
3357	mutex_lock(&memcg->thresholds_lock);
3358
3359	if (type == _MEM) {
3360		thresholds = &memcg->thresholds;
3361		usage = mem_cgroup_usage(memcg, false);
3362	} else if (type == _MEMSWAP) {
3363		thresholds = &memcg->memsw_thresholds;
3364		usage = mem_cgroup_usage(memcg, true);
3365	} else
3366		BUG();
3367
3368	/* Check if a threshold crossed before adding a new one */
3369	if (thresholds->primary)
3370		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
3371
3372	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3373
3374	/* Allocate memory for new array of thresholds */
3375	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3376			GFP_KERNEL);
3377	if (!new) {
3378		ret = -ENOMEM;
3379		goto unlock;
3380	}
3381	new->size = size;
3382
3383	/* Copy thresholds (if any) to new array */
3384	if (thresholds->primary) {
3385		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3386				sizeof(struct mem_cgroup_threshold));
3387	}
3388
3389	/* Add new threshold */
3390	new->entries[size - 1].eventfd = eventfd;
3391	new->entries[size - 1].threshold = threshold;
3392
3393	/* Sort thresholds. Registering of new threshold isn't time-critical */
3394	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3395			compare_thresholds, NULL);
3396
3397	/* Find current threshold */
3398	new->current_threshold = -1;
3399	for (i = 0; i < size; i++) {
3400		if (new->entries[i].threshold <= usage) {
3401			/*
3402			 * new->current_threshold will not be used until
3403			 * rcu_assign_pointer(), so it's safe to increment
3404			 * it here.
3405			 */
3406			++new->current_threshold;
3407		} else
3408			break;
3409	}
3410
3411	/* Free old spare buffer and save old primary buffer as spare */
3412	kfree(thresholds->spare);
3413	thresholds->spare = thresholds->primary;
3414
3415	rcu_assign_pointer(thresholds->primary, new);
3416
3417	/* To be sure that nobody uses thresholds */
3418	synchronize_rcu();
3419
3420unlock:
3421	mutex_unlock(&memcg->thresholds_lock);
3422
3423	return ret;
3424}
3425
3426static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3427	struct eventfd_ctx *eventfd, const char *args)
3428{
3429	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3430}
3431
3432static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3433	struct eventfd_ctx *eventfd, const char *args)
3434{
3435	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3436}
3437
3438static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3439	struct eventfd_ctx *eventfd, enum res_type type)
3440{
3441	struct mem_cgroup_thresholds *thresholds;
3442	struct mem_cgroup_threshold_ary *new;
3443	unsigned long usage;
3444	int i, j, size;
3445
3446	mutex_lock(&memcg->thresholds_lock);
3447
3448	if (type == _MEM) {
3449		thresholds = &memcg->thresholds;
3450		usage = mem_cgroup_usage(memcg, false);
3451	} else if (type == _MEMSWAP) {
3452		thresholds = &memcg->memsw_thresholds;
3453		usage = mem_cgroup_usage(memcg, true);
3454	} else
3455		BUG();
3456
3457	if (!thresholds->primary)
3458		goto unlock;
3459
3460	/* Check if a threshold crossed before removing */
3461	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
3462
3463	/* Calculate new number of threshold */
3464	size = 0;
3465	for (i = 0; i < thresholds->primary->size; i++) {
3466		if (thresholds->primary->entries[i].eventfd != eventfd)
3467			size++;
 
 
3468	}
3469
3470	new = thresholds->spare;
3471
 
 
 
 
3472	/* Set thresholds array to NULL if we don't have thresholds */
3473	if (!size) {
3474		kfree(new);
3475		new = NULL;
3476		goto swap_buffers;
3477	}
3478
3479	new->size = size;
3480
3481	/* Copy thresholds and find current threshold */
3482	new->current_threshold = -1;
3483	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3484		if (thresholds->primary->entries[i].eventfd == eventfd)
3485			continue;
3486
3487		new->entries[j] = thresholds->primary->entries[i];
3488		if (new->entries[j].threshold <= usage) {
3489			/*
3490			 * new->current_threshold will not be used
3491			 * until rcu_assign_pointer(), so it's safe to increment
3492			 * it here.
3493			 */
3494			++new->current_threshold;
3495		}
3496		j++;
3497	}
3498
3499swap_buffers:
3500	/* Swap primary and spare array */
3501	thresholds->spare = thresholds->primary;
3502
3503	rcu_assign_pointer(thresholds->primary, new);
3504
3505	/* To be sure that nobody uses thresholds */
3506	synchronize_rcu();
3507
3508	/* If all events are unregistered, free the spare array */
3509	if (!new) {
3510		kfree(thresholds->spare);
3511		thresholds->spare = NULL;
3512	}
3513unlock:
3514	mutex_unlock(&memcg->thresholds_lock);
3515}
3516
3517static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3518	struct eventfd_ctx *eventfd)
3519{
3520	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3521}
3522
3523static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3524	struct eventfd_ctx *eventfd)
3525{
3526	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3527}
3528
3529static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3530	struct eventfd_ctx *eventfd, const char *args)
3531{
3532	struct mem_cgroup_eventfd_list *event;
3533
3534	event = kmalloc(sizeof(*event),	GFP_KERNEL);
3535	if (!event)
3536		return -ENOMEM;
3537
3538	spin_lock(&memcg_oom_lock);
3539
3540	event->eventfd = eventfd;
3541	list_add(&event->list, &memcg->oom_notify);
3542
3543	/* already in OOM ? */
3544	if (memcg->under_oom)
3545		eventfd_signal(eventfd, 1);
3546	spin_unlock(&memcg_oom_lock);
3547
3548	return 0;
3549}
3550
3551static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3552	struct eventfd_ctx *eventfd)
3553{
3554	struct mem_cgroup_eventfd_list *ev, *tmp;
3555
3556	spin_lock(&memcg_oom_lock);
3557
3558	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3559		if (ev->eventfd == eventfd) {
3560			list_del(&ev->list);
3561			kfree(ev);
3562		}
3563	}
3564
3565	spin_unlock(&memcg_oom_lock);
3566}
3567
3568static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3569{
3570	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3571
3572	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3573	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
 
 
3574	return 0;
3575}
3576
3577static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3578	struct cftype *cft, u64 val)
3579{
3580	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3581
3582	/* cannot set to root cgroup and only 0 and 1 are allowed */
3583	if (!css->parent || !((val == 0) || (val == 1)))
3584		return -EINVAL;
3585
3586	memcg->oom_kill_disable = val;
3587	if (!val)
3588		memcg_oom_recover(memcg);
3589
3590	return 0;
3591}
3592
3593#ifdef CONFIG_CGROUP_WRITEBACK
3594
3595struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
3596{
3597	return &memcg->cgwb_list;
3598}
3599
3600static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3601{
3602	return wb_domain_init(&memcg->cgwb_domain, gfp);
3603}
3604
3605static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3606{
3607	wb_domain_exit(&memcg->cgwb_domain);
3608}
3609
3610static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3611{
3612	wb_domain_size_changed(&memcg->cgwb_domain);
3613}
3614
3615struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3616{
3617	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3618
3619	if (!memcg->css.parent)
3620		return NULL;
3621
3622	return &memcg->cgwb_domain;
3623}
3624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3625/**
3626 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3627 * @wb: bdi_writeback in question
3628 * @pfilepages: out parameter for number of file pages
3629 * @pheadroom: out parameter for number of allocatable pages according to memcg
3630 * @pdirty: out parameter for number of dirty pages
3631 * @pwriteback: out parameter for number of pages under writeback
3632 *
3633 * Determine the numbers of file, headroom, dirty, and writeback pages in
3634 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3635 * is a bit more involved.
3636 *
3637 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3638 * headroom is calculated as the lowest headroom of itself and the
3639 * ancestors.  Note that this doesn't consider the actual amount of
3640 * available memory in the system.  The caller should further cap
3641 * *@pheadroom accordingly.
3642 */
3643void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3644			 unsigned long *pheadroom, unsigned long *pdirty,
3645			 unsigned long *pwriteback)
3646{
3647	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3648	struct mem_cgroup *parent;
3649
3650	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
3651
3652	/* this should eventually include NR_UNSTABLE_NFS */
3653	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
3654	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3655						     (1 << LRU_ACTIVE_FILE));
3656	*pheadroom = PAGE_COUNTER_MAX;
3657
3658	while ((parent = parent_mem_cgroup(memcg))) {
3659		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
 
3660		unsigned long used = page_counter_read(&memcg->memory);
3661
3662		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3663		memcg = parent;
3664	}
3665}
3666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3667#else	/* CONFIG_CGROUP_WRITEBACK */
3668
3669static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3670{
3671	return 0;
3672}
3673
3674static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3675{
3676}
3677
3678static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3679{
3680}
3681
3682#endif	/* CONFIG_CGROUP_WRITEBACK */
3683
3684/*
3685 * DO NOT USE IN NEW FILES.
3686 *
3687 * "cgroup.event_control" implementation.
3688 *
3689 * This is way over-engineered.  It tries to support fully configurable
3690 * events for each user.  Such level of flexibility is completely
3691 * unnecessary especially in the light of the planned unified hierarchy.
3692 *
3693 * Please deprecate this and replace with something simpler if at all
3694 * possible.
3695 */
3696
3697/*
3698 * Unregister event and free resources.
3699 *
3700 * Gets called from workqueue.
3701 */
3702static void memcg_event_remove(struct work_struct *work)
3703{
3704	struct mem_cgroup_event *event =
3705		container_of(work, struct mem_cgroup_event, remove);
3706	struct mem_cgroup *memcg = event->memcg;
3707
3708	remove_wait_queue(event->wqh, &event->wait);
3709
3710	event->unregister_event(memcg, event->eventfd);
3711
3712	/* Notify userspace the event is going away. */
3713	eventfd_signal(event->eventfd, 1);
3714
3715	eventfd_ctx_put(event->eventfd);
3716	kfree(event);
3717	css_put(&memcg->css);
3718}
3719
3720/*
3721 * Gets called on POLLHUP on eventfd when user closes it.
3722 *
3723 * Called with wqh->lock held and interrupts disabled.
3724 */
3725static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
3726			    int sync, void *key)
3727{
3728	struct mem_cgroup_event *event =
3729		container_of(wait, struct mem_cgroup_event, wait);
3730	struct mem_cgroup *memcg = event->memcg;
3731	unsigned long flags = (unsigned long)key;
3732
3733	if (flags & POLLHUP) {
3734		/*
3735		 * If the event has been detached at cgroup removal, we
3736		 * can simply return knowing the other side will cleanup
3737		 * for us.
3738		 *
3739		 * We can't race against event freeing since the other
3740		 * side will require wqh->lock via remove_wait_queue(),
3741		 * which we hold.
3742		 */
3743		spin_lock(&memcg->event_list_lock);
3744		if (!list_empty(&event->list)) {
3745			list_del_init(&event->list);
3746			/*
3747			 * We are in atomic context, but cgroup_event_remove()
3748			 * may sleep, so we have to call it in workqueue.
3749			 */
3750			schedule_work(&event->remove);
3751		}
3752		spin_unlock(&memcg->event_list_lock);
3753	}
3754
3755	return 0;
3756}
3757
3758static void memcg_event_ptable_queue_proc(struct file *file,
3759		wait_queue_head_t *wqh, poll_table *pt)
3760{
3761	struct mem_cgroup_event *event =
3762		container_of(pt, struct mem_cgroup_event, pt);
3763
3764	event->wqh = wqh;
3765	add_wait_queue(wqh, &event->wait);
3766}
3767
3768/*
3769 * DO NOT USE IN NEW FILES.
3770 *
3771 * Parse input and register new cgroup event handler.
3772 *
3773 * Input must be in format '<event_fd> <control_fd> <args>'.
3774 * Interpretation of args is defined by control file implementation.
3775 */
3776static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
3777					 char *buf, size_t nbytes, loff_t off)
3778{
3779	struct cgroup_subsys_state *css = of_css(of);
3780	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3781	struct mem_cgroup_event *event;
3782	struct cgroup_subsys_state *cfile_css;
3783	unsigned int efd, cfd;
3784	struct fd efile;
3785	struct fd cfile;
3786	const char *name;
3787	char *endp;
3788	int ret;
3789
3790	buf = strstrip(buf);
3791
3792	efd = simple_strtoul(buf, &endp, 10);
3793	if (*endp != ' ')
3794		return -EINVAL;
3795	buf = endp + 1;
3796
3797	cfd = simple_strtoul(buf, &endp, 10);
3798	if ((*endp != ' ') && (*endp != '\0'))
3799		return -EINVAL;
3800	buf = endp + 1;
3801
3802	event = kzalloc(sizeof(*event), GFP_KERNEL);
3803	if (!event)
3804		return -ENOMEM;
3805
3806	event->memcg = memcg;
3807	INIT_LIST_HEAD(&event->list);
3808	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
3809	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
3810	INIT_WORK(&event->remove, memcg_event_remove);
3811
3812	efile = fdget(efd);
3813	if (!efile.file) {
3814		ret = -EBADF;
3815		goto out_kfree;
3816	}
3817
3818	event->eventfd = eventfd_ctx_fileget(efile.file);
3819	if (IS_ERR(event->eventfd)) {
3820		ret = PTR_ERR(event->eventfd);
3821		goto out_put_efile;
3822	}
3823
3824	cfile = fdget(cfd);
3825	if (!cfile.file) {
3826		ret = -EBADF;
3827		goto out_put_eventfd;
3828	}
3829
3830	/* the process need read permission on control file */
3831	/* AV: shouldn't we check that it's been opened for read instead? */
3832	ret = inode_permission(file_inode(cfile.file), MAY_READ);
3833	if (ret < 0)
3834		goto out_put_cfile;
3835
3836	/*
3837	 * Determine the event callbacks and set them in @event.  This used
3838	 * to be done via struct cftype but cgroup core no longer knows
3839	 * about these events.  The following is crude but the whole thing
3840	 * is for compatibility anyway.
3841	 *
3842	 * DO NOT ADD NEW FILES.
3843	 */
3844	name = cfile.file->f_path.dentry->d_name.name;
3845
3846	if (!strcmp(name, "memory.usage_in_bytes")) {
3847		event->register_event = mem_cgroup_usage_register_event;
3848		event->unregister_event = mem_cgroup_usage_unregister_event;
3849	} else if (!strcmp(name, "memory.oom_control")) {
3850		event->register_event = mem_cgroup_oom_register_event;
3851		event->unregister_event = mem_cgroup_oom_unregister_event;
3852	} else if (!strcmp(name, "memory.pressure_level")) {
3853		event->register_event = vmpressure_register_event;
3854		event->unregister_event = vmpressure_unregister_event;
3855	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
3856		event->register_event = memsw_cgroup_usage_register_event;
3857		event->unregister_event = memsw_cgroup_usage_unregister_event;
3858	} else {
3859		ret = -EINVAL;
3860		goto out_put_cfile;
3861	}
3862
3863	/*
3864	 * Verify @cfile should belong to @css.  Also, remaining events are
3865	 * automatically removed on cgroup destruction but the removal is
3866	 * asynchronous, so take an extra ref on @css.
3867	 */
3868	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
3869					       &memory_cgrp_subsys);
3870	ret = -EINVAL;
3871	if (IS_ERR(cfile_css))
3872		goto out_put_cfile;
3873	if (cfile_css != css) {
3874		css_put(cfile_css);
3875		goto out_put_cfile;
3876	}
3877
3878	ret = event->register_event(memcg, event->eventfd, buf);
3879	if (ret)
3880		goto out_put_css;
3881
3882	efile.file->f_op->poll(efile.file, &event->pt);
3883
3884	spin_lock(&memcg->event_list_lock);
3885	list_add(&event->list, &memcg->event_list);
3886	spin_unlock(&memcg->event_list_lock);
3887
3888	fdput(cfile);
3889	fdput(efile);
3890
3891	return nbytes;
3892
3893out_put_css:
3894	css_put(css);
3895out_put_cfile:
3896	fdput(cfile);
3897out_put_eventfd:
3898	eventfd_ctx_put(event->eventfd);
3899out_put_efile:
3900	fdput(efile);
3901out_kfree:
3902	kfree(event);
3903
3904	return ret;
3905}
3906
3907static struct cftype mem_cgroup_legacy_files[] = {
3908	{
3909		.name = "usage_in_bytes",
3910		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3911		.read_u64 = mem_cgroup_read_u64,
3912	},
3913	{
3914		.name = "max_usage_in_bytes",
3915		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3916		.write = mem_cgroup_reset,
3917		.read_u64 = mem_cgroup_read_u64,
3918	},
3919	{
3920		.name = "limit_in_bytes",
3921		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3922		.write = mem_cgroup_write,
3923		.read_u64 = mem_cgroup_read_u64,
3924	},
3925	{
3926		.name = "soft_limit_in_bytes",
3927		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3928		.write = mem_cgroup_write,
3929		.read_u64 = mem_cgroup_read_u64,
3930	},
3931	{
3932		.name = "failcnt",
3933		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3934		.write = mem_cgroup_reset,
3935		.read_u64 = mem_cgroup_read_u64,
3936	},
3937	{
3938		.name = "stat",
3939		.seq_show = memcg_stat_show,
3940	},
3941	{
3942		.name = "force_empty",
3943		.write = mem_cgroup_force_empty_write,
3944	},
3945	{
3946		.name = "use_hierarchy",
3947		.write_u64 = mem_cgroup_hierarchy_write,
3948		.read_u64 = mem_cgroup_hierarchy_read,
3949	},
3950	{
3951		.name = "cgroup.event_control",		/* XXX: for compat */
3952		.write = memcg_write_event_control,
3953		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
3954	},
3955	{
3956		.name = "swappiness",
3957		.read_u64 = mem_cgroup_swappiness_read,
3958		.write_u64 = mem_cgroup_swappiness_write,
3959	},
3960	{
3961		.name = "move_charge_at_immigrate",
3962		.read_u64 = mem_cgroup_move_charge_read,
3963		.write_u64 = mem_cgroup_move_charge_write,
3964	},
3965	{
3966		.name = "oom_control",
3967		.seq_show = mem_cgroup_oom_control_read,
3968		.write_u64 = mem_cgroup_oom_control_write,
3969		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3970	},
3971	{
3972		.name = "pressure_level",
3973	},
3974#ifdef CONFIG_NUMA
3975	{
3976		.name = "numa_stat",
3977		.seq_show = memcg_numa_stat_show,
3978	},
3979#endif
3980	{
3981		.name = "kmem.limit_in_bytes",
3982		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
3983		.write = mem_cgroup_write,
3984		.read_u64 = mem_cgroup_read_u64,
3985	},
3986	{
3987		.name = "kmem.usage_in_bytes",
3988		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
3989		.read_u64 = mem_cgroup_read_u64,
3990	},
3991	{
3992		.name = "kmem.failcnt",
3993		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
3994		.write = mem_cgroup_reset,
3995		.read_u64 = mem_cgroup_read_u64,
3996	},
3997	{
3998		.name = "kmem.max_usage_in_bytes",
3999		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4000		.write = mem_cgroup_reset,
4001		.read_u64 = mem_cgroup_read_u64,
4002	},
4003#ifdef CONFIG_SLABINFO
 
4004	{
4005		.name = "kmem.slabinfo",
4006		.seq_start = slab_start,
4007		.seq_next = slab_next,
4008		.seq_stop = slab_stop,
4009		.seq_show = memcg_slab_show,
4010	},
4011#endif
4012	{
4013		.name = "kmem.tcp.limit_in_bytes",
4014		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4015		.write = mem_cgroup_write,
4016		.read_u64 = mem_cgroup_read_u64,
4017	},
4018	{
4019		.name = "kmem.tcp.usage_in_bytes",
4020		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4021		.read_u64 = mem_cgroup_read_u64,
4022	},
4023	{
4024		.name = "kmem.tcp.failcnt",
4025		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4026		.write = mem_cgroup_reset,
4027		.read_u64 = mem_cgroup_read_u64,
4028	},
4029	{
4030		.name = "kmem.tcp.max_usage_in_bytes",
4031		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4032		.write = mem_cgroup_reset,
4033		.read_u64 = mem_cgroup_read_u64,
4034	},
4035	{ },	/* terminate */
4036};
4037
4038static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4039{
4040	struct mem_cgroup_per_node *pn;
4041	struct mem_cgroup_per_zone *mz;
4042	int zone, tmp = node;
4043	/*
4044	 * This routine is called against possible nodes.
4045	 * But it's BUG to call kmalloc() against offline node.
4046	 *
4047	 * TODO: this routine can waste much memory for nodes which will
4048	 *       never be onlined. It's better to use memory hotplug callback
4049	 *       function.
4050	 */
4051	if (!node_state(node, N_NORMAL_MEMORY))
4052		tmp = -1;
4053	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4054	if (!pn)
4055		return 1;
4056
4057	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4058		mz = &pn->zoneinfo[zone];
4059		lruvec_init(&mz->lruvec);
4060		mz->usage_in_excess = 0;
4061		mz->on_tree = false;
4062		mz->memcg = memcg;
 
 
 
 
 
 
 
4063	}
 
 
 
 
 
 
4064	memcg->nodeinfo[node] = pn;
4065	return 0;
4066}
4067
4068static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4069{
4070	kfree(memcg->nodeinfo[node]);
 
 
 
 
 
 
 
4071}
4072
4073static void mem_cgroup_free(struct mem_cgroup *memcg)
4074{
4075	int node;
4076
4077	memcg_wb_domain_exit(memcg);
4078	for_each_node(node)
4079		free_mem_cgroup_per_zone_info(memcg, node);
4080	free_percpu(memcg->stat);
 
4081	kfree(memcg);
4082}
4083
 
 
 
 
 
 
 
 
 
 
 
 
4084static struct mem_cgroup *mem_cgroup_alloc(void)
4085{
4086	struct mem_cgroup *memcg;
4087	size_t size;
4088	int node;
 
 
4089
4090	size = sizeof(struct mem_cgroup);
4091	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4092
4093	memcg = kzalloc(size, GFP_KERNEL);
4094	if (!memcg)
4095		return NULL;
4096
4097	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4098	if (!memcg->stat)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4099		goto fail;
4100
4101	for_each_node(node)
4102		if (alloc_mem_cgroup_per_zone_info(memcg, node))
4103			goto fail;
4104
4105	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4106		goto fail;
4107
4108	INIT_WORK(&memcg->high_work, high_work_func);
4109	memcg->last_scanned_node = MAX_NUMNODES;
4110	INIT_LIST_HEAD(&memcg->oom_notify);
4111	mutex_init(&memcg->thresholds_lock);
4112	spin_lock_init(&memcg->move_lock);
4113	vmpressure_init(&memcg->vmpressure);
4114	INIT_LIST_HEAD(&memcg->event_list);
4115	spin_lock_init(&memcg->event_list_lock);
4116	memcg->socket_pressure = jiffies;
4117#ifndef CONFIG_SLOB
4118	memcg->kmemcg_id = -1;
 
4119#endif
4120#ifdef CONFIG_CGROUP_WRITEBACK
4121	INIT_LIST_HEAD(&memcg->cgwb_list);
 
 
 
 
 
 
 
 
4122#endif
 
4123	return memcg;
4124fail:
4125	mem_cgroup_free(memcg);
4126	return NULL;
 
4127}
4128
4129static struct cgroup_subsys_state * __ref
4130mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4131{
4132	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4133	struct mem_cgroup *memcg;
4134	long error = -ENOMEM;
4135
 
4136	memcg = mem_cgroup_alloc();
4137	if (!memcg)
4138		return ERR_PTR(error);
 
4139
4140	memcg->high = PAGE_COUNTER_MAX;
4141	memcg->soft_limit = PAGE_COUNTER_MAX;
 
4142	if (parent) {
4143		memcg->swappiness = mem_cgroup_swappiness(parent);
4144		memcg->oom_kill_disable = parent->oom_kill_disable;
4145	}
4146	if (parent && parent->use_hierarchy) {
4147		memcg->use_hierarchy = true;
4148		page_counter_init(&memcg->memory, &parent->memory);
4149		page_counter_init(&memcg->swap, &parent->swap);
4150		page_counter_init(&memcg->memsw, &parent->memsw);
4151		page_counter_init(&memcg->kmem, &parent->kmem);
4152		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4153	} else {
4154		page_counter_init(&memcg->memory, NULL);
4155		page_counter_init(&memcg->swap, NULL);
4156		page_counter_init(&memcg->memsw, NULL);
4157		page_counter_init(&memcg->kmem, NULL);
4158		page_counter_init(&memcg->tcpmem, NULL);
4159		/*
4160		 * Deeper hierachy with use_hierarchy == false doesn't make
4161		 * much sense so let cgroup subsystem know about this
4162		 * unfortunate state in our controller.
4163		 */
4164		if (parent != root_mem_cgroup)
4165			memory_cgrp_subsys.broken_hierarchy = true;
4166	}
4167
4168	/* The following stuff does not apply to the root */
4169	if (!parent) {
4170		root_mem_cgroup = memcg;
4171		return &memcg->css;
4172	}
4173
4174	error = memcg_online_kmem(memcg);
4175	if (error)
4176		goto fail;
4177
4178	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4179		static_branch_inc(&memcg_sockets_enabled_key);
4180
4181	return &memcg->css;
4182fail:
 
4183	mem_cgroup_free(memcg);
4184	return NULL;
4185}
4186
4187static int
4188mem_cgroup_css_online(struct cgroup_subsys_state *css)
4189{
4190	if (css->id > MEM_CGROUP_ID_MAX)
4191		return -ENOSPC;
4192
 
 
 
 
 
 
 
 
 
 
 
 
 
4193	return 0;
4194}
4195
4196static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4197{
4198	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4199	struct mem_cgroup_event *event, *tmp;
4200
4201	/*
4202	 * Unregister events and notify userspace.
4203	 * Notify userspace about cgroup removing only after rmdir of cgroup
4204	 * directory to avoid race between userspace and kernelspace.
4205	 */
4206	spin_lock(&memcg->event_list_lock);
4207	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4208		list_del_init(&event->list);
4209		schedule_work(&event->remove);
4210	}
4211	spin_unlock(&memcg->event_list_lock);
4212
 
 
 
4213	memcg_offline_kmem(memcg);
4214	wb_memcg_offline(memcg);
 
 
 
 
4215}
4216
4217static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4218{
4219	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4220
4221	invalidate_reclaim_iterators(memcg);
4222}
4223
4224static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4225{
4226	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
4227
 
 
 
 
4228	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4229		static_branch_dec(&memcg_sockets_enabled_key);
4230
4231	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4232		static_branch_dec(&memcg_sockets_enabled_key);
4233
4234	vmpressure_cleanup(&memcg->vmpressure);
4235	cancel_work_sync(&memcg->high_work);
4236	mem_cgroup_remove_from_trees(memcg);
 
4237	memcg_free_kmem(memcg);
4238	mem_cgroup_free(memcg);
4239}
4240
4241/**
4242 * mem_cgroup_css_reset - reset the states of a mem_cgroup
4243 * @css: the target css
4244 *
4245 * Reset the states of the mem_cgroup associated with @css.  This is
4246 * invoked when the userland requests disabling on the default hierarchy
4247 * but the memcg is pinned through dependency.  The memcg should stop
4248 * applying policies and should revert to the vanilla state as it may be
4249 * made visible again.
4250 *
4251 * The current implementation only resets the essential configurations.
4252 * This needs to be expanded to cover all the visible parts.
4253 */
4254static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4255{
4256	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4257
4258	page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
4259	page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
4260	page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
4261	page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
4262	page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
4263	memcg->low = 0;
4264	memcg->high = PAGE_COUNTER_MAX;
 
4265	memcg->soft_limit = PAGE_COUNTER_MAX;
 
4266	memcg_wb_domain_size_changed(memcg);
4267}
4268
4269#ifdef CONFIG_MMU
4270/* Handlers for move charge at task migration. */
4271static int mem_cgroup_do_precharge(unsigned long count)
4272{
4273	int ret;
4274
4275	/* Try a single bulk charge without reclaim first, kswapd may wake */
4276	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4277	if (!ret) {
4278		mc.precharge += count;
4279		return ret;
4280	}
4281
4282	/* Try charges one by one with reclaim */
4283	while (count--) {
4284		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
4285		if (ret)
4286			return ret;
4287		mc.precharge++;
4288		cond_resched();
4289	}
4290	return 0;
4291}
4292
4293/**
4294 * get_mctgt_type - get target type of moving charge
4295 * @vma: the vma the pte to be checked belongs
4296 * @addr: the address corresponding to the pte to be checked
4297 * @ptent: the pte to be checked
4298 * @target: the pointer the target page or swap ent will be stored(can be NULL)
4299 *
4300 * Returns
4301 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4302 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4303 *     move charge. if @target is not NULL, the page is stored in target->page
4304 *     with extra refcnt got(Callers should handle it).
4305 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4306 *     target for charge migration. if @target is not NULL, the entry is stored
4307 *     in target->ent.
4308 *
4309 * Called with pte lock held.
4310 */
4311union mc_target {
4312	struct page	*page;
4313	swp_entry_t	ent;
4314};
4315
4316enum mc_target_type {
4317	MC_TARGET_NONE = 0,
4318	MC_TARGET_PAGE,
4319	MC_TARGET_SWAP,
 
4320};
4321
4322static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4323						unsigned long addr, pte_t ptent)
4324{
4325	struct page *page = vm_normal_page(vma, addr, ptent);
4326
4327	if (!page || !page_mapped(page))
4328		return NULL;
4329	if (PageAnon(page)) {
4330		if (!(mc.flags & MOVE_ANON))
4331			return NULL;
4332	} else {
4333		if (!(mc.flags & MOVE_FILE))
4334			return NULL;
4335	}
4336	if (!get_page_unless_zero(page))
4337		return NULL;
4338
4339	return page;
4340}
4341
4342#ifdef CONFIG_SWAP
4343static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4344			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4345{
4346	struct page *page = NULL;
4347	swp_entry_t ent = pte_to_swp_entry(ptent);
4348
4349	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4350		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4351	/*
4352	 * Because lookup_swap_cache() updates some statistics counter,
4353	 * we call find_get_page() with swapper_space directly.
4354	 */
4355	page = find_get_page(swap_address_space(ent), ent.val);
4356	if (do_memsw_account())
4357		entry->val = ent.val;
4358
4359	return page;
4360}
4361#else
4362static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4363			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4364{
4365	return NULL;
4366}
4367#endif
4368
4369static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4370			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4371{
4372	struct page *page = NULL;
4373	struct address_space *mapping;
4374	pgoff_t pgoff;
4375
4376	if (!vma->vm_file) /* anonymous vma */
4377		return NULL;
4378	if (!(mc.flags & MOVE_FILE))
4379		return NULL;
4380
4381	mapping = vma->vm_file->f_mapping;
4382	pgoff = linear_page_index(vma, addr);
4383
4384	/* page is moved even if it's not RSS of this task(page-faulted). */
4385#ifdef CONFIG_SWAP
4386	/* shmem/tmpfs may report page out on swap: account for that too. */
4387	if (shmem_mapping(mapping)) {
4388		page = find_get_entry(mapping, pgoff);
4389		if (radix_tree_exceptional_entry(page)) {
4390			swp_entry_t swp = radix_to_swp_entry(page);
4391			if (do_memsw_account())
4392				*entry = swp;
4393			page = find_get_page(swap_address_space(swp), swp.val);
4394		}
4395	} else
4396		page = find_get_page(mapping, pgoff);
4397#else
4398	page = find_get_page(mapping, pgoff);
4399#endif
4400	return page;
4401}
4402
4403/**
4404 * mem_cgroup_move_account - move account of the page
4405 * @page: the page
4406 * @nr_pages: number of regular pages (>1 for huge pages)
4407 * @from: mem_cgroup which the page is moved from.
4408 * @to:	mem_cgroup which the page is moved to. @from != @to.
4409 *
4410 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4411 *
4412 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4413 * from old cgroup.
4414 */
4415static int mem_cgroup_move_account(struct page *page,
4416				   bool compound,
4417				   struct mem_cgroup *from,
4418				   struct mem_cgroup *to)
4419{
4420	unsigned long flags;
4421	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
 
4422	int ret;
4423	bool anon;
4424
4425	VM_BUG_ON(from == to);
4426	VM_BUG_ON_PAGE(PageLRU(page), page);
4427	VM_BUG_ON(compound && !PageTransHuge(page));
4428
4429	/*
4430	 * Prevent mem_cgroup_migrate() from looking at
4431	 * page->mem_cgroup of its source page while we change it.
4432	 */
4433	ret = -EBUSY;
4434	if (!trylock_page(page))
4435		goto out;
4436
4437	ret = -EINVAL;
4438	if (page->mem_cgroup != from)
4439		goto out_unlock;
4440
4441	anon = PageAnon(page);
 
 
4442
4443	spin_lock_irqsave(&from->move_lock, flags);
4444
4445	if (!anon && page_mapped(page)) {
4446		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4447			       nr_pages);
4448		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4449			       nr_pages);
4450	}
 
 
 
 
4451
4452	/*
4453	 * move_lock grabbed above and caller set from->moving_account, so
4454	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
4455	 * So mapping should be stable for dirty pages.
4456	 */
4457	if (!anon && PageDirty(page)) {
4458		struct address_space *mapping = page_mapping(page);
4459
4460		if (mapping_cap_account_dirty(mapping)) {
4461			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
4462				       nr_pages);
4463			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
4464				       nr_pages);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4465		}
4466	}
4467
4468	if (PageWriteback(page)) {
4469		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4470			       nr_pages);
4471		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4472			       nr_pages);
4473	}
4474
4475	/*
 
 
4476	 * It is safe to change page->mem_cgroup here because the page
4477	 * is referenced, charged, and isolated - we can't race with
4478	 * uncharging, charging, migration, or LRU putback.
 
 
 
 
 
 
4479	 */
 
 
 
 
4480
4481	/* caller should have done css_get */
4482	page->mem_cgroup = to;
4483	spin_unlock_irqrestore(&from->move_lock, flags);
 
4484
4485	ret = 0;
4486
4487	local_irq_disable();
4488	mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4489	memcg_check_events(to, page);
4490	mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4491	memcg_check_events(from, page);
4492	local_irq_enable();
4493out_unlock:
4494	unlock_page(page);
4495out:
4496	return ret;
4497}
4498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4499static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4500		unsigned long addr, pte_t ptent, union mc_target *target)
4501{
4502	struct page *page = NULL;
4503	enum mc_target_type ret = MC_TARGET_NONE;
4504	swp_entry_t ent = { .val = 0 };
4505
4506	if (pte_present(ptent))
4507		page = mc_handle_present_pte(vma, addr, ptent);
4508	else if (is_swap_pte(ptent))
4509		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4510	else if (pte_none(ptent))
4511		page = mc_handle_file_pte(vma, addr, ptent, &ent);
4512
4513	if (!page && !ent.val)
4514		return ret;
4515	if (page) {
4516		/*
4517		 * Do only loose check w/o serialization.
4518		 * mem_cgroup_move_account() checks the page is valid or
4519		 * not under LRU exclusion.
4520		 */
4521		if (page->mem_cgroup == mc.from) {
4522			ret = MC_TARGET_PAGE;
 
 
4523			if (target)
4524				target->page = page;
4525		}
4526		if (!ret || !target)
4527			put_page(page);
4528	}
4529	/* There is a swap entry and a page doesn't exist or isn't charged */
4530	if (ent.val && !ret &&
 
 
 
4531	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4532		ret = MC_TARGET_SWAP;
4533		if (target)
4534			target->ent = ent;
4535	}
4536	return ret;
4537}
4538
4539#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4540/*
4541 * We don't consider swapping or file mapped pages because THP does not
4542 * support them for now.
4543 * Caller should make sure that pmd_trans_huge(pmd) is true.
4544 */
4545static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4546		unsigned long addr, pmd_t pmd, union mc_target *target)
4547{
4548	struct page *page = NULL;
4549	enum mc_target_type ret = MC_TARGET_NONE;
4550
 
 
 
 
 
4551	page = pmd_page(pmd);
4552	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4553	if (!(mc.flags & MOVE_ANON))
4554		return ret;
4555	if (page->mem_cgroup == mc.from) {
4556		ret = MC_TARGET_PAGE;
4557		if (target) {
4558			get_page(page);
4559			target->page = page;
4560		}
4561	}
4562	return ret;
4563}
4564#else
4565static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4566		unsigned long addr, pmd_t pmd, union mc_target *target)
4567{
4568	return MC_TARGET_NONE;
4569}
4570#endif
4571
4572static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4573					unsigned long addr, unsigned long end,
4574					struct mm_walk *walk)
4575{
4576	struct vm_area_struct *vma = walk->vma;
4577	pte_t *pte;
4578	spinlock_t *ptl;
4579
4580	ptl = pmd_trans_huge_lock(pmd, vma);
4581	if (ptl) {
 
 
 
 
 
4582		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4583			mc.precharge += HPAGE_PMD_NR;
4584		spin_unlock(ptl);
4585		return 0;
4586	}
4587
4588	if (pmd_trans_unstable(pmd))
4589		return 0;
4590	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4591	for (; addr != end; pte++, addr += PAGE_SIZE)
4592		if (get_mctgt_type(vma, addr, *pte, NULL))
4593			mc.precharge++;	/* increment precharge temporarily */
4594	pte_unmap_unlock(pte - 1, ptl);
4595	cond_resched();
4596
4597	return 0;
4598}
4599
 
 
 
 
4600static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4601{
4602	unsigned long precharge;
4603
4604	struct mm_walk mem_cgroup_count_precharge_walk = {
4605		.pmd_entry = mem_cgroup_count_precharge_pte_range,
4606		.mm = mm,
4607	};
4608	down_read(&mm->mmap_sem);
4609	walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
4610	up_read(&mm->mmap_sem);
4611
4612	precharge = mc.precharge;
4613	mc.precharge = 0;
4614
4615	return precharge;
4616}
4617
4618static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4619{
4620	unsigned long precharge = mem_cgroup_count_precharge(mm);
4621
4622	VM_BUG_ON(mc.moving_task);
4623	mc.moving_task = current;
4624	return mem_cgroup_do_precharge(precharge);
4625}
4626
4627/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4628static void __mem_cgroup_clear_mc(void)
4629{
4630	struct mem_cgroup *from = mc.from;
4631	struct mem_cgroup *to = mc.to;
4632
4633	/* we must uncharge all the leftover precharges from mc.to */
4634	if (mc.precharge) {
4635		cancel_charge(mc.to, mc.precharge);
4636		mc.precharge = 0;
4637	}
4638	/*
4639	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4640	 * we must uncharge here.
4641	 */
4642	if (mc.moved_charge) {
4643		cancel_charge(mc.from, mc.moved_charge);
4644		mc.moved_charge = 0;
4645	}
4646	/* we must fixup refcnts and charges */
4647	if (mc.moved_swap) {
4648		/* uncharge swap account from the old cgroup */
4649		if (!mem_cgroup_is_root(mc.from))
4650			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
4651
 
 
4652		/*
4653		 * we charged both to->memory and to->memsw, so we
4654		 * should uncharge to->memory.
4655		 */
4656		if (!mem_cgroup_is_root(mc.to))
4657			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
4658
4659		css_put_many(&mc.from->css, mc.moved_swap);
4660
4661		/* we've already done css_get(mc.to) */
4662		mc.moved_swap = 0;
4663	}
4664	memcg_oom_recover(from);
4665	memcg_oom_recover(to);
4666	wake_up_all(&mc.waitq);
4667}
4668
4669static void mem_cgroup_clear_mc(void)
4670{
4671	struct mm_struct *mm = mc.mm;
4672
4673	/*
4674	 * we must clear moving_task before waking up waiters at the end of
4675	 * task migration.
4676	 */
4677	mc.moving_task = NULL;
4678	__mem_cgroup_clear_mc();
4679	spin_lock(&mc.lock);
4680	mc.from = NULL;
4681	mc.to = NULL;
4682	mc.mm = NULL;
4683	spin_unlock(&mc.lock);
4684
4685	mmput(mm);
4686}
4687
4688static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4689{
4690	struct cgroup_subsys_state *css;
4691	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
4692	struct mem_cgroup *from;
4693	struct task_struct *leader, *p;
4694	struct mm_struct *mm;
4695	unsigned long move_flags;
4696	int ret = 0;
4697
4698	/* charge immigration isn't supported on the default hierarchy */
4699	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
4700		return 0;
4701
4702	/*
4703	 * Multi-process migrations only happen on the default hierarchy
4704	 * where charge immigration is not used.  Perform charge
4705	 * immigration if @tset contains a leader and whine if there are
4706	 * multiple.
4707	 */
4708	p = NULL;
4709	cgroup_taskset_for_each_leader(leader, css, tset) {
4710		WARN_ON_ONCE(p);
4711		p = leader;
4712		memcg = mem_cgroup_from_css(css);
4713	}
4714	if (!p)
4715		return 0;
4716
4717	/*
4718	 * We are now commited to this value whatever it is. Changes in this
4719	 * tunable will only affect upcoming migrations, not the current one.
4720	 * So we need to save it, and keep it going.
4721	 */
4722	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
4723	if (!move_flags)
4724		return 0;
4725
4726	from = mem_cgroup_from_task(p);
4727
4728	VM_BUG_ON(from == memcg);
4729
4730	mm = get_task_mm(p);
4731	if (!mm)
4732		return 0;
4733	/* We move charges only when we move a owner of the mm */
4734	if (mm->owner == p) {
4735		VM_BUG_ON(mc.from);
4736		VM_BUG_ON(mc.to);
4737		VM_BUG_ON(mc.precharge);
4738		VM_BUG_ON(mc.moved_charge);
4739		VM_BUG_ON(mc.moved_swap);
4740
4741		spin_lock(&mc.lock);
4742		mc.mm = mm;
4743		mc.from = from;
4744		mc.to = memcg;
4745		mc.flags = move_flags;
4746		spin_unlock(&mc.lock);
4747		/* We set mc.moving_task later */
4748
4749		ret = mem_cgroup_precharge_mc(mm);
4750		if (ret)
4751			mem_cgroup_clear_mc();
4752	} else {
4753		mmput(mm);
4754	}
4755	return ret;
4756}
4757
4758static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
4759{
4760	if (mc.to)
4761		mem_cgroup_clear_mc();
4762}
4763
4764static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4765				unsigned long addr, unsigned long end,
4766				struct mm_walk *walk)
4767{
4768	int ret = 0;
4769	struct vm_area_struct *vma = walk->vma;
4770	pte_t *pte;
4771	spinlock_t *ptl;
4772	enum mc_target_type target_type;
4773	union mc_target target;
4774	struct page *page;
4775
4776	ptl = pmd_trans_huge_lock(pmd, vma);
4777	if (ptl) {
4778		if (mc.precharge < HPAGE_PMD_NR) {
4779			spin_unlock(ptl);
4780			return 0;
4781		}
4782		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
4783		if (target_type == MC_TARGET_PAGE) {
4784			page = target.page;
4785			if (!isolate_lru_page(page)) {
4786				if (!mem_cgroup_move_account(page, true,
4787							     mc.from, mc.to)) {
4788					mc.precharge -= HPAGE_PMD_NR;
4789					mc.moved_charge += HPAGE_PMD_NR;
4790				}
4791				putback_lru_page(page);
4792			}
4793			put_page(page);
 
 
 
 
 
 
 
 
4794		}
4795		spin_unlock(ptl);
4796		return 0;
4797	}
4798
4799	if (pmd_trans_unstable(pmd))
4800		return 0;
4801retry:
4802	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4803	for (; addr != end; addr += PAGE_SIZE) {
4804		pte_t ptent = *(pte++);
 
4805		swp_entry_t ent;
4806
4807		if (!mc.precharge)
4808			break;
4809
4810		switch (get_mctgt_type(vma, addr, ptent, &target)) {
 
 
 
4811		case MC_TARGET_PAGE:
4812			page = target.page;
4813			/*
4814			 * We can have a part of the split pmd here. Moving it
4815			 * can be done but it would be too convoluted so simply
4816			 * ignore such a partial THP and keep it in original
4817			 * memcg. There should be somebody mapping the head.
4818			 */
4819			if (PageTransCompound(page))
4820				goto put;
4821			if (isolate_lru_page(page))
4822				goto put;
4823			if (!mem_cgroup_move_account(page, false,
4824						mc.from, mc.to)) {
4825				mc.precharge--;
4826				/* we uncharge from mc.from later. */
4827				mc.moved_charge++;
4828			}
4829			putback_lru_page(page);
 
4830put:			/* get_mctgt_type() gets the page */
4831			put_page(page);
4832			break;
4833		case MC_TARGET_SWAP:
4834			ent = target.ent;
4835			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
4836				mc.precharge--;
4837				/* we fixup refcnts and charges later. */
 
4838				mc.moved_swap++;
4839			}
4840			break;
4841		default:
4842			break;
4843		}
4844	}
4845	pte_unmap_unlock(pte - 1, ptl);
4846	cond_resched();
4847
4848	if (addr != end) {
4849		/*
4850		 * We have consumed all precharges we got in can_attach().
4851		 * We try charge one by one, but don't do any additional
4852		 * charges to mc.to if we have failed in charge once in attach()
4853		 * phase.
4854		 */
4855		ret = mem_cgroup_do_precharge(1);
4856		if (!ret)
4857			goto retry;
4858	}
4859
4860	return ret;
4861}
4862
 
 
 
 
4863static void mem_cgroup_move_charge(void)
4864{
4865	struct mm_walk mem_cgroup_move_charge_walk = {
4866		.pmd_entry = mem_cgroup_move_charge_pte_range,
4867		.mm = mc.mm,
4868	};
4869
4870	lru_add_drain_all();
4871	/*
4872	 * Signal lock_page_memcg() to take the memcg's move_lock
4873	 * while we're moving its pages to another memcg. Then wait
4874	 * for already started RCU-only updates to finish.
4875	 */
4876	atomic_inc(&mc.from->moving_account);
4877	synchronize_rcu();
4878retry:
4879	if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
4880		/*
4881		 * Someone who are holding the mmap_sem might be waiting in
4882		 * waitq. So we cancel all extra charges, wake up all waiters,
4883		 * and retry. Because we cancel precharges, we might not be able
4884		 * to move enough charges, but moving charge is a best-effort
4885		 * feature anyway, so it wouldn't be a big problem.
4886		 */
4887		__mem_cgroup_clear_mc();
4888		cond_resched();
4889		goto retry;
4890	}
4891	/*
4892	 * When we have consumed all precharges and failed in doing
4893	 * additional charge, the page walk just aborts.
4894	 */
4895	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
4896	up_read(&mc.mm->mmap_sem);
 
 
4897	atomic_dec(&mc.from->moving_account);
4898}
4899
4900static void mem_cgroup_move_task(void)
4901{
4902	if (mc.to) {
4903		mem_cgroup_move_charge();
4904		mem_cgroup_clear_mc();
4905	}
4906}
4907#else	/* !CONFIG_MMU */
4908static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4909{
4910	return 0;
4911}
4912static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
4913{
4914}
4915static void mem_cgroup_move_task(void)
4916{
4917}
4918#endif
4919
4920/*
4921 * Cgroup retains root cgroups across [un]mount cycles making it necessary
4922 * to verify whether we're attached to the default hierarchy on each mount
4923 * attempt.
4924 */
4925static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
4926{
4927	/*
4928	 * use_hierarchy is forced on the default hierarchy.  cgroup core
4929	 * guarantees that @root doesn't have any children, so turning it
4930	 * on for the root memcg is enough.
4931	 */
4932	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
4933		root_mem_cgroup->use_hierarchy = true;
4934	else
4935		root_mem_cgroup->use_hierarchy = false;
4936}
4937
 
 
 
 
 
 
 
 
 
 
4938static u64 memory_current_read(struct cgroup_subsys_state *css,
4939			       struct cftype *cft)
4940{
4941	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4942
4943	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
4944}
4945
4946static int memory_low_show(struct seq_file *m, void *v)
4947{
4948	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4949	unsigned long low = READ_ONCE(memcg->low);
 
4950
4951	if (low == PAGE_COUNTER_MAX)
4952		seq_puts(m, "max\n");
4953	else
4954		seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
 
 
4955
4956	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
4957}
4958
4959static ssize_t memory_low_write(struct kernfs_open_file *of,
4960				char *buf, size_t nbytes, loff_t off)
4961{
4962	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4963	unsigned long low;
4964	int err;
4965
4966	buf = strstrip(buf);
4967	err = page_counter_memparse(buf, "max", &low);
4968	if (err)
4969		return err;
4970
4971	memcg->low = low;
4972
4973	return nbytes;
4974}
4975
4976static int memory_high_show(struct seq_file *m, void *v)
4977{
4978	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4979	unsigned long high = READ_ONCE(memcg->high);
4980
4981	if (high == PAGE_COUNTER_MAX)
4982		seq_puts(m, "max\n");
4983	else
4984		seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
4985
4986	return 0;
4987}
4988
4989static ssize_t memory_high_write(struct kernfs_open_file *of,
4990				 char *buf, size_t nbytes, loff_t off)
4991{
4992	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4993	unsigned long nr_pages;
 
4994	unsigned long high;
4995	int err;
4996
4997	buf = strstrip(buf);
4998	err = page_counter_memparse(buf, "max", &high);
4999	if (err)
5000		return err;
5001
5002	memcg->high = high;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5003
5004	nr_pages = page_counter_read(&memcg->memory);
5005	if (nr_pages > high)
5006		try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5007					     GFP_KERNEL, true);
 
5008
5009	memcg_wb_domain_size_changed(memcg);
 
5010	return nbytes;
5011}
5012
5013static int memory_max_show(struct seq_file *m, void *v)
5014{
5015	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5016	unsigned long max = READ_ONCE(memcg->memory.limit);
5017
5018	if (max == PAGE_COUNTER_MAX)
5019		seq_puts(m, "max\n");
5020	else
5021		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5022
5023	return 0;
5024}
5025
5026static ssize_t memory_max_write(struct kernfs_open_file *of,
5027				char *buf, size_t nbytes, loff_t off)
5028{
5029	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5030	unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5031	bool drained = false;
5032	unsigned long max;
5033	int err;
5034
5035	buf = strstrip(buf);
5036	err = page_counter_memparse(buf, "max", &max);
5037	if (err)
5038		return err;
5039
5040	xchg(&memcg->memory.limit, max);
5041
5042	for (;;) {
5043		unsigned long nr_pages = page_counter_read(&memcg->memory);
5044
5045		if (nr_pages <= max)
5046			break;
5047
5048		if (signal_pending(current)) {
5049			err = -EINTR;
5050			break;
5051		}
5052
5053		if (!drained) {
5054			drain_all_stock(memcg);
5055			drained = true;
5056			continue;
5057		}
5058
5059		if (nr_reclaims) {
5060			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5061							  GFP_KERNEL, true))
5062				nr_reclaims--;
5063			continue;
5064		}
5065
5066		mem_cgroup_events(memcg, MEMCG_OOM, 1);
5067		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5068			break;
5069	}
5070
5071	memcg_wb_domain_size_changed(memcg);
5072	return nbytes;
5073}
5074
 
 
 
 
 
 
 
 
 
 
5075static int memory_events_show(struct seq_file *m, void *v)
5076{
5077	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5078
5079	seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5080	seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5081	seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5082	seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
 
 
 
5083
 
5084	return 0;
5085}
5086
5087static int memory_stat_show(struct seq_file *m, void *v)
5088{
5089	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5090	unsigned long stat[MEMCG_NR_STAT];
5091	unsigned long events[MEMCG_NR_EVENTS];
5092	int i;
5093
5094	/*
5095	 * Provide statistics on the state of the memory subsystem as
5096	 * well as cumulative event counters that show past behavior.
5097	 *
5098	 * This list is ordered following a combination of these gradients:
5099	 * 1) generic big picture -> specifics and details
5100	 * 2) reflecting userspace activity -> reflecting kernel heuristics
5101	 *
5102	 * Current memory state:
5103	 */
5104
5105	tree_stat(memcg, stat);
5106	tree_events(memcg, events);
 
5107
5108	seq_printf(m, "anon %llu\n",
5109		   (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
5110	seq_printf(m, "file %llu\n",
5111		   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
5112	seq_printf(m, "kernel_stack %llu\n",
5113		   (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
5114	seq_printf(m, "slab %llu\n",
5115		   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
5116			 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5117	seq_printf(m, "sock %llu\n",
5118		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
5119
5120	seq_printf(m, "file_mapped %llu\n",
5121		   (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
5122	seq_printf(m, "file_dirty %llu\n",
5123		   (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
5124	seq_printf(m, "file_writeback %llu\n",
5125		   (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
5126
5127	for (i = 0; i < NR_LRU_LISTS; i++) {
5128		struct mem_cgroup *mi;
5129		unsigned long val = 0;
5130
5131		for_each_mem_cgroup_tree(mi, memcg)
5132			val += mem_cgroup_nr_lru_pages(mi, BIT(i));
5133		seq_printf(m, "%s %llu\n",
5134			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
5135	}
5136
5137	seq_printf(m, "slab_reclaimable %llu\n",
5138		   (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
5139	seq_printf(m, "slab_unreclaimable %llu\n",
5140		   (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5141
5142	/* Accumulated memory events */
 
 
 
 
 
5143
5144	seq_printf(m, "pgfault %lu\n",
5145		   events[MEM_CGROUP_EVENTS_PGFAULT]);
5146	seq_printf(m, "pgmajfault %lu\n",
5147		   events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
5148
5149	return 0;
5150}
5151
5152static struct cftype memory_files[] = {
5153	{
5154		.name = "current",
5155		.flags = CFTYPE_NOT_ON_ROOT,
5156		.read_u64 = memory_current_read,
5157	},
5158	{
 
 
 
 
 
 
5159		.name = "low",
5160		.flags = CFTYPE_NOT_ON_ROOT,
5161		.seq_show = memory_low_show,
5162		.write = memory_low_write,
5163	},
5164	{
5165		.name = "high",
5166		.flags = CFTYPE_NOT_ON_ROOT,
5167		.seq_show = memory_high_show,
5168		.write = memory_high_write,
5169	},
5170	{
5171		.name = "max",
5172		.flags = CFTYPE_NOT_ON_ROOT,
5173		.seq_show = memory_max_show,
5174		.write = memory_max_write,
5175	},
5176	{
5177		.name = "events",
5178		.flags = CFTYPE_NOT_ON_ROOT,
5179		.file_offset = offsetof(struct mem_cgroup, events_file),
5180		.seq_show = memory_events_show,
5181	},
5182	{
5183		.name = "stat",
5184		.flags = CFTYPE_NOT_ON_ROOT,
 
 
 
 
 
5185		.seq_show = memory_stat_show,
5186	},
 
 
 
 
 
 
5187	{ }	/* terminate */
5188};
5189
5190struct cgroup_subsys memory_cgrp_subsys = {
5191	.css_alloc = mem_cgroup_css_alloc,
5192	.css_online = mem_cgroup_css_online,
5193	.css_offline = mem_cgroup_css_offline,
5194	.css_released = mem_cgroup_css_released,
5195	.css_free = mem_cgroup_css_free,
5196	.css_reset = mem_cgroup_css_reset,
5197	.can_attach = mem_cgroup_can_attach,
5198	.cancel_attach = mem_cgroup_cancel_attach,
5199	.post_attach = mem_cgroup_move_task,
5200	.bind = mem_cgroup_bind,
5201	.dfl_cftypes = memory_files,
5202	.legacy_cftypes = mem_cgroup_legacy_files,
5203	.early_init = 0,
5204};
5205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5206/**
5207 * mem_cgroup_low - check if memory consumption is below the normal range
5208 * @root: the highest ancestor to consider
5209 * @memcg: the memory cgroup to check
5210 *
5211 * Returns %true if memory consumption of @memcg, and that of all
5212 * configurable ancestors up to @root, is below the normal range.
5213 */
5214bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 
5215{
 
 
 
5216	if (mem_cgroup_disabled())
5217		return false;
 
 
 
5218
5219	/*
5220	 * The toplevel group doesn't have a configurable range, so
5221	 * it's never low when looked at directly, and it is not
5222	 * considered an ancestor when assessing the hierarchy.
 
 
5223	 */
 
 
5224
5225	if (memcg == root_mem_cgroup)
5226		return false;
 
5227
5228	if (page_counter_read(&memcg->memory) >= memcg->low)
5229		return false;
 
 
5230
5231	while (memcg != root) {
5232		memcg = parent_mem_cgroup(memcg);
 
 
 
5233
5234		if (memcg == root_mem_cgroup)
5235			break;
5236
5237		if (page_counter_read(&memcg->memory) >= memcg->low)
5238			return false;
5239	}
5240	return true;
 
 
 
 
 
5241}
5242
5243/**
5244 * mem_cgroup_try_charge - try charging a page
5245 * @page: page to charge
5246 * @mm: mm context of the victim
5247 * @gfp_mask: reclaim mode
5248 * @memcgp: charged memcg return
5249 *
5250 * Try to charge @page to the memcg that @mm belongs to, reclaiming
5251 * pages according to @gfp_mask if necessary.
5252 *
5253 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5254 * Otherwise, an error code is returned.
5255 *
5256 * After page->mapping has been set up, the caller must finalize the
5257 * charge with mem_cgroup_commit_charge().  Or abort the transaction
5258 * with mem_cgroup_cancel_charge() in case page instantiation fails.
5259 */
5260int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5261			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
5262			  bool compound)
5263{
 
5264	struct mem_cgroup *memcg = NULL;
5265	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5266	int ret = 0;
5267
5268	if (mem_cgroup_disabled())
5269		goto out;
5270
5271	if (PageSwapCache(page)) {
 
 
 
5272		/*
5273		 * Every swap fault against a single page tries to charge the
5274		 * page, bail as early as possible.  shmem_unuse() encounters
5275		 * already charged pages, too.  The USED bit is protected by
5276		 * the page lock, which serializes swap cache removal, which
5277		 * in turn serializes uncharging.
5278		 */
5279		VM_BUG_ON_PAGE(!PageLocked(page), page);
5280		if (page->mem_cgroup)
5281			goto out;
5282
5283		if (do_swap_account) {
5284			swp_entry_t ent = { .val = page_private(page), };
5285			unsigned short id = lookup_swap_cgroup_id(ent);
5286
5287			rcu_read_lock();
5288			memcg = mem_cgroup_from_id(id);
5289			if (memcg && !css_tryget_online(&memcg->css))
5290				memcg = NULL;
5291			rcu_read_unlock();
5292		}
5293	}
5294
5295	if (!memcg)
5296		memcg = get_mem_cgroup_from_mm(mm);
5297
5298	ret = try_charge(memcg, gfp_mask, nr_pages);
 
 
5299
5300	css_put(&memcg->css);
5301out:
5302	*memcgp = memcg;
5303	return ret;
5304}
5305
5306/**
5307 * mem_cgroup_commit_charge - commit a page charge
5308 * @page: page to charge
5309 * @memcg: memcg to charge the page to
5310 * @lrucare: page might be on LRU already
5311 *
5312 * Finalize a charge transaction started by mem_cgroup_try_charge(),
5313 * after page->mapping has been set up.  This must happen atomically
5314 * as part of the page instantiation, i.e. under the page table lock
5315 * for anonymous pages, under the page lock for page and swap cache.
5316 *
5317 * In addition, the page must not be on the LRU during the commit, to
5318 * prevent racing with task migration.  If it might be, use @lrucare.
5319 *
5320 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5321 */
5322void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5323			      bool lrucare, bool compound)
5324{
5325	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5326
5327	VM_BUG_ON_PAGE(!page->mapping, page);
5328	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5329
5330	if (mem_cgroup_disabled())
5331		return;
5332	/*
5333	 * Swap faults will attempt to charge the same page multiple
5334	 * times.  But reuse_swap_page() might have removed the page
5335	 * from swapcache already, so we can't check PageSwapCache().
5336	 */
5337	if (!memcg)
5338		return;
5339
5340	commit_charge(page, memcg, lrucare);
5341
5342	local_irq_disable();
5343	mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5344	memcg_check_events(memcg, page);
5345	local_irq_enable();
5346
5347	if (do_memsw_account() && PageSwapCache(page)) {
5348		swp_entry_t entry = { .val = page_private(page) };
5349		/*
5350		 * The swap entry might not get freed for a long time,
5351		 * let's not wait for it.  The page already received a
5352		 * memory+swap charge, drop the swap entry duplicate.
5353		 */
5354		mem_cgroup_uncharge_swap(entry);
5355	}
5356}
5357
5358/**
5359 * mem_cgroup_cancel_charge - cancel a page charge
5360 * @page: page to charge
5361 * @memcg: memcg to charge the page to
5362 *
5363 * Cancel a charge transaction started by mem_cgroup_try_charge().
5364 */
5365void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5366		bool compound)
5367{
5368	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5369
5370	if (mem_cgroup_disabled())
5371		return;
5372	/*
5373	 * Swap faults will attempt to charge the same page multiple
5374	 * times.  But reuse_swap_page() might have removed the page
5375	 * from swapcache already, so we can't check PageSwapCache().
5376	 */
5377	if (!memcg)
5378		return;
5379
5380	cancel_charge(memcg, nr_pages);
 
 
5381}
5382
5383static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5384			   unsigned long nr_anon, unsigned long nr_file,
5385			   unsigned long nr_huge, struct page *dummy_page)
5386{
5387	unsigned long nr_pages = nr_anon + nr_file;
5388	unsigned long flags;
5389
5390	if (!mem_cgroup_is_root(memcg)) {
5391		page_counter_uncharge(&memcg->memory, nr_pages);
5392		if (do_memsw_account())
5393			page_counter_uncharge(&memcg->memsw, nr_pages);
5394		memcg_oom_recover(memcg);
 
 
5395	}
5396
5397	local_irq_save(flags);
5398	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
5399	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
5400	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
5401	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
5402	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
5403	memcg_check_events(memcg, dummy_page);
5404	local_irq_restore(flags);
5405
5406	if (!mem_cgroup_is_root(memcg))
5407		css_put_many(&memcg->css, nr_pages);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5408}
5409
5410static void uncharge_list(struct list_head *page_list)
5411{
5412	struct mem_cgroup *memcg = NULL;
5413	unsigned long nr_anon = 0;
5414	unsigned long nr_file = 0;
5415	unsigned long nr_huge = 0;
5416	unsigned long pgpgout = 0;
5417	struct list_head *next;
5418	struct page *page;
 
5419
5420	/*
5421	 * Note that the list can be a single page->lru; hence the
5422	 * do-while loop instead of a simple list_for_each_entry().
5423	 */
5424	next = page_list->next;
5425	do {
5426		unsigned int nr_pages = 1;
5427
5428		page = list_entry(next, struct page, lru);
5429		next = page->lru.next;
5430
5431		VM_BUG_ON_PAGE(PageLRU(page), page);
5432		VM_BUG_ON_PAGE(page_count(page), page);
5433
5434		if (!page->mem_cgroup)
5435			continue;
5436
5437		/*
5438		 * Nobody should be changing or seriously looking at
5439		 * page->mem_cgroup at this point, we have fully
5440		 * exclusive access to the page.
5441		 */
5442
5443		if (memcg != page->mem_cgroup) {
5444			if (memcg) {
5445				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5446					       nr_huge, page);
5447				pgpgout = nr_anon = nr_file = nr_huge = 0;
5448			}
5449			memcg = page->mem_cgroup;
5450		}
5451
5452		if (PageTransHuge(page)) {
5453			nr_pages <<= compound_order(page);
5454			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5455			nr_huge += nr_pages;
5456		}
5457
5458		if (PageAnon(page))
5459			nr_anon += nr_pages;
5460		else
5461			nr_file += nr_pages;
5462
5463		page->mem_cgroup = NULL;
5464
5465		pgpgout++;
5466	} while (next != page_list);
5467
5468	if (memcg)
5469		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5470			       nr_huge, page);
5471}
5472
5473/**
5474 * mem_cgroup_uncharge - uncharge a page
5475 * @page: page to uncharge
5476 *
5477 * Uncharge a page previously charged with mem_cgroup_try_charge() and
5478 * mem_cgroup_commit_charge().
5479 */
5480void mem_cgroup_uncharge(struct page *page)
5481{
 
 
5482	if (mem_cgroup_disabled())
5483		return;
5484
5485	/* Don't touch page->lru of any random page, pre-check: */
5486	if (!page->mem_cgroup)
5487		return;
5488
5489	INIT_LIST_HEAD(&page->lru);
5490	uncharge_list(&page->lru);
 
5491}
5492
5493/**
5494 * mem_cgroup_uncharge_list - uncharge a list of page
5495 * @page_list: list of pages to uncharge
5496 *
5497 * Uncharge a list of pages previously charged with
5498 * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
5499 */
5500void mem_cgroup_uncharge_list(struct list_head *page_list)
5501{
5502	if (mem_cgroup_disabled())
5503		return;
5504
5505	if (!list_empty(page_list))
5506		uncharge_list(page_list);
5507}
5508
5509/**
5510 * mem_cgroup_migrate - charge a page's replacement
5511 * @oldpage: currently circulating page
5512 * @newpage: replacement page
5513 *
5514 * Charge @newpage as a replacement page for @oldpage. @oldpage will
5515 * be uncharged upon free.
5516 *
5517 * Both pages must be locked, @newpage->mapping must be set up.
5518 */
5519void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5520{
5521	struct mem_cgroup *memcg;
5522	unsigned int nr_pages;
5523	bool compound;
5524
5525	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5526	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5527	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5528	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5529		       newpage);
5530
5531	if (mem_cgroup_disabled())
5532		return;
5533
5534	/* Page cache replacement: new page already charged? */
5535	if (newpage->mem_cgroup)
5536		return;
5537
5538	/* Swapcache readahead pages can get replaced before being charged */
5539	memcg = oldpage->mem_cgroup;
5540	if (!memcg)
5541		return;
5542
5543	/* Force-charge the new page. The old one will be freed soon */
5544	compound = PageTransHuge(newpage);
5545	nr_pages = compound ? hpage_nr_pages(newpage) : 1;
5546
5547	page_counter_charge(&memcg->memory, nr_pages);
5548	if (do_memsw_account())
5549		page_counter_charge(&memcg->memsw, nr_pages);
5550	css_get_many(&memcg->css, nr_pages);
5551
5552	commit_charge(newpage, memcg, false);
 
5553
5554	local_irq_disable();
5555	mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
5556	memcg_check_events(memcg, newpage);
5557	local_irq_enable();
5558}
5559
5560DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5561EXPORT_SYMBOL(memcg_sockets_enabled_key);
5562
5563void sock_update_memcg(struct sock *sk)
5564{
5565	struct mem_cgroup *memcg;
5566
5567	/* Socket cloning can throw us here with sk_cgrp already
5568	 * filled. It won't however, necessarily happen from
5569	 * process context. So the test for root memcg given
5570	 * the current task's memcg won't help us in this case.
5571	 *
5572	 * Respecting the original socket's memcg is a better
5573	 * decision in this case.
5574	 */
5575	if (sk->sk_memcg) {
5576		BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
5577		css_get(&sk->sk_memcg->css);
5578		return;
5579	}
5580
5581	rcu_read_lock();
5582	memcg = mem_cgroup_from_task(current);
5583	if (memcg == root_mem_cgroup)
5584		goto out;
5585	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
5586		goto out;
5587	if (css_tryget_online(&memcg->css))
5588		sk->sk_memcg = memcg;
5589out:
5590	rcu_read_unlock();
5591}
5592EXPORT_SYMBOL(sock_update_memcg);
5593
5594void sock_release_memcg(struct sock *sk)
5595{
5596	WARN_ON(!sk->sk_memcg);
5597	css_put(&sk->sk_memcg->css);
5598}
5599
5600/**
5601 * mem_cgroup_charge_skmem - charge socket memory
5602 * @memcg: memcg to charge
5603 * @nr_pages: number of pages to charge
5604 *
5605 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5606 * @memcg's configured limit, %false if the charge had to be forced.
5607 */
5608bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5609{
5610	gfp_t gfp_mask = GFP_KERNEL;
5611
5612	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5613		struct page_counter *fail;
5614
5615		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
5616			memcg->tcpmem_pressure = 0;
5617			return true;
5618		}
5619		page_counter_charge(&memcg->tcpmem, nr_pages);
5620		memcg->tcpmem_pressure = 1;
5621		return false;
5622	}
5623
5624	/* Don't block in the packet receive path */
5625	if (in_softirq())
5626		gfp_mask = GFP_NOWAIT;
5627
5628	this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
5629
5630	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
5631		return true;
5632
5633	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
5634	return false;
5635}
5636
5637/**
5638 * mem_cgroup_uncharge_skmem - uncharge socket memory
5639 * @memcg - memcg to uncharge
5640 * @nr_pages - number of pages to uncharge
5641 */
5642void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5643{
5644	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5645		page_counter_uncharge(&memcg->tcpmem, nr_pages);
5646		return;
5647	}
5648
5649	this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
5650
5651	page_counter_uncharge(&memcg->memory, nr_pages);
5652	css_put_many(&memcg->css, nr_pages);
5653}
5654
5655static int __init cgroup_memory(char *s)
5656{
5657	char *token;
5658
5659	while ((token = strsep(&s, ",")) != NULL) {
5660		if (!*token)
5661			continue;
5662		if (!strcmp(token, "nosocket"))
5663			cgroup_memory_nosocket = true;
5664		if (!strcmp(token, "nokmem"))
5665			cgroup_memory_nokmem = true;
5666	}
5667	return 0;
5668}
5669__setup("cgroup.memory=", cgroup_memory);
5670
5671/*
5672 * subsys_initcall() for memory controller.
5673 *
5674 * Some parts like hotcpu_notifier() have to be initialized from this context
5675 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
5676 * everything that doesn't depend on a specific mem_cgroup structure should
5677 * be initialized from here.
5678 */
5679static int __init mem_cgroup_init(void)
5680{
5681	int cpu, node;
5682
5683	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 
5684
5685	for_each_possible_cpu(cpu)
5686		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5687			  drain_local_stock);
5688
5689	for_each_node(node) {
5690		struct mem_cgroup_tree_per_node *rtpn;
5691		int zone;
5692
5693		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5694				    node_online(node) ? node : NUMA_NO_NODE);
5695
5696		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5697			struct mem_cgroup_tree_per_zone *rtpz;
5698
5699			rtpz = &rtpn->rb_tree_per_zone[zone];
5700			rtpz->rb_root = RB_ROOT;
5701			spin_lock_init(&rtpz->lock);
5702		}
5703		soft_limit_tree.rb_tree_per_node[node] = rtpn;
5704	}
5705
5706	return 0;
5707}
5708subsys_initcall(mem_cgroup_init);
5709
5710#ifdef CONFIG_MEMCG_SWAP
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5711/**
5712 * mem_cgroup_swapout - transfer a memsw charge to swap
5713 * @page: page whose memsw charge to transfer
5714 * @entry: swap entry to move the charge to
5715 *
5716 * Transfer the memsw charge of @page to @entry.
5717 */
5718void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5719{
5720	struct mem_cgroup *memcg;
 
5721	unsigned short oldid;
5722
5723	VM_BUG_ON_PAGE(PageLRU(page), page);
5724	VM_BUG_ON_PAGE(page_count(page), page);
5725
5726	if (!do_memsw_account())
5727		return;
5728
5729	memcg = page->mem_cgroup;
5730
5731	/* Readahead page, never charged */
5732	if (!memcg)
5733		return;
5734
5735	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
 
 
 
 
 
 
 
 
 
 
 
5736	VM_BUG_ON_PAGE(oldid, page);
5737	mem_cgroup_swap_statistics(memcg, true);
5738
5739	page->mem_cgroup = NULL;
5740
5741	if (!mem_cgroup_is_root(memcg))
5742		page_counter_uncharge(&memcg->memory, 1);
 
 
 
 
 
 
5743
5744	/*
5745	 * Interrupts should be disabled here because the caller holds the
5746	 * mapping->tree_lock lock which is taken with interrupts-off. It is
5747	 * important here to have the interrupts disabled because it is the
5748	 * only synchronisation we have for udpating the per-CPU variables.
5749	 */
5750	VM_BUG_ON(!irqs_disabled());
5751	mem_cgroup_charge_statistics(memcg, page, false, -1);
5752	memcg_check_events(memcg, page);
 
 
5753}
5754
5755/*
5756 * mem_cgroup_try_charge_swap - try charging a swap entry
5757 * @page: page being added to swap
5758 * @entry: swap entry to charge
5759 *
5760 * Try to charge @entry to the memcg that @page belongs to.
5761 *
5762 * Returns 0 on success, -ENOMEM on failure.
5763 */
5764int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
5765{
5766	struct mem_cgroup *memcg;
5767	struct page_counter *counter;
 
5768	unsigned short oldid;
5769
5770	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
5771		return 0;
5772
5773	memcg = page->mem_cgroup;
5774
5775	/* Readahead page, never charged */
5776	if (!memcg)
5777		return 0;
5778
5779	if (!mem_cgroup_is_root(memcg) &&
5780	    !page_counter_try_charge(&memcg->swap, 1, &counter))
 
 
 
 
 
 
 
 
 
 
5781		return -ENOMEM;
 
5782
5783	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
 
 
 
5784	VM_BUG_ON_PAGE(oldid, page);
5785	mem_cgroup_swap_statistics(memcg, true);
5786
5787	css_get(&memcg->css);
5788	return 0;
5789}
5790
5791/**
5792 * mem_cgroup_uncharge_swap - uncharge a swap entry
5793 * @entry: swap entry to uncharge
5794 *
5795 * Drop the swap charge associated with @entry.
5796 */
5797void mem_cgroup_uncharge_swap(swp_entry_t entry)
5798{
5799	struct mem_cgroup *memcg;
5800	unsigned short id;
5801
5802	if (!do_swap_account)
5803		return;
5804
5805	id = swap_cgroup_record(entry, 0);
5806	rcu_read_lock();
5807	memcg = mem_cgroup_from_id(id);
5808	if (memcg) {
5809		if (!mem_cgroup_is_root(memcg)) {
5810			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5811				page_counter_uncharge(&memcg->swap, 1);
5812			else
5813				page_counter_uncharge(&memcg->memsw, 1);
5814		}
5815		mem_cgroup_swap_statistics(memcg, false);
5816		css_put(&memcg->css);
5817	}
5818	rcu_read_unlock();
5819}
5820
5821long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5822{
5823	long nr_swap_pages = get_nr_swap_pages();
5824
5825	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5826		return nr_swap_pages;
5827	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5828		nr_swap_pages = min_t(long, nr_swap_pages,
5829				      READ_ONCE(memcg->swap.limit) -
5830				      page_counter_read(&memcg->swap));
5831	return nr_swap_pages;
5832}
5833
5834bool mem_cgroup_swap_full(struct page *page)
5835{
5836	struct mem_cgroup *memcg;
5837
5838	VM_BUG_ON_PAGE(!PageLocked(page), page);
5839
5840	if (vm_swap_full())
5841		return true;
5842	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5843		return false;
5844
5845	memcg = page->mem_cgroup;
5846	if (!memcg)
5847		return false;
5848
5849	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5850		if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
 
 
 
5851			return true;
 
5852
5853	return false;
5854}
5855
5856/* for remember boot option*/
5857#ifdef CONFIG_MEMCG_SWAP_ENABLED
5858static int really_do_swap_account __initdata = 1;
5859#else
5860static int really_do_swap_account __initdata;
5861#endif
5862
5863static int __init enable_swap_account(char *s)
5864{
5865	if (!strcmp(s, "1"))
5866		really_do_swap_account = 1;
5867	else if (!strcmp(s, "0"))
5868		really_do_swap_account = 0;
5869	return 1;
5870}
5871__setup("swapaccount=", enable_swap_account);
5872
5873static u64 swap_current_read(struct cgroup_subsys_state *css,
5874			     struct cftype *cft)
5875{
5876	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5877
5878	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
5879}
5880
5881static int swap_max_show(struct seq_file *m, void *v)
5882{
5883	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5884	unsigned long max = READ_ONCE(memcg->swap.limit);
 
5885
5886	if (max == PAGE_COUNTER_MAX)
5887		seq_puts(m, "max\n");
5888	else
5889		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
 
 
5890
5891	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
5892}
5893
5894static ssize_t swap_max_write(struct kernfs_open_file *of,
5895			      char *buf, size_t nbytes, loff_t off)
5896{
5897	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5898	unsigned long max;
5899	int err;
5900
5901	buf = strstrip(buf);
5902	err = page_counter_memparse(buf, "max", &max);
5903	if (err)
5904		return err;
5905
5906	mutex_lock(&memcg_limit_mutex);
5907	err = page_counter_limit(&memcg->swap, max);
5908	mutex_unlock(&memcg_limit_mutex);
5909	if (err)
5910		return err;
5911
5912	return nbytes;
5913}
5914
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5915static struct cftype swap_files[] = {
5916	{
5917		.name = "swap.current",
5918		.flags = CFTYPE_NOT_ON_ROOT,
5919		.read_u64 = swap_current_read,
5920	},
5921	{
 
 
 
 
 
 
5922		.name = "swap.max",
5923		.flags = CFTYPE_NOT_ON_ROOT,
5924		.seq_show = swap_max_show,
5925		.write = swap_max_write,
5926	},
 
 
 
 
 
 
5927	{ }	/* terminate */
5928};
5929
5930static struct cftype memsw_cgroup_files[] = {
5931	{
5932		.name = "memsw.usage_in_bytes",
5933		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5934		.read_u64 = mem_cgroup_read_u64,
5935	},
5936	{
5937		.name = "memsw.max_usage_in_bytes",
5938		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5939		.write = mem_cgroup_reset,
5940		.read_u64 = mem_cgroup_read_u64,
5941	},
5942	{
5943		.name = "memsw.limit_in_bytes",
5944		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5945		.write = mem_cgroup_write,
5946		.read_u64 = mem_cgroup_read_u64,
5947	},
5948	{
5949		.name = "memsw.failcnt",
5950		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5951		.write = mem_cgroup_reset,
5952		.read_u64 = mem_cgroup_read_u64,
5953	},
5954	{ },	/* terminate */
5955};
5956
 
 
 
 
 
 
 
5957static int __init mem_cgroup_swap_init(void)
5958{
5959	if (!mem_cgroup_disabled() && really_do_swap_account) {
5960		do_swap_account = 1;
5961		WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
5962					       swap_files));
5963		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
5964						  memsw_cgroup_files));
5965	}
 
 
 
5966	return 0;
5967}
5968subsys_initcall(mem_cgroup_swap_init);
5969
5970#endif /* CONFIG_MEMCG_SWAP */
v5.9
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* memcontrol.c - Memory Controller
   3 *
   4 * Copyright IBM Corporation, 2007
   5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6 *
   7 * Copyright 2007 OpenVZ SWsoft Inc
   8 * Author: Pavel Emelianov <xemul@openvz.org>
   9 *
  10 * Memory thresholds
  11 * Copyright (C) 2009 Nokia Corporation
  12 * Author: Kirill A. Shutemov
  13 *
  14 * Kernel Memory Controller
  15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16 * Authors: Glauber Costa and Suleiman Souhlal
  17 *
  18 * Native page reclaim
  19 * Charge lifetime sanitation
  20 * Lockless page tracking & accounting
  21 * Unified hierarchy configuration model
  22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 
 
 
 
 
 
 
 
 
 
  23 */
  24
  25#include <linux/page_counter.h>
  26#include <linux/memcontrol.h>
  27#include <linux/cgroup.h>
  28#include <linux/pagewalk.h>
  29#include <linux/sched/mm.h>
  30#include <linux/shmem_fs.h>
  31#include <linux/hugetlb.h>
  32#include <linux/pagemap.h>
  33#include <linux/vm_event_item.h>
  34#include <linux/smp.h>
  35#include <linux/page-flags.h>
  36#include <linux/backing-dev.h>
  37#include <linux/bit_spinlock.h>
  38#include <linux/rcupdate.h>
  39#include <linux/limits.h>
  40#include <linux/export.h>
  41#include <linux/mutex.h>
  42#include <linux/rbtree.h>
  43#include <linux/slab.h>
  44#include <linux/swap.h>
  45#include <linux/swapops.h>
  46#include <linux/spinlock.h>
  47#include <linux/eventfd.h>
  48#include <linux/poll.h>
  49#include <linux/sort.h>
  50#include <linux/fs.h>
  51#include <linux/seq_file.h>
  52#include <linux/vmpressure.h>
  53#include <linux/mm_inline.h>
  54#include <linux/swap_cgroup.h>
  55#include <linux/cpu.h>
  56#include <linux/oom.h>
  57#include <linux/lockdep.h>
  58#include <linux/file.h>
  59#include <linux/tracehook.h>
  60#include <linux/psi.h>
  61#include <linux/seq_buf.h>
  62#include "internal.h"
  63#include <net/sock.h>
  64#include <net/ip.h>
  65#include "slab.h"
  66
  67#include <linux/uaccess.h>
  68
  69#include <trace/events/vmscan.h>
  70
  71struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  72EXPORT_SYMBOL(memory_cgrp_subsys);
  73
  74struct mem_cgroup *root_mem_cgroup __read_mostly;
  75
 
 
  76/* Socket memory accounting disabled? */
  77static bool cgroup_memory_nosocket;
  78
  79/* Kernel memory accounting disabled? */
  80static bool cgroup_memory_nokmem;
  81
  82/* Whether the swap controller is active */
  83#ifdef CONFIG_MEMCG_SWAP
  84bool cgroup_memory_noswap __read_mostly;
  85#else
  86#define cgroup_memory_noswap		1
  87#endif
  88
  89#ifdef CONFIG_CGROUP_WRITEBACK
  90static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
  91#endif
  92
  93/* Whether legacy memory+swap accounting is active */
  94static bool do_memsw_account(void)
  95{
  96	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
  97}
  98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  99#define THRESHOLDS_EVENTS_TARGET 128
 100#define SOFTLIMIT_EVENTS_TARGET 1024
 
 101
 102/*
 103 * Cgroups above their limits are maintained in a RB-Tree, independent of
 104 * their hierarchy representation
 105 */
 106
 107struct mem_cgroup_tree_per_node {
 108	struct rb_root rb_root;
 109	struct rb_node *rb_rightmost;
 110	spinlock_t lock;
 111};
 112
 
 
 
 
 113struct mem_cgroup_tree {
 114	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 115};
 116
 117static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 118
 119/* for OOM */
 120struct mem_cgroup_eventfd_list {
 121	struct list_head list;
 122	struct eventfd_ctx *eventfd;
 123};
 124
 125/*
 126 * cgroup_event represents events which userspace want to receive.
 127 */
 128struct mem_cgroup_event {
 129	/*
 130	 * memcg which the event belongs to.
 131	 */
 132	struct mem_cgroup *memcg;
 133	/*
 134	 * eventfd to signal userspace about the event.
 135	 */
 136	struct eventfd_ctx *eventfd;
 137	/*
 138	 * Each of these stored in a list by the cgroup.
 139	 */
 140	struct list_head list;
 141	/*
 142	 * register_event() callback will be used to add new userspace
 143	 * waiter for changes related to this event.  Use eventfd_signal()
 144	 * on eventfd to send notification to userspace.
 145	 */
 146	int (*register_event)(struct mem_cgroup *memcg,
 147			      struct eventfd_ctx *eventfd, const char *args);
 148	/*
 149	 * unregister_event() callback will be called when userspace closes
 150	 * the eventfd or on cgroup removing.  This callback must be set,
 151	 * if you want provide notification functionality.
 152	 */
 153	void (*unregister_event)(struct mem_cgroup *memcg,
 154				 struct eventfd_ctx *eventfd);
 155	/*
 156	 * All fields below needed to unregister event when
 157	 * userspace closes eventfd.
 158	 */
 159	poll_table pt;
 160	wait_queue_head_t *wqh;
 161	wait_queue_entry_t wait;
 162	struct work_struct remove;
 163};
 164
 165static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 166static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 167
 168/* Stuffs for move charges at task migration. */
 169/*
 170 * Types of charges to be moved.
 171 */
 172#define MOVE_ANON	0x1U
 173#define MOVE_FILE	0x2U
 174#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
 175
 176/* "mc" and its members are protected by cgroup_mutex */
 177static struct move_charge_struct {
 178	spinlock_t	  lock; /* for from, to */
 179	struct mm_struct  *mm;
 180	struct mem_cgroup *from;
 181	struct mem_cgroup *to;
 182	unsigned long flags;
 183	unsigned long precharge;
 184	unsigned long moved_charge;
 185	unsigned long moved_swap;
 186	struct task_struct *moving_task;	/* a task moving charges */
 187	wait_queue_head_t waitq;		/* a waitq for other context */
 188} mc = {
 189	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 190	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 191};
 192
 193/*
 194 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 195 * limit reclaim to prevent infinite loops, if they ever occur.
 196 */
 197#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 198#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 199
 200enum charge_type {
 201	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 202	MEM_CGROUP_CHARGE_TYPE_ANON,
 203	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 204	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 205	NR_CHARGE_TYPE,
 206};
 207
 208/* for encoding cft->private value on file */
 209enum res_type {
 210	_MEM,
 211	_MEMSWAP,
 212	_OOM_TYPE,
 213	_KMEM,
 214	_TCP,
 215};
 216
 217#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 218#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 219#define MEMFILE_ATTR(val)	((val) & 0xffff)
 220/* Used for OOM nofiier */
 221#define OOM_CONTROL		(0)
 222
 223/*
 224 * Iteration constructs for visiting all cgroups (under a tree).  If
 225 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 226 * be used for reference counting.
 227 */
 228#define for_each_mem_cgroup_tree(iter, root)		\
 229	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 230	     iter != NULL;				\
 231	     iter = mem_cgroup_iter(root, iter, NULL))
 232
 233#define for_each_mem_cgroup(iter)			\
 234	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 235	     iter != NULL;				\
 236	     iter = mem_cgroup_iter(NULL, iter, NULL))
 237
 238static inline bool should_force_charge(void)
 239{
 240	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 241		(current->flags & PF_EXITING);
 242}
 243
 244/* Some nice accessors for the vmpressure. */
 245struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 246{
 247	if (!memcg)
 248		memcg = root_mem_cgroup;
 249	return &memcg->vmpressure;
 250}
 251
 252struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 253{
 254	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 255}
 256
 257#ifdef CONFIG_MEMCG_KMEM
 258extern spinlock_t css_set_lock;
 259
 260static void obj_cgroup_release(struct percpu_ref *ref)
 261{
 262	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 263	struct mem_cgroup *memcg;
 264	unsigned int nr_bytes;
 265	unsigned int nr_pages;
 266	unsigned long flags;
 267
 268	/*
 269	 * At this point all allocated objects are freed, and
 270	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
 271	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 272	 *
 273	 * The following sequence can lead to it:
 274	 * 1) CPU0: objcg == stock->cached_objcg
 275	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 276	 *          PAGE_SIZE bytes are charged
 277	 * 3) CPU1: a process from another memcg is allocating something,
 278	 *          the stock if flushed,
 279	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 280	 * 5) CPU0: we do release this object,
 281	 *          92 bytes are added to stock->nr_bytes
 282	 * 6) CPU0: stock is flushed,
 283	 *          92 bytes are added to objcg->nr_charged_bytes
 284	 *
 285	 * In the result, nr_charged_bytes == PAGE_SIZE.
 286	 * This page will be uncharged in obj_cgroup_release().
 287	 */
 288	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 289	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 290	nr_pages = nr_bytes >> PAGE_SHIFT;
 291
 292	spin_lock_irqsave(&css_set_lock, flags);
 293	memcg = obj_cgroup_memcg(objcg);
 294	if (nr_pages)
 295		__memcg_kmem_uncharge(memcg, nr_pages);
 296	list_del(&objcg->list);
 297	mem_cgroup_put(memcg);
 298	spin_unlock_irqrestore(&css_set_lock, flags);
 299
 300	percpu_ref_exit(ref);
 301	kfree_rcu(objcg, rcu);
 302}
 303
 304static struct obj_cgroup *obj_cgroup_alloc(void)
 305{
 306	struct obj_cgroup *objcg;
 307	int ret;
 308
 309	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 310	if (!objcg)
 311		return NULL;
 312
 313	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 314			      GFP_KERNEL);
 315	if (ret) {
 316		kfree(objcg);
 317		return NULL;
 318	}
 319	INIT_LIST_HEAD(&objcg->list);
 320	return objcg;
 321}
 322
 323static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 324				  struct mem_cgroup *parent)
 325{
 326	struct obj_cgroup *objcg, *iter;
 327
 328	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 329
 330	spin_lock_irq(&css_set_lock);
 331
 332	/* Move active objcg to the parent's list */
 333	xchg(&objcg->memcg, parent);
 334	css_get(&parent->css);
 335	list_add(&objcg->list, &parent->objcg_list);
 336
 337	/* Move already reparented objcgs to the parent's list */
 338	list_for_each_entry(iter, &memcg->objcg_list, list) {
 339		css_get(&parent->css);
 340		xchg(&iter->memcg, parent);
 341		css_put(&memcg->css);
 342	}
 343	list_splice(&memcg->objcg_list, &parent->objcg_list);
 344
 345	spin_unlock_irq(&css_set_lock);
 346
 347	percpu_ref_kill(&objcg->refcnt);
 348}
 349
 
 350/*
 351 * This will be used as a shrinker list's index.
 352 * The main reason for not using cgroup id for this:
 353 *  this works better in sparse environments, where we have a lot of memcgs,
 354 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 355 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 356 *  200 entry array for that.
 357 *
 358 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 359 * will double each time we have to increase it.
 360 */
 361static DEFINE_IDA(memcg_cache_ida);
 362int memcg_nr_cache_ids;
 363
 364/* Protects memcg_nr_cache_ids */
 365static DECLARE_RWSEM(memcg_cache_ids_sem);
 366
 367void memcg_get_cache_ids(void)
 368{
 369	down_read(&memcg_cache_ids_sem);
 370}
 371
 372void memcg_put_cache_ids(void)
 373{
 374	up_read(&memcg_cache_ids_sem);
 375}
 376
 377/*
 378 * MIN_SIZE is different than 1, because we would like to avoid going through
 379 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 380 * cgroups is a reasonable guess. In the future, it could be a parameter or
 381 * tunable, but that is strictly not necessary.
 382 *
 383 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 384 * this constant directly from cgroup, but it is understandable that this is
 385 * better kept as an internal representation in cgroup.c. In any case, the
 386 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 387 * increase ours as well if it increases.
 388 */
 389#define MEMCG_CACHES_MIN_SIZE 4
 390#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 391
 392/*
 393 * A lot of the calls to the cache allocation functions are expected to be
 394 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
 395 * conditional to this static branch, we'll have to allow modules that does
 396 * kmem_cache_alloc and the such to see this symbol as well
 397 */
 398DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 399EXPORT_SYMBOL(memcg_kmem_enabled_key);
 400#endif
 401
 402static int memcg_shrinker_map_size;
 403static DEFINE_MUTEX(memcg_shrinker_map_mutex);
 404
 405static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
 
 406{
 407	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
 408}
 409
 410static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
 411					 int size, int old_size)
 412{
 413	struct memcg_shrinker_map *new, *old;
 414	int nid;
 415
 416	lockdep_assert_held(&memcg_shrinker_map_mutex);
 417
 418	for_each_node(nid) {
 419		old = rcu_dereference_protected(
 420			mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
 421		/* Not yet online memcg */
 422		if (!old)
 423			return 0;
 424
 425		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
 426		if (!new)
 427			return -ENOMEM;
 428
 429		/* Set all old bits, clear all new bits */
 430		memset(new->map, (int)0xff, old_size);
 431		memset((void *)new->map + old_size, 0, size - old_size);
 432
 433		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
 434		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
 435	}
 436
 437	return 0;
 438}
 439
 440static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
 441{
 442	struct mem_cgroup_per_node *pn;
 443	struct memcg_shrinker_map *map;
 444	int nid;
 445
 446	if (mem_cgroup_is_root(memcg))
 447		return;
 448
 449	for_each_node(nid) {
 450		pn = mem_cgroup_nodeinfo(memcg, nid);
 451		map = rcu_dereference_protected(pn->shrinker_map, true);
 452		if (map)
 453			kvfree(map);
 454		rcu_assign_pointer(pn->shrinker_map, NULL);
 455	}
 456}
 457
 458static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
 459{
 460	struct memcg_shrinker_map *map;
 461	int nid, size, ret = 0;
 462
 463	if (mem_cgroup_is_root(memcg))
 464		return 0;
 465
 466	mutex_lock(&memcg_shrinker_map_mutex);
 467	size = memcg_shrinker_map_size;
 468	for_each_node(nid) {
 469		map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
 470		if (!map) {
 471			memcg_free_shrinker_maps(memcg);
 472			ret = -ENOMEM;
 473			break;
 474		}
 475		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
 476	}
 477	mutex_unlock(&memcg_shrinker_map_mutex);
 478
 479	return ret;
 480}
 481
 482int memcg_expand_shrinker_maps(int new_id)
 483{
 484	int size, old_size, ret = 0;
 485	struct mem_cgroup *memcg;
 486
 487	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
 488	old_size = memcg_shrinker_map_size;
 489	if (size <= old_size)
 490		return 0;
 491
 492	mutex_lock(&memcg_shrinker_map_mutex);
 493	if (!root_mem_cgroup)
 494		goto unlock;
 495
 496	for_each_mem_cgroup(memcg) {
 497		if (mem_cgroup_is_root(memcg))
 498			continue;
 499		ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
 500		if (ret) {
 501			mem_cgroup_iter_break(NULL, memcg);
 502			goto unlock;
 503		}
 504	}
 505unlock:
 506	if (!ret)
 507		memcg_shrinker_map_size = size;
 508	mutex_unlock(&memcg_shrinker_map_mutex);
 509	return ret;
 510}
 511
 512void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 513{
 514	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 515		struct memcg_shrinker_map *map;
 516
 517		rcu_read_lock();
 518		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
 519		/* Pairs with smp mb in shrink_slab() */
 520		smp_mb__before_atomic();
 521		set_bit(shrinker_id, map->map);
 522		rcu_read_unlock();
 523	}
 524}
 525
 526/**
 527 * mem_cgroup_css_from_page - css of the memcg associated with a page
 528 * @page: page of interest
 529 *
 530 * If memcg is bound to the default hierarchy, css of the memcg associated
 531 * with @page is returned.  The returned css remains associated with @page
 532 * until it is released.
 533 *
 534 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 535 * is returned.
 536 */
 537struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 538{
 539	struct mem_cgroup *memcg;
 540
 541	memcg = page->mem_cgroup;
 542
 543	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 544		memcg = root_mem_cgroup;
 545
 546	return &memcg->css;
 547}
 548
 549/**
 550 * page_cgroup_ino - return inode number of the memcg a page is charged to
 551 * @page: the page
 552 *
 553 * Look up the closest online ancestor of the memory cgroup @page is charged to
 554 * and return its inode number or 0 if @page is not charged to any cgroup. It
 555 * is safe to call this function without holding a reference to @page.
 556 *
 557 * Note, this function is inherently racy, because there is nothing to prevent
 558 * the cgroup inode from getting torn down and potentially reallocated a moment
 559 * after page_cgroup_ino() returns, so it only should be used by callers that
 560 * do not care (such as procfs interfaces).
 561 */
 562ino_t page_cgroup_ino(struct page *page)
 563{
 564	struct mem_cgroup *memcg;
 565	unsigned long ino = 0;
 566
 567	rcu_read_lock();
 568	memcg = page->mem_cgroup;
 569
 570	/*
 571	 * The lowest bit set means that memcg isn't a valid
 572	 * memcg pointer, but a obj_cgroups pointer.
 573	 * In this case the page is shared and doesn't belong
 574	 * to any specific memory cgroup.
 575	 */
 576	if ((unsigned long) memcg & 0x1UL)
 577		memcg = NULL;
 578
 579	while (memcg && !(memcg->css.flags & CSS_ONLINE))
 580		memcg = parent_mem_cgroup(memcg);
 581	if (memcg)
 582		ino = cgroup_ino(memcg->css.cgroup);
 583	rcu_read_unlock();
 584	return ino;
 585}
 586
 587static struct mem_cgroup_per_node *
 588mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 589{
 590	int nid = page_to_nid(page);
 
 591
 592	return memcg->nodeinfo[nid];
 593}
 594
 595static struct mem_cgroup_tree_per_node *
 596soft_limit_tree_node(int nid)
 597{
 598	return soft_limit_tree.rb_tree_per_node[nid];
 599}
 600
 601static struct mem_cgroup_tree_per_node *
 602soft_limit_tree_from_page(struct page *page)
 603{
 604	int nid = page_to_nid(page);
 
 605
 606	return soft_limit_tree.rb_tree_per_node[nid];
 607}
 608
 609static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 610					 struct mem_cgroup_tree_per_node *mctz,
 611					 unsigned long new_usage_in_excess)
 612{
 613	struct rb_node **p = &mctz->rb_root.rb_node;
 614	struct rb_node *parent = NULL;
 615	struct mem_cgroup_per_node *mz_node;
 616	bool rightmost = true;
 617
 618	if (mz->on_tree)
 619		return;
 620
 621	mz->usage_in_excess = new_usage_in_excess;
 622	if (!mz->usage_in_excess)
 623		return;
 624	while (*p) {
 625		parent = *p;
 626		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 627					tree_node);
 628		if (mz->usage_in_excess < mz_node->usage_in_excess) {
 629			p = &(*p)->rb_left;
 630			rightmost = false;
 631		}
 632
 633		/*
 634		 * We can't avoid mem cgroups that are over their soft
 635		 * limit by the same amount
 636		 */
 637		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 638			p = &(*p)->rb_right;
 639	}
 640
 641	if (rightmost)
 642		mctz->rb_rightmost = &mz->tree_node;
 643
 644	rb_link_node(&mz->tree_node, parent, p);
 645	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 646	mz->on_tree = true;
 647}
 648
 649static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 650					 struct mem_cgroup_tree_per_node *mctz)
 651{
 652	if (!mz->on_tree)
 653		return;
 654
 655	if (&mz->tree_node == mctz->rb_rightmost)
 656		mctz->rb_rightmost = rb_prev(&mz->tree_node);
 657
 658	rb_erase(&mz->tree_node, &mctz->rb_root);
 659	mz->on_tree = false;
 660}
 661
 662static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 663				       struct mem_cgroup_tree_per_node *mctz)
 664{
 665	unsigned long flags;
 666
 667	spin_lock_irqsave(&mctz->lock, flags);
 668	__mem_cgroup_remove_exceeded(mz, mctz);
 669	spin_unlock_irqrestore(&mctz->lock, flags);
 670}
 671
 672static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 673{
 674	unsigned long nr_pages = page_counter_read(&memcg->memory);
 675	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 676	unsigned long excess = 0;
 677
 678	if (nr_pages > soft_limit)
 679		excess = nr_pages - soft_limit;
 680
 681	return excess;
 682}
 683
 684static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 685{
 686	unsigned long excess;
 687	struct mem_cgroup_per_node *mz;
 688	struct mem_cgroup_tree_per_node *mctz;
 689
 690	mctz = soft_limit_tree_from_page(page);
 691	if (!mctz)
 692		return;
 693	/*
 694	 * Necessary to update all ancestors when hierarchy is used.
 695	 * because their event counter is not touched.
 696	 */
 697	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 698		mz = mem_cgroup_page_nodeinfo(memcg, page);
 699		excess = soft_limit_excess(memcg);
 700		/*
 701		 * We have to update the tree if mz is on RB-tree or
 702		 * mem is over its softlimit.
 703		 */
 704		if (excess || mz->on_tree) {
 705			unsigned long flags;
 706
 707			spin_lock_irqsave(&mctz->lock, flags);
 708			/* if on-tree, remove it */
 709			if (mz->on_tree)
 710				__mem_cgroup_remove_exceeded(mz, mctz);
 711			/*
 712			 * Insert again. mz->usage_in_excess will be updated.
 713			 * If excess is 0, no tree ops.
 714			 */
 715			__mem_cgroup_insert_exceeded(mz, mctz, excess);
 716			spin_unlock_irqrestore(&mctz->lock, flags);
 717		}
 718	}
 719}
 720
 721static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 722{
 723	struct mem_cgroup_tree_per_node *mctz;
 724	struct mem_cgroup_per_node *mz;
 725	int nid;
 726
 727	for_each_node(nid) {
 728		mz = mem_cgroup_nodeinfo(memcg, nid);
 729		mctz = soft_limit_tree_node(nid);
 730		if (mctz)
 731			mem_cgroup_remove_exceeded(mz, mctz);
 
 732	}
 733}
 734
 735static struct mem_cgroup_per_node *
 736__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 737{
 738	struct mem_cgroup_per_node *mz;
 
 739
 740retry:
 741	mz = NULL;
 742	if (!mctz->rb_rightmost)
 
 743		goto done;		/* Nothing to reclaim from */
 744
 745	mz = rb_entry(mctz->rb_rightmost,
 746		      struct mem_cgroup_per_node, tree_node);
 747	/*
 748	 * Remove the node now but someone else can add it back,
 749	 * we will to add it back at the end of reclaim to its correct
 750	 * position in the tree.
 751	 */
 752	__mem_cgroup_remove_exceeded(mz, mctz);
 753	if (!soft_limit_excess(mz->memcg) ||
 754	    !css_tryget(&mz->memcg->css))
 755		goto retry;
 756done:
 757	return mz;
 758}
 759
 760static struct mem_cgroup_per_node *
 761mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 762{
 763	struct mem_cgroup_per_node *mz;
 764
 765	spin_lock_irq(&mctz->lock);
 766	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 767	spin_unlock_irq(&mctz->lock);
 768	return mz;
 769}
 770
 771/**
 772 * __mod_memcg_state - update cgroup memory statistics
 773 * @memcg: the memory cgroup
 774 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 775 * @val: delta to add to the counter, can be negative
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 776 */
 777void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 
 778{
 779	long x, threshold = MEMCG_CHARGE_BATCH;
 
 780
 781	if (mem_cgroup_disabled())
 782		return;
 783
 784	if (memcg_stat_item_in_bytes(idx))
 785		threshold <<= PAGE_SHIFT;
 786
 787	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 788	if (unlikely(abs(x) > threshold)) {
 789		struct mem_cgroup *mi;
 790
 791		/*
 792		 * Batch local counters to keep them in sync with
 793		 * the hierarchical ones.
 794		 */
 795		__this_cpu_add(memcg->vmstats_local->stat[idx], x);
 796		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 797			atomic_long_add(x, &mi->vmstats[idx]);
 798		x = 0;
 799	}
 800	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
 801}
 802
 803static struct mem_cgroup_per_node *
 804parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
 805{
 806	struct mem_cgroup *parent;
 
 807
 808	parent = parent_mem_cgroup(pn->memcg);
 809	if (!parent)
 810		return NULL;
 811	return mem_cgroup_nodeinfo(parent, nid);
 812}
 813
 814void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 815			      int val)
 
 816{
 817	struct mem_cgroup_per_node *pn;
 818	struct mem_cgroup *memcg;
 819	long x, threshold = MEMCG_CHARGE_BATCH;
 820
 821	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 822	memcg = pn->memcg;
 823
 824	/* Update memcg */
 825	__mod_memcg_state(memcg, idx, val);
 826
 827	/* Update lruvec */
 828	__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 829
 830	if (vmstat_item_in_bytes(idx))
 831		threshold <<= PAGE_SHIFT;
 832
 833	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
 834	if (unlikely(abs(x) > threshold)) {
 835		pg_data_t *pgdat = lruvec_pgdat(lruvec);
 836		struct mem_cgroup_per_node *pi;
 837
 838		for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
 839			atomic_long_add(x, &pi->lruvec_stat[idx]);
 840		x = 0;
 841	}
 842	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 843}
 844
 845/**
 846 * __mod_lruvec_state - update lruvec memory statistics
 847 * @lruvec: the lruvec
 848 * @idx: the stat item
 849 * @val: delta to add to the counter, can be negative
 850 *
 851 * The lruvec is the intersection of the NUMA node and a cgroup. This
 852 * function updates the all three counters that are affected by a
 853 * change of state at this level: per-node, per-cgroup, per-lruvec.
 854 */
 855void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 856			int val)
 857{
 858	/* Update node */
 859	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 860
 861	/* Update memcg and lruvec */
 862	if (!mem_cgroup_disabled())
 863		__mod_memcg_lruvec_state(lruvec, idx, val);
 864}
 865
 866void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 867{
 868	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 869	struct mem_cgroup *memcg;
 870	struct lruvec *lruvec;
 871
 872	rcu_read_lock();
 873	memcg = mem_cgroup_from_obj(p);
 874
 875	/* Untracked pages have no memcg, no lruvec. Update only the node */
 876	if (!memcg || memcg == root_mem_cgroup) {
 877		__mod_node_page_state(pgdat, idx, val);
 878	} else {
 879		lruvec = mem_cgroup_lruvec(memcg, pgdat);
 880		__mod_lruvec_state(lruvec, idx, val);
 881	}
 882	rcu_read_unlock();
 883}
 884
 885void mod_memcg_obj_state(void *p, int idx, int val)
 886{
 887	struct mem_cgroup *memcg;
 888
 889	rcu_read_lock();
 890	memcg = mem_cgroup_from_obj(p);
 891	if (memcg)
 892		mod_memcg_state(memcg, idx, val);
 893	rcu_read_unlock();
 894}
 895
 896/**
 897 * __count_memcg_events - account VM events in a cgroup
 898 * @memcg: the memory cgroup
 899 * @idx: the event item
 900 * @count: the number of events that occured
 901 */
 902void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 903			  unsigned long count)
 904{
 905	unsigned long x;
 
 906
 907	if (mem_cgroup_disabled())
 908		return;
 909
 910	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 911	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
 912		struct mem_cgroup *mi;
 913
 914		/*
 915		 * Batch local counters to keep them in sync with
 916		 * the hierarchical ones.
 917		 */
 918		__this_cpu_add(memcg->vmstats_local->events[idx], x);
 919		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 920			atomic_long_add(x, &mi->vmevents[idx]);
 921		x = 0;
 922	}
 923	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
 924}
 925
 926static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 
 927{
 928	return atomic_long_read(&memcg->vmevents[event]);
 929}
 930
 931static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 932{
 933	long x = 0;
 934	int cpu;
 935
 936	for_each_possible_cpu(cpu)
 937		x += per_cpu(memcg->vmstats_local->events[event], cpu);
 938	return x;
 939}
 940
 941static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 942					 struct page *page,
 943					 int nr_pages)
 944{
 945	/* pagein of a big page is an event. So, ignore page size */
 946	if (nr_pages > 0)
 947		__count_memcg_events(memcg, PGPGIN, 1);
 948	else {
 949		__count_memcg_events(memcg, PGPGOUT, 1);
 950		nr_pages = -nr_pages; /* for event */
 951	}
 952
 953	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 954}
 955
 956static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 957				       enum mem_cgroup_events_target target)
 958{
 959	unsigned long val, next;
 960
 961	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 962	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 963	/* from time_after() in jiffies.h */
 964	if ((long)(next - val) < 0) {
 965		switch (target) {
 966		case MEM_CGROUP_TARGET_THRESH:
 967			next = val + THRESHOLDS_EVENTS_TARGET;
 968			break;
 969		case MEM_CGROUP_TARGET_SOFTLIMIT:
 970			next = val + SOFTLIMIT_EVENTS_TARGET;
 971			break;
 
 
 
 972		default:
 973			break;
 974		}
 975		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 976		return true;
 977	}
 978	return false;
 979}
 980
 981/*
 982 * Check events in order.
 983 *
 984 */
 985static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 986{
 987	/* threshold event is triggered in finer grain than soft limit */
 988	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 989						MEM_CGROUP_TARGET_THRESH))) {
 990		bool do_softlimit;
 
 991
 992		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 993						MEM_CGROUP_TARGET_SOFTLIMIT);
 
 
 
 
 994		mem_cgroup_threshold(memcg);
 995		if (unlikely(do_softlimit))
 996			mem_cgroup_update_tree(memcg, page);
 
 
 
 
 997	}
 998}
 999
1000struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1001{
1002	/*
1003	 * mm_update_next_owner() may clear mm->owner to NULL
1004	 * if it races with swapoff, page migration, etc.
1005	 * So this can be called with p == NULL.
1006	 */
1007	if (unlikely(!p))
1008		return NULL;
1009
1010	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1011}
1012EXPORT_SYMBOL(mem_cgroup_from_task);
1013
1014/**
1015 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
1016 * @mm: mm from which memcg should be extracted. It can be NULL.
1017 *
1018 * Obtain a reference on mm->memcg and returns it if successful. Otherwise
1019 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
1020 * returned.
1021 */
1022struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1023{
1024	struct mem_cgroup *memcg;
1025
1026	if (mem_cgroup_disabled())
1027		return NULL;
1028
1029	rcu_read_lock();
1030	do {
1031		/*
1032		 * Page cache insertions can happen withou an
1033		 * actual mm context, e.g. during disk probing
1034		 * on boot, loopback IO, acct() writes etc.
1035		 */
1036		if (unlikely(!mm))
1037			memcg = root_mem_cgroup;
1038		else {
1039			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1040			if (unlikely(!memcg))
1041				memcg = root_mem_cgroup;
1042		}
1043	} while (!css_tryget(&memcg->css));
1044	rcu_read_unlock();
1045	return memcg;
1046}
1047EXPORT_SYMBOL(get_mem_cgroup_from_mm);
1048
1049/**
1050 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
1051 * @page: page from which memcg should be extracted.
1052 *
1053 * Obtain a reference on page->memcg and returns it if successful. Otherwise
1054 * root_mem_cgroup is returned.
1055 */
1056struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
1057{
1058	struct mem_cgroup *memcg = page->mem_cgroup;
1059
1060	if (mem_cgroup_disabled())
1061		return NULL;
1062
1063	rcu_read_lock();
1064	/* Page should not get uncharged and freed memcg under us. */
1065	if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
1066		memcg = root_mem_cgroup;
1067	rcu_read_unlock();
1068	return memcg;
1069}
1070EXPORT_SYMBOL(get_mem_cgroup_from_page);
1071
1072/**
1073 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
1074 */
1075static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
1076{
1077	if (unlikely(current->active_memcg)) {
1078		struct mem_cgroup *memcg;
1079
1080		rcu_read_lock();
1081		/* current->active_memcg must hold a ref. */
1082		if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
1083			memcg = root_mem_cgroup;
1084		else
1085			memcg = current->active_memcg;
1086		rcu_read_unlock();
1087		return memcg;
1088	}
1089	return get_mem_cgroup_from_mm(current->mm);
1090}
1091
1092/**
1093 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1094 * @root: hierarchy root
1095 * @prev: previously returned memcg, NULL on first invocation
1096 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1097 *
1098 * Returns references to children of the hierarchy below @root, or
1099 * @root itself, or %NULL after a full round-trip.
1100 *
1101 * Caller must pass the return value in @prev on subsequent
1102 * invocations for reference counting, or use mem_cgroup_iter_break()
1103 * to cancel a hierarchy walk before the round-trip is complete.
1104 *
1105 * Reclaimers can specify a node and a priority level in @reclaim to
1106 * divide up the memcgs in the hierarchy among all concurrent
1107 * reclaimers operating on the same node and priority.
1108 */
1109struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1110				   struct mem_cgroup *prev,
1111				   struct mem_cgroup_reclaim_cookie *reclaim)
1112{
1113	struct mem_cgroup_reclaim_iter *iter;
1114	struct cgroup_subsys_state *css = NULL;
1115	struct mem_cgroup *memcg = NULL;
1116	struct mem_cgroup *pos = NULL;
1117
1118	if (mem_cgroup_disabled())
1119		return NULL;
1120
1121	if (!root)
1122		root = root_mem_cgroup;
1123
1124	if (prev && !reclaim)
1125		pos = prev;
1126
1127	if (!root->use_hierarchy && root != root_mem_cgroup) {
1128		if (prev)
1129			goto out;
1130		return root;
1131	}
1132
1133	rcu_read_lock();
1134
1135	if (reclaim) {
1136		struct mem_cgroup_per_node *mz;
1137
1138		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
1139		iter = &mz->iter;
1140
1141		if (prev && reclaim->generation != iter->generation)
1142			goto out_unlock;
1143
1144		while (1) {
1145			pos = READ_ONCE(iter->position);
1146			if (!pos || css_tryget(&pos->css))
1147				break;
1148			/*
1149			 * css reference reached zero, so iter->position will
1150			 * be cleared by ->css_released. However, we should not
1151			 * rely on this happening soon, because ->css_released
1152			 * is called from a work queue, and by busy-waiting we
1153			 * might block it. So we clear iter->position right
1154			 * away.
1155			 */
1156			(void)cmpxchg(&iter->position, pos, NULL);
1157		}
1158	}
1159
1160	if (pos)
1161		css = &pos->css;
1162
1163	for (;;) {
1164		css = css_next_descendant_pre(css, &root->css);
1165		if (!css) {
1166			/*
1167			 * Reclaimers share the hierarchy walk, and a
1168			 * new one might jump in right at the end of
1169			 * the hierarchy - make sure they see at least
1170			 * one group and restart from the beginning.
1171			 */
1172			if (!prev)
1173				continue;
1174			break;
1175		}
1176
1177		/*
1178		 * Verify the css and acquire a reference.  The root
1179		 * is provided by the caller, so we know it's alive
1180		 * and kicking, and don't take an extra reference.
1181		 */
1182		memcg = mem_cgroup_from_css(css);
1183
1184		if (css == &root->css)
1185			break;
1186
1187		if (css_tryget(css))
1188			break;
1189
1190		memcg = NULL;
1191	}
1192
1193	if (reclaim) {
1194		/*
1195		 * The position could have already been updated by a competing
1196		 * thread, so check that the value hasn't changed since we read
1197		 * it to avoid reclaiming from the same cgroup twice.
1198		 */
1199		(void)cmpxchg(&iter->position, pos, memcg);
1200
1201		if (pos)
1202			css_put(&pos->css);
1203
1204		if (!memcg)
1205			iter->generation++;
1206		else if (!prev)
1207			reclaim->generation = iter->generation;
1208	}
1209
1210out_unlock:
1211	rcu_read_unlock();
1212out:
1213	if (prev && prev != root)
1214		css_put(&prev->css);
1215
1216	return memcg;
1217}
1218
1219/**
1220 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1221 * @root: hierarchy root
1222 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1223 */
1224void mem_cgroup_iter_break(struct mem_cgroup *root,
1225			   struct mem_cgroup *prev)
1226{
1227	if (!root)
1228		root = root_mem_cgroup;
1229	if (prev && prev != root)
1230		css_put(&prev->css);
1231}
1232
1233static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1234					struct mem_cgroup *dead_memcg)
1235{
 
1236	struct mem_cgroup_reclaim_iter *iter;
1237	struct mem_cgroup_per_node *mz;
1238	int nid;
 
1239
1240	for_each_node(nid) {
1241		mz = mem_cgroup_nodeinfo(from, nid);
1242		iter = &mz->iter;
1243		cmpxchg(&iter->position, dead_memcg, NULL);
 
 
 
 
 
 
 
1244	}
1245}
1246
1247static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1248{
1249	struct mem_cgroup *memcg = dead_memcg;
1250	struct mem_cgroup *last;
 
 
 
 
 
1251
1252	do {
1253		__invalidate_reclaim_iterators(memcg, dead_memcg);
1254		last = memcg;
1255	} while ((memcg = parent_mem_cgroup(memcg)));
1256
1257	/*
1258	 * When cgruop1 non-hierarchy mode is used,
1259	 * parent_mem_cgroup() does not walk all the way up to the
1260	 * cgroup root (root_mem_cgroup). So we have to handle
1261	 * dead_memcg from cgroup root separately.
1262	 */
1263	if (last != root_mem_cgroup)
1264		__invalidate_reclaim_iterators(root_mem_cgroup,
1265						dead_memcg);
1266}
1267
1268/**
1269 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1270 * @memcg: hierarchy root
1271 * @fn: function to call for each task
1272 * @arg: argument passed to @fn
1273 *
1274 * This function iterates over tasks attached to @memcg or to any of its
1275 * descendants and calls @fn for each task. If @fn returns a non-zero
1276 * value, the function breaks the iteration loop and returns the value.
1277 * Otherwise, it will iterate over all tasks and return 0.
1278 *
1279 * This function must not be called for the root memory cgroup.
1280 */
1281int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1282			  int (*fn)(struct task_struct *, void *), void *arg)
1283{
1284	struct mem_cgroup *iter;
1285	int ret = 0;
1286
1287	BUG_ON(memcg == root_mem_cgroup);
 
 
 
1288
1289	for_each_mem_cgroup_tree(iter, memcg) {
1290		struct css_task_iter it;
1291		struct task_struct *task;
1292
1293		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1294		while (!ret && (task = css_task_iter_next(&it)))
1295			ret = fn(task, arg);
1296		css_task_iter_end(&it);
1297		if (ret) {
1298			mem_cgroup_iter_break(memcg, iter);
1299			break;
1300		}
1301	}
1302	return ret;
1303}
1304
1305/**
1306 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1307 * @page: the page
1308 * @pgdat: pgdat of the page
1309 *
1310 * This function relies on page->mem_cgroup being stable - see the
1311 * access rules in commit_charge().
 
1312 */
1313struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1314{
1315	struct mem_cgroup_per_node *mz;
1316	struct mem_cgroup *memcg;
1317	struct lruvec *lruvec;
1318
1319	if (mem_cgroup_disabled()) {
1320		lruvec = &pgdat->__lruvec;
1321		goto out;
1322	}
1323
1324	memcg = page->mem_cgroup;
1325	/*
1326	 * Swapcache readahead pages are added to the LRU - and
1327	 * possibly migrated - before they are charged.
1328	 */
1329	if (!memcg)
1330		memcg = root_mem_cgroup;
1331
1332	mz = mem_cgroup_page_nodeinfo(memcg, page);
1333	lruvec = &mz->lruvec;
1334out:
1335	/*
1336	 * Since a node can be onlined after the mem_cgroup was created,
1337	 * we have to be prepared to initialize lruvec->zone here;
1338	 * and if offlined then reonlined, we need to reinitialize it.
1339	 */
1340	if (unlikely(lruvec->pgdat != pgdat))
1341		lruvec->pgdat = pgdat;
1342	return lruvec;
1343}
1344
1345/**
1346 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1347 * @lruvec: mem_cgroup per zone lru vector
1348 * @lru: index of lru list the page is sitting on
1349 * @zid: zone id of the accounted pages
1350 * @nr_pages: positive when adding or negative when removing
1351 *
1352 * This function must be called under lru_lock, just before a page is added
1353 * to or just after a page is removed from an lru list (that ordering being
1354 * so as to allow it to check that lru_size 0 is consistent with list_empty).
1355 */
1356void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1357				int zid, int nr_pages)
1358{
1359	struct mem_cgroup_per_node *mz;
1360	unsigned long *lru_size;
1361	long size;
1362
1363	if (mem_cgroup_disabled())
1364		return;
1365
1366	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1367	lru_size = &mz->lru_zone_size[zid][lru];
 
 
 
1368
1369	if (nr_pages < 0)
1370		*lru_size += nr_pages;
 
 
 
1371
1372	size = *lru_size;
1373	if (WARN_ONCE(size < 0,
1374		"%s(%p, %d, %d): lru_size %ld\n",
1375		__func__, lruvec, lru, nr_pages, size)) {
1376		VM_BUG_ON(1);
1377		*lru_size = 0;
 
 
 
 
 
 
 
 
1378	}
1379
1380	if (nr_pages > 0)
1381		*lru_size += nr_pages;
1382}
1383
1384/**
1385 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1386 * @memcg: the memory cgroup
1387 *
1388 * Returns the maximum amount of memory @mem can be charged with, in
1389 * pages.
1390 */
1391static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1392{
1393	unsigned long margin = 0;
1394	unsigned long count;
1395	unsigned long limit;
1396
1397	count = page_counter_read(&memcg->memory);
1398	limit = READ_ONCE(memcg->memory.max);
1399	if (count < limit)
1400		margin = limit - count;
1401
1402	if (do_memsw_account()) {
1403		count = page_counter_read(&memcg->memsw);
1404		limit = READ_ONCE(memcg->memsw.max);
1405		if (count < limit)
1406			margin = min(margin, limit - count);
1407		else
1408			margin = 0;
1409	}
1410
1411	return margin;
1412}
1413
1414/*
1415 * A routine for checking "mem" is under move_account() or not.
1416 *
1417 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1418 * moving cgroups. This is for waiting at high-memory pressure
1419 * caused by "move".
1420 */
1421static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1422{
1423	struct mem_cgroup *from;
1424	struct mem_cgroup *to;
1425	bool ret = false;
1426	/*
1427	 * Unlike task_move routines, we access mc.to, mc.from not under
1428	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1429	 */
1430	spin_lock(&mc.lock);
1431	from = mc.from;
1432	to = mc.to;
1433	if (!from)
1434		goto unlock;
1435
1436	ret = mem_cgroup_is_descendant(from, memcg) ||
1437		mem_cgroup_is_descendant(to, memcg);
1438unlock:
1439	spin_unlock(&mc.lock);
1440	return ret;
1441}
1442
1443static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1444{
1445	if (mc.moving_task && current != mc.moving_task) {
1446		if (mem_cgroup_under_move(memcg)) {
1447			DEFINE_WAIT(wait);
1448			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1449			/* moving charge context might have finished. */
1450			if (mc.moving_task)
1451				schedule();
1452			finish_wait(&mc.waitq, &wait);
1453			return true;
1454		}
1455	}
1456	return false;
1457}
1458
1459static char *memory_stat_format(struct mem_cgroup *memcg)
1460{
1461	struct seq_buf s;
1462	int i;
1463
1464	seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1465	if (!s.buffer)
1466		return NULL;
1467
1468	/*
1469	 * Provide statistics on the state of the memory subsystem as
1470	 * well as cumulative event counters that show past behavior.
1471	 *
1472	 * This list is ordered following a combination of these gradients:
1473	 * 1) generic big picture -> specifics and details
1474	 * 2) reflecting userspace activity -> reflecting kernel heuristics
1475	 *
1476	 * Current memory state:
1477	 */
1478
1479	seq_buf_printf(&s, "anon %llu\n",
1480		       (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
1481		       PAGE_SIZE);
1482	seq_buf_printf(&s, "file %llu\n",
1483		       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
1484		       PAGE_SIZE);
1485	seq_buf_printf(&s, "kernel_stack %llu\n",
1486		       (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1487		       1024);
1488	seq_buf_printf(&s, "slab %llu\n",
1489		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1490			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
1491	seq_buf_printf(&s, "percpu %llu\n",
1492		       (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
1493	seq_buf_printf(&s, "sock %llu\n",
1494		       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
1495		       PAGE_SIZE);
1496
1497	seq_buf_printf(&s, "shmem %llu\n",
1498		       (u64)memcg_page_state(memcg, NR_SHMEM) *
1499		       PAGE_SIZE);
1500	seq_buf_printf(&s, "file_mapped %llu\n",
1501		       (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
1502		       PAGE_SIZE);
1503	seq_buf_printf(&s, "file_dirty %llu\n",
1504		       (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
1505		       PAGE_SIZE);
1506	seq_buf_printf(&s, "file_writeback %llu\n",
1507		       (u64)memcg_page_state(memcg, NR_WRITEBACK) *
1508		       PAGE_SIZE);
1509
1510#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1511	seq_buf_printf(&s, "anon_thp %llu\n",
1512		       (u64)memcg_page_state(memcg, NR_ANON_THPS) *
1513		       HPAGE_PMD_SIZE);
1514#endif
1515
1516	for (i = 0; i < NR_LRU_LISTS; i++)
1517		seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
1518			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1519			       PAGE_SIZE);
1520
1521	seq_buf_printf(&s, "slab_reclaimable %llu\n",
1522		       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
1523	seq_buf_printf(&s, "slab_unreclaimable %llu\n",
1524		       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
1525
1526	/* Accumulated memory events */
1527
1528	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1529		       memcg_events(memcg, PGFAULT));
1530	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1531		       memcg_events(memcg, PGMAJFAULT));
1532
1533	seq_buf_printf(&s, "workingset_refault_anon %lu\n",
1534		       memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
1535	seq_buf_printf(&s, "workingset_refault_file %lu\n",
1536		       memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
1537	seq_buf_printf(&s, "workingset_activate_anon %lu\n",
1538		       memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
1539	seq_buf_printf(&s, "workingset_activate_file %lu\n",
1540		       memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
1541	seq_buf_printf(&s, "workingset_restore_anon %lu\n",
1542		       memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
1543	seq_buf_printf(&s, "workingset_restore_file %lu\n",
1544		       memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
1545	seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
1546		       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
1547
1548	seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
1549		       memcg_events(memcg, PGREFILL));
1550	seq_buf_printf(&s, "pgscan %lu\n",
1551		       memcg_events(memcg, PGSCAN_KSWAPD) +
1552		       memcg_events(memcg, PGSCAN_DIRECT));
1553	seq_buf_printf(&s, "pgsteal %lu\n",
1554		       memcg_events(memcg, PGSTEAL_KSWAPD) +
1555		       memcg_events(memcg, PGSTEAL_DIRECT));
1556	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1557		       memcg_events(memcg, PGACTIVATE));
1558	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1559		       memcg_events(memcg, PGDEACTIVATE));
1560	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1561		       memcg_events(memcg, PGLAZYFREE));
1562	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1563		       memcg_events(memcg, PGLAZYFREED));
1564
1565#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1566	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1567		       memcg_events(memcg, THP_FAULT_ALLOC));
1568	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1569		       memcg_events(memcg, THP_COLLAPSE_ALLOC));
1570#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1571
1572	/* The above should easily fit into one page */
1573	WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1574
1575	return s.buffer;
1576}
1577
1578#define K(x) ((x) << (PAGE_SHIFT-10))
1579/**
1580 * mem_cgroup_print_oom_context: Print OOM information relevant to
1581 * memory controller.
1582 * @memcg: The memory cgroup that went over limit
1583 * @p: Task that is going to be killed
1584 *
1585 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1586 * enabled
1587 */
1588void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1589{
 
 
 
1590	rcu_read_lock();
1591
1592	if (memcg) {
1593		pr_cont(",oom_memcg=");
1594		pr_cont_cgroup_path(memcg->css.cgroup);
1595	} else
1596		pr_cont(",global_oom");
1597	if (p) {
1598		pr_cont(",task_memcg=");
1599		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
 
 
 
1600	}
 
 
 
 
1601	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1602}
1603
1604/**
1605 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1606 * memory controller.
1607 * @memcg: The memory cgroup that went over limit
1608 */
1609void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1610{
1611	char *buf;
 
1612
1613	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1614		K((u64)page_counter_read(&memcg->memory)),
1615		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1616	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1617		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1618			K((u64)page_counter_read(&memcg->swap)),
1619			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1620	else {
1621		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1622			K((u64)page_counter_read(&memcg->memsw)),
1623			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1624		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1625			K((u64)page_counter_read(&memcg->kmem)),
1626			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1627	}
1628
1629	pr_info("Memory cgroup stats for ");
1630	pr_cont_cgroup_path(memcg->css.cgroup);
1631	pr_cont(":");
1632	buf = memory_stat_format(memcg);
1633	if (!buf)
1634		return;
1635	pr_info("%s", buf);
1636	kfree(buf);
1637}
1638
1639/*
1640 * Return the memory (and swap, if configured) limit for a memcg.
1641 */
1642unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1643{
1644	unsigned long max;
1645
1646	max = READ_ONCE(memcg->memory.max);
1647	if (mem_cgroup_swappiness(memcg)) {
1648		unsigned long memsw_max;
1649		unsigned long swap_max;
1650
1651		memsw_max = memcg->memsw.max;
1652		swap_max = READ_ONCE(memcg->swap.max);
1653		swap_max = min(swap_max, (unsigned long)total_swap_pages);
1654		max = min(max + swap_max, memsw_max);
1655	}
1656	return max;
1657}
1658
1659unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1660{
1661	return page_counter_read(&memcg->memory);
1662}
1663
1664static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1665				     int order)
1666{
1667	struct oom_control oc = {
1668		.zonelist = NULL,
1669		.nodemask = NULL,
1670		.memcg = memcg,
1671		.gfp_mask = gfp_mask,
1672		.order = order,
1673	};
1674	bool ret = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1675
1676	if (mutex_lock_killable(&oom_lock))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1677		return true;
 
 
 
 
 
1678
1679	if (mem_cgroup_margin(memcg) >= (1 << order))
1680		goto unlock;
1681
 
 
 
 
 
 
 
 
 
1682	/*
1683	 * A few threads which were not waiting at mutex_lock_killable() can
1684	 * fail to bail out. Therefore, check again after holding oom_lock.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1685	 */
1686	ret = should_force_charge() || out_of_memory(&oc);
 
1687
1688unlock:
1689	mutex_unlock(&oom_lock);
1690	return ret;
 
 
 
 
1691}
 
1692
1693static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1694				   pg_data_t *pgdat,
1695				   gfp_t gfp_mask,
1696				   unsigned long *total_scanned)
1697{
1698	struct mem_cgroup *victim = NULL;
1699	int total = 0;
1700	int loop = 0;
1701	unsigned long excess;
1702	unsigned long nr_scanned;
1703	struct mem_cgroup_reclaim_cookie reclaim = {
1704		.pgdat = pgdat,
 
1705	};
1706
1707	excess = soft_limit_excess(root_memcg);
1708
1709	while (1) {
1710		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1711		if (!victim) {
1712			loop++;
1713			if (loop >= 2) {
1714				/*
1715				 * If we have not been able to reclaim
1716				 * anything, it might because there are
1717				 * no reclaimable pages under this hierarchy
1718				 */
1719				if (!total)
1720					break;
1721				/*
1722				 * We want to do more targeted reclaim.
1723				 * excess >> 2 is not to excessive so as to
1724				 * reclaim too much, nor too less that we keep
1725				 * coming back to reclaim from this cgroup
1726				 */
1727				if (total >= (excess >> 2) ||
1728					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1729					break;
1730			}
1731			continue;
1732		}
1733		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1734					pgdat, &nr_scanned);
1735		*total_scanned += nr_scanned;
1736		if (!soft_limit_excess(root_memcg))
1737			break;
1738	}
1739	mem_cgroup_iter_break(root_memcg, victim);
1740	return total;
1741}
1742
1743#ifdef CONFIG_LOCKDEP
1744static struct lockdep_map memcg_oom_lock_dep_map = {
1745	.name = "memcg_oom_lock",
1746};
1747#endif
1748
1749static DEFINE_SPINLOCK(memcg_oom_lock);
1750
1751/*
1752 * Check OOM-Killer is already running under our hierarchy.
1753 * If someone is running, return false.
1754 */
1755static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1756{
1757	struct mem_cgroup *iter, *failed = NULL;
1758
1759	spin_lock(&memcg_oom_lock);
1760
1761	for_each_mem_cgroup_tree(iter, memcg) {
1762		if (iter->oom_lock) {
1763			/*
1764			 * this subtree of our hierarchy is already locked
1765			 * so we cannot give a lock.
1766			 */
1767			failed = iter;
1768			mem_cgroup_iter_break(memcg, iter);
1769			break;
1770		} else
1771			iter->oom_lock = true;
1772	}
1773
1774	if (failed) {
1775		/*
1776		 * OK, we failed to lock the whole subtree so we have
1777		 * to clean up what we set up to the failing subtree
1778		 */
1779		for_each_mem_cgroup_tree(iter, memcg) {
1780			if (iter == failed) {
1781				mem_cgroup_iter_break(memcg, iter);
1782				break;
1783			}
1784			iter->oom_lock = false;
1785		}
1786	} else
1787		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1788
1789	spin_unlock(&memcg_oom_lock);
1790
1791	return !failed;
1792}
1793
1794static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1795{
1796	struct mem_cgroup *iter;
1797
1798	spin_lock(&memcg_oom_lock);
1799	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1800	for_each_mem_cgroup_tree(iter, memcg)
1801		iter->oom_lock = false;
1802	spin_unlock(&memcg_oom_lock);
1803}
1804
1805static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1806{
1807	struct mem_cgroup *iter;
1808
1809	spin_lock(&memcg_oom_lock);
1810	for_each_mem_cgroup_tree(iter, memcg)
1811		iter->under_oom++;
1812	spin_unlock(&memcg_oom_lock);
1813}
1814
1815static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1816{
1817	struct mem_cgroup *iter;
1818
1819	/*
1820	 * When a new child is created while the hierarchy is under oom,
1821	 * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1822	 */
1823	spin_lock(&memcg_oom_lock);
1824	for_each_mem_cgroup_tree(iter, memcg)
1825		if (iter->under_oom > 0)
1826			iter->under_oom--;
1827	spin_unlock(&memcg_oom_lock);
1828}
1829
1830static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1831
1832struct oom_wait_info {
1833	struct mem_cgroup *memcg;
1834	wait_queue_entry_t	wait;
1835};
1836
1837static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1838	unsigned mode, int sync, void *arg)
1839{
1840	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1841	struct mem_cgroup *oom_wait_memcg;
1842	struct oom_wait_info *oom_wait_info;
1843
1844	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1845	oom_wait_memcg = oom_wait_info->memcg;
1846
1847	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1848	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1849		return 0;
1850	return autoremove_wake_function(wait, mode, sync, arg);
1851}
1852
1853static void memcg_oom_recover(struct mem_cgroup *memcg)
1854{
1855	/*
1856	 * For the following lockless ->under_oom test, the only required
1857	 * guarantee is that it must see the state asserted by an OOM when
1858	 * this function is called as a result of userland actions
1859	 * triggered by the notification of the OOM.  This is trivially
1860	 * achieved by invoking mem_cgroup_mark_under_oom() before
1861	 * triggering notification.
1862	 */
1863	if (memcg && memcg->under_oom)
1864		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1865}
1866
1867enum oom_status {
1868	OOM_SUCCESS,
1869	OOM_FAILED,
1870	OOM_ASYNC,
1871	OOM_SKIPPED
1872};
1873
1874static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1875{
1876	enum oom_status ret;
1877	bool locked;
1878
1879	if (order > PAGE_ALLOC_COSTLY_ORDER)
1880		return OOM_SKIPPED;
1881
1882	memcg_memory_event(memcg, MEMCG_OOM);
1883
1884	/*
1885	 * We are in the middle of the charge context here, so we
1886	 * don't want to block when potentially sitting on a callstack
1887	 * that holds all kinds of filesystem and mm locks.
1888	 *
1889	 * cgroup1 allows disabling the OOM killer and waiting for outside
1890	 * handling until the charge can succeed; remember the context and put
1891	 * the task to sleep at the end of the page fault when all locks are
1892	 * released.
1893	 *
1894	 * On the other hand, in-kernel OOM killer allows for an async victim
1895	 * memory reclaim (oom_reaper) and that means that we are not solely
1896	 * relying on the oom victim to make a forward progress and we can
1897	 * invoke the oom killer here.
1898	 *
1899	 * Please note that mem_cgroup_out_of_memory might fail to find a
1900	 * victim and then we have to bail out from the charge path.
 
 
1901	 */
1902	if (memcg->oom_kill_disable) {
1903		if (!current->in_user_fault)
1904			return OOM_SKIPPED;
1905		css_get(&memcg->css);
1906		current->memcg_in_oom = memcg;
1907		current->memcg_oom_gfp_mask = mask;
1908		current->memcg_oom_order = order;
1909
1910		return OOM_ASYNC;
1911	}
1912
1913	mem_cgroup_mark_under_oom(memcg);
1914
1915	locked = mem_cgroup_oom_trylock(memcg);
1916
1917	if (locked)
1918		mem_cgroup_oom_notify(memcg);
1919
1920	mem_cgroup_unmark_under_oom(memcg);
1921	if (mem_cgroup_out_of_memory(memcg, mask, order))
1922		ret = OOM_SUCCESS;
1923	else
1924		ret = OOM_FAILED;
1925
1926	if (locked)
1927		mem_cgroup_oom_unlock(memcg);
1928
1929	return ret;
1930}
1931
1932/**
1933 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1934 * @handle: actually kill/wait or just clean up the OOM state
1935 *
1936 * This has to be called at the end of a page fault if the memcg OOM
1937 * handler was enabled.
1938 *
1939 * Memcg supports userspace OOM handling where failed allocations must
1940 * sleep on a waitqueue until the userspace task resolves the
1941 * situation.  Sleeping directly in the charge context with all kinds
1942 * of locks held is not a good idea, instead we remember an OOM state
1943 * in the task and mem_cgroup_oom_synchronize() has to be called at
1944 * the end of the page fault to complete the OOM handling.
1945 *
1946 * Returns %true if an ongoing memcg OOM situation was detected and
1947 * completed, %false otherwise.
1948 */
1949bool mem_cgroup_oom_synchronize(bool handle)
1950{
1951	struct mem_cgroup *memcg = current->memcg_in_oom;
1952	struct oom_wait_info owait;
1953	bool locked;
1954
1955	/* OOM is global, do not handle */
1956	if (!memcg)
1957		return false;
1958
1959	if (!handle)
1960		goto cleanup;
1961
1962	owait.memcg = memcg;
1963	owait.wait.flags = 0;
1964	owait.wait.func = memcg_oom_wake_function;
1965	owait.wait.private = current;
1966	INIT_LIST_HEAD(&owait.wait.entry);
1967
1968	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1969	mem_cgroup_mark_under_oom(memcg);
1970
1971	locked = mem_cgroup_oom_trylock(memcg);
1972
1973	if (locked)
1974		mem_cgroup_oom_notify(memcg);
1975
1976	if (locked && !memcg->oom_kill_disable) {
1977		mem_cgroup_unmark_under_oom(memcg);
1978		finish_wait(&memcg_oom_waitq, &owait.wait);
1979		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1980					 current->memcg_oom_order);
1981	} else {
1982		schedule();
1983		mem_cgroup_unmark_under_oom(memcg);
1984		finish_wait(&memcg_oom_waitq, &owait.wait);
1985	}
1986
1987	if (locked) {
1988		mem_cgroup_oom_unlock(memcg);
1989		/*
1990		 * There is no guarantee that an OOM-lock contender
1991		 * sees the wakeups triggered by the OOM kill
1992		 * uncharges.  Wake any sleepers explicitely.
1993		 */
1994		memcg_oom_recover(memcg);
1995	}
1996cleanup:
1997	current->memcg_in_oom = NULL;
1998	css_put(&memcg->css);
1999	return true;
2000}
2001
2002/**
2003 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
2004 * @victim: task to be killed by the OOM killer
2005 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
2006 *
2007 * Returns a pointer to a memory cgroup, which has to be cleaned up
2008 * by killing all belonging OOM-killable tasks.
2009 *
2010 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
2011 */
2012struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
2013					    struct mem_cgroup *oom_domain)
2014{
2015	struct mem_cgroup *oom_group = NULL;
2016	struct mem_cgroup *memcg;
2017
2018	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2019		return NULL;
2020
2021	if (!oom_domain)
2022		oom_domain = root_mem_cgroup;
2023
2024	rcu_read_lock();
2025
2026	memcg = mem_cgroup_from_task(victim);
2027	if (memcg == root_mem_cgroup)
2028		goto out;
2029
2030	/*
2031	 * If the victim task has been asynchronously moved to a different
2032	 * memory cgroup, we might end up killing tasks outside oom_domain.
2033	 * In this case it's better to ignore memory.group.oom.
2034	 */
2035	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2036		goto out;
2037
2038	/*
2039	 * Traverse the memory cgroup hierarchy from the victim task's
2040	 * cgroup up to the OOMing cgroup (or root) to find the
2041	 * highest-level memory cgroup with oom.group set.
2042	 */
2043	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2044		if (memcg->oom_group)
2045			oom_group = memcg;
2046
2047		if (memcg == oom_domain)
2048			break;
2049	}
2050
2051	if (oom_group)
2052		css_get(&oom_group->css);
2053out:
2054	rcu_read_unlock();
2055
2056	return oom_group;
2057}
2058
2059void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2060{
2061	pr_info("Tasks in ");
2062	pr_cont_cgroup_path(memcg->css.cgroup);
2063	pr_cont(" are going to be killed due to memory.oom.group set\n");
2064}
2065
2066/**
2067 * lock_page_memcg - lock a page->mem_cgroup binding
2068 * @page: the page
2069 *
2070 * This function protects unlocked LRU pages from being moved to
2071 * another cgroup.
2072 *
2073 * It ensures lifetime of the returned memcg. Caller is responsible
2074 * for the lifetime of the page; __unlock_page_memcg() is available
2075 * when @page might get freed inside the locked section.
2076 */
2077struct mem_cgroup *lock_page_memcg(struct page *page)
2078{
2079	struct page *head = compound_head(page); /* rmap on tail pages */
2080	struct mem_cgroup *memcg;
2081	unsigned long flags;
2082
2083	/*
2084	 * The RCU lock is held throughout the transaction.  The fast
2085	 * path can get away without acquiring the memcg->move_lock
2086	 * because page moving starts with an RCU grace period.
2087	 *
2088	 * The RCU lock also protects the memcg from being freed when
2089	 * the page state that is going to change is the only thing
2090	 * preventing the page itself from being freed. E.g. writeback
2091	 * doesn't hold a page reference and relies on PG_writeback to
2092	 * keep off truncation, migration and so forth.
2093         */
2094	rcu_read_lock();
2095
2096	if (mem_cgroup_disabled())
2097		return NULL;
2098again:
2099	memcg = head->mem_cgroup;
2100	if (unlikely(!memcg))
2101		return NULL;
2102
2103	if (atomic_read(&memcg->moving_account) <= 0)
2104		return memcg;
2105
2106	spin_lock_irqsave(&memcg->move_lock, flags);
2107	if (memcg != head->mem_cgroup) {
2108		spin_unlock_irqrestore(&memcg->move_lock, flags);
2109		goto again;
2110	}
2111
2112	/*
2113	 * When charge migration first begins, we can have locked and
2114	 * unlocked page stat updates happening concurrently.  Track
2115	 * the task who has the lock for unlock_page_memcg().
2116	 */
2117	memcg->move_lock_task = current;
2118	memcg->move_lock_flags = flags;
2119
2120	return memcg;
2121}
2122EXPORT_SYMBOL(lock_page_memcg);
2123
2124/**
2125 * __unlock_page_memcg - unlock and unpin a memcg
2126 * @memcg: the memcg
2127 *
2128 * Unlock and unpin a memcg returned by lock_page_memcg().
2129 */
2130void __unlock_page_memcg(struct mem_cgroup *memcg)
2131{
 
 
2132	if (memcg && memcg->move_lock_task == current) {
2133		unsigned long flags = memcg->move_lock_flags;
2134
2135		memcg->move_lock_task = NULL;
2136		memcg->move_lock_flags = 0;
2137
2138		spin_unlock_irqrestore(&memcg->move_lock, flags);
2139	}
2140
2141	rcu_read_unlock();
2142}
 
2143
2144/**
2145 * unlock_page_memcg - unlock a page->mem_cgroup binding
2146 * @page: the page
2147 */
2148void unlock_page_memcg(struct page *page)
2149{
2150	struct page *head = compound_head(page);
2151
2152	__unlock_page_memcg(head->mem_cgroup);
2153}
2154EXPORT_SYMBOL(unlock_page_memcg);
2155
2156struct memcg_stock_pcp {
2157	struct mem_cgroup *cached; /* this never be root cgroup */
2158	unsigned int nr_pages;
2159
2160#ifdef CONFIG_MEMCG_KMEM
2161	struct obj_cgroup *cached_objcg;
2162	unsigned int nr_bytes;
2163#endif
2164
2165	struct work_struct work;
2166	unsigned long flags;
2167#define FLUSHING_CACHED_CHARGE	0
2168};
2169static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2170static DEFINE_MUTEX(percpu_charge_mutex);
2171
2172#ifdef CONFIG_MEMCG_KMEM
2173static void drain_obj_stock(struct memcg_stock_pcp *stock);
2174static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2175				     struct mem_cgroup *root_memcg);
2176
2177#else
2178static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2179{
2180}
2181static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2182				     struct mem_cgroup *root_memcg)
2183{
2184	return false;
2185}
2186#endif
2187
2188/**
2189 * consume_stock: Try to consume stocked charge on this cpu.
2190 * @memcg: memcg to consume from.
2191 * @nr_pages: how many pages to charge.
2192 *
2193 * The charges will only happen if @memcg matches the current cpu's memcg
2194 * stock, and at least @nr_pages are available in that stock.  Failure to
2195 * service an allocation will refill the stock.
2196 *
2197 * returns true if successful, false otherwise.
2198 */
2199static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2200{
2201	struct memcg_stock_pcp *stock;
2202	unsigned long flags;
2203	bool ret = false;
2204
2205	if (nr_pages > MEMCG_CHARGE_BATCH)
2206		return ret;
2207
2208	local_irq_save(flags);
2209
2210	stock = this_cpu_ptr(&memcg_stock);
2211	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2212		stock->nr_pages -= nr_pages;
2213		ret = true;
2214	}
2215
2216	local_irq_restore(flags);
2217
2218	return ret;
2219}
2220
2221/*
2222 * Returns stocks cached in percpu and reset cached information.
2223 */
2224static void drain_stock(struct memcg_stock_pcp *stock)
2225{
2226	struct mem_cgroup *old = stock->cached;
2227
2228	if (!old)
2229		return;
2230
2231	if (stock->nr_pages) {
2232		page_counter_uncharge(&old->memory, stock->nr_pages);
2233		if (do_memsw_account())
2234			page_counter_uncharge(&old->memsw, stock->nr_pages);
 
2235		stock->nr_pages = 0;
2236	}
2237
2238	css_put(&old->css);
2239	stock->cached = NULL;
2240}
2241
 
 
 
 
2242static void drain_local_stock(struct work_struct *dummy)
2243{
2244	struct memcg_stock_pcp *stock;
2245	unsigned long flags;
2246
2247	/*
2248	 * The only protection from memory hotplug vs. drain_stock races is
2249	 * that we always operate on local CPU stock here with IRQ disabled
2250	 */
2251	local_irq_save(flags);
2252
2253	stock = this_cpu_ptr(&memcg_stock);
2254	drain_obj_stock(stock);
2255	drain_stock(stock);
2256	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2257
2258	local_irq_restore(flags);
2259}
2260
2261/*
2262 * Cache charges(val) to local per_cpu area.
2263 * This will be consumed by consume_stock() function, later.
2264 */
2265static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2266{
2267	struct memcg_stock_pcp *stock;
2268	unsigned long flags;
2269
2270	local_irq_save(flags);
2271
2272	stock = this_cpu_ptr(&memcg_stock);
2273	if (stock->cached != memcg) { /* reset if necessary */
2274		drain_stock(stock);
2275		css_get(&memcg->css);
2276		stock->cached = memcg;
2277	}
2278	stock->nr_pages += nr_pages;
2279
2280	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2281		drain_stock(stock);
2282
2283	local_irq_restore(flags);
2284}
2285
2286/*
2287 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2288 * of the hierarchy under it.
2289 */
2290static void drain_all_stock(struct mem_cgroup *root_memcg)
2291{
2292	int cpu, curcpu;
2293
2294	/* If someone's already draining, avoid adding running more workers. */
2295	if (!mutex_trylock(&percpu_charge_mutex))
2296		return;
2297	/*
2298	 * Notify other cpus that system-wide "drain" is running
2299	 * We do not care about races with the cpu hotplug because cpu down
2300	 * as well as workers from this path always operate on the local
2301	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
2302	 */
2303	curcpu = get_cpu();
2304	for_each_online_cpu(cpu) {
2305		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2306		struct mem_cgroup *memcg;
2307		bool flush = false;
2308
2309		rcu_read_lock();
2310		memcg = stock->cached;
2311		if (memcg && stock->nr_pages &&
2312		    mem_cgroup_is_descendant(memcg, root_memcg))
2313			flush = true;
2314		if (obj_stock_flush_required(stock, root_memcg))
2315			flush = true;
2316		rcu_read_unlock();
2317
2318		if (flush &&
2319		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2320			if (cpu == curcpu)
2321				drain_local_stock(&stock->work);
2322			else
2323				schedule_work_on(cpu, &stock->work);
2324		}
2325	}
2326	put_cpu();
 
2327	mutex_unlock(&percpu_charge_mutex);
2328}
2329
2330static int memcg_hotplug_cpu_dead(unsigned int cpu)
 
 
2331{
 
2332	struct memcg_stock_pcp *stock;
2333	struct mem_cgroup *memcg, *mi;
 
 
 
 
 
2334
2335	stock = &per_cpu(memcg_stock, cpu);
2336	drain_stock(stock);
2337
2338	for_each_mem_cgroup(memcg) {
2339		int i;
2340
2341		for (i = 0; i < MEMCG_NR_STAT; i++) {
2342			int nid;
2343			long x;
2344
2345			x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2346			if (x)
2347				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2348					atomic_long_add(x, &memcg->vmstats[i]);
2349
2350			if (i >= NR_VM_NODE_STAT_ITEMS)
2351				continue;
2352
2353			for_each_node(nid) {
2354				struct mem_cgroup_per_node *pn;
2355
2356				pn = mem_cgroup_nodeinfo(memcg, nid);
2357				x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2358				if (x)
2359					do {
2360						atomic_long_add(x, &pn->lruvec_stat[i]);
2361					} while ((pn = parent_nodeinfo(pn, nid)));
2362			}
2363		}
2364
2365		for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2366			long x;
2367
2368			x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2369			if (x)
2370				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2371					atomic_long_add(x, &memcg->vmevents[i]);
2372		}
2373	}
2374
2375	return 0;
2376}
2377
2378static unsigned long reclaim_high(struct mem_cgroup *memcg,
2379				  unsigned int nr_pages,
2380				  gfp_t gfp_mask)
2381{
2382	unsigned long nr_reclaimed = 0;
2383
2384	do {
2385		unsigned long pflags;
2386
2387		if (page_counter_read(&memcg->memory) <=
2388		    READ_ONCE(memcg->memory.high))
2389			continue;
2390
2391		memcg_memory_event(memcg, MEMCG_HIGH);
2392
2393		psi_memstall_enter(&pflags);
2394		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2395							     gfp_mask, true);
2396		psi_memstall_leave(&pflags);
2397	} while ((memcg = parent_mem_cgroup(memcg)) &&
2398		 !mem_cgroup_is_root(memcg));
2399
2400	return nr_reclaimed;
2401}
2402
2403static void high_work_func(struct work_struct *work)
2404{
2405	struct mem_cgroup *memcg;
2406
2407	memcg = container_of(work, struct mem_cgroup, high_work);
2408	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2409}
2410
2411/*
2412 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2413 * enough to still cause a significant slowdown in most cases, while still
2414 * allowing diagnostics and tracing to proceed without becoming stuck.
2415 */
2416#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2417
2418/*
2419 * When calculating the delay, we use these either side of the exponentiation to
2420 * maintain precision and scale to a reasonable number of jiffies (see the table
2421 * below.
2422 *
2423 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2424 *   overage ratio to a delay.
2425 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2426 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2427 *   to produce a reasonable delay curve.
2428 *
2429 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2430 * reasonable delay curve compared to precision-adjusted overage, not
2431 * penalising heavily at first, but still making sure that growth beyond the
2432 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2433 * example, with a high of 100 megabytes:
2434 *
2435 *  +-------+------------------------+
2436 *  | usage | time to allocate in ms |
2437 *  +-------+------------------------+
2438 *  | 100M  |                      0 |
2439 *  | 101M  |                      6 |
2440 *  | 102M  |                     25 |
2441 *  | 103M  |                     57 |
2442 *  | 104M  |                    102 |
2443 *  | 105M  |                    159 |
2444 *  | 106M  |                    230 |
2445 *  | 107M  |                    313 |
2446 *  | 108M  |                    409 |
2447 *  | 109M  |                    518 |
2448 *  | 110M  |                    639 |
2449 *  | 111M  |                    774 |
2450 *  | 112M  |                    921 |
2451 *  | 113M  |                   1081 |
2452 *  | 114M  |                   1254 |
2453 *  | 115M  |                   1439 |
2454 *  | 116M  |                   1638 |
2455 *  | 117M  |                   1849 |
2456 *  | 118M  |                   2000 |
2457 *  | 119M  |                   2000 |
2458 *  | 120M  |                   2000 |
2459 *  +-------+------------------------+
2460 */
2461 #define MEMCG_DELAY_PRECISION_SHIFT 20
2462 #define MEMCG_DELAY_SCALING_SHIFT 14
2463
2464static u64 calculate_overage(unsigned long usage, unsigned long high)
2465{
2466	u64 overage;
2467
2468	if (usage <= high)
2469		return 0;
2470
2471	/*
2472	 * Prevent division by 0 in overage calculation by acting as if
2473	 * it was a threshold of 1 page
2474	 */
2475	high = max(high, 1UL);
2476
2477	overage = usage - high;
2478	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2479	return div64_u64(overage, high);
2480}
2481
2482static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2483{
2484	u64 overage, max_overage = 0;
2485
2486	do {
2487		overage = calculate_overage(page_counter_read(&memcg->memory),
2488					    READ_ONCE(memcg->memory.high));
2489		max_overage = max(overage, max_overage);
2490	} while ((memcg = parent_mem_cgroup(memcg)) &&
2491		 !mem_cgroup_is_root(memcg));
2492
2493	return max_overage;
2494}
2495
2496static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2497{
2498	u64 overage, max_overage = 0;
2499
2500	do {
2501		overage = calculate_overage(page_counter_read(&memcg->swap),
2502					    READ_ONCE(memcg->swap.high));
2503		if (overage)
2504			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2505		max_overage = max(overage, max_overage);
2506	} while ((memcg = parent_mem_cgroup(memcg)) &&
2507		 !mem_cgroup_is_root(memcg));
2508
2509	return max_overage;
2510}
2511
2512/*
2513 * Get the number of jiffies that we should penalise a mischievous cgroup which
2514 * is exceeding its memory.high by checking both it and its ancestors.
2515 */
2516static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2517					  unsigned int nr_pages,
2518					  u64 max_overage)
2519{
2520	unsigned long penalty_jiffies;
2521
2522	if (!max_overage)
2523		return 0;
2524
2525	/*
2526	 * We use overage compared to memory.high to calculate the number of
2527	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2528	 * fairly lenient on small overages, and increasingly harsh when the
2529	 * memcg in question makes it clear that it has no intention of stopping
2530	 * its crazy behaviour, so we exponentially increase the delay based on
2531	 * overage amount.
2532	 */
2533	penalty_jiffies = max_overage * max_overage * HZ;
2534	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2535	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2536
2537	/*
2538	 * Factor in the task's own contribution to the overage, such that four
2539	 * N-sized allocations are throttled approximately the same as one
2540	 * 4N-sized allocation.
2541	 *
2542	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2543	 * larger the current charge patch is than that.
2544	 */
2545	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2546}
2547
2548/*
2549 * Scheduled by try_charge() to be executed from the userland return path
2550 * and reclaims memory over the high limit.
2551 */
2552void mem_cgroup_handle_over_high(void)
2553{
2554	unsigned long penalty_jiffies;
2555	unsigned long pflags;
2556	unsigned long nr_reclaimed;
2557	unsigned int nr_pages = current->memcg_nr_pages_over_high;
2558	int nr_retries = MAX_RECLAIM_RETRIES;
2559	struct mem_cgroup *memcg;
2560	bool in_retry = false;
2561
2562	if (likely(!nr_pages))
2563		return;
2564
2565	memcg = get_mem_cgroup_from_mm(current->mm);
 
 
2566	current->memcg_nr_pages_over_high = 0;
2567
2568retry_reclaim:
2569	/*
2570	 * The allocating task should reclaim at least the batch size, but for
2571	 * subsequent retries we only want to do what's necessary to prevent oom
2572	 * or breaching resource isolation.
2573	 *
2574	 * This is distinct from memory.max or page allocator behaviour because
2575	 * memory.high is currently batched, whereas memory.max and the page
2576	 * allocator run every time an allocation is made.
2577	 */
2578	nr_reclaimed = reclaim_high(memcg,
2579				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2580				    GFP_KERNEL);
2581
2582	/*
2583	 * memory.high is breached and reclaim is unable to keep up. Throttle
2584	 * allocators proactively to slow down excessive growth.
2585	 */
2586	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2587					       mem_find_max_overage(memcg));
2588
2589	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2590						swap_find_max_overage(memcg));
2591
2592	/*
2593	 * Clamp the max delay per usermode return so as to still keep the
2594	 * application moving forwards and also permit diagnostics, albeit
2595	 * extremely slowly.
2596	 */
2597	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2598
2599	/*
2600	 * Don't sleep if the amount of jiffies this memcg owes us is so low
2601	 * that it's not even worth doing, in an attempt to be nice to those who
2602	 * go only a small amount over their memory.high value and maybe haven't
2603	 * been aggressively reclaimed enough yet.
2604	 */
2605	if (penalty_jiffies <= HZ / 100)
2606		goto out;
2607
2608	/*
2609	 * If reclaim is making forward progress but we're still over
2610	 * memory.high, we want to encourage that rather than doing allocator
2611	 * throttling.
2612	 */
2613	if (nr_reclaimed || nr_retries--) {
2614		in_retry = true;
2615		goto retry_reclaim;
2616	}
2617
2618	/*
2619	 * If we exit early, we're guaranteed to die (since
2620	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2621	 * need to account for any ill-begotten jiffies to pay them off later.
2622	 */
2623	psi_memstall_enter(&pflags);
2624	schedule_timeout_killable(penalty_jiffies);
2625	psi_memstall_leave(&pflags);
2626
2627out:
2628	css_put(&memcg->css);
2629}
2630
2631static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2632		      unsigned int nr_pages)
2633{
2634	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2635	int nr_retries = MAX_RECLAIM_RETRIES;
2636	struct mem_cgroup *mem_over_limit;
2637	struct page_counter *counter;
2638	enum oom_status oom_status;
2639	unsigned long nr_reclaimed;
2640	bool may_swap = true;
2641	bool drained = false;
2642	unsigned long pflags;
2643
2644	if (mem_cgroup_is_root(memcg))
2645		return 0;
2646retry:
2647	if (consume_stock(memcg, nr_pages))
2648		return 0;
2649
2650	if (!do_memsw_account() ||
2651	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2652		if (page_counter_try_charge(&memcg->memory, batch, &counter))
2653			goto done_restock;
2654		if (do_memsw_account())
2655			page_counter_uncharge(&memcg->memsw, batch);
2656		mem_over_limit = mem_cgroup_from_counter(counter, memory);
2657	} else {
2658		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2659		may_swap = false;
2660	}
2661
2662	if (batch > nr_pages) {
2663		batch = nr_pages;
2664		goto retry;
2665	}
2666
2667	/*
2668	 * Memcg doesn't have a dedicated reserve for atomic
2669	 * allocations. But like the global atomic pool, we need to
2670	 * put the burden of reclaim on regular allocation requests
2671	 * and let these go through as privileged allocations.
2672	 */
2673	if (gfp_mask & __GFP_ATOMIC)
2674		goto force;
2675
2676	/*
2677	 * Unlike in global OOM situations, memcg is not in a physical
2678	 * memory shortage.  Allow dying and OOM-killed tasks to
2679	 * bypass the last charges so that they can exit quickly and
2680	 * free their memory.
2681	 */
2682	if (unlikely(should_force_charge()))
2683		goto force;
2684
2685	/*
2686	 * Prevent unbounded recursion when reclaim operations need to
2687	 * allocate memory. This might exceed the limits temporarily,
2688	 * but we prefer facilitating memory reclaim and getting back
2689	 * under the limit over triggering OOM kills in these cases.
2690	 */
2691	if (unlikely(current->flags & PF_MEMALLOC))
2692		goto force;
2693
2694	if (unlikely(task_in_memcg_oom(current)))
2695		goto nomem;
2696
2697	if (!gfpflags_allow_blocking(gfp_mask))
2698		goto nomem;
2699
2700	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2701
2702	psi_memstall_enter(&pflags);
2703	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2704						    gfp_mask, may_swap);
2705	psi_memstall_leave(&pflags);
2706
2707	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2708		goto retry;
2709
2710	if (!drained) {
2711		drain_all_stock(mem_over_limit);
2712		drained = true;
2713		goto retry;
2714	}
2715
2716	if (gfp_mask & __GFP_NORETRY)
2717		goto nomem;
2718	/*
2719	 * Even though the limit is exceeded at this point, reclaim
2720	 * may have been able to free some pages.  Retry the charge
2721	 * before killing the task.
2722	 *
2723	 * Only for regular pages, though: huge pages are rather
2724	 * unlikely to succeed so close to the limit, and we fall back
2725	 * to regular pages anyway in case of failure.
2726	 */
2727	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2728		goto retry;
2729	/*
2730	 * At task move, charge accounts can be doubly counted. So, it's
2731	 * better to wait until the end of task_move if something is going on.
2732	 */
2733	if (mem_cgroup_wait_acct_move(mem_over_limit))
2734		goto retry;
2735
2736	if (nr_retries--)
2737		goto retry;
2738
2739	if (gfp_mask & __GFP_RETRY_MAYFAIL)
2740		goto nomem;
2741
2742	if (gfp_mask & __GFP_NOFAIL)
2743		goto force;
2744
2745	if (fatal_signal_pending(current))
2746		goto force;
2747
2748	/*
2749	 * keep retrying as long as the memcg oom killer is able to make
2750	 * a forward progress or bypass the charge if the oom killer
2751	 * couldn't make any progress.
2752	 */
2753	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2754		       get_order(nr_pages * PAGE_SIZE));
2755	switch (oom_status) {
2756	case OOM_SUCCESS:
2757		nr_retries = MAX_RECLAIM_RETRIES;
2758		goto retry;
2759	case OOM_FAILED:
2760		goto force;
2761	default:
2762		goto nomem;
2763	}
2764nomem:
2765	if (!(gfp_mask & __GFP_NOFAIL))
2766		return -ENOMEM;
2767force:
2768	/*
2769	 * The allocation either can't fail or will lead to more memory
2770	 * being freed very soon.  Allow memory usage go over the limit
2771	 * temporarily by force charging it.
2772	 */
2773	page_counter_charge(&memcg->memory, nr_pages);
2774	if (do_memsw_account())
2775		page_counter_charge(&memcg->memsw, nr_pages);
 
2776
2777	return 0;
2778
2779done_restock:
 
2780	if (batch > nr_pages)
2781		refill_stock(memcg, batch - nr_pages);
2782
2783	/*
2784	 * If the hierarchy is above the normal consumption range, schedule
2785	 * reclaim on returning to userland.  We can perform reclaim here
2786	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2787	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2788	 * not recorded as it most likely matches current's and won't
2789	 * change in the meantime.  As high limit is checked again before
2790	 * reclaim, the cost of mismatch is negligible.
2791	 */
2792	do {
2793		bool mem_high, swap_high;
2794
2795		mem_high = page_counter_read(&memcg->memory) >
2796			READ_ONCE(memcg->memory.high);
2797		swap_high = page_counter_read(&memcg->swap) >
2798			READ_ONCE(memcg->swap.high);
2799
2800		/* Don't bother a random interrupted task */
2801		if (in_interrupt()) {
2802			if (mem_high) {
2803				schedule_work(&memcg->high_work);
2804				break;
2805			}
2806			continue;
2807		}
2808
2809		if (mem_high || swap_high) {
2810			/*
2811			 * The allocating tasks in this cgroup will need to do
2812			 * reclaim or be throttled to prevent further growth
2813			 * of the memory or swap footprints.
2814			 *
2815			 * Target some best-effort fairness between the tasks,
2816			 * and distribute reclaim work and delay penalties
2817			 * based on how much each task is actually allocating.
2818			 */
2819			current->memcg_nr_pages_over_high += batch;
2820			set_notify_resume(current);
2821			break;
2822		}
2823	} while ((memcg = parent_mem_cgroup(memcg)));
2824
2825	return 0;
2826}
2827
2828#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
2829static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2830{
2831	if (mem_cgroup_is_root(memcg))
2832		return;
2833
2834	page_counter_uncharge(&memcg->memory, nr_pages);
2835	if (do_memsw_account())
2836		page_counter_uncharge(&memcg->memsw, nr_pages);
 
 
2837}
2838#endif
2839
2840static void commit_charge(struct page *page, struct mem_cgroup *memcg)
2841{
2842	VM_BUG_ON_PAGE(page->mem_cgroup, page);
2843	/*
2844	 * Any of the following ensures page->mem_cgroup stability:
2845	 *
2846	 * - the page lock
2847	 * - LRU isolation
2848	 * - lock_page_memcg()
2849	 * - exclusive reference
2850	 */
2851	page->mem_cgroup = memcg;
2852}
2853
2854#ifdef CONFIG_MEMCG_KMEM
2855int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2856				 gfp_t gfp)
2857{
2858	unsigned int objects = objs_per_slab_page(s, page);
2859	void *vec;
2860
2861	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2862			   page_to_nid(page));
2863	if (!vec)
2864		return -ENOMEM;
2865
2866	if (cmpxchg(&page->obj_cgroups, NULL,
2867		    (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
2868		kfree(vec);
2869	else
2870		kmemleak_not_leak(vec);
2871
2872	return 0;
2873}
2874
2875/*
2876 * Returns a pointer to the memory cgroup to which the kernel object is charged.
2877 *
2878 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2879 * cgroup_mutex, etc.
2880 */
2881struct mem_cgroup *mem_cgroup_from_obj(void *p)
2882{
2883	struct page *page;
2884
2885	if (mem_cgroup_disabled())
2886		return NULL;
2887
2888	page = virt_to_head_page(p);
 
2889
2890	/*
2891	 * Slab objects are accounted individually, not per-page.
2892	 * Memcg membership data for each individual object is saved in
2893	 * the page->obj_cgroups.
2894	 */
2895	if (page_has_obj_cgroups(page)) {
2896		struct obj_cgroup *objcg;
2897		unsigned int off;
2898
2899		off = obj_to_index(page->slab_cache, page, p);
2900		objcg = page_obj_cgroups(page)[off];
2901		if (objcg)
2902			return obj_cgroup_memcg(objcg);
2903
2904		return NULL;
2905	}
2906
2907	/* All other pages use page->mem_cgroup */
2908	return page->mem_cgroup;
2909}
2910
2911__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
 
2912{
2913	struct obj_cgroup *objcg = NULL;
2914	struct mem_cgroup *memcg;
2915
2916	if (unlikely(!current->mm && !current->active_memcg))
2917		return NULL;
2918
2919	rcu_read_lock();
2920	if (unlikely(current->active_memcg))
2921		memcg = rcu_dereference(current->active_memcg);
2922	else
2923		memcg = mem_cgroup_from_task(current);
 
2924
2925	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
2926		objcg = rcu_dereference(memcg->objcg);
2927		if (objcg && obj_cgroup_tryget(objcg))
2928			break;
2929	}
2930	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
2931
2932	return objcg;
 
2933}
2934
 
2935static int memcg_alloc_cache_id(void)
2936{
2937	int id, size;
2938	int err;
2939
2940	id = ida_simple_get(&memcg_cache_ida,
2941			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2942	if (id < 0)
2943		return id;
2944
2945	if (id < memcg_nr_cache_ids)
2946		return id;
2947
2948	/*
2949	 * There's no space for the new id in memcg_caches arrays,
2950	 * so we have to grow them.
2951	 */
2952	down_write(&memcg_cache_ids_sem);
2953
2954	size = 2 * (id + 1);
2955	if (size < MEMCG_CACHES_MIN_SIZE)
2956		size = MEMCG_CACHES_MIN_SIZE;
2957	else if (size > MEMCG_CACHES_MAX_SIZE)
2958		size = MEMCG_CACHES_MAX_SIZE;
2959
2960	err = memcg_update_all_list_lrus(size);
 
 
2961	if (!err)
2962		memcg_nr_cache_ids = size;
2963
2964	up_write(&memcg_cache_ids_sem);
2965
2966	if (err) {
2967		ida_simple_remove(&memcg_cache_ida, id);
2968		return err;
2969	}
2970	return id;
2971}
2972
2973static void memcg_free_cache_id(int id)
2974{
2975	ida_simple_remove(&memcg_cache_ida, id);
2976}
2977
2978/**
2979 * __memcg_kmem_charge: charge a number of kernel pages to a memcg
2980 * @memcg: memory cgroup to charge
2981 * @gfp: reclaim mode
2982 * @nr_pages: number of pages to charge
2983 *
2984 * Returns 0 on success, an error code on failure.
2985 */
2986int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
2987			unsigned int nr_pages)
2988{
2989	struct page_counter *counter;
2990	int ret;
 
 
2991
2992	ret = try_charge(memcg, gfp, nr_pages);
2993	if (ret)
2994		return ret;
2995
2996	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2997	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2998
2999		/*
3000		 * Enforce __GFP_NOFAIL allocation because callers are not
3001		 * prepared to see failures and likely do not have any failure
3002		 * handling code.
3003		 */
3004		if (gfp & __GFP_NOFAIL) {
3005			page_counter_charge(&memcg->kmem, nr_pages);
3006			return 0;
3007		}
3008		cancel_charge(memcg, nr_pages);
3009		return -ENOMEM;
3010	}
3011	return 0;
3012}
3013
3014/**
3015 * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
3016 * @memcg: memcg to uncharge
3017 * @nr_pages: number of pages to uncharge
3018 */
3019void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
 
3020{
3021	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3022		page_counter_uncharge(&memcg->kmem, nr_pages);
3023
3024	page_counter_uncharge(&memcg->memory, nr_pages);
3025	if (do_memsw_account())
3026		page_counter_uncharge(&memcg->memsw, nr_pages);
3027}
3028
3029/**
3030 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
3031 * @page: page to charge
3032 * @gfp: reclaim mode
3033 * @order: allocation order
3034 *
3035 * Returns 0 on success, an error code on failure.
3036 */
3037int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3038{
3039	struct mem_cgroup *memcg;
3040	int ret = 0;
3041
3042	if (memcg_kmem_bypass())
3043		return 0;
 
3044
3045	memcg = get_mem_cgroup_from_current();
3046	if (!mem_cgroup_is_root(memcg)) {
3047		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3048		if (!ret) {
3049			page->mem_cgroup = memcg;
3050			__SetPageKmemcg(page);
3051			return 0;
3052		}
3053	}
3054	css_put(&memcg->css);
3055	return ret;
3056}
3057
3058/**
3059 * __memcg_kmem_uncharge_page: uncharge a kmem page
3060 * @page: page to uncharge
3061 * @order: allocation order
3062 */
3063void __memcg_kmem_uncharge_page(struct page *page, int order)
3064{
3065	struct mem_cgroup *memcg = page->mem_cgroup;
3066	unsigned int nr_pages = 1 << order;
3067
3068	if (!memcg)
3069		return;
3070
3071	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3072	__memcg_kmem_uncharge(memcg, nr_pages);
3073	page->mem_cgroup = NULL;
3074	css_put(&memcg->css);
3075
3076	/* slab pages do not have PageKmemcg flag set */
3077	if (PageKmemcg(page))
3078		__ClearPageKmemcg(page);
3079}
3080
3081static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
3082{
3083	struct memcg_stock_pcp *stock;
3084	unsigned long flags;
3085	bool ret = false;
3086
3087	local_irq_save(flags);
3088
3089	stock = this_cpu_ptr(&memcg_stock);
3090	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3091		stock->nr_bytes -= nr_bytes;
3092		ret = true;
3093	}
3094
3095	local_irq_restore(flags);
 
3096
3097	return ret;
3098}
3099
3100static void drain_obj_stock(struct memcg_stock_pcp *stock)
3101{
3102	struct obj_cgroup *old = stock->cached_objcg;
3103
3104	if (!old)
3105		return;
 
 
3106
3107	if (stock->nr_bytes) {
3108		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3109		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3110
3111		if (nr_pages) {
3112			rcu_read_lock();
3113			__memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
3114			rcu_read_unlock();
3115		}
3116
3117		/*
3118		 * The leftover is flushed to the centralized per-memcg value.
3119		 * On the next attempt to refill obj stock it will be moved
3120		 * to a per-cpu stock (probably, on an other CPU), see
3121		 * refill_obj_stock().
3122		 *
3123		 * How often it's flushed is a trade-off between the memory
3124		 * limit enforcement accuracy and potential CPU contention,
3125		 * so it might be changed in the future.
3126		 */
3127		atomic_add(nr_bytes, &old->nr_charged_bytes);
3128		stock->nr_bytes = 0;
3129	}
3130
3131	obj_cgroup_put(old);
3132	stock->cached_objcg = NULL;
3133}
3134
3135static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3136				     struct mem_cgroup *root_memcg)
3137{
3138	struct mem_cgroup *memcg;
3139
3140	if (stock->cached_objcg) {
3141		memcg = obj_cgroup_memcg(stock->cached_objcg);
3142		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3143			return true;
3144	}
3145
3146	return false;
3147}
3148
3149static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 
3150{
3151	struct memcg_stock_pcp *stock;
3152	unsigned long flags;
 
3153
3154	local_irq_save(flags);
 
 
3155
3156	stock = this_cpu_ptr(&memcg_stock);
3157	if (stock->cached_objcg != objcg) { /* reset if necessary */
3158		drain_obj_stock(stock);
3159		obj_cgroup_get(objcg);
3160		stock->cached_objcg = objcg;
3161		stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3162	}
3163	stock->nr_bytes += nr_bytes;
3164
3165	if (stock->nr_bytes > PAGE_SIZE)
3166		drain_obj_stock(stock);
3167
3168	local_irq_restore(flags);
3169}
3170
3171int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3172{
3173	struct mem_cgroup *memcg;
3174	unsigned int nr_pages, nr_bytes;
3175	int ret;
3176
3177	if (consume_obj_stock(objcg, size))
3178		return 0;
 
 
 
 
3179
3180	/*
3181	 * In theory, memcg->nr_charged_bytes can have enough
3182	 * pre-charged bytes to satisfy the allocation. However,
3183	 * flushing memcg->nr_charged_bytes requires two atomic
3184	 * operations, and memcg->nr_charged_bytes can't be big,
3185	 * so it's better to ignore it and try grab some new pages.
3186	 * memcg->nr_charged_bytes will be flushed in
3187	 * refill_obj_stock(), called from this function or
3188	 * independently later.
3189	 */
3190	rcu_read_lock();
3191	memcg = obj_cgroup_memcg(objcg);
3192	css_get(&memcg->css);
3193	rcu_read_unlock();
3194
3195	nr_pages = size >> PAGE_SHIFT;
3196	nr_bytes = size & (PAGE_SIZE - 1);
3197
3198	if (nr_bytes)
3199		nr_pages += 1;
3200
3201	ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3202	if (!ret && nr_bytes)
3203		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3204
3205	css_put(&memcg->css);
3206	return ret;
3207}
3208
3209void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3210{
3211	refill_obj_stock(objcg, size);
3212}
3213
3214#endif /* CONFIG_MEMCG_KMEM */
3215
3216#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3217
3218/*
3219 * Because tail pages are not marked as "used", set it. We're under
3220 * pgdat->lru_lock and migration entries setup in all page mappings.
3221 */
3222void mem_cgroup_split_huge_fixup(struct page *head)
3223{
3224	struct mem_cgroup *memcg = head->mem_cgroup;
3225	int i;
3226
3227	if (mem_cgroup_disabled())
3228		return;
3229
3230	for (i = 1; i < HPAGE_PMD_NR; i++) {
3231		css_get(&memcg->css);
3232		head[i].mem_cgroup = memcg;
3233	}
 
3234}
3235#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3236
3237#ifdef CONFIG_MEMCG_SWAP
 
 
 
 
 
 
 
3238/**
3239 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3240 * @entry: swap entry to be moved
3241 * @from:  mem_cgroup which the entry is moved from
3242 * @to:  mem_cgroup which the entry is moved to
3243 *
3244 * It succeeds only when the swap_cgroup's record for this entry is the same
3245 * as the mem_cgroup's id of @from.
3246 *
3247 * Returns 0 on success, -EINVAL on failure.
3248 *
3249 * The caller must have charged to @to, IOW, called page_counter_charge() about
3250 * both res and memsw, and called css_get().
3251 */
3252static int mem_cgroup_move_swap_account(swp_entry_t entry,
3253				struct mem_cgroup *from, struct mem_cgroup *to)
3254{
3255	unsigned short old_id, new_id;
3256
3257	old_id = mem_cgroup_id(from);
3258	new_id = mem_cgroup_id(to);
3259
3260	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3261		mod_memcg_state(from, MEMCG_SWAP, -1);
3262		mod_memcg_state(to, MEMCG_SWAP, 1);
3263		return 0;
3264	}
3265	return -EINVAL;
3266}
3267#else
3268static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3269				struct mem_cgroup *from, struct mem_cgroup *to)
3270{
3271	return -EINVAL;
3272}
3273#endif
3274
3275static DEFINE_MUTEX(memcg_max_mutex);
3276
3277static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3278				 unsigned long max, bool memsw)
3279{
 
 
3280	bool enlarge = false;
3281	bool drained = false;
3282	int ret;
3283	bool limits_invariant;
3284	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
 
 
 
 
 
 
 
 
3285
3286	do {
3287		if (signal_pending(current)) {
3288			ret = -EINTR;
3289			break;
3290		}
3291
3292		mutex_lock(&memcg_max_mutex);
3293		/*
3294		 * Make sure that the new limit (memsw or memory limit) doesn't
3295		 * break our basic invariant rule memory.max <= memsw.max.
3296		 */
3297		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3298					   max <= memcg->memsw.max;
3299		if (!limits_invariant) {
3300			mutex_unlock(&memcg_max_mutex);
3301			ret = -EINVAL;
3302			break;
3303		}
3304		if (max > counter->max)
3305			enlarge = true;
3306		ret = page_counter_set_max(counter, max);
3307		mutex_unlock(&memcg_max_mutex);
3308
3309		if (!ret)
3310			break;
3311
3312		if (!drained) {
3313			drain_all_stock(memcg);
3314			drained = true;
3315			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3316		}
3317
3318		if (!try_to_free_mem_cgroup_pages(memcg, 1,
3319					GFP_KERNEL, !memsw)) {
3320			ret = -EBUSY;
 
3321			break;
3322		}
3323	} while (true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3324
3325	if (!ret && enlarge)
3326		memcg_oom_recover(memcg);
3327
3328	return ret;
3329}
3330
3331unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3332					    gfp_t gfp_mask,
3333					    unsigned long *total_scanned)
3334{
3335	unsigned long nr_reclaimed = 0;
3336	struct mem_cgroup_per_node *mz, *next_mz = NULL;
3337	unsigned long reclaimed;
3338	int loop = 0;
3339	struct mem_cgroup_tree_per_node *mctz;
3340	unsigned long excess;
3341	unsigned long nr_scanned;
3342
3343	if (order > 0)
3344		return 0;
3345
3346	mctz = soft_limit_tree_node(pgdat->node_id);
3347
3348	/*
3349	 * Do not even bother to check the largest node if the root
3350	 * is empty. Do it lockless to prevent lock bouncing. Races
3351	 * are acceptable as soft limit is best effort anyway.
3352	 */
3353	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3354		return 0;
3355
3356	/*
3357	 * This loop can run a while, specially if mem_cgroup's continuously
3358	 * keep exceeding their soft limit and putting the system under
3359	 * pressure
3360	 */
3361	do {
3362		if (next_mz)
3363			mz = next_mz;
3364		else
3365			mz = mem_cgroup_largest_soft_limit_node(mctz);
3366		if (!mz)
3367			break;
3368
3369		nr_scanned = 0;
3370		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3371						    gfp_mask, &nr_scanned);
3372		nr_reclaimed += reclaimed;
3373		*total_scanned += nr_scanned;
3374		spin_lock_irq(&mctz->lock);
3375		__mem_cgroup_remove_exceeded(mz, mctz);
3376
3377		/*
3378		 * If we failed to reclaim anything from this memory cgroup
3379		 * it is time to move on to the next cgroup
3380		 */
3381		next_mz = NULL;
3382		if (!reclaimed)
3383			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3384
3385		excess = soft_limit_excess(mz->memcg);
3386		/*
3387		 * One school of thought says that we should not add
3388		 * back the node to the tree if reclaim returns 0.
3389		 * But our reclaim could return 0, simply because due
3390		 * to priority we are exposing a smaller subset of
3391		 * memory to reclaim from. Consider this as a longer
3392		 * term TODO.
3393		 */
3394		/* If excess == 0, no tree ops */
3395		__mem_cgroup_insert_exceeded(mz, mctz, excess);
3396		spin_unlock_irq(&mctz->lock);
3397		css_put(&mz->memcg->css);
3398		loop++;
3399		/*
3400		 * Could not reclaim anything and there are no more
3401		 * mem cgroups to try or we seem to be looping without
3402		 * reclaiming anything.
3403		 */
3404		if (!nr_reclaimed &&
3405			(next_mz == NULL ||
3406			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3407			break;
3408	} while (!nr_reclaimed);
3409	if (next_mz)
3410		css_put(&next_mz->memcg->css);
3411	return nr_reclaimed;
3412}
3413
3414/*
3415 * Test whether @memcg has children, dead or alive.  Note that this
3416 * function doesn't care whether @memcg has use_hierarchy enabled and
3417 * returns %true if there are child csses according to the cgroup
3418 * hierarchy.  Testing use_hierarchy is the caller's responsibility.
3419 */
3420static inline bool memcg_has_children(struct mem_cgroup *memcg)
3421{
3422	bool ret;
3423
3424	rcu_read_lock();
3425	ret = css_next_child(NULL, &memcg->css);
3426	rcu_read_unlock();
3427	return ret;
3428}
3429
3430/*
3431 * Reclaims as many pages from the given memcg as possible.
 
3432 *
3433 * Caller is responsible for holding css reference for memcg.
3434 */
3435static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3436{
3437	int nr_retries = MAX_RECLAIM_RETRIES;
3438
3439	/* we call try-to-free pages for make this cgroup empty */
3440	lru_add_drain_all();
3441
3442	drain_all_stock(memcg);
3443
3444	/* try to free all pages in this cgroup */
3445	while (nr_retries && page_counter_read(&memcg->memory)) {
3446		int progress;
3447
3448		if (signal_pending(current))
3449			return -EINTR;
3450
3451		progress = try_to_free_mem_cgroup_pages(memcg, 1,
3452							GFP_KERNEL, true);
3453		if (!progress) {
3454			nr_retries--;
3455			/* maybe some writeback is necessary */
3456			congestion_wait(BLK_RW_ASYNC, HZ/10);
3457		}
3458
3459	}
3460
3461	return 0;
3462}
3463
3464static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3465					    char *buf, size_t nbytes,
3466					    loff_t off)
3467{
3468	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3469
3470	if (mem_cgroup_is_root(memcg))
3471		return -EINVAL;
3472	return mem_cgroup_force_empty(memcg) ?: nbytes;
3473}
3474
3475static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3476				     struct cftype *cft)
3477{
3478	return mem_cgroup_from_css(css)->use_hierarchy;
3479}
3480
3481static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3482				      struct cftype *cft, u64 val)
3483{
3484	int retval = 0;
3485	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3486	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3487
3488	if (memcg->use_hierarchy == val)
3489		return 0;
3490
3491	/*
3492	 * If parent's use_hierarchy is set, we can't make any modifications
3493	 * in the child subtrees. If it is unset, then the change can
3494	 * occur, provided the current cgroup has no children.
3495	 *
3496	 * For the root cgroup, parent_mem is NULL, we allow value to be
3497	 * set if there are no children.
3498	 */
3499	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3500				(val == 1 || val == 0)) {
3501		if (!memcg_has_children(memcg))
3502			memcg->use_hierarchy = val;
3503		else
3504			retval = -EBUSY;
3505	} else
3506		retval = -EINVAL;
3507
3508	return retval;
3509}
3510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3511static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3512{
3513	unsigned long val;
3514
3515	if (mem_cgroup_is_root(memcg)) {
3516		val = memcg_page_state(memcg, NR_FILE_PAGES) +
3517			memcg_page_state(memcg, NR_ANON_MAPPED);
3518		if (swap)
3519			val += memcg_page_state(memcg, MEMCG_SWAP);
 
 
 
 
 
 
 
3520	} else {
3521		if (!swap)
3522			val = page_counter_read(&memcg->memory);
3523		else
3524			val = page_counter_read(&memcg->memsw);
3525	}
3526	return val;
3527}
3528
3529enum {
3530	RES_USAGE,
3531	RES_LIMIT,
3532	RES_MAX_USAGE,
3533	RES_FAILCNT,
3534	RES_SOFT_LIMIT,
3535};
3536
3537static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3538			       struct cftype *cft)
3539{
3540	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3541	struct page_counter *counter;
3542
3543	switch (MEMFILE_TYPE(cft->private)) {
3544	case _MEM:
3545		counter = &memcg->memory;
3546		break;
3547	case _MEMSWAP:
3548		counter = &memcg->memsw;
3549		break;
3550	case _KMEM:
3551		counter = &memcg->kmem;
3552		break;
3553	case _TCP:
3554		counter = &memcg->tcpmem;
3555		break;
3556	default:
3557		BUG();
3558	}
3559
3560	switch (MEMFILE_ATTR(cft->private)) {
3561	case RES_USAGE:
3562		if (counter == &memcg->memory)
3563			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3564		if (counter == &memcg->memsw)
3565			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3566		return (u64)page_counter_read(counter) * PAGE_SIZE;
3567	case RES_LIMIT:
3568		return (u64)counter->max * PAGE_SIZE;
3569	case RES_MAX_USAGE:
3570		return (u64)counter->watermark * PAGE_SIZE;
3571	case RES_FAILCNT:
3572		return counter->failcnt;
3573	case RES_SOFT_LIMIT:
3574		return (u64)memcg->soft_limit * PAGE_SIZE;
3575	default:
3576		BUG();
3577	}
3578}
3579
3580static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3581{
3582	unsigned long stat[MEMCG_NR_STAT] = {0};
3583	struct mem_cgroup *mi;
3584	int node, cpu, i;
3585
3586	for_each_online_cpu(cpu)
3587		for (i = 0; i < MEMCG_NR_STAT; i++)
3588			stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3589
3590	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3591		for (i = 0; i < MEMCG_NR_STAT; i++)
3592			atomic_long_add(stat[i], &mi->vmstats[i]);
3593
3594	for_each_node(node) {
3595		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3596		struct mem_cgroup_per_node *pi;
3597
3598		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3599			stat[i] = 0;
3600
3601		for_each_online_cpu(cpu)
3602			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3603				stat[i] += per_cpu(
3604					pn->lruvec_stat_cpu->count[i], cpu);
3605
3606		for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3607			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3608				atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3609	}
3610}
3611
3612static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3613{
3614	unsigned long events[NR_VM_EVENT_ITEMS];
3615	struct mem_cgroup *mi;
3616	int cpu, i;
3617
3618	for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3619		events[i] = 0;
3620
3621	for_each_online_cpu(cpu)
3622		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3623			events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3624					     cpu);
3625
3626	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3627		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3628			atomic_long_add(events[i], &mi->vmevents[i]);
3629}
3630
3631#ifdef CONFIG_MEMCG_KMEM
3632static int memcg_online_kmem(struct mem_cgroup *memcg)
3633{
3634	struct obj_cgroup *objcg;
3635	int memcg_id;
3636
3637	if (cgroup_memory_nokmem)
3638		return 0;
3639
3640	BUG_ON(memcg->kmemcg_id >= 0);
3641	BUG_ON(memcg->kmem_state);
3642
3643	memcg_id = memcg_alloc_cache_id();
3644	if (memcg_id < 0)
3645		return memcg_id;
3646
3647	objcg = obj_cgroup_alloc();
3648	if (!objcg) {
3649		memcg_free_cache_id(memcg_id);
3650		return -ENOMEM;
3651	}
3652	objcg->memcg = memcg;
3653	rcu_assign_pointer(memcg->objcg, objcg);
3654
3655	static_branch_enable(&memcg_kmem_enabled_key);
3656
3657	/*
3658	 * A memory cgroup is considered kmem-online as soon as it gets
3659	 * kmemcg_id. Setting the id after enabling static branching will
3660	 * guarantee no one starts accounting before all call sites are
3661	 * patched.
3662	 */
3663	memcg->kmemcg_id = memcg_id;
3664	memcg->kmem_state = KMEM_ONLINE;
3665
3666	return 0;
3667}
3668
3669static void memcg_offline_kmem(struct mem_cgroup *memcg)
3670{
3671	struct cgroup_subsys_state *css;
3672	struct mem_cgroup *parent, *child;
3673	int kmemcg_id;
3674
3675	if (memcg->kmem_state != KMEM_ONLINE)
3676		return;
 
 
 
 
 
 
 
 
 
3677
3678	memcg->kmem_state = KMEM_ALLOCATED;
 
3679
3680	parent = parent_mem_cgroup(memcg);
3681	if (!parent)
3682		parent = root_mem_cgroup;
3683
3684	memcg_reparent_objcgs(memcg, parent);
3685
3686	kmemcg_id = memcg->kmemcg_id;
3687	BUG_ON(kmemcg_id < 0);
3688
3689	/*
3690	 * Change kmemcg_id of this cgroup and all its descendants to the
3691	 * parent's id, and then move all entries from this cgroup's list_lrus
3692	 * to ones of the parent. After we have finished, all list_lrus
3693	 * corresponding to this cgroup are guaranteed to remain empty. The
3694	 * ordering is imposed by list_lru_node->lock taken by
3695	 * memcg_drain_all_list_lrus().
3696	 */
3697	rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3698	css_for_each_descendant_pre(css, &memcg->css) {
3699		child = mem_cgroup_from_css(css);
3700		BUG_ON(child->kmemcg_id != kmemcg_id);
3701		child->kmemcg_id = parent->kmemcg_id;
3702		if (!memcg->use_hierarchy)
3703			break;
3704	}
3705	rcu_read_unlock();
3706
3707	memcg_drain_all_list_lrus(kmemcg_id, parent);
3708
3709	memcg_free_cache_id(kmemcg_id);
3710}
3711
3712static void memcg_free_kmem(struct mem_cgroup *memcg)
3713{
3714	/* css_alloc() failed, offlining didn't happen */
3715	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3716		memcg_offline_kmem(memcg);
 
 
 
 
 
 
3717}
3718#else
3719static int memcg_online_kmem(struct mem_cgroup *memcg)
3720{
3721	return 0;
3722}
3723static void memcg_offline_kmem(struct mem_cgroup *memcg)
3724{
3725}
3726static void memcg_free_kmem(struct mem_cgroup *memcg)
3727{
3728}
3729#endif /* CONFIG_MEMCG_KMEM */
3730
3731static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3732				 unsigned long max)
3733{
3734	int ret;
3735
3736	mutex_lock(&memcg_max_mutex);
3737	ret = page_counter_set_max(&memcg->kmem, max);
3738	mutex_unlock(&memcg_max_mutex);
3739	return ret;
3740}
3741
3742static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3743{
3744	int ret;
3745
3746	mutex_lock(&memcg_max_mutex);
3747
3748	ret = page_counter_set_max(&memcg->tcpmem, max);
3749	if (ret)
3750		goto out;
3751
3752	if (!memcg->tcpmem_active) {
3753		/*
3754		 * The active flag needs to be written after the static_key
3755		 * update. This is what guarantees that the socket activation
3756		 * function is the last one to run. See mem_cgroup_sk_alloc()
3757		 * for details, and note that we don't mark any socket as
3758		 * belonging to this memcg until that flag is up.
3759		 *
3760		 * We need to do this, because static_keys will span multiple
3761		 * sites, but we can't control their order. If we mark a socket
3762		 * as accounted, but the accounting functions are not patched in
3763		 * yet, we'll lose accounting.
3764		 *
3765		 * We never race with the readers in mem_cgroup_sk_alloc(),
3766		 * because when this value change, the code to process it is not
3767		 * patched in yet.
3768		 */
3769		static_branch_inc(&memcg_sockets_enabled_key);
3770		memcg->tcpmem_active = true;
3771	}
3772out:
3773	mutex_unlock(&memcg_max_mutex);
3774	return ret;
3775}
3776
3777/*
3778 * The user of this function is...
3779 * RES_LIMIT.
3780 */
3781static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3782				char *buf, size_t nbytes, loff_t off)
3783{
3784	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3785	unsigned long nr_pages;
3786	int ret;
3787
3788	buf = strstrip(buf);
3789	ret = page_counter_memparse(buf, "-1", &nr_pages);
3790	if (ret)
3791		return ret;
3792
3793	switch (MEMFILE_ATTR(of_cft(of)->private)) {
3794	case RES_LIMIT:
3795		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3796			ret = -EINVAL;
3797			break;
3798		}
3799		switch (MEMFILE_TYPE(of_cft(of)->private)) {
3800		case _MEM:
3801			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3802			break;
3803		case _MEMSWAP:
3804			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3805			break;
3806		case _KMEM:
3807			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3808				     "Please report your usecase to linux-mm@kvack.org if you "
3809				     "depend on this functionality.\n");
3810			ret = memcg_update_kmem_max(memcg, nr_pages);
3811			break;
3812		case _TCP:
3813			ret = memcg_update_tcp_max(memcg, nr_pages);
3814			break;
3815		}
3816		break;
3817	case RES_SOFT_LIMIT:
3818		memcg->soft_limit = nr_pages;
3819		ret = 0;
3820		break;
3821	}
3822	return ret ?: nbytes;
3823}
3824
3825static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3826				size_t nbytes, loff_t off)
3827{
3828	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3829	struct page_counter *counter;
3830
3831	switch (MEMFILE_TYPE(of_cft(of)->private)) {
3832	case _MEM:
3833		counter = &memcg->memory;
3834		break;
3835	case _MEMSWAP:
3836		counter = &memcg->memsw;
3837		break;
3838	case _KMEM:
3839		counter = &memcg->kmem;
3840		break;
3841	case _TCP:
3842		counter = &memcg->tcpmem;
3843		break;
3844	default:
3845		BUG();
3846	}
3847
3848	switch (MEMFILE_ATTR(of_cft(of)->private)) {
3849	case RES_MAX_USAGE:
3850		page_counter_reset_watermark(counter);
3851		break;
3852	case RES_FAILCNT:
3853		counter->failcnt = 0;
3854		break;
3855	default:
3856		BUG();
3857	}
3858
3859	return nbytes;
3860}
3861
3862static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3863					struct cftype *cft)
3864{
3865	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3866}
3867
3868#ifdef CONFIG_MMU
3869static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3870					struct cftype *cft, u64 val)
3871{
3872	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3873
3874	if (val & ~MOVE_MASK)
3875		return -EINVAL;
3876
3877	/*
3878	 * No kind of locking is needed in here, because ->can_attach() will
3879	 * check this value once in the beginning of the process, and then carry
3880	 * on with stale data. This means that changes to this value will only
3881	 * affect task migrations starting after the change.
3882	 */
3883	memcg->move_charge_at_immigrate = val;
3884	return 0;
3885}
3886#else
3887static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3888					struct cftype *cft, u64 val)
3889{
3890	return -ENOSYS;
3891}
3892#endif
3893
3894#ifdef CONFIG_NUMA
3895
3896#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3897#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3898#define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
3899
3900static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3901				int nid, unsigned int lru_mask, bool tree)
3902{
3903	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3904	unsigned long nr = 0;
3905	enum lru_list lru;
3906
3907	VM_BUG_ON((unsigned)nid >= nr_node_ids);
3908
3909	for_each_lru(lru) {
3910		if (!(BIT(lru) & lru_mask))
3911			continue;
3912		if (tree)
3913			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3914		else
3915			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3916	}
3917	return nr;
3918}
3919
3920static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3921					     unsigned int lru_mask,
3922					     bool tree)
3923{
3924	unsigned long nr = 0;
3925	enum lru_list lru;
3926
3927	for_each_lru(lru) {
3928		if (!(BIT(lru) & lru_mask))
3929			continue;
3930		if (tree)
3931			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3932		else
3933			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3934	}
3935	return nr;
3936}
3937
3938static int memcg_numa_stat_show(struct seq_file *m, void *v)
3939{
3940	struct numa_stat {
3941		const char *name;
3942		unsigned int lru_mask;
3943	};
3944
3945	static const struct numa_stat stats[] = {
3946		{ "total", LRU_ALL },
3947		{ "file", LRU_ALL_FILE },
3948		{ "anon", LRU_ALL_ANON },
3949		{ "unevictable", BIT(LRU_UNEVICTABLE) },
3950	};
3951	const struct numa_stat *stat;
3952	int nid;
3953	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
3954
3955	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3956		seq_printf(m, "%s=%lu", stat->name,
3957			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3958						   false));
3959		for_each_node_state(nid, N_MEMORY)
3960			seq_printf(m, " N%d=%lu", nid,
3961				   mem_cgroup_node_nr_lru_pages(memcg, nid,
3962							stat->lru_mask, false));
3963		seq_putc(m, '\n');
3964	}
3965
3966	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 
3967
3968		seq_printf(m, "hierarchical_%s=%lu", stat->name,
3969			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3970						   true));
3971		for_each_node_state(nid, N_MEMORY)
3972			seq_printf(m, " N%d=%lu", nid,
3973				   mem_cgroup_node_nr_lru_pages(memcg, nid,
3974							stat->lru_mask, true));
 
 
 
 
3975		seq_putc(m, '\n');
3976	}
3977
3978	return 0;
3979}
3980#endif /* CONFIG_NUMA */
3981
3982static const unsigned int memcg1_stats[] = {
3983	NR_FILE_PAGES,
3984	NR_ANON_MAPPED,
3985#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3986	NR_ANON_THPS,
3987#endif
3988	NR_SHMEM,
3989	NR_FILE_MAPPED,
3990	NR_FILE_DIRTY,
3991	NR_WRITEBACK,
3992	MEMCG_SWAP,
3993};
3994
3995static const char *const memcg1_stat_names[] = {
3996	"cache",
3997	"rss",
3998#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3999	"rss_huge",
4000#endif
4001	"shmem",
4002	"mapped_file",
4003	"dirty",
4004	"writeback",
4005	"swap",
4006};
4007
4008/* Universal VM events cgroup1 shows, original sort order */
4009static const unsigned int memcg1_events[] = {
4010	PGPGIN,
4011	PGPGOUT,
4012	PGFAULT,
4013	PGMAJFAULT,
4014};
4015
4016static int memcg_stat_show(struct seq_file *m, void *v)
4017{
4018	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4019	unsigned long memory, memsw;
4020	struct mem_cgroup *mi;
4021	unsigned int i;
4022
4023	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4024
4025	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4026		unsigned long nr;
 
4027
4028		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 
4029			continue;
4030		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4031#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4032		if (memcg1_stats[i] == NR_ANON_THPS)
4033			nr *= HPAGE_PMD_NR;
4034#endif
4035		seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
4036	}
4037
4038	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4039		seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4040			   memcg_events_local(memcg, memcg1_events[i]));
4041
4042	for (i = 0; i < NR_LRU_LISTS; i++)
4043		seq_printf(m, "%s %lu\n", lru_list_name(i),
4044			   memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4045			   PAGE_SIZE);
4046
4047	/* Hierarchical information */
4048	memory = memsw = PAGE_COUNTER_MAX;
4049	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4050		memory = min(memory, READ_ONCE(mi->memory.max));
4051		memsw = min(memsw, READ_ONCE(mi->memsw.max));
4052	}
4053	seq_printf(m, "hierarchical_memory_limit %llu\n",
4054		   (u64)memory * PAGE_SIZE);
4055	if (do_memsw_account())
4056		seq_printf(m, "hierarchical_memsw_limit %llu\n",
4057			   (u64)memsw * PAGE_SIZE);
4058
4059	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4060		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 
 
4061			continue;
4062		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4063			   (u64)memcg_page_state(memcg, memcg1_stats[i]) *
4064			   PAGE_SIZE);
4065	}
4066
4067	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
 
 
 
 
4068		seq_printf(m, "total_%s %llu\n",
4069			   vm_event_name(memcg1_events[i]),
4070			   (u64)memcg_events(memcg, memcg1_events[i]));
 
 
 
4071
4072	for (i = 0; i < NR_LRU_LISTS; i++)
4073		seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4074			   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4075			   PAGE_SIZE);
4076
4077#ifdef CONFIG_DEBUG_VM
4078	{
4079		pg_data_t *pgdat;
4080		struct mem_cgroup_per_node *mz;
4081		unsigned long anon_cost = 0;
4082		unsigned long file_cost = 0;
4083
4084		for_each_online_pgdat(pgdat) {
4085			mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
4086
4087			anon_cost += mz->lruvec.anon_cost;
4088			file_cost += mz->lruvec.file_cost;
4089		}
4090		seq_printf(m, "anon_cost %lu\n", anon_cost);
4091		seq_printf(m, "file_cost %lu\n", file_cost);
 
 
 
 
 
 
 
4092	}
4093#endif
4094
4095	return 0;
4096}
4097
4098static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4099				      struct cftype *cft)
4100{
4101	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4102
4103	return mem_cgroup_swappiness(memcg);
4104}
4105
4106static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4107				       struct cftype *cft, u64 val)
4108{
4109	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4110
4111	if (val > 100)
4112		return -EINVAL;
4113
4114	if (css->parent)
4115		memcg->swappiness = val;
4116	else
4117		vm_swappiness = val;
4118
4119	return 0;
4120}
4121
4122static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4123{
4124	struct mem_cgroup_threshold_ary *t;
4125	unsigned long usage;
4126	int i;
4127
4128	rcu_read_lock();
4129	if (!swap)
4130		t = rcu_dereference(memcg->thresholds.primary);
4131	else
4132		t = rcu_dereference(memcg->memsw_thresholds.primary);
4133
4134	if (!t)
4135		goto unlock;
4136
4137	usage = mem_cgroup_usage(memcg, swap);
4138
4139	/*
4140	 * current_threshold points to threshold just below or equal to usage.
4141	 * If it's not true, a threshold was crossed after last
4142	 * call of __mem_cgroup_threshold().
4143	 */
4144	i = t->current_threshold;
4145
4146	/*
4147	 * Iterate backward over array of thresholds starting from
4148	 * current_threshold and check if a threshold is crossed.
4149	 * If none of thresholds below usage is crossed, we read
4150	 * only one element of the array here.
4151	 */
4152	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4153		eventfd_signal(t->entries[i].eventfd, 1);
4154
4155	/* i = current_threshold + 1 */
4156	i++;
4157
4158	/*
4159	 * Iterate forward over array of thresholds starting from
4160	 * current_threshold+1 and check if a threshold is crossed.
4161	 * If none of thresholds above usage is crossed, we read
4162	 * only one element of the array here.
4163	 */
4164	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4165		eventfd_signal(t->entries[i].eventfd, 1);
4166
4167	/* Update current_threshold */
4168	t->current_threshold = i - 1;
4169unlock:
4170	rcu_read_unlock();
4171}
4172
4173static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4174{
4175	while (memcg) {
4176		__mem_cgroup_threshold(memcg, false);
4177		if (do_memsw_account())
4178			__mem_cgroup_threshold(memcg, true);
4179
4180		memcg = parent_mem_cgroup(memcg);
4181	}
4182}
4183
4184static int compare_thresholds(const void *a, const void *b)
4185{
4186	const struct mem_cgroup_threshold *_a = a;
4187	const struct mem_cgroup_threshold *_b = b;
4188
4189	if (_a->threshold > _b->threshold)
4190		return 1;
4191
4192	if (_a->threshold < _b->threshold)
4193		return -1;
4194
4195	return 0;
4196}
4197
4198static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4199{
4200	struct mem_cgroup_eventfd_list *ev;
4201
4202	spin_lock(&memcg_oom_lock);
4203
4204	list_for_each_entry(ev, &memcg->oom_notify, list)
4205		eventfd_signal(ev->eventfd, 1);
4206
4207	spin_unlock(&memcg_oom_lock);
4208	return 0;
4209}
4210
4211static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4212{
4213	struct mem_cgroup *iter;
4214
4215	for_each_mem_cgroup_tree(iter, memcg)
4216		mem_cgroup_oom_notify_cb(iter);
4217}
4218
4219static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4220	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4221{
4222	struct mem_cgroup_thresholds *thresholds;
4223	struct mem_cgroup_threshold_ary *new;
4224	unsigned long threshold;
4225	unsigned long usage;
4226	int i, size, ret;
4227
4228	ret = page_counter_memparse(args, "-1", &threshold);
4229	if (ret)
4230		return ret;
4231
4232	mutex_lock(&memcg->thresholds_lock);
4233
4234	if (type == _MEM) {
4235		thresholds = &memcg->thresholds;
4236		usage = mem_cgroup_usage(memcg, false);
4237	} else if (type == _MEMSWAP) {
4238		thresholds = &memcg->memsw_thresholds;
4239		usage = mem_cgroup_usage(memcg, true);
4240	} else
4241		BUG();
4242
4243	/* Check if a threshold crossed before adding a new one */
4244	if (thresholds->primary)
4245		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
4246
4247	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4248
4249	/* Allocate memory for new array of thresholds */
4250	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
 
4251	if (!new) {
4252		ret = -ENOMEM;
4253		goto unlock;
4254	}
4255	new->size = size;
4256
4257	/* Copy thresholds (if any) to new array */
4258	if (thresholds->primary) {
4259		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4260				sizeof(struct mem_cgroup_threshold));
4261	}
4262
4263	/* Add new threshold */
4264	new->entries[size - 1].eventfd = eventfd;
4265	new->entries[size - 1].threshold = threshold;
4266
4267	/* Sort thresholds. Registering of new threshold isn't time-critical */
4268	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4269			compare_thresholds, NULL);
4270
4271	/* Find current threshold */
4272	new->current_threshold = -1;
4273	for (i = 0; i < size; i++) {
4274		if (new->entries[i].threshold <= usage) {
4275			/*
4276			 * new->current_threshold will not be used until
4277			 * rcu_assign_pointer(), so it's safe to increment
4278			 * it here.
4279			 */
4280			++new->current_threshold;
4281		} else
4282			break;
4283	}
4284
4285	/* Free old spare buffer and save old primary buffer as spare */
4286	kfree(thresholds->spare);
4287	thresholds->spare = thresholds->primary;
4288
4289	rcu_assign_pointer(thresholds->primary, new);
4290
4291	/* To be sure that nobody uses thresholds */
4292	synchronize_rcu();
4293
4294unlock:
4295	mutex_unlock(&memcg->thresholds_lock);
4296
4297	return ret;
4298}
4299
4300static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4301	struct eventfd_ctx *eventfd, const char *args)
4302{
4303	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4304}
4305
4306static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4307	struct eventfd_ctx *eventfd, const char *args)
4308{
4309	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4310}
4311
4312static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4313	struct eventfd_ctx *eventfd, enum res_type type)
4314{
4315	struct mem_cgroup_thresholds *thresholds;
4316	struct mem_cgroup_threshold_ary *new;
4317	unsigned long usage;
4318	int i, j, size, entries;
4319
4320	mutex_lock(&memcg->thresholds_lock);
4321
4322	if (type == _MEM) {
4323		thresholds = &memcg->thresholds;
4324		usage = mem_cgroup_usage(memcg, false);
4325	} else if (type == _MEMSWAP) {
4326		thresholds = &memcg->memsw_thresholds;
4327		usage = mem_cgroup_usage(memcg, true);
4328	} else
4329		BUG();
4330
4331	if (!thresholds->primary)
4332		goto unlock;
4333
4334	/* Check if a threshold crossed before removing */
4335	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
4336
4337	/* Calculate new number of threshold */
4338	size = entries = 0;
4339	for (i = 0; i < thresholds->primary->size; i++) {
4340		if (thresholds->primary->entries[i].eventfd != eventfd)
4341			size++;
4342		else
4343			entries++;
4344	}
4345
4346	new = thresholds->spare;
4347
4348	/* If no items related to eventfd have been cleared, nothing to do */
4349	if (!entries)
4350		goto unlock;
4351
4352	/* Set thresholds array to NULL if we don't have thresholds */
4353	if (!size) {
4354		kfree(new);
4355		new = NULL;
4356		goto swap_buffers;
4357	}
4358
4359	new->size = size;
4360
4361	/* Copy thresholds and find current threshold */
4362	new->current_threshold = -1;
4363	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4364		if (thresholds->primary->entries[i].eventfd == eventfd)
4365			continue;
4366
4367		new->entries[j] = thresholds->primary->entries[i];
4368		if (new->entries[j].threshold <= usage) {
4369			/*
4370			 * new->current_threshold will not be used
4371			 * until rcu_assign_pointer(), so it's safe to increment
4372			 * it here.
4373			 */
4374			++new->current_threshold;
4375		}
4376		j++;
4377	}
4378
4379swap_buffers:
4380	/* Swap primary and spare array */
4381	thresholds->spare = thresholds->primary;
4382
4383	rcu_assign_pointer(thresholds->primary, new);
4384
4385	/* To be sure that nobody uses thresholds */
4386	synchronize_rcu();
4387
4388	/* If all events are unregistered, free the spare array */
4389	if (!new) {
4390		kfree(thresholds->spare);
4391		thresholds->spare = NULL;
4392	}
4393unlock:
4394	mutex_unlock(&memcg->thresholds_lock);
4395}
4396
4397static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4398	struct eventfd_ctx *eventfd)
4399{
4400	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4401}
4402
4403static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4404	struct eventfd_ctx *eventfd)
4405{
4406	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4407}
4408
4409static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4410	struct eventfd_ctx *eventfd, const char *args)
4411{
4412	struct mem_cgroup_eventfd_list *event;
4413
4414	event = kmalloc(sizeof(*event),	GFP_KERNEL);
4415	if (!event)
4416		return -ENOMEM;
4417
4418	spin_lock(&memcg_oom_lock);
4419
4420	event->eventfd = eventfd;
4421	list_add(&event->list, &memcg->oom_notify);
4422
4423	/* already in OOM ? */
4424	if (memcg->under_oom)
4425		eventfd_signal(eventfd, 1);
4426	spin_unlock(&memcg_oom_lock);
4427
4428	return 0;
4429}
4430
4431static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4432	struct eventfd_ctx *eventfd)
4433{
4434	struct mem_cgroup_eventfd_list *ev, *tmp;
4435
4436	spin_lock(&memcg_oom_lock);
4437
4438	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4439		if (ev->eventfd == eventfd) {
4440			list_del(&ev->list);
4441			kfree(ev);
4442		}
4443	}
4444
4445	spin_unlock(&memcg_oom_lock);
4446}
4447
4448static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4449{
4450	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4451
4452	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4453	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4454	seq_printf(sf, "oom_kill %lu\n",
4455		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4456	return 0;
4457}
4458
4459static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4460	struct cftype *cft, u64 val)
4461{
4462	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4463
4464	/* cannot set to root cgroup and only 0 and 1 are allowed */
4465	if (!css->parent || !((val == 0) || (val == 1)))
4466		return -EINVAL;
4467
4468	memcg->oom_kill_disable = val;
4469	if (!val)
4470		memcg_oom_recover(memcg);
4471
4472	return 0;
4473}
4474
4475#ifdef CONFIG_CGROUP_WRITEBACK
4476
4477#include <trace/events/writeback.h>
 
 
 
4478
4479static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4480{
4481	return wb_domain_init(&memcg->cgwb_domain, gfp);
4482}
4483
4484static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4485{
4486	wb_domain_exit(&memcg->cgwb_domain);
4487}
4488
4489static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4490{
4491	wb_domain_size_changed(&memcg->cgwb_domain);
4492}
4493
4494struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4495{
4496	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4497
4498	if (!memcg->css.parent)
4499		return NULL;
4500
4501	return &memcg->cgwb_domain;
4502}
4503
4504/*
4505 * idx can be of type enum memcg_stat_item or node_stat_item.
4506 * Keep in sync with memcg_exact_page().
4507 */
4508static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
4509{
4510	long x = atomic_long_read(&memcg->vmstats[idx]);
4511	int cpu;
4512
4513	for_each_online_cpu(cpu)
4514		x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
4515	if (x < 0)
4516		x = 0;
4517	return x;
4518}
4519
4520/**
4521 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4522 * @wb: bdi_writeback in question
4523 * @pfilepages: out parameter for number of file pages
4524 * @pheadroom: out parameter for number of allocatable pages according to memcg
4525 * @pdirty: out parameter for number of dirty pages
4526 * @pwriteback: out parameter for number of pages under writeback
4527 *
4528 * Determine the numbers of file, headroom, dirty, and writeback pages in
4529 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4530 * is a bit more involved.
4531 *
4532 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4533 * headroom is calculated as the lowest headroom of itself and the
4534 * ancestors.  Note that this doesn't consider the actual amount of
4535 * available memory in the system.  The caller should further cap
4536 * *@pheadroom accordingly.
4537 */
4538void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4539			 unsigned long *pheadroom, unsigned long *pdirty,
4540			 unsigned long *pwriteback)
4541{
4542	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4543	struct mem_cgroup *parent;
4544
4545	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
4546
4547	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
4548	*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4549			memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
 
4550	*pheadroom = PAGE_COUNTER_MAX;
4551
4552	while ((parent = parent_mem_cgroup(memcg))) {
4553		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4554					    READ_ONCE(memcg->memory.high));
4555		unsigned long used = page_counter_read(&memcg->memory);
4556
4557		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4558		memcg = parent;
4559	}
4560}
4561
4562/*
4563 * Foreign dirty flushing
4564 *
4565 * There's an inherent mismatch between memcg and writeback.  The former
4566 * trackes ownership per-page while the latter per-inode.  This was a
4567 * deliberate design decision because honoring per-page ownership in the
4568 * writeback path is complicated, may lead to higher CPU and IO overheads
4569 * and deemed unnecessary given that write-sharing an inode across
4570 * different cgroups isn't a common use-case.
4571 *
4572 * Combined with inode majority-writer ownership switching, this works well
4573 * enough in most cases but there are some pathological cases.  For
4574 * example, let's say there are two cgroups A and B which keep writing to
4575 * different but confined parts of the same inode.  B owns the inode and
4576 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4577 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4578 * triggering background writeback.  A will be slowed down without a way to
4579 * make writeback of the dirty pages happen.
4580 *
4581 * Conditions like the above can lead to a cgroup getting repatedly and
4582 * severely throttled after making some progress after each
4583 * dirty_expire_interval while the underyling IO device is almost
4584 * completely idle.
4585 *
4586 * Solving this problem completely requires matching the ownership tracking
4587 * granularities between memcg and writeback in either direction.  However,
4588 * the more egregious behaviors can be avoided by simply remembering the
4589 * most recent foreign dirtying events and initiating remote flushes on
4590 * them when local writeback isn't enough to keep the memory clean enough.
4591 *
4592 * The following two functions implement such mechanism.  When a foreign
4593 * page - a page whose memcg and writeback ownerships don't match - is
4594 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4595 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4596 * decides that the memcg needs to sleep due to high dirty ratio, it calls
4597 * mem_cgroup_flush_foreign() which queues writeback on the recorded
4598 * foreign bdi_writebacks which haven't expired.  Both the numbers of
4599 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4600 * limited to MEMCG_CGWB_FRN_CNT.
4601 *
4602 * The mechanism only remembers IDs and doesn't hold any object references.
4603 * As being wrong occasionally doesn't matter, updates and accesses to the
4604 * records are lockless and racy.
4605 */
4606void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4607					     struct bdi_writeback *wb)
4608{
4609	struct mem_cgroup *memcg = page->mem_cgroup;
4610	struct memcg_cgwb_frn *frn;
4611	u64 now = get_jiffies_64();
4612	u64 oldest_at = now;
4613	int oldest = -1;
4614	int i;
4615
4616	trace_track_foreign_dirty(page, wb);
4617
4618	/*
4619	 * Pick the slot to use.  If there is already a slot for @wb, keep
4620	 * using it.  If not replace the oldest one which isn't being
4621	 * written out.
4622	 */
4623	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4624		frn = &memcg->cgwb_frn[i];
4625		if (frn->bdi_id == wb->bdi->id &&
4626		    frn->memcg_id == wb->memcg_css->id)
4627			break;
4628		if (time_before64(frn->at, oldest_at) &&
4629		    atomic_read(&frn->done.cnt) == 1) {
4630			oldest = i;
4631			oldest_at = frn->at;
4632		}
4633	}
4634
4635	if (i < MEMCG_CGWB_FRN_CNT) {
4636		/*
4637		 * Re-using an existing one.  Update timestamp lazily to
4638		 * avoid making the cacheline hot.  We want them to be
4639		 * reasonably up-to-date and significantly shorter than
4640		 * dirty_expire_interval as that's what expires the record.
4641		 * Use the shorter of 1s and dirty_expire_interval / 8.
4642		 */
4643		unsigned long update_intv =
4644			min_t(unsigned long, HZ,
4645			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4646
4647		if (time_before64(frn->at, now - update_intv))
4648			frn->at = now;
4649	} else if (oldest >= 0) {
4650		/* replace the oldest free one */
4651		frn = &memcg->cgwb_frn[oldest];
4652		frn->bdi_id = wb->bdi->id;
4653		frn->memcg_id = wb->memcg_css->id;
4654		frn->at = now;
4655	}
4656}
4657
4658/* issue foreign writeback flushes for recorded foreign dirtying events */
4659void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4660{
4661	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4662	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4663	u64 now = jiffies_64;
4664	int i;
4665
4666	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4667		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4668
4669		/*
4670		 * If the record is older than dirty_expire_interval,
4671		 * writeback on it has already started.  No need to kick it
4672		 * off again.  Also, don't start a new one if there's
4673		 * already one in flight.
4674		 */
4675		if (time_after64(frn->at, now - intv) &&
4676		    atomic_read(&frn->done.cnt) == 1) {
4677			frn->at = 0;
4678			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4679			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4680					       WB_REASON_FOREIGN_FLUSH,
4681					       &frn->done);
4682		}
4683	}
4684}
4685
4686#else	/* CONFIG_CGROUP_WRITEBACK */
4687
4688static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4689{
4690	return 0;
4691}
4692
4693static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4694{
4695}
4696
4697static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4698{
4699}
4700
4701#endif	/* CONFIG_CGROUP_WRITEBACK */
4702
4703/*
4704 * DO NOT USE IN NEW FILES.
4705 *
4706 * "cgroup.event_control" implementation.
4707 *
4708 * This is way over-engineered.  It tries to support fully configurable
4709 * events for each user.  Such level of flexibility is completely
4710 * unnecessary especially in the light of the planned unified hierarchy.
4711 *
4712 * Please deprecate this and replace with something simpler if at all
4713 * possible.
4714 */
4715
4716/*
4717 * Unregister event and free resources.
4718 *
4719 * Gets called from workqueue.
4720 */
4721static void memcg_event_remove(struct work_struct *work)
4722{
4723	struct mem_cgroup_event *event =
4724		container_of(work, struct mem_cgroup_event, remove);
4725	struct mem_cgroup *memcg = event->memcg;
4726
4727	remove_wait_queue(event->wqh, &event->wait);
4728
4729	event->unregister_event(memcg, event->eventfd);
4730
4731	/* Notify userspace the event is going away. */
4732	eventfd_signal(event->eventfd, 1);
4733
4734	eventfd_ctx_put(event->eventfd);
4735	kfree(event);
4736	css_put(&memcg->css);
4737}
4738
4739/*
4740 * Gets called on EPOLLHUP on eventfd when user closes it.
4741 *
4742 * Called with wqh->lock held and interrupts disabled.
4743 */
4744static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4745			    int sync, void *key)
4746{
4747	struct mem_cgroup_event *event =
4748		container_of(wait, struct mem_cgroup_event, wait);
4749	struct mem_cgroup *memcg = event->memcg;
4750	__poll_t flags = key_to_poll(key);
4751
4752	if (flags & EPOLLHUP) {
4753		/*
4754		 * If the event has been detached at cgroup removal, we
4755		 * can simply return knowing the other side will cleanup
4756		 * for us.
4757		 *
4758		 * We can't race against event freeing since the other
4759		 * side will require wqh->lock via remove_wait_queue(),
4760		 * which we hold.
4761		 */
4762		spin_lock(&memcg->event_list_lock);
4763		if (!list_empty(&event->list)) {
4764			list_del_init(&event->list);
4765			/*
4766			 * We are in atomic context, but cgroup_event_remove()
4767			 * may sleep, so we have to call it in workqueue.
4768			 */
4769			schedule_work(&event->remove);
4770		}
4771		spin_unlock(&memcg->event_list_lock);
4772	}
4773
4774	return 0;
4775}
4776
4777static void memcg_event_ptable_queue_proc(struct file *file,
4778		wait_queue_head_t *wqh, poll_table *pt)
4779{
4780	struct mem_cgroup_event *event =
4781		container_of(pt, struct mem_cgroup_event, pt);
4782
4783	event->wqh = wqh;
4784	add_wait_queue(wqh, &event->wait);
4785}
4786
4787/*
4788 * DO NOT USE IN NEW FILES.
4789 *
4790 * Parse input and register new cgroup event handler.
4791 *
4792 * Input must be in format '<event_fd> <control_fd> <args>'.
4793 * Interpretation of args is defined by control file implementation.
4794 */
4795static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4796					 char *buf, size_t nbytes, loff_t off)
4797{
4798	struct cgroup_subsys_state *css = of_css(of);
4799	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4800	struct mem_cgroup_event *event;
4801	struct cgroup_subsys_state *cfile_css;
4802	unsigned int efd, cfd;
4803	struct fd efile;
4804	struct fd cfile;
4805	const char *name;
4806	char *endp;
4807	int ret;
4808
4809	buf = strstrip(buf);
4810
4811	efd = simple_strtoul(buf, &endp, 10);
4812	if (*endp != ' ')
4813		return -EINVAL;
4814	buf = endp + 1;
4815
4816	cfd = simple_strtoul(buf, &endp, 10);
4817	if ((*endp != ' ') && (*endp != '\0'))
4818		return -EINVAL;
4819	buf = endp + 1;
4820
4821	event = kzalloc(sizeof(*event), GFP_KERNEL);
4822	if (!event)
4823		return -ENOMEM;
4824
4825	event->memcg = memcg;
4826	INIT_LIST_HEAD(&event->list);
4827	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4828	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4829	INIT_WORK(&event->remove, memcg_event_remove);
4830
4831	efile = fdget(efd);
4832	if (!efile.file) {
4833		ret = -EBADF;
4834		goto out_kfree;
4835	}
4836
4837	event->eventfd = eventfd_ctx_fileget(efile.file);
4838	if (IS_ERR(event->eventfd)) {
4839		ret = PTR_ERR(event->eventfd);
4840		goto out_put_efile;
4841	}
4842
4843	cfile = fdget(cfd);
4844	if (!cfile.file) {
4845		ret = -EBADF;
4846		goto out_put_eventfd;
4847	}
4848
4849	/* the process need read permission on control file */
4850	/* AV: shouldn't we check that it's been opened for read instead? */
4851	ret = inode_permission(file_inode(cfile.file), MAY_READ);
4852	if (ret < 0)
4853		goto out_put_cfile;
4854
4855	/*
4856	 * Determine the event callbacks and set them in @event.  This used
4857	 * to be done via struct cftype but cgroup core no longer knows
4858	 * about these events.  The following is crude but the whole thing
4859	 * is for compatibility anyway.
4860	 *
4861	 * DO NOT ADD NEW FILES.
4862	 */
4863	name = cfile.file->f_path.dentry->d_name.name;
4864
4865	if (!strcmp(name, "memory.usage_in_bytes")) {
4866		event->register_event = mem_cgroup_usage_register_event;
4867		event->unregister_event = mem_cgroup_usage_unregister_event;
4868	} else if (!strcmp(name, "memory.oom_control")) {
4869		event->register_event = mem_cgroup_oom_register_event;
4870		event->unregister_event = mem_cgroup_oom_unregister_event;
4871	} else if (!strcmp(name, "memory.pressure_level")) {
4872		event->register_event = vmpressure_register_event;
4873		event->unregister_event = vmpressure_unregister_event;
4874	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4875		event->register_event = memsw_cgroup_usage_register_event;
4876		event->unregister_event = memsw_cgroup_usage_unregister_event;
4877	} else {
4878		ret = -EINVAL;
4879		goto out_put_cfile;
4880	}
4881
4882	/*
4883	 * Verify @cfile should belong to @css.  Also, remaining events are
4884	 * automatically removed on cgroup destruction but the removal is
4885	 * asynchronous, so take an extra ref on @css.
4886	 */
4887	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4888					       &memory_cgrp_subsys);
4889	ret = -EINVAL;
4890	if (IS_ERR(cfile_css))
4891		goto out_put_cfile;
4892	if (cfile_css != css) {
4893		css_put(cfile_css);
4894		goto out_put_cfile;
4895	}
4896
4897	ret = event->register_event(memcg, event->eventfd, buf);
4898	if (ret)
4899		goto out_put_css;
4900
4901	vfs_poll(efile.file, &event->pt);
4902
4903	spin_lock(&memcg->event_list_lock);
4904	list_add(&event->list, &memcg->event_list);
4905	spin_unlock(&memcg->event_list_lock);
4906
4907	fdput(cfile);
4908	fdput(efile);
4909
4910	return nbytes;
4911
4912out_put_css:
4913	css_put(css);
4914out_put_cfile:
4915	fdput(cfile);
4916out_put_eventfd:
4917	eventfd_ctx_put(event->eventfd);
4918out_put_efile:
4919	fdput(efile);
4920out_kfree:
4921	kfree(event);
4922
4923	return ret;
4924}
4925
4926static struct cftype mem_cgroup_legacy_files[] = {
4927	{
4928		.name = "usage_in_bytes",
4929		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4930		.read_u64 = mem_cgroup_read_u64,
4931	},
4932	{
4933		.name = "max_usage_in_bytes",
4934		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4935		.write = mem_cgroup_reset,
4936		.read_u64 = mem_cgroup_read_u64,
4937	},
4938	{
4939		.name = "limit_in_bytes",
4940		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4941		.write = mem_cgroup_write,
4942		.read_u64 = mem_cgroup_read_u64,
4943	},
4944	{
4945		.name = "soft_limit_in_bytes",
4946		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4947		.write = mem_cgroup_write,
4948		.read_u64 = mem_cgroup_read_u64,
4949	},
4950	{
4951		.name = "failcnt",
4952		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4953		.write = mem_cgroup_reset,
4954		.read_u64 = mem_cgroup_read_u64,
4955	},
4956	{
4957		.name = "stat",
4958		.seq_show = memcg_stat_show,
4959	},
4960	{
4961		.name = "force_empty",
4962		.write = mem_cgroup_force_empty_write,
4963	},
4964	{
4965		.name = "use_hierarchy",
4966		.write_u64 = mem_cgroup_hierarchy_write,
4967		.read_u64 = mem_cgroup_hierarchy_read,
4968	},
4969	{
4970		.name = "cgroup.event_control",		/* XXX: for compat */
4971		.write = memcg_write_event_control,
4972		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4973	},
4974	{
4975		.name = "swappiness",
4976		.read_u64 = mem_cgroup_swappiness_read,
4977		.write_u64 = mem_cgroup_swappiness_write,
4978	},
4979	{
4980		.name = "move_charge_at_immigrate",
4981		.read_u64 = mem_cgroup_move_charge_read,
4982		.write_u64 = mem_cgroup_move_charge_write,
4983	},
4984	{
4985		.name = "oom_control",
4986		.seq_show = mem_cgroup_oom_control_read,
4987		.write_u64 = mem_cgroup_oom_control_write,
4988		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4989	},
4990	{
4991		.name = "pressure_level",
4992	},
4993#ifdef CONFIG_NUMA
4994	{
4995		.name = "numa_stat",
4996		.seq_show = memcg_numa_stat_show,
4997	},
4998#endif
4999	{
5000		.name = "kmem.limit_in_bytes",
5001		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5002		.write = mem_cgroup_write,
5003		.read_u64 = mem_cgroup_read_u64,
5004	},
5005	{
5006		.name = "kmem.usage_in_bytes",
5007		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5008		.read_u64 = mem_cgroup_read_u64,
5009	},
5010	{
5011		.name = "kmem.failcnt",
5012		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5013		.write = mem_cgroup_reset,
5014		.read_u64 = mem_cgroup_read_u64,
5015	},
5016	{
5017		.name = "kmem.max_usage_in_bytes",
5018		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5019		.write = mem_cgroup_reset,
5020		.read_u64 = mem_cgroup_read_u64,
5021	},
5022#if defined(CONFIG_MEMCG_KMEM) && \
5023	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5024	{
5025		.name = "kmem.slabinfo",
 
 
 
5026		.seq_show = memcg_slab_show,
5027	},
5028#endif
5029	{
5030		.name = "kmem.tcp.limit_in_bytes",
5031		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5032		.write = mem_cgroup_write,
5033		.read_u64 = mem_cgroup_read_u64,
5034	},
5035	{
5036		.name = "kmem.tcp.usage_in_bytes",
5037		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5038		.read_u64 = mem_cgroup_read_u64,
5039	},
5040	{
5041		.name = "kmem.tcp.failcnt",
5042		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5043		.write = mem_cgroup_reset,
5044		.read_u64 = mem_cgroup_read_u64,
5045	},
5046	{
5047		.name = "kmem.tcp.max_usage_in_bytes",
5048		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5049		.write = mem_cgroup_reset,
5050		.read_u64 = mem_cgroup_read_u64,
5051	},
5052	{ },	/* terminate */
5053};
5054
5055/*
5056 * Private memory cgroup IDR
5057 *
5058 * Swap-out records and page cache shadow entries need to store memcg
5059 * references in constrained space, so we maintain an ID space that is
5060 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
5061 * memory-controlled cgroups to 64k.
5062 *
5063 * However, there usually are many references to the offline CSS after
5064 * the cgroup has been destroyed, such as page cache or reclaimable
5065 * slab objects, that don't need to hang on to the ID. We want to keep
5066 * those dead CSS from occupying IDs, or we might quickly exhaust the
5067 * relatively small ID space and prevent the creation of new cgroups
5068 * even when there are much fewer than 64k cgroups - possibly none.
5069 *
5070 * Maintain a private 16-bit ID space for memcg, and allow the ID to
5071 * be freed and recycled when it's no longer needed, which is usually
5072 * when the CSS is offlined.
5073 *
5074 * The only exception to that are records of swapped out tmpfs/shmem
5075 * pages that need to be attributed to live ancestors on swapin. But
5076 * those references are manageable from userspace.
5077 */
5078
5079static DEFINE_IDR(mem_cgroup_idr);
5080
5081static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5082{
5083	if (memcg->id.id > 0) {
5084		idr_remove(&mem_cgroup_idr, memcg->id.id);
5085		memcg->id.id = 0;
5086	}
5087}
5088
5089static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5090						  unsigned int n)
5091{
5092	refcount_add(n, &memcg->id.ref);
5093}
5094
5095static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5096{
5097	if (refcount_sub_and_test(n, &memcg->id.ref)) {
5098		mem_cgroup_id_remove(memcg);
5099
5100		/* Memcg ID pins CSS */
5101		css_put(&memcg->css);
5102	}
5103}
5104
5105static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5106{
5107	mem_cgroup_id_put_many(memcg, 1);
5108}
5109
5110/**
5111 * mem_cgroup_from_id - look up a memcg from a memcg id
5112 * @id: the memcg id to look up
5113 *
5114 * Caller must hold rcu_read_lock().
5115 */
5116struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5117{
5118	WARN_ON_ONCE(!rcu_read_lock_held());
5119	return idr_find(&mem_cgroup_idr, id);
5120}
5121
5122static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5123{
5124	struct mem_cgroup_per_node *pn;
5125	int tmp = node;
 
5126	/*
5127	 * This routine is called against possible nodes.
5128	 * But it's BUG to call kmalloc() against offline node.
5129	 *
5130	 * TODO: this routine can waste much memory for nodes which will
5131	 *       never be onlined. It's better to use memory hotplug callback
5132	 *       function.
5133	 */
5134	if (!node_state(node, N_NORMAL_MEMORY))
5135		tmp = -1;
5136	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5137	if (!pn)
5138		return 1;
5139
5140	pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5141						 GFP_KERNEL_ACCOUNT);
5142	if (!pn->lruvec_stat_local) {
5143		kfree(pn);
5144		return 1;
5145	}
5146
5147	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
5148					       GFP_KERNEL_ACCOUNT);
5149	if (!pn->lruvec_stat_cpu) {
5150		free_percpu(pn->lruvec_stat_local);
5151		kfree(pn);
5152		return 1;
5153	}
5154
5155	lruvec_init(&pn->lruvec);
5156	pn->usage_in_excess = 0;
5157	pn->on_tree = false;
5158	pn->memcg = memcg;
5159
5160	memcg->nodeinfo[node] = pn;
5161	return 0;
5162}
5163
5164static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5165{
5166	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5167
5168	if (!pn)
5169		return;
5170
5171	free_percpu(pn->lruvec_stat_cpu);
5172	free_percpu(pn->lruvec_stat_local);
5173	kfree(pn);
5174}
5175
5176static void __mem_cgroup_free(struct mem_cgroup *memcg)
5177{
5178	int node;
5179
 
5180	for_each_node(node)
5181		free_mem_cgroup_per_node_info(memcg, node);
5182	free_percpu(memcg->vmstats_percpu);
5183	free_percpu(memcg->vmstats_local);
5184	kfree(memcg);
5185}
5186
5187static void mem_cgroup_free(struct mem_cgroup *memcg)
5188{
5189	memcg_wb_domain_exit(memcg);
5190	/*
5191	 * Flush percpu vmstats and vmevents to guarantee the value correctness
5192	 * on parent's and all ancestor levels.
5193	 */
5194	memcg_flush_percpu_vmstats(memcg);
5195	memcg_flush_percpu_vmevents(memcg);
5196	__mem_cgroup_free(memcg);
5197}
5198
5199static struct mem_cgroup *mem_cgroup_alloc(void)
5200{
5201	struct mem_cgroup *memcg;
5202	unsigned int size;
5203	int node;
5204	int __maybe_unused i;
5205	long error = -ENOMEM;
5206
5207	size = sizeof(struct mem_cgroup);
5208	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5209
5210	memcg = kzalloc(size, GFP_KERNEL);
5211	if (!memcg)
5212		return ERR_PTR(error);
5213
5214	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5215				 1, MEM_CGROUP_ID_MAX,
5216				 GFP_KERNEL);
5217	if (memcg->id.id < 0) {
5218		error = memcg->id.id;
5219		goto fail;
5220	}
5221
5222	memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5223						GFP_KERNEL_ACCOUNT);
5224	if (!memcg->vmstats_local)
5225		goto fail;
5226
5227	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5228						 GFP_KERNEL_ACCOUNT);
5229	if (!memcg->vmstats_percpu)
5230		goto fail;
5231
5232	for_each_node(node)
5233		if (alloc_mem_cgroup_per_node_info(memcg, node))
5234			goto fail;
5235
5236	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5237		goto fail;
5238
5239	INIT_WORK(&memcg->high_work, high_work_func);
 
5240	INIT_LIST_HEAD(&memcg->oom_notify);
5241	mutex_init(&memcg->thresholds_lock);
5242	spin_lock_init(&memcg->move_lock);
5243	vmpressure_init(&memcg->vmpressure);
5244	INIT_LIST_HEAD(&memcg->event_list);
5245	spin_lock_init(&memcg->event_list_lock);
5246	memcg->socket_pressure = jiffies;
5247#ifdef CONFIG_MEMCG_KMEM
5248	memcg->kmemcg_id = -1;
5249	INIT_LIST_HEAD(&memcg->objcg_list);
5250#endif
5251#ifdef CONFIG_CGROUP_WRITEBACK
5252	INIT_LIST_HEAD(&memcg->cgwb_list);
5253	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5254		memcg->cgwb_frn[i].done =
5255			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5256#endif
5257#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5258	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5259	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5260	memcg->deferred_split_queue.split_queue_len = 0;
5261#endif
5262	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5263	return memcg;
5264fail:
5265	mem_cgroup_id_remove(memcg);
5266	__mem_cgroup_free(memcg);
5267	return ERR_PTR(error);
5268}
5269
5270static struct cgroup_subsys_state * __ref
5271mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5272{
5273	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5274	struct mem_cgroup *memcg;
5275	long error = -ENOMEM;
5276
5277	memalloc_use_memcg(parent);
5278	memcg = mem_cgroup_alloc();
5279	memalloc_unuse_memcg();
5280	if (IS_ERR(memcg))
5281		return ERR_CAST(memcg);
5282
5283	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5284	memcg->soft_limit = PAGE_COUNTER_MAX;
5285	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5286	if (parent) {
5287		memcg->swappiness = mem_cgroup_swappiness(parent);
5288		memcg->oom_kill_disable = parent->oom_kill_disable;
5289	}
5290	if (parent && parent->use_hierarchy) {
5291		memcg->use_hierarchy = true;
5292		page_counter_init(&memcg->memory, &parent->memory);
5293		page_counter_init(&memcg->swap, &parent->swap);
5294		page_counter_init(&memcg->memsw, &parent->memsw);
5295		page_counter_init(&memcg->kmem, &parent->kmem);
5296		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5297	} else {
5298		page_counter_init(&memcg->memory, NULL);
5299		page_counter_init(&memcg->swap, NULL);
5300		page_counter_init(&memcg->memsw, NULL);
5301		page_counter_init(&memcg->kmem, NULL);
5302		page_counter_init(&memcg->tcpmem, NULL);
5303		/*
5304		 * Deeper hierachy with use_hierarchy == false doesn't make
5305		 * much sense so let cgroup subsystem know about this
5306		 * unfortunate state in our controller.
5307		 */
5308		if (parent != root_mem_cgroup)
5309			memory_cgrp_subsys.broken_hierarchy = true;
5310	}
5311
5312	/* The following stuff does not apply to the root */
5313	if (!parent) {
5314		root_mem_cgroup = memcg;
5315		return &memcg->css;
5316	}
5317
5318	error = memcg_online_kmem(memcg);
5319	if (error)
5320		goto fail;
5321
5322	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5323		static_branch_inc(&memcg_sockets_enabled_key);
5324
5325	return &memcg->css;
5326fail:
5327	mem_cgroup_id_remove(memcg);
5328	mem_cgroup_free(memcg);
5329	return ERR_PTR(error);
5330}
5331
5332static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 
5333{
5334	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
5335
5336	/*
5337	 * A memcg must be visible for memcg_expand_shrinker_maps()
5338	 * by the time the maps are allocated. So, we allocate maps
5339	 * here, when for_each_mem_cgroup() can't skip it.
5340	 */
5341	if (memcg_alloc_shrinker_maps(memcg)) {
5342		mem_cgroup_id_remove(memcg);
5343		return -ENOMEM;
5344	}
5345
5346	/* Online state pins memcg ID, memcg ID pins CSS */
5347	refcount_set(&memcg->id.ref, 1);
5348	css_get(css);
5349	return 0;
5350}
5351
5352static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5353{
5354	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5355	struct mem_cgroup_event *event, *tmp;
5356
5357	/*
5358	 * Unregister events and notify userspace.
5359	 * Notify userspace about cgroup removing only after rmdir of cgroup
5360	 * directory to avoid race between userspace and kernelspace.
5361	 */
5362	spin_lock(&memcg->event_list_lock);
5363	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5364		list_del_init(&event->list);
5365		schedule_work(&event->remove);
5366	}
5367	spin_unlock(&memcg->event_list_lock);
5368
5369	page_counter_set_min(&memcg->memory, 0);
5370	page_counter_set_low(&memcg->memory, 0);
5371
5372	memcg_offline_kmem(memcg);
5373	wb_memcg_offline(memcg);
5374
5375	drain_all_stock(memcg);
5376
5377	mem_cgroup_id_put(memcg);
5378}
5379
5380static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5381{
5382	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5383
5384	invalidate_reclaim_iterators(memcg);
5385}
5386
5387static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5388{
5389	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5390	int __maybe_unused i;
5391
5392#ifdef CONFIG_CGROUP_WRITEBACK
5393	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5394		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5395#endif
5396	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5397		static_branch_dec(&memcg_sockets_enabled_key);
5398
5399	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5400		static_branch_dec(&memcg_sockets_enabled_key);
5401
5402	vmpressure_cleanup(&memcg->vmpressure);
5403	cancel_work_sync(&memcg->high_work);
5404	mem_cgroup_remove_from_trees(memcg);
5405	memcg_free_shrinker_maps(memcg);
5406	memcg_free_kmem(memcg);
5407	mem_cgroup_free(memcg);
5408}
5409
5410/**
5411 * mem_cgroup_css_reset - reset the states of a mem_cgroup
5412 * @css: the target css
5413 *
5414 * Reset the states of the mem_cgroup associated with @css.  This is
5415 * invoked when the userland requests disabling on the default hierarchy
5416 * but the memcg is pinned through dependency.  The memcg should stop
5417 * applying policies and should revert to the vanilla state as it may be
5418 * made visible again.
5419 *
5420 * The current implementation only resets the essential configurations.
5421 * This needs to be expanded to cover all the visible parts.
5422 */
5423static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5424{
5425	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5426
5427	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5428	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5429	page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
5430	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5431	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5432	page_counter_set_min(&memcg->memory, 0);
5433	page_counter_set_low(&memcg->memory, 0);
5434	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5435	memcg->soft_limit = PAGE_COUNTER_MAX;
5436	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5437	memcg_wb_domain_size_changed(memcg);
5438}
5439
5440#ifdef CONFIG_MMU
5441/* Handlers for move charge at task migration. */
5442static int mem_cgroup_do_precharge(unsigned long count)
5443{
5444	int ret;
5445
5446	/* Try a single bulk charge without reclaim first, kswapd may wake */
5447	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5448	if (!ret) {
5449		mc.precharge += count;
5450		return ret;
5451	}
5452
5453	/* Try charges one by one with reclaim, but do not retry */
5454	while (count--) {
5455		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5456		if (ret)
5457			return ret;
5458		mc.precharge++;
5459		cond_resched();
5460	}
5461	return 0;
5462}
5463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5464union mc_target {
5465	struct page	*page;
5466	swp_entry_t	ent;
5467};
5468
5469enum mc_target_type {
5470	MC_TARGET_NONE = 0,
5471	MC_TARGET_PAGE,
5472	MC_TARGET_SWAP,
5473	MC_TARGET_DEVICE,
5474};
5475
5476static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5477						unsigned long addr, pte_t ptent)
5478{
5479	struct page *page = vm_normal_page(vma, addr, ptent);
5480
5481	if (!page || !page_mapped(page))
5482		return NULL;
5483	if (PageAnon(page)) {
5484		if (!(mc.flags & MOVE_ANON))
5485			return NULL;
5486	} else {
5487		if (!(mc.flags & MOVE_FILE))
5488			return NULL;
5489	}
5490	if (!get_page_unless_zero(page))
5491		return NULL;
5492
5493	return page;
5494}
5495
5496#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5497static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5498			pte_t ptent, swp_entry_t *entry)
5499{
5500	struct page *page = NULL;
5501	swp_entry_t ent = pte_to_swp_entry(ptent);
5502
5503	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
5504		return NULL;
5505
5506	/*
5507	 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
5508	 * a device and because they are not accessible by CPU they are store
5509	 * as special swap entry in the CPU page table.
5510	 */
5511	if (is_device_private_entry(ent)) {
5512		page = device_private_entry_to_page(ent);
5513		/*
5514		 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
5515		 * a refcount of 1 when free (unlike normal page)
5516		 */
5517		if (!page_ref_add_unless(page, 1, 1))
5518			return NULL;
5519		return page;
5520	}
5521
5522	/*
5523	 * Because lookup_swap_cache() updates some statistics counter,
5524	 * we call find_get_page() with swapper_space directly.
5525	 */
5526	page = find_get_page(swap_address_space(ent), swp_offset(ent));
5527	entry->val = ent.val;
 
5528
5529	return page;
5530}
5531#else
5532static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5533			pte_t ptent, swp_entry_t *entry)
5534{
5535	return NULL;
5536}
5537#endif
5538
5539static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5540			unsigned long addr, pte_t ptent, swp_entry_t *entry)
5541{
5542	struct page *page = NULL;
5543	struct address_space *mapping;
5544	pgoff_t pgoff;
5545
5546	if (!vma->vm_file) /* anonymous vma */
5547		return NULL;
5548	if (!(mc.flags & MOVE_FILE))
5549		return NULL;
5550
5551	mapping = vma->vm_file->f_mapping;
5552	pgoff = linear_page_index(vma, addr);
5553
5554	/* page is moved even if it's not RSS of this task(page-faulted). */
5555#ifdef CONFIG_SWAP
5556	/* shmem/tmpfs may report page out on swap: account for that too. */
5557	if (shmem_mapping(mapping)) {
5558		page = find_get_entry(mapping, pgoff);
5559		if (xa_is_value(page)) {
5560			swp_entry_t swp = radix_to_swp_entry(page);
5561			*entry = swp;
5562			page = find_get_page(swap_address_space(swp),
5563					     swp_offset(swp));
5564		}
5565	} else
5566		page = find_get_page(mapping, pgoff);
5567#else
5568	page = find_get_page(mapping, pgoff);
5569#endif
5570	return page;
5571}
5572
5573/**
5574 * mem_cgroup_move_account - move account of the page
5575 * @page: the page
5576 * @compound: charge the page as compound or small page
5577 * @from: mem_cgroup which the page is moved from.
5578 * @to:	mem_cgroup which the page is moved to. @from != @to.
5579 *
5580 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
5581 *
5582 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5583 * from old cgroup.
5584 */
5585static int mem_cgroup_move_account(struct page *page,
5586				   bool compound,
5587				   struct mem_cgroup *from,
5588				   struct mem_cgroup *to)
5589{
5590	struct lruvec *from_vec, *to_vec;
5591	struct pglist_data *pgdat;
5592	unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
5593	int ret;
 
5594
5595	VM_BUG_ON(from == to);
5596	VM_BUG_ON_PAGE(PageLRU(page), page);
5597	VM_BUG_ON(compound && !PageTransHuge(page));
5598
5599	/*
5600	 * Prevent mem_cgroup_migrate() from looking at
5601	 * page->mem_cgroup of its source page while we change it.
5602	 */
5603	ret = -EBUSY;
5604	if (!trylock_page(page))
5605		goto out;
5606
5607	ret = -EINVAL;
5608	if (page->mem_cgroup != from)
5609		goto out_unlock;
5610
5611	pgdat = page_pgdat(page);
5612	from_vec = mem_cgroup_lruvec(from, pgdat);
5613	to_vec = mem_cgroup_lruvec(to, pgdat);
5614
5615	lock_page_memcg(page);
5616
5617	if (PageAnon(page)) {
5618		if (page_mapped(page)) {
5619			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5620			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5621			if (PageTransHuge(page)) {
5622				__mod_lruvec_state(from_vec, NR_ANON_THPS,
5623						   -nr_pages);
5624				__mod_lruvec_state(to_vec, NR_ANON_THPS,
5625						   nr_pages);
5626			}
5627
5628		}
5629	} else {
5630		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5631		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
 
 
 
5632
5633		if (PageSwapBacked(page)) {
5634			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5635			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5636		}
5637
5638		if (page_mapped(page)) {
5639			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5640			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5641		}
5642
5643		if (PageDirty(page)) {
5644			struct address_space *mapping = page_mapping(page);
5645
5646			if (mapping_cap_account_dirty(mapping)) {
5647				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5648						   -nr_pages);
5649				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5650						   nr_pages);
5651			}
5652		}
5653	}
5654
5655	if (PageWriteback(page)) {
5656		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5657		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
 
 
5658	}
5659
5660	/*
5661	 * All state has been migrated, let's switch to the new memcg.
5662	 *
5663	 * It is safe to change page->mem_cgroup here because the page
5664	 * is referenced, charged, isolated, and locked: we can't race
5665	 * with (un)charging, migration, LRU putback, or anything else
5666	 * that would rely on a stable page->mem_cgroup.
5667	 *
5668	 * Note that lock_page_memcg is a memcg lock, not a page lock,
5669	 * to save space. As soon as we switch page->mem_cgroup to a
5670	 * new memcg that isn't locked, the above state can change
5671	 * concurrently again. Make sure we're truly done with it.
5672	 */
5673	smp_mb();
5674
5675	css_get(&to->css);
5676	css_put(&from->css);
5677
 
5678	page->mem_cgroup = to;
5679
5680	__unlock_page_memcg(from);
5681
5682	ret = 0;
5683
5684	local_irq_disable();
5685	mem_cgroup_charge_statistics(to, page, nr_pages);
5686	memcg_check_events(to, page);
5687	mem_cgroup_charge_statistics(from, page, -nr_pages);
5688	memcg_check_events(from, page);
5689	local_irq_enable();
5690out_unlock:
5691	unlock_page(page);
5692out:
5693	return ret;
5694}
5695
5696/**
5697 * get_mctgt_type - get target type of moving charge
5698 * @vma: the vma the pte to be checked belongs
5699 * @addr: the address corresponding to the pte to be checked
5700 * @ptent: the pte to be checked
5701 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5702 *
5703 * Returns
5704 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5705 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5706 *     move charge. if @target is not NULL, the page is stored in target->page
5707 *     with extra refcnt got(Callers should handle it).
5708 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5709 *     target for charge migration. if @target is not NULL, the entry is stored
5710 *     in target->ent.
5711 *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
5712 *     (so ZONE_DEVICE page and thus not on the lru).
5713 *     For now we such page is charge like a regular page would be as for all
5714 *     intent and purposes it is just special memory taking the place of a
5715 *     regular page.
5716 *
5717 *     See Documentations/vm/hmm.txt and include/linux/hmm.h
5718 *
5719 * Called with pte lock held.
5720 */
5721
5722static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5723		unsigned long addr, pte_t ptent, union mc_target *target)
5724{
5725	struct page *page = NULL;
5726	enum mc_target_type ret = MC_TARGET_NONE;
5727	swp_entry_t ent = { .val = 0 };
5728
5729	if (pte_present(ptent))
5730		page = mc_handle_present_pte(vma, addr, ptent);
5731	else if (is_swap_pte(ptent))
5732		page = mc_handle_swap_pte(vma, ptent, &ent);
5733	else if (pte_none(ptent))
5734		page = mc_handle_file_pte(vma, addr, ptent, &ent);
5735
5736	if (!page && !ent.val)
5737		return ret;
5738	if (page) {
5739		/*
5740		 * Do only loose check w/o serialization.
5741		 * mem_cgroup_move_account() checks the page is valid or
5742		 * not under LRU exclusion.
5743		 */
5744		if (page->mem_cgroup == mc.from) {
5745			ret = MC_TARGET_PAGE;
5746			if (is_device_private_page(page))
5747				ret = MC_TARGET_DEVICE;
5748			if (target)
5749				target->page = page;
5750		}
5751		if (!ret || !target)
5752			put_page(page);
5753	}
5754	/*
5755	 * There is a swap entry and a page doesn't exist or isn't charged.
5756	 * But we cannot move a tail-page in a THP.
5757	 */
5758	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5759	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5760		ret = MC_TARGET_SWAP;
5761		if (target)
5762			target->ent = ent;
5763	}
5764	return ret;
5765}
5766
5767#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5768/*
5769 * We don't consider PMD mapped swapping or file mapped pages because THP does
5770 * not support them for now.
5771 * Caller should make sure that pmd_trans_huge(pmd) is true.
5772 */
5773static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5774		unsigned long addr, pmd_t pmd, union mc_target *target)
5775{
5776	struct page *page = NULL;
5777	enum mc_target_type ret = MC_TARGET_NONE;
5778
5779	if (unlikely(is_swap_pmd(pmd))) {
5780		VM_BUG_ON(thp_migration_supported() &&
5781				  !is_pmd_migration_entry(pmd));
5782		return ret;
5783	}
5784	page = pmd_page(pmd);
5785	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5786	if (!(mc.flags & MOVE_ANON))
5787		return ret;
5788	if (page->mem_cgroup == mc.from) {
5789		ret = MC_TARGET_PAGE;
5790		if (target) {
5791			get_page(page);
5792			target->page = page;
5793		}
5794	}
5795	return ret;
5796}
5797#else
5798static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5799		unsigned long addr, pmd_t pmd, union mc_target *target)
5800{
5801	return MC_TARGET_NONE;
5802}
5803#endif
5804
5805static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5806					unsigned long addr, unsigned long end,
5807					struct mm_walk *walk)
5808{
5809	struct vm_area_struct *vma = walk->vma;
5810	pte_t *pte;
5811	spinlock_t *ptl;
5812
5813	ptl = pmd_trans_huge_lock(pmd, vma);
5814	if (ptl) {
5815		/*
5816		 * Note their can not be MC_TARGET_DEVICE for now as we do not
5817		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5818		 * this might change.
5819		 */
5820		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5821			mc.precharge += HPAGE_PMD_NR;
5822		spin_unlock(ptl);
5823		return 0;
5824	}
5825
5826	if (pmd_trans_unstable(pmd))
5827		return 0;
5828	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5829	for (; addr != end; pte++, addr += PAGE_SIZE)
5830		if (get_mctgt_type(vma, addr, *pte, NULL))
5831			mc.precharge++;	/* increment precharge temporarily */
5832	pte_unmap_unlock(pte - 1, ptl);
5833	cond_resched();
5834
5835	return 0;
5836}
5837
5838static const struct mm_walk_ops precharge_walk_ops = {
5839	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
5840};
5841
5842static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5843{
5844	unsigned long precharge;
5845
5846	mmap_read_lock(mm);
5847	walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5848	mmap_read_unlock(mm);
 
 
 
 
5849
5850	precharge = mc.precharge;
5851	mc.precharge = 0;
5852
5853	return precharge;
5854}
5855
5856static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5857{
5858	unsigned long precharge = mem_cgroup_count_precharge(mm);
5859
5860	VM_BUG_ON(mc.moving_task);
5861	mc.moving_task = current;
5862	return mem_cgroup_do_precharge(precharge);
5863}
5864
5865/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5866static void __mem_cgroup_clear_mc(void)
5867{
5868	struct mem_cgroup *from = mc.from;
5869	struct mem_cgroup *to = mc.to;
5870
5871	/* we must uncharge all the leftover precharges from mc.to */
5872	if (mc.precharge) {
5873		cancel_charge(mc.to, mc.precharge);
5874		mc.precharge = 0;
5875	}
5876	/*
5877	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5878	 * we must uncharge here.
5879	 */
5880	if (mc.moved_charge) {
5881		cancel_charge(mc.from, mc.moved_charge);
5882		mc.moved_charge = 0;
5883	}
5884	/* we must fixup refcnts and charges */
5885	if (mc.moved_swap) {
5886		/* uncharge swap account from the old cgroup */
5887		if (!mem_cgroup_is_root(mc.from))
5888			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5889
5890		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5891
5892		/*
5893		 * we charged both to->memory and to->memsw, so we
5894		 * should uncharge to->memory.
5895		 */
5896		if (!mem_cgroup_is_root(mc.to))
5897			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5898
 
 
 
5899		mc.moved_swap = 0;
5900	}
5901	memcg_oom_recover(from);
5902	memcg_oom_recover(to);
5903	wake_up_all(&mc.waitq);
5904}
5905
5906static void mem_cgroup_clear_mc(void)
5907{
5908	struct mm_struct *mm = mc.mm;
5909
5910	/*
5911	 * we must clear moving_task before waking up waiters at the end of
5912	 * task migration.
5913	 */
5914	mc.moving_task = NULL;
5915	__mem_cgroup_clear_mc();
5916	spin_lock(&mc.lock);
5917	mc.from = NULL;
5918	mc.to = NULL;
5919	mc.mm = NULL;
5920	spin_unlock(&mc.lock);
5921
5922	mmput(mm);
5923}
5924
5925static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5926{
5927	struct cgroup_subsys_state *css;
5928	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
5929	struct mem_cgroup *from;
5930	struct task_struct *leader, *p;
5931	struct mm_struct *mm;
5932	unsigned long move_flags;
5933	int ret = 0;
5934
5935	/* charge immigration isn't supported on the default hierarchy */
5936	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5937		return 0;
5938
5939	/*
5940	 * Multi-process migrations only happen on the default hierarchy
5941	 * where charge immigration is not used.  Perform charge
5942	 * immigration if @tset contains a leader and whine if there are
5943	 * multiple.
5944	 */
5945	p = NULL;
5946	cgroup_taskset_for_each_leader(leader, css, tset) {
5947		WARN_ON_ONCE(p);
5948		p = leader;
5949		memcg = mem_cgroup_from_css(css);
5950	}
5951	if (!p)
5952		return 0;
5953
5954	/*
5955	 * We are now commited to this value whatever it is. Changes in this
5956	 * tunable will only affect upcoming migrations, not the current one.
5957	 * So we need to save it, and keep it going.
5958	 */
5959	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5960	if (!move_flags)
5961		return 0;
5962
5963	from = mem_cgroup_from_task(p);
5964
5965	VM_BUG_ON(from == memcg);
5966
5967	mm = get_task_mm(p);
5968	if (!mm)
5969		return 0;
5970	/* We move charges only when we move a owner of the mm */
5971	if (mm->owner == p) {
5972		VM_BUG_ON(mc.from);
5973		VM_BUG_ON(mc.to);
5974		VM_BUG_ON(mc.precharge);
5975		VM_BUG_ON(mc.moved_charge);
5976		VM_BUG_ON(mc.moved_swap);
5977
5978		spin_lock(&mc.lock);
5979		mc.mm = mm;
5980		mc.from = from;
5981		mc.to = memcg;
5982		mc.flags = move_flags;
5983		spin_unlock(&mc.lock);
5984		/* We set mc.moving_task later */
5985
5986		ret = mem_cgroup_precharge_mc(mm);
5987		if (ret)
5988			mem_cgroup_clear_mc();
5989	} else {
5990		mmput(mm);
5991	}
5992	return ret;
5993}
5994
5995static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5996{
5997	if (mc.to)
5998		mem_cgroup_clear_mc();
5999}
6000
6001static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6002				unsigned long addr, unsigned long end,
6003				struct mm_walk *walk)
6004{
6005	int ret = 0;
6006	struct vm_area_struct *vma = walk->vma;
6007	pte_t *pte;
6008	spinlock_t *ptl;
6009	enum mc_target_type target_type;
6010	union mc_target target;
6011	struct page *page;
6012
6013	ptl = pmd_trans_huge_lock(pmd, vma);
6014	if (ptl) {
6015		if (mc.precharge < HPAGE_PMD_NR) {
6016			spin_unlock(ptl);
6017			return 0;
6018		}
6019		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6020		if (target_type == MC_TARGET_PAGE) {
6021			page = target.page;
6022			if (!isolate_lru_page(page)) {
6023				if (!mem_cgroup_move_account(page, true,
6024							     mc.from, mc.to)) {
6025					mc.precharge -= HPAGE_PMD_NR;
6026					mc.moved_charge += HPAGE_PMD_NR;
6027				}
6028				putback_lru_page(page);
6029			}
6030			put_page(page);
6031		} else if (target_type == MC_TARGET_DEVICE) {
6032			page = target.page;
6033			if (!mem_cgroup_move_account(page, true,
6034						     mc.from, mc.to)) {
6035				mc.precharge -= HPAGE_PMD_NR;
6036				mc.moved_charge += HPAGE_PMD_NR;
6037			}
6038			put_page(page);
6039		}
6040		spin_unlock(ptl);
6041		return 0;
6042	}
6043
6044	if (pmd_trans_unstable(pmd))
6045		return 0;
6046retry:
6047	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6048	for (; addr != end; addr += PAGE_SIZE) {
6049		pte_t ptent = *(pte++);
6050		bool device = false;
6051		swp_entry_t ent;
6052
6053		if (!mc.precharge)
6054			break;
6055
6056		switch (get_mctgt_type(vma, addr, ptent, &target)) {
6057		case MC_TARGET_DEVICE:
6058			device = true;
6059			fallthrough;
6060		case MC_TARGET_PAGE:
6061			page = target.page;
6062			/*
6063			 * We can have a part of the split pmd here. Moving it
6064			 * can be done but it would be too convoluted so simply
6065			 * ignore such a partial THP and keep it in original
6066			 * memcg. There should be somebody mapping the head.
6067			 */
6068			if (PageTransCompound(page))
6069				goto put;
6070			if (!device && isolate_lru_page(page))
6071				goto put;
6072			if (!mem_cgroup_move_account(page, false,
6073						mc.from, mc.to)) {
6074				mc.precharge--;
6075				/* we uncharge from mc.from later. */
6076				mc.moved_charge++;
6077			}
6078			if (!device)
6079				putback_lru_page(page);
6080put:			/* get_mctgt_type() gets the page */
6081			put_page(page);
6082			break;
6083		case MC_TARGET_SWAP:
6084			ent = target.ent;
6085			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6086				mc.precharge--;
6087				mem_cgroup_id_get_many(mc.to, 1);
6088				/* we fixup other refcnts and charges later. */
6089				mc.moved_swap++;
6090			}
6091			break;
6092		default:
6093			break;
6094		}
6095	}
6096	pte_unmap_unlock(pte - 1, ptl);
6097	cond_resched();
6098
6099	if (addr != end) {
6100		/*
6101		 * We have consumed all precharges we got in can_attach().
6102		 * We try charge one by one, but don't do any additional
6103		 * charges to mc.to if we have failed in charge once in attach()
6104		 * phase.
6105		 */
6106		ret = mem_cgroup_do_precharge(1);
6107		if (!ret)
6108			goto retry;
6109	}
6110
6111	return ret;
6112}
6113
6114static const struct mm_walk_ops charge_walk_ops = {
6115	.pmd_entry	= mem_cgroup_move_charge_pte_range,
6116};
6117
6118static void mem_cgroup_move_charge(void)
6119{
 
 
 
 
 
6120	lru_add_drain_all();
6121	/*
6122	 * Signal lock_page_memcg() to take the memcg's move_lock
6123	 * while we're moving its pages to another memcg. Then wait
6124	 * for already started RCU-only updates to finish.
6125	 */
6126	atomic_inc(&mc.from->moving_account);
6127	synchronize_rcu();
6128retry:
6129	if (unlikely(!mmap_read_trylock(mc.mm))) {
6130		/*
6131		 * Someone who are holding the mmap_lock might be waiting in
6132		 * waitq. So we cancel all extra charges, wake up all waiters,
6133		 * and retry. Because we cancel precharges, we might not be able
6134		 * to move enough charges, but moving charge is a best-effort
6135		 * feature anyway, so it wouldn't be a big problem.
6136		 */
6137		__mem_cgroup_clear_mc();
6138		cond_resched();
6139		goto retry;
6140	}
6141	/*
6142	 * When we have consumed all precharges and failed in doing
6143	 * additional charge, the page walk just aborts.
6144	 */
6145	walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6146			NULL);
6147
6148	mmap_read_unlock(mc.mm);
6149	atomic_dec(&mc.from->moving_account);
6150}
6151
6152static void mem_cgroup_move_task(void)
6153{
6154	if (mc.to) {
6155		mem_cgroup_move_charge();
6156		mem_cgroup_clear_mc();
6157	}
6158}
6159#else	/* !CONFIG_MMU */
6160static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6161{
6162	return 0;
6163}
6164static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6165{
6166}
6167static void mem_cgroup_move_task(void)
6168{
6169}
6170#endif
6171
6172/*
6173 * Cgroup retains root cgroups across [un]mount cycles making it necessary
6174 * to verify whether we're attached to the default hierarchy on each mount
6175 * attempt.
6176 */
6177static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6178{
6179	/*
6180	 * use_hierarchy is forced on the default hierarchy.  cgroup core
6181	 * guarantees that @root doesn't have any children, so turning it
6182	 * on for the root memcg is enough.
6183	 */
6184	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6185		root_mem_cgroup->use_hierarchy = true;
6186	else
6187		root_mem_cgroup->use_hierarchy = false;
6188}
6189
6190static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6191{
6192	if (value == PAGE_COUNTER_MAX)
6193		seq_puts(m, "max\n");
6194	else
6195		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6196
6197	return 0;
6198}
6199
6200static u64 memory_current_read(struct cgroup_subsys_state *css,
6201			       struct cftype *cft)
6202{
6203	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6204
6205	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6206}
6207
6208static int memory_min_show(struct seq_file *m, void *v)
6209{
6210	return seq_puts_memcg_tunable(m,
6211		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6212}
6213
6214static ssize_t memory_min_write(struct kernfs_open_file *of,
6215				char *buf, size_t nbytes, loff_t off)
6216{
6217	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6218	unsigned long min;
6219	int err;
6220
6221	buf = strstrip(buf);
6222	err = page_counter_memparse(buf, "max", &min);
6223	if (err)
6224		return err;
6225
6226	page_counter_set_min(&memcg->memory, min);
6227
6228	return nbytes;
6229}
6230
6231static int memory_low_show(struct seq_file *m, void *v)
6232{
6233	return seq_puts_memcg_tunable(m,
6234		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6235}
6236
6237static ssize_t memory_low_write(struct kernfs_open_file *of,
6238				char *buf, size_t nbytes, loff_t off)
6239{
6240	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6241	unsigned long low;
6242	int err;
6243
6244	buf = strstrip(buf);
6245	err = page_counter_memparse(buf, "max", &low);
6246	if (err)
6247		return err;
6248
6249	page_counter_set_low(&memcg->memory, low);
6250
6251	return nbytes;
6252}
6253
6254static int memory_high_show(struct seq_file *m, void *v)
6255{
6256	return seq_puts_memcg_tunable(m,
6257		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
 
 
 
 
 
 
 
6258}
6259
6260static ssize_t memory_high_write(struct kernfs_open_file *of,
6261				 char *buf, size_t nbytes, loff_t off)
6262{
6263	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6264	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6265	bool drained = false;
6266	unsigned long high;
6267	int err;
6268
6269	buf = strstrip(buf);
6270	err = page_counter_memparse(buf, "max", &high);
6271	if (err)
6272		return err;
6273
6274	for (;;) {
6275		unsigned long nr_pages = page_counter_read(&memcg->memory);
6276		unsigned long reclaimed;
6277
6278		if (nr_pages <= high)
6279			break;
6280
6281		if (signal_pending(current))
6282			break;
6283
6284		if (!drained) {
6285			drain_all_stock(memcg);
6286			drained = true;
6287			continue;
6288		}
6289
6290		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6291							 GFP_KERNEL, true);
6292
6293		if (!reclaimed && !nr_retries--)
6294			break;
6295	}
6296
6297	page_counter_set_high(&memcg->memory, high);
6298
6299	memcg_wb_domain_size_changed(memcg);
6300
6301	return nbytes;
6302}
6303
6304static int memory_max_show(struct seq_file *m, void *v)
6305{
6306	return seq_puts_memcg_tunable(m,
6307		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
 
 
 
 
 
 
 
6308}
6309
6310static ssize_t memory_max_write(struct kernfs_open_file *of,
6311				char *buf, size_t nbytes, loff_t off)
6312{
6313	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6314	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6315	bool drained = false;
6316	unsigned long max;
6317	int err;
6318
6319	buf = strstrip(buf);
6320	err = page_counter_memparse(buf, "max", &max);
6321	if (err)
6322		return err;
6323
6324	xchg(&memcg->memory.max, max);
6325
6326	for (;;) {
6327		unsigned long nr_pages = page_counter_read(&memcg->memory);
6328
6329		if (nr_pages <= max)
6330			break;
6331
6332		if (signal_pending(current))
 
6333			break;
 
6334
6335		if (!drained) {
6336			drain_all_stock(memcg);
6337			drained = true;
6338			continue;
6339		}
6340
6341		if (nr_reclaims) {
6342			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6343							  GFP_KERNEL, true))
6344				nr_reclaims--;
6345			continue;
6346		}
6347
6348		memcg_memory_event(memcg, MEMCG_OOM);
6349		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6350			break;
6351	}
6352
6353	memcg_wb_domain_size_changed(memcg);
6354	return nbytes;
6355}
6356
6357static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6358{
6359	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6360	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6361	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6362	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6363	seq_printf(m, "oom_kill %lu\n",
6364		   atomic_long_read(&events[MEMCG_OOM_KILL]));
6365}
6366
6367static int memory_events_show(struct seq_file *m, void *v)
6368{
6369	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6370
6371	__memory_events_show(m, memcg->memory_events);
6372	return 0;
6373}
6374
6375static int memory_events_local_show(struct seq_file *m, void *v)
6376{
6377	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6378
6379	__memory_events_show(m, memcg->memory_events_local);
6380	return 0;
6381}
6382
6383static int memory_stat_show(struct seq_file *m, void *v)
6384{
6385	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6386	char *buf;
 
 
6387
6388	buf = memory_stat_format(memcg);
6389	if (!buf)
6390		return -ENOMEM;
6391	seq_puts(m, buf);
6392	kfree(buf);
6393	return 0;
6394}
 
 
 
6395
6396static int memory_oom_group_show(struct seq_file *m, void *v)
6397{
6398	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6399
6400	seq_printf(m, "%d\n", memcg->oom_group);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6401
6402	return 0;
6403}
 
6404
6405static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6406				      char *buf, size_t nbytes, loff_t off)
6407{
6408	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6409	int ret, oom_group;
6410
6411	buf = strstrip(buf);
6412	if (!buf)
6413		return -EINVAL;
 
6414
6415	ret = kstrtoint(buf, 0, &oom_group);
6416	if (ret)
6417		return ret;
6418
6419	if (oom_group != 0 && oom_group != 1)
6420		return -EINVAL;
6421
6422	memcg->oom_group = oom_group;
 
 
 
6423
6424	return nbytes;
6425}
6426
6427static struct cftype memory_files[] = {
6428	{
6429		.name = "current",
6430		.flags = CFTYPE_NOT_ON_ROOT,
6431		.read_u64 = memory_current_read,
6432	},
6433	{
6434		.name = "min",
6435		.flags = CFTYPE_NOT_ON_ROOT,
6436		.seq_show = memory_min_show,
6437		.write = memory_min_write,
6438	},
6439	{
6440		.name = "low",
6441		.flags = CFTYPE_NOT_ON_ROOT,
6442		.seq_show = memory_low_show,
6443		.write = memory_low_write,
6444	},
6445	{
6446		.name = "high",
6447		.flags = CFTYPE_NOT_ON_ROOT,
6448		.seq_show = memory_high_show,
6449		.write = memory_high_write,
6450	},
6451	{
6452		.name = "max",
6453		.flags = CFTYPE_NOT_ON_ROOT,
6454		.seq_show = memory_max_show,
6455		.write = memory_max_write,
6456	},
6457	{
6458		.name = "events",
6459		.flags = CFTYPE_NOT_ON_ROOT,
6460		.file_offset = offsetof(struct mem_cgroup, events_file),
6461		.seq_show = memory_events_show,
6462	},
6463	{
6464		.name = "events.local",
6465		.flags = CFTYPE_NOT_ON_ROOT,
6466		.file_offset = offsetof(struct mem_cgroup, events_local_file),
6467		.seq_show = memory_events_local_show,
6468	},
6469	{
6470		.name = "stat",
6471		.seq_show = memory_stat_show,
6472	},
6473	{
6474		.name = "oom.group",
6475		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6476		.seq_show = memory_oom_group_show,
6477		.write = memory_oom_group_write,
6478	},
6479	{ }	/* terminate */
6480};
6481
6482struct cgroup_subsys memory_cgrp_subsys = {
6483	.css_alloc = mem_cgroup_css_alloc,
6484	.css_online = mem_cgroup_css_online,
6485	.css_offline = mem_cgroup_css_offline,
6486	.css_released = mem_cgroup_css_released,
6487	.css_free = mem_cgroup_css_free,
6488	.css_reset = mem_cgroup_css_reset,
6489	.can_attach = mem_cgroup_can_attach,
6490	.cancel_attach = mem_cgroup_cancel_attach,
6491	.post_attach = mem_cgroup_move_task,
6492	.bind = mem_cgroup_bind,
6493	.dfl_cftypes = memory_files,
6494	.legacy_cftypes = mem_cgroup_legacy_files,
6495	.early_init = 0,
6496};
6497
6498/*
6499 * This function calculates an individual cgroup's effective
6500 * protection which is derived from its own memory.min/low, its
6501 * parent's and siblings' settings, as well as the actual memory
6502 * distribution in the tree.
6503 *
6504 * The following rules apply to the effective protection values:
6505 *
6506 * 1. At the first level of reclaim, effective protection is equal to
6507 *    the declared protection in memory.min and memory.low.
6508 *
6509 * 2. To enable safe delegation of the protection configuration, at
6510 *    subsequent levels the effective protection is capped to the
6511 *    parent's effective protection.
6512 *
6513 * 3. To make complex and dynamic subtrees easier to configure, the
6514 *    user is allowed to overcommit the declared protection at a given
6515 *    level. If that is the case, the parent's effective protection is
6516 *    distributed to the children in proportion to how much protection
6517 *    they have declared and how much of it they are utilizing.
6518 *
6519 *    This makes distribution proportional, but also work-conserving:
6520 *    if one cgroup claims much more protection than it uses memory,
6521 *    the unused remainder is available to its siblings.
6522 *
6523 * 4. Conversely, when the declared protection is undercommitted at a
6524 *    given level, the distribution of the larger parental protection
6525 *    budget is NOT proportional. A cgroup's protection from a sibling
6526 *    is capped to its own memory.min/low setting.
6527 *
6528 * 5. However, to allow protecting recursive subtrees from each other
6529 *    without having to declare each individual cgroup's fixed share
6530 *    of the ancestor's claim to protection, any unutilized -
6531 *    "floating" - protection from up the tree is distributed in
6532 *    proportion to each cgroup's *usage*. This makes the protection
6533 *    neutral wrt sibling cgroups and lets them compete freely over
6534 *    the shared parental protection budget, but it protects the
6535 *    subtree as a whole from neighboring subtrees.
6536 *
6537 * Note that 4. and 5. are not in conflict: 4. is about protecting
6538 * against immediate siblings whereas 5. is about protecting against
6539 * neighboring subtrees.
6540 */
6541static unsigned long effective_protection(unsigned long usage,
6542					  unsigned long parent_usage,
6543					  unsigned long setting,
6544					  unsigned long parent_effective,
6545					  unsigned long siblings_protected)
6546{
6547	unsigned long protected;
6548	unsigned long ep;
6549
6550	protected = min(usage, setting);
6551	/*
6552	 * If all cgroups at this level combined claim and use more
6553	 * protection then what the parent affords them, distribute
6554	 * shares in proportion to utilization.
6555	 *
6556	 * We are using actual utilization rather than the statically
6557	 * claimed protection in order to be work-conserving: claimed
6558	 * but unused protection is available to siblings that would
6559	 * otherwise get a smaller chunk than what they claimed.
6560	 */
6561	if (siblings_protected > parent_effective)
6562		return protected * parent_effective / siblings_protected;
6563
6564	/*
6565	 * Ok, utilized protection of all children is within what the
6566	 * parent affords them, so we know whatever this child claims
6567	 * and utilizes is effectively protected.
6568	 *
6569	 * If there is unprotected usage beyond this value, reclaim
6570	 * will apply pressure in proportion to that amount.
6571	 *
6572	 * If there is unutilized protection, the cgroup will be fully
6573	 * shielded from reclaim, but we do return a smaller value for
6574	 * protection than what the group could enjoy in theory. This
6575	 * is okay. With the overcommit distribution above, effective
6576	 * protection is always dependent on how memory is actually
6577	 * consumed among the siblings anyway.
6578	 */
6579	ep = protected;
6580
6581	/*
6582	 * If the children aren't claiming (all of) the protection
6583	 * afforded to them by the parent, distribute the remainder in
6584	 * proportion to the (unprotected) memory of each cgroup. That
6585	 * way, cgroups that aren't explicitly prioritized wrt each
6586	 * other compete freely over the allowance, but they are
6587	 * collectively protected from neighboring trees.
6588	 *
6589	 * We're using unprotected memory for the weight so that if
6590	 * some cgroups DO claim explicit protection, we don't protect
6591	 * the same bytes twice.
6592	 *
6593	 * Check both usage and parent_usage against the respective
6594	 * protected values. One should imply the other, but they
6595	 * aren't read atomically - make sure the division is sane.
6596	 */
6597	if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6598		return ep;
6599	if (parent_effective > siblings_protected &&
6600	    parent_usage > siblings_protected &&
6601	    usage > protected) {
6602		unsigned long unclaimed;
6603
6604		unclaimed = parent_effective - siblings_protected;
6605		unclaimed *= usage - protected;
6606		unclaimed /= parent_usage - siblings_protected;
6607
6608		ep += unclaimed;
6609	}
6610
6611	return ep;
6612}
6613
6614/**
6615 * mem_cgroup_protected - check if memory consumption is in the normal range
6616 * @root: the top ancestor of the sub-tree being checked
6617 * @memcg: the memory cgroup to check
6618 *
6619 * WARNING: This function is not stateless! It can only be used as part
6620 *          of a top-down tree iteration, not for isolated queries.
6621 */
6622void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6623				     struct mem_cgroup *memcg)
6624{
6625	unsigned long usage, parent_usage;
6626	struct mem_cgroup *parent;
6627
6628	if (mem_cgroup_disabled())
6629		return;
6630
6631	if (!root)
6632		root = root_mem_cgroup;
6633
6634	/*
6635	 * Effective values of the reclaim targets are ignored so they
6636	 * can be stale. Have a look at mem_cgroup_protection for more
6637	 * details.
6638	 * TODO: calculation should be more robust so that we do not need
6639	 * that special casing.
6640	 */
6641	if (memcg == root)
6642		return;
6643
6644	usage = page_counter_read(&memcg->memory);
6645	if (!usage)
6646		return;
6647
6648	parent = parent_mem_cgroup(memcg);
6649	/* No parent means a non-hierarchical mode on v1 memcg */
6650	if (!parent)
6651		return;
6652
6653	if (parent == root) {
6654		memcg->memory.emin = READ_ONCE(memcg->memory.min);
6655		memcg->memory.elow = READ_ONCE(memcg->memory.low);
6656		return;
6657	}
6658
6659	parent_usage = page_counter_read(&parent->memory);
 
6660
6661	WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6662			READ_ONCE(memcg->memory.min),
6663			READ_ONCE(parent->memory.emin),
6664			atomic_long_read(&parent->memory.children_min_usage)));
6665
6666	WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6667			READ_ONCE(memcg->memory.low),
6668			READ_ONCE(parent->memory.elow),
6669			atomic_long_read(&parent->memory.children_low_usage)));
6670}
6671
6672/**
6673 * mem_cgroup_charge - charge a newly allocated page to a cgroup
6674 * @page: page to charge
6675 * @mm: mm context of the victim
6676 * @gfp_mask: reclaim mode
 
6677 *
6678 * Try to charge @page to the memcg that @mm belongs to, reclaiming
6679 * pages according to @gfp_mask if necessary.
6680 *
6681 * Returns 0 on success. Otherwise, an error code is returned.
6682 */
6683int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 
 
 
 
 
 
 
6684{
6685	unsigned int nr_pages = thp_nr_pages(page);
6686	struct mem_cgroup *memcg = NULL;
 
6687	int ret = 0;
6688
6689	if (mem_cgroup_disabled())
6690		goto out;
6691
6692	if (PageSwapCache(page)) {
6693		swp_entry_t ent = { .val = page_private(page), };
6694		unsigned short id;
6695
6696		/*
6697		 * Every swap fault against a single page tries to charge the
6698		 * page, bail as early as possible.  shmem_unuse() encounters
6699		 * already charged pages, too.  page->mem_cgroup is protected
6700		 * by the page lock, which serializes swap cache removal, which
6701		 * in turn serializes uncharging.
6702		 */
6703		VM_BUG_ON_PAGE(!PageLocked(page), page);
6704		if (compound_head(page)->mem_cgroup)
6705			goto out;
6706
6707		id = lookup_swap_cgroup_id(ent);
6708		rcu_read_lock();
6709		memcg = mem_cgroup_from_id(id);
6710		if (memcg && !css_tryget_online(&memcg->css))
6711			memcg = NULL;
6712		rcu_read_unlock();
 
 
 
 
6713	}
6714
6715	if (!memcg)
6716		memcg = get_mem_cgroup_from_mm(mm);
6717
6718	ret = try_charge(memcg, gfp_mask, nr_pages);
6719	if (ret)
6720		goto out_put;
6721
6722	css_get(&memcg->css);
6723	commit_charge(page, memcg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6724
6725	local_irq_disable();
6726	mem_cgroup_charge_statistics(memcg, page, nr_pages);
6727	memcg_check_events(memcg, page);
6728	local_irq_enable();
6729
6730	if (PageSwapCache(page)) {
6731		swp_entry_t entry = { .val = page_private(page) };
6732		/*
6733		 * The swap entry might not get freed for a long time,
6734		 * let's not wait for it.  The page already received a
6735		 * memory+swap charge, drop the swap entry duplicate.
6736		 */
6737		mem_cgroup_uncharge_swap(entry, nr_pages);
6738	}
 
6739
6740out_put:
6741	css_put(&memcg->css);
6742out:
6743	return ret;
6744}
 
 
 
 
 
 
6745
6746struct uncharge_gather {
6747	struct mem_cgroup *memcg;
6748	unsigned long nr_pages;
6749	unsigned long pgpgout;
6750	unsigned long nr_kmem;
6751	struct page *dummy_page;
6752};
 
 
6753
6754static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6755{
6756	memset(ug, 0, sizeof(*ug));
6757}
6758
6759static void uncharge_batch(const struct uncharge_gather *ug)
 
 
6760{
 
6761	unsigned long flags;
6762
6763	if (!mem_cgroup_is_root(ug->memcg)) {
6764		page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6765		if (do_memsw_account())
6766			page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6767		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6768			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6769		memcg_oom_recover(ug->memcg);
6770	}
6771
6772	local_irq_save(flags);
6773	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6774	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6775	memcg_check_events(ug->memcg, ug->dummy_page);
 
 
 
6776	local_irq_restore(flags);
6777
6778	/* drop reference from uncharge_page */
6779	css_put(&ug->memcg->css);
6780}
6781
6782static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6783{
6784	unsigned long nr_pages;
6785
6786	VM_BUG_ON_PAGE(PageLRU(page), page);
6787
6788	if (!page->mem_cgroup)
6789		return;
6790
6791	/*
6792	 * Nobody should be changing or seriously looking at
6793	 * page->mem_cgroup at this point, we have fully
6794	 * exclusive access to the page.
6795	 */
6796
6797	if (ug->memcg != page->mem_cgroup) {
6798		if (ug->memcg) {
6799			uncharge_batch(ug);
6800			uncharge_gather_clear(ug);
6801		}
6802		ug->memcg = page->mem_cgroup;
6803
6804		/* pairs with css_put in uncharge_batch */
6805		css_get(&ug->memcg->css);
6806	}
6807
6808	nr_pages = compound_nr(page);
6809	ug->nr_pages += nr_pages;
6810
6811	if (!PageKmemcg(page)) {
6812		ug->pgpgout++;
6813	} else {
6814		ug->nr_kmem += nr_pages;
6815		__ClearPageKmemcg(page);
6816	}
6817
6818	ug->dummy_page = page;
6819	page->mem_cgroup = NULL;
6820	css_put(&ug->memcg->css);
6821}
6822
6823static void uncharge_list(struct list_head *page_list)
6824{
6825	struct uncharge_gather ug;
 
 
 
 
6826	struct list_head *next;
6827
6828	uncharge_gather_clear(&ug);
6829
6830	/*
6831	 * Note that the list can be a single page->lru; hence the
6832	 * do-while loop instead of a simple list_for_each_entry().
6833	 */
6834	next = page_list->next;
6835	do {
6836		struct page *page;
6837
6838		page = list_entry(next, struct page, lru);
6839		next = page->lru.next;
6840
6841		uncharge_page(page, &ug);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6842	} while (next != page_list);
6843
6844	if (ug.memcg)
6845		uncharge_batch(&ug);
 
6846}
6847
6848/**
6849 * mem_cgroup_uncharge - uncharge a page
6850 * @page: page to uncharge
6851 *
6852 * Uncharge a page previously charged with mem_cgroup_charge().
 
6853 */
6854void mem_cgroup_uncharge(struct page *page)
6855{
6856	struct uncharge_gather ug;
6857
6858	if (mem_cgroup_disabled())
6859		return;
6860
6861	/* Don't touch page->lru of any random page, pre-check: */
6862	if (!page->mem_cgroup)
6863		return;
6864
6865	uncharge_gather_clear(&ug);
6866	uncharge_page(page, &ug);
6867	uncharge_batch(&ug);
6868}
6869
6870/**
6871 * mem_cgroup_uncharge_list - uncharge a list of page
6872 * @page_list: list of pages to uncharge
6873 *
6874 * Uncharge a list of pages previously charged with
6875 * mem_cgroup_charge().
6876 */
6877void mem_cgroup_uncharge_list(struct list_head *page_list)
6878{
6879	if (mem_cgroup_disabled())
6880		return;
6881
6882	if (!list_empty(page_list))
6883		uncharge_list(page_list);
6884}
6885
6886/**
6887 * mem_cgroup_migrate - charge a page's replacement
6888 * @oldpage: currently circulating page
6889 * @newpage: replacement page
6890 *
6891 * Charge @newpage as a replacement page for @oldpage. @oldpage will
6892 * be uncharged upon free.
6893 *
6894 * Both pages must be locked, @newpage->mapping must be set up.
6895 */
6896void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6897{
6898	struct mem_cgroup *memcg;
6899	unsigned int nr_pages;
6900	unsigned long flags;
6901
6902	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6903	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6904	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6905	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6906		       newpage);
6907
6908	if (mem_cgroup_disabled())
6909		return;
6910
6911	/* Page cache replacement: new page already charged? */
6912	if (newpage->mem_cgroup)
6913		return;
6914
6915	/* Swapcache readahead pages can get replaced before being charged */
6916	memcg = oldpage->mem_cgroup;
6917	if (!memcg)
6918		return;
6919
6920	/* Force-charge the new page. The old one will be freed soon */
6921	nr_pages = thp_nr_pages(newpage);
 
6922
6923	page_counter_charge(&memcg->memory, nr_pages);
6924	if (do_memsw_account())
6925		page_counter_charge(&memcg->memsw, nr_pages);
 
6926
6927	css_get(&memcg->css);
6928	commit_charge(newpage, memcg);
6929
6930	local_irq_save(flags);
6931	mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6932	memcg_check_events(memcg, newpage);
6933	local_irq_restore(flags);
6934}
6935
6936DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6937EXPORT_SYMBOL(memcg_sockets_enabled_key);
6938
6939void mem_cgroup_sk_alloc(struct sock *sk)
6940{
6941	struct mem_cgroup *memcg;
6942
6943	if (!mem_cgroup_sockets_enabled)
6944		return;
6945
6946	/* Do not associate the sock with unrelated interrupted task's memcg. */
6947	if (in_interrupt())
 
 
 
 
 
 
6948		return;
 
6949
6950	rcu_read_lock();
6951	memcg = mem_cgroup_from_task(current);
6952	if (memcg == root_mem_cgroup)
6953		goto out;
6954	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6955		goto out;
6956	if (css_tryget(&memcg->css))
6957		sk->sk_memcg = memcg;
6958out:
6959	rcu_read_unlock();
6960}
 
6961
6962void mem_cgroup_sk_free(struct sock *sk)
6963{
6964	if (sk->sk_memcg)
6965		css_put(&sk->sk_memcg->css);
6966}
6967
6968/**
6969 * mem_cgroup_charge_skmem - charge socket memory
6970 * @memcg: memcg to charge
6971 * @nr_pages: number of pages to charge
6972 *
6973 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
6974 * @memcg's configured limit, %false if the charge had to be forced.
6975 */
6976bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
6977{
6978	gfp_t gfp_mask = GFP_KERNEL;
6979
6980	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
6981		struct page_counter *fail;
6982
6983		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
6984			memcg->tcpmem_pressure = 0;
6985			return true;
6986		}
6987		page_counter_charge(&memcg->tcpmem, nr_pages);
6988		memcg->tcpmem_pressure = 1;
6989		return false;
6990	}
6991
6992	/* Don't block in the packet receive path */
6993	if (in_softirq())
6994		gfp_mask = GFP_NOWAIT;
6995
6996	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
6997
6998	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
6999		return true;
7000
7001	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
7002	return false;
7003}
7004
7005/**
7006 * mem_cgroup_uncharge_skmem - uncharge socket memory
7007 * @memcg: memcg to uncharge
7008 * @nr_pages: number of pages to uncharge
7009 */
7010void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7011{
7012	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7013		page_counter_uncharge(&memcg->tcpmem, nr_pages);
7014		return;
7015	}
7016
7017	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7018
7019	refill_stock(memcg, nr_pages);
 
7020}
7021
7022static int __init cgroup_memory(char *s)
7023{
7024	char *token;
7025
7026	while ((token = strsep(&s, ",")) != NULL) {
7027		if (!*token)
7028			continue;
7029		if (!strcmp(token, "nosocket"))
7030			cgroup_memory_nosocket = true;
7031		if (!strcmp(token, "nokmem"))
7032			cgroup_memory_nokmem = true;
7033	}
7034	return 0;
7035}
7036__setup("cgroup.memory=", cgroup_memory);
7037
7038/*
7039 * subsys_initcall() for memory controller.
7040 *
7041 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
7042 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
7043 * basically everything that doesn't depend on a specific mem_cgroup structure
7044 * should be initialized from here.
7045 */
7046static int __init mem_cgroup_init(void)
7047{
7048	int cpu, node;
7049
7050	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7051				  memcg_hotplug_cpu_dead);
7052
7053	for_each_possible_cpu(cpu)
7054		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7055			  drain_local_stock);
7056
7057	for_each_node(node) {
7058		struct mem_cgroup_tree_per_node *rtpn;
 
7059
7060		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7061				    node_online(node) ? node : NUMA_NO_NODE);
7062
7063		rtpn->rb_root = RB_ROOT;
7064		rtpn->rb_rightmost = NULL;
7065		spin_lock_init(&rtpn->lock);
 
 
 
 
7066		soft_limit_tree.rb_tree_per_node[node] = rtpn;
7067	}
7068
7069	return 0;
7070}
7071subsys_initcall(mem_cgroup_init);
7072
7073#ifdef CONFIG_MEMCG_SWAP
7074static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7075{
7076	while (!refcount_inc_not_zero(&memcg->id.ref)) {
7077		/*
7078		 * The root cgroup cannot be destroyed, so it's refcount must
7079		 * always be >= 1.
7080		 */
7081		if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7082			VM_BUG_ON(1);
7083			break;
7084		}
7085		memcg = parent_mem_cgroup(memcg);
7086		if (!memcg)
7087			memcg = root_mem_cgroup;
7088	}
7089	return memcg;
7090}
7091
7092/**
7093 * mem_cgroup_swapout - transfer a memsw charge to swap
7094 * @page: page whose memsw charge to transfer
7095 * @entry: swap entry to move the charge to
7096 *
7097 * Transfer the memsw charge of @page to @entry.
7098 */
7099void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7100{
7101	struct mem_cgroup *memcg, *swap_memcg;
7102	unsigned int nr_entries;
7103	unsigned short oldid;
7104
7105	VM_BUG_ON_PAGE(PageLRU(page), page);
7106	VM_BUG_ON_PAGE(page_count(page), page);
7107
7108	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7109		return;
7110
7111	memcg = page->mem_cgroup;
7112
7113	/* Readahead page, never charged */
7114	if (!memcg)
7115		return;
7116
7117	/*
7118	 * In case the memcg owning these pages has been offlined and doesn't
7119	 * have an ID allocated to it anymore, charge the closest online
7120	 * ancestor for the swap instead and transfer the memory+swap charge.
7121	 */
7122	swap_memcg = mem_cgroup_id_get_online(memcg);
7123	nr_entries = thp_nr_pages(page);
7124	/* Get references for the tail pages, too */
7125	if (nr_entries > 1)
7126		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7127	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7128				   nr_entries);
7129	VM_BUG_ON_PAGE(oldid, page);
7130	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7131
7132	page->mem_cgroup = NULL;
7133
7134	if (!mem_cgroup_is_root(memcg))
7135		page_counter_uncharge(&memcg->memory, nr_entries);
7136
7137	if (!cgroup_memory_noswap && memcg != swap_memcg) {
7138		if (!mem_cgroup_is_root(swap_memcg))
7139			page_counter_charge(&swap_memcg->memsw, nr_entries);
7140		page_counter_uncharge(&memcg->memsw, nr_entries);
7141	}
7142
7143	/*
7144	 * Interrupts should be disabled here because the caller holds the
7145	 * i_pages lock which is taken with interrupts-off. It is
7146	 * important here to have the interrupts disabled because it is the
7147	 * only synchronisation we have for updating the per-CPU variables.
7148	 */
7149	VM_BUG_ON(!irqs_disabled());
7150	mem_cgroup_charge_statistics(memcg, page, -nr_entries);
7151	memcg_check_events(memcg, page);
7152
7153	css_put(&memcg->css);
7154}
7155
7156/**
7157 * mem_cgroup_try_charge_swap - try charging swap space for a page
7158 * @page: page being added to swap
7159 * @entry: swap entry to charge
7160 *
7161 * Try to charge @page's memcg for the swap space at @entry.
7162 *
7163 * Returns 0 on success, -ENOMEM on failure.
7164 */
7165int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7166{
7167	unsigned int nr_pages = thp_nr_pages(page);
7168	struct page_counter *counter;
7169	struct mem_cgroup *memcg;
7170	unsigned short oldid;
7171
7172	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7173		return 0;
7174
7175	memcg = page->mem_cgroup;
7176
7177	/* Readahead page, never charged */
7178	if (!memcg)
7179		return 0;
7180
7181	if (!entry.val) {
7182		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7183		return 0;
7184	}
7185
7186	memcg = mem_cgroup_id_get_online(memcg);
7187
7188	if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7189	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7190		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7191		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7192		mem_cgroup_id_put(memcg);
7193		return -ENOMEM;
7194	}
7195
7196	/* Get references for the tail pages, too */
7197	if (nr_pages > 1)
7198		mem_cgroup_id_get_many(memcg, nr_pages - 1);
7199	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7200	VM_BUG_ON_PAGE(oldid, page);
7201	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7202
 
7203	return 0;
7204}
7205
7206/**
7207 * mem_cgroup_uncharge_swap - uncharge swap space
7208 * @entry: swap entry to uncharge
7209 * @nr_pages: the amount of swap space to uncharge
 
7210 */
7211void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7212{
7213	struct mem_cgroup *memcg;
7214	unsigned short id;
7215
7216	id = swap_cgroup_record(entry, 0, nr_pages);
 
 
 
7217	rcu_read_lock();
7218	memcg = mem_cgroup_from_id(id);
7219	if (memcg) {
7220		if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7221			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7222				page_counter_uncharge(&memcg->swap, nr_pages);
7223			else
7224				page_counter_uncharge(&memcg->memsw, nr_pages);
7225		}
7226		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7227		mem_cgroup_id_put_many(memcg, nr_pages);
7228	}
7229	rcu_read_unlock();
7230}
7231
7232long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7233{
7234	long nr_swap_pages = get_nr_swap_pages();
7235
7236	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7237		return nr_swap_pages;
7238	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7239		nr_swap_pages = min_t(long, nr_swap_pages,
7240				      READ_ONCE(memcg->swap.max) -
7241				      page_counter_read(&memcg->swap));
7242	return nr_swap_pages;
7243}
7244
7245bool mem_cgroup_swap_full(struct page *page)
7246{
7247	struct mem_cgroup *memcg;
7248
7249	VM_BUG_ON_PAGE(!PageLocked(page), page);
7250
7251	if (vm_swap_full())
7252		return true;
7253	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7254		return false;
7255
7256	memcg = page->mem_cgroup;
7257	if (!memcg)
7258		return false;
7259
7260	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7261		unsigned long usage = page_counter_read(&memcg->swap);
7262
7263		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7264		    usage * 2 >= READ_ONCE(memcg->swap.max))
7265			return true;
7266	}
7267
7268	return false;
7269}
7270
7271static int __init setup_swap_account(char *s)
 
 
 
 
 
 
 
7272{
7273	if (!strcmp(s, "1"))
7274		cgroup_memory_noswap = 0;
7275	else if (!strcmp(s, "0"))
7276		cgroup_memory_noswap = 1;
7277	return 1;
7278}
7279__setup("swapaccount=", setup_swap_account);
7280
7281static u64 swap_current_read(struct cgroup_subsys_state *css,
7282			     struct cftype *cft)
7283{
7284	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7285
7286	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7287}
7288
7289static int swap_high_show(struct seq_file *m, void *v)
7290{
7291	return seq_puts_memcg_tunable(m,
7292		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7293}
7294
7295static ssize_t swap_high_write(struct kernfs_open_file *of,
7296			       char *buf, size_t nbytes, loff_t off)
7297{
7298	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7299	unsigned long high;
7300	int err;
7301
7302	buf = strstrip(buf);
7303	err = page_counter_memparse(buf, "max", &high);
7304	if (err)
7305		return err;
7306
7307	page_counter_set_high(&memcg->swap, high);
7308
7309	return nbytes;
7310}
7311
7312static int swap_max_show(struct seq_file *m, void *v)
7313{
7314	return seq_puts_memcg_tunable(m,
7315		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7316}
7317
7318static ssize_t swap_max_write(struct kernfs_open_file *of,
7319			      char *buf, size_t nbytes, loff_t off)
7320{
7321	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7322	unsigned long max;
7323	int err;
7324
7325	buf = strstrip(buf);
7326	err = page_counter_memparse(buf, "max", &max);
7327	if (err)
7328		return err;
7329
7330	xchg(&memcg->swap.max, max);
 
 
 
 
7331
7332	return nbytes;
7333}
7334
7335static int swap_events_show(struct seq_file *m, void *v)
7336{
7337	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7338
7339	seq_printf(m, "high %lu\n",
7340		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7341	seq_printf(m, "max %lu\n",
7342		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7343	seq_printf(m, "fail %lu\n",
7344		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7345
7346	return 0;
7347}
7348
7349static struct cftype swap_files[] = {
7350	{
7351		.name = "swap.current",
7352		.flags = CFTYPE_NOT_ON_ROOT,
7353		.read_u64 = swap_current_read,
7354	},
7355	{
7356		.name = "swap.high",
7357		.flags = CFTYPE_NOT_ON_ROOT,
7358		.seq_show = swap_high_show,
7359		.write = swap_high_write,
7360	},
7361	{
7362		.name = "swap.max",
7363		.flags = CFTYPE_NOT_ON_ROOT,
7364		.seq_show = swap_max_show,
7365		.write = swap_max_write,
7366	},
7367	{
7368		.name = "swap.events",
7369		.flags = CFTYPE_NOT_ON_ROOT,
7370		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
7371		.seq_show = swap_events_show,
7372	},
7373	{ }	/* terminate */
7374};
7375
7376static struct cftype memsw_files[] = {
7377	{
7378		.name = "memsw.usage_in_bytes",
7379		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7380		.read_u64 = mem_cgroup_read_u64,
7381	},
7382	{
7383		.name = "memsw.max_usage_in_bytes",
7384		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7385		.write = mem_cgroup_reset,
7386		.read_u64 = mem_cgroup_read_u64,
7387	},
7388	{
7389		.name = "memsw.limit_in_bytes",
7390		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7391		.write = mem_cgroup_write,
7392		.read_u64 = mem_cgroup_read_u64,
7393	},
7394	{
7395		.name = "memsw.failcnt",
7396		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7397		.write = mem_cgroup_reset,
7398		.read_u64 = mem_cgroup_read_u64,
7399	},
7400	{ },	/* terminate */
7401};
7402
7403/*
7404 * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7405 * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7406 * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7407 * boot parameter. This may result in premature OOPS inside
7408 * mem_cgroup_get_nr_swap_pages() function in corner cases.
7409 */
7410static int __init mem_cgroup_swap_init(void)
7411{
7412	/* No memory control -> no swap control */
7413	if (mem_cgroup_disabled())
7414		cgroup_memory_noswap = true;
7415
7416	if (cgroup_memory_noswap)
7417		return 0;
7418
7419	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7420	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7421
7422	return 0;
7423}
7424core_initcall(mem_cgroup_swap_init);
7425
7426#endif /* CONFIG_MEMCG_SWAP */