memcontrol-v1.c - mm/memcontrol-v1.c - Linux source code v4.17

Note: File does not exist in v4.17.
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2
   3#include <linux/memcontrol.h>
   4#include <linux/swap.h>
   5#include <linux/mm_inline.h>
   6#include <linux/pagewalk.h>
   7#include <linux/backing-dev.h>
   8#include <linux/swap_cgroup.h>
   9#include <linux/eventfd.h>
  10#include <linux/poll.h>
  11#include <linux/sort.h>
  12#include <linux/file.h>
  13#include <linux/seq_buf.h>
  14
  15#include "internal.h"
  16#include "swap.h"
  17#include "memcontrol-v1.h"
  18
  19/*
  20 * Cgroups above their limits are maintained in a RB-Tree, independent of
  21 * their hierarchy representation
  22 */
  23
  24struct mem_cgroup_tree_per_node {
  25	struct rb_root rb_root;
  26	struct rb_node *rb_rightmost;
  27	spinlock_t lock;
  28};
  29
  30struct mem_cgroup_tree {
  31	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  32};
  33
  34static struct mem_cgroup_tree soft_limit_tree __read_mostly;
  35
  36/*
  37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
  38 * limit reclaim to prevent infinite loops, if they ever occur.
  39 */
  40#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
  41#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
  42
  43/* for OOM */
  44struct mem_cgroup_eventfd_list {
  45	struct list_head list;
  46	struct eventfd_ctx *eventfd;
  47};
  48
  49/*
  50 * cgroup_event represents events which userspace want to receive.
  51 */
  52struct mem_cgroup_event {
  53	/*
  54	 * memcg which the event belongs to.
  55	 */
  56	struct mem_cgroup *memcg;
  57	/*
  58	 * eventfd to signal userspace about the event.
  59	 */
  60	struct eventfd_ctx *eventfd;
  61	/*
  62	 * Each of these stored in a list by the cgroup.
  63	 */
  64	struct list_head list;
  65	/*
  66	 * register_event() callback will be used to add new userspace
  67	 * waiter for changes related to this event.  Use eventfd_signal()
  68	 * on eventfd to send notification to userspace.
  69	 */
  70	int (*register_event)(struct mem_cgroup *memcg,
  71			      struct eventfd_ctx *eventfd, const char *args);
  72	/*
  73	 * unregister_event() callback will be called when userspace closes
  74	 * the eventfd or on cgroup removing.  This callback must be set,
  75	 * if you want provide notification functionality.
  76	 */
  77	void (*unregister_event)(struct mem_cgroup *memcg,
  78				 struct eventfd_ctx *eventfd);
  79	/*
  80	 * All fields below needed to unregister event when
  81	 * userspace closes eventfd.
  82	 */
  83	poll_table pt;
  84	wait_queue_head_t *wqh;
  85	wait_queue_entry_t wait;
  86	struct work_struct remove;
  87};
  88
  89#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
  90#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
  91#define MEMFILE_ATTR(val)	((val) & 0xffff)
  92
  93enum {
  94	RES_USAGE,
  95	RES_LIMIT,
  96	RES_MAX_USAGE,
  97	RES_FAILCNT,
  98	RES_SOFT_LIMIT,
  99};
 100
 101#ifdef CONFIG_LOCKDEP
 102static struct lockdep_map memcg_oom_lock_dep_map = {
 103	.name = "memcg_oom_lock",
 104};
 105#endif
 106
 107DEFINE_SPINLOCK(memcg_oom_lock);
 108
 109static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 110					 struct mem_cgroup_tree_per_node *mctz,
 111					 unsigned long new_usage_in_excess)
 112{
 113	struct rb_node **p = &mctz->rb_root.rb_node;
 114	struct rb_node *parent = NULL;
 115	struct mem_cgroup_per_node *mz_node;
 116	bool rightmost = true;
 117
 118	if (mz->on_tree)
 119		return;
 120
 121	mz->usage_in_excess = new_usage_in_excess;
 122	if (!mz->usage_in_excess)
 123		return;
 124	while (*p) {
 125		parent = *p;
 126		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 127					tree_node);
 128		if (mz->usage_in_excess < mz_node->usage_in_excess) {
 129			p = &(*p)->rb_left;
 130			rightmost = false;
 131		} else {
 132			p = &(*p)->rb_right;
 133		}
 134	}
 135
 136	if (rightmost)
 137		mctz->rb_rightmost = &mz->tree_node;
 138
 139	rb_link_node(&mz->tree_node, parent, p);
 140	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 141	mz->on_tree = true;
 142}
 143
 144static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 145					 struct mem_cgroup_tree_per_node *mctz)
 146{
 147	if (!mz->on_tree)
 148		return;
 149
 150	if (&mz->tree_node == mctz->rb_rightmost)
 151		mctz->rb_rightmost = rb_prev(&mz->tree_node);
 152
 153	rb_erase(&mz->tree_node, &mctz->rb_root);
 154	mz->on_tree = false;
 155}
 156
 157static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 158				       struct mem_cgroup_tree_per_node *mctz)
 159{
 160	unsigned long flags;
 161
 162	spin_lock_irqsave(&mctz->lock, flags);
 163	__mem_cgroup_remove_exceeded(mz, mctz);
 164	spin_unlock_irqrestore(&mctz->lock, flags);
 165}
 166
 167static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 168{
 169	unsigned long nr_pages = page_counter_read(&memcg->memory);
 170	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 171	unsigned long excess = 0;
 172
 173	if (nr_pages > soft_limit)
 174		excess = nr_pages - soft_limit;
 175
 176	return excess;
 177}
 178
 179static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
 180{
 181	unsigned long excess;
 182	struct mem_cgroup_per_node *mz;
 183	struct mem_cgroup_tree_per_node *mctz;
 184
 185	if (lru_gen_enabled()) {
 186		if (soft_limit_excess(memcg))
 187			lru_gen_soft_reclaim(memcg, nid);
 188		return;
 189	}
 190
 191	mctz = soft_limit_tree.rb_tree_per_node[nid];
 192	if (!mctz)
 193		return;
 194	/*
 195	 * Necessary to update all ancestors when hierarchy is used.
 196	 * because their event counter is not touched.
 197	 */
 198	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 199		mz = memcg->nodeinfo[nid];
 200		excess = soft_limit_excess(memcg);
 201		/*
 202		 * We have to update the tree if mz is on RB-tree or
 203		 * mem is over its softlimit.
 204		 */
 205		if (excess || mz->on_tree) {
 206			unsigned long flags;
 207
 208			spin_lock_irqsave(&mctz->lock, flags);
 209			/* if on-tree, remove it */
 210			if (mz->on_tree)
 211				__mem_cgroup_remove_exceeded(mz, mctz);
 212			/*
 213			 * Insert again. mz->usage_in_excess will be updated.
 214			 * If excess is 0, no tree ops.
 215			 */
 216			__mem_cgroup_insert_exceeded(mz, mctz, excess);
 217			spin_unlock_irqrestore(&mctz->lock, flags);
 218		}
 219	}
 220}
 221
 222void memcg1_remove_from_trees(struct mem_cgroup *memcg)
 223{
 224	struct mem_cgroup_tree_per_node *mctz;
 225	struct mem_cgroup_per_node *mz;
 226	int nid;
 227
 228	for_each_node(nid) {
 229		mz = memcg->nodeinfo[nid];
 230		mctz = soft_limit_tree.rb_tree_per_node[nid];
 231		if (mctz)
 232			mem_cgroup_remove_exceeded(mz, mctz);
 233	}
 234}
 235
 236static struct mem_cgroup_per_node *
 237__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 238{
 239	struct mem_cgroup_per_node *mz;
 240
 241retry:
 242	mz = NULL;
 243	if (!mctz->rb_rightmost)
 244		goto done;		/* Nothing to reclaim from */
 245
 246	mz = rb_entry(mctz->rb_rightmost,
 247		      struct mem_cgroup_per_node, tree_node);
 248	/*
 249	 * Remove the node now but someone else can add it back,
 250	 * we will to add it back at the end of reclaim to its correct
 251	 * position in the tree.
 252	 */
 253	__mem_cgroup_remove_exceeded(mz, mctz);
 254	if (!soft_limit_excess(mz->memcg) ||
 255	    !css_tryget(&mz->memcg->css))
 256		goto retry;
 257done:
 258	return mz;
 259}
 260
 261static struct mem_cgroup_per_node *
 262mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 263{
 264	struct mem_cgroup_per_node *mz;
 265
 266	spin_lock_irq(&mctz->lock);
 267	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 268	spin_unlock_irq(&mctz->lock);
 269	return mz;
 270}
 271
 272static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 273				   pg_data_t *pgdat,
 274				   gfp_t gfp_mask,
 275				   unsigned long *total_scanned)
 276{
 277	struct mem_cgroup *victim = NULL;
 278	int total = 0;
 279	int loop = 0;
 280	unsigned long excess;
 281	unsigned long nr_scanned;
 282	struct mem_cgroup_reclaim_cookie reclaim = {
 283		.pgdat = pgdat,
 284	};
 285
 286	excess = soft_limit_excess(root_memcg);
 287
 288	while (1) {
 289		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
 290		if (!victim) {
 291			loop++;
 292			if (loop >= 2) {
 293				/*
 294				 * If we have not been able to reclaim
 295				 * anything, it might because there are
 296				 * no reclaimable pages under this hierarchy
 297				 */
 298				if (!total)
 299					break;
 300				/*
 301				 * We want to do more targeted reclaim.
 302				 * excess >> 2 is not to excessive so as to
 303				 * reclaim too much, nor too less that we keep
 304				 * coming back to reclaim from this cgroup
 305				 */
 306				if (total >= (excess >> 2) ||
 307					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
 308					break;
 309			}
 310			continue;
 311		}
 312		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
 313					pgdat, &nr_scanned);
 314		*total_scanned += nr_scanned;
 315		if (!soft_limit_excess(root_memcg))
 316			break;
 317	}
 318	mem_cgroup_iter_break(root_memcg, victim);
 319	return total;
 320}
 321
 322unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
 323					    gfp_t gfp_mask,
 324					    unsigned long *total_scanned)
 325{
 326	unsigned long nr_reclaimed = 0;
 327	struct mem_cgroup_per_node *mz, *next_mz = NULL;
 328	unsigned long reclaimed;
 329	int loop = 0;
 330	struct mem_cgroup_tree_per_node *mctz;
 331	unsigned long excess;
 332
 333	if (lru_gen_enabled())
 334		return 0;
 335
 336	if (order > 0)
 337		return 0;
 338
 339	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
 340
 341	/*
 342	 * Do not even bother to check the largest node if the root
 343	 * is empty. Do it lockless to prevent lock bouncing. Races
 344	 * are acceptable as soft limit is best effort anyway.
 345	 */
 346	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
 347		return 0;
 348
 349	/*
 350	 * This loop can run a while, specially if mem_cgroup's continuously
 351	 * keep exceeding their soft limit and putting the system under
 352	 * pressure
 353	 */
 354	do {
 355		if (next_mz)
 356			mz = next_mz;
 357		else
 358			mz = mem_cgroup_largest_soft_limit_node(mctz);
 359		if (!mz)
 360			break;
 361
 362		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
 363						    gfp_mask, total_scanned);
 364		nr_reclaimed += reclaimed;
 365		spin_lock_irq(&mctz->lock);
 366
 367		/*
 368		 * If we failed to reclaim anything from this memory cgroup
 369		 * it is time to move on to the next cgroup
 370		 */
 371		next_mz = NULL;
 372		if (!reclaimed)
 373			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
 374
 375		excess = soft_limit_excess(mz->memcg);
 376		/*
 377		 * One school of thought says that we should not add
 378		 * back the node to the tree if reclaim returns 0.
 379		 * But our reclaim could return 0, simply because due
 380		 * to priority we are exposing a smaller subset of
 381		 * memory to reclaim from. Consider this as a longer
 382		 * term TODO.
 383		 */
 384		/* If excess == 0, no tree ops */
 385		__mem_cgroup_insert_exceeded(mz, mctz, excess);
 386		spin_unlock_irq(&mctz->lock);
 387		css_put(&mz->memcg->css);
 388		loop++;
 389		/*
 390		 * Could not reclaim anything and there are no more
 391		 * mem cgroups to try or we seem to be looping without
 392		 * reclaiming anything.
 393		 */
 394		if (!nr_reclaimed &&
 395			(next_mz == NULL ||
 396			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 397			break;
 398	} while (!nr_reclaimed);
 399	if (next_mz)
 400		css_put(&next_mz->memcg->css);
 401	return nr_reclaimed;
 402}
 403
 404static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 405				struct cftype *cft)
 406{
 407	return 0;
 408}
 409
 410#ifdef CONFIG_MMU
 411static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 412				 struct cftype *cft, u64 val)
 413{
 414	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
 415		     "Please report your usecase to linux-mm@kvack.org if you "
 416		     "depend on this functionality.\n");
 417
 418	if (val != 0)
 419		return -EINVAL;
 420	return 0;
 421}
 422#else
 423static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 424				 struct cftype *cft, u64 val)
 425{
 426	return -ENOSYS;
 427}
 428#endif
 429
 430static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 431{
 432	struct mem_cgroup_threshold_ary *t;
 433	unsigned long usage;
 434	int i;
 435
 436	rcu_read_lock();
 437	if (!swap)
 438		t = rcu_dereference(memcg->thresholds.primary);
 439	else
 440		t = rcu_dereference(memcg->memsw_thresholds.primary);
 441
 442	if (!t)
 443		goto unlock;
 444
 445	usage = mem_cgroup_usage(memcg, swap);
 446
 447	/*
 448	 * current_threshold points to threshold just below or equal to usage.
 449	 * If it's not true, a threshold was crossed after last
 450	 * call of __mem_cgroup_threshold().
 451	 */
 452	i = t->current_threshold;
 453
 454	/*
 455	 * Iterate backward over array of thresholds starting from
 456	 * current_threshold and check if a threshold is crossed.
 457	 * If none of thresholds below usage is crossed, we read
 458	 * only one element of the array here.
 459	 */
 460	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 461		eventfd_signal(t->entries[i].eventfd);
 462
 463	/* i = current_threshold + 1 */
 464	i++;
 465
 466	/*
 467	 * Iterate forward over array of thresholds starting from
 468	 * current_threshold+1 and check if a threshold is crossed.
 469	 * If none of thresholds above usage is crossed, we read
 470	 * only one element of the array here.
 471	 */
 472	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 473		eventfd_signal(t->entries[i].eventfd);
 474
 475	/* Update current_threshold */
 476	t->current_threshold = i - 1;
 477unlock:
 478	rcu_read_unlock();
 479}
 480
 481static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 482{
 483	while (memcg) {
 484		__mem_cgroup_threshold(memcg, false);
 485		if (do_memsw_account())
 486			__mem_cgroup_threshold(memcg, true);
 487
 488		memcg = parent_mem_cgroup(memcg);
 489	}
 490}
 491
 492/* Cgroup1: threshold notifications & softlimit tree updates */
 493struct memcg1_events_percpu {
 494	unsigned long nr_page_events;
 495	unsigned long targets[MEM_CGROUP_NTARGETS];
 496};
 497
 498static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
 499{
 500	/* pagein of a big page is an event. So, ignore page size */
 501	if (nr_pages > 0)
 502		__count_memcg_events(memcg, PGPGIN, 1);
 503	else {
 504		__count_memcg_events(memcg, PGPGOUT, 1);
 505		nr_pages = -nr_pages; /* for event */
 506	}
 507
 508	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
 509}
 510
 511#define THRESHOLDS_EVENTS_TARGET 128
 512#define SOFTLIMIT_EVENTS_TARGET 1024
 513
 514static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
 515				enum mem_cgroup_events_target target)
 516{
 517	unsigned long val, next;
 518
 519	val = __this_cpu_read(memcg->events_percpu->nr_page_events);
 520	next = __this_cpu_read(memcg->events_percpu->targets[target]);
 521	/* from time_after() in jiffies.h */
 522	if ((long)(next - val) < 0) {
 523		switch (target) {
 524		case MEM_CGROUP_TARGET_THRESH:
 525			next = val + THRESHOLDS_EVENTS_TARGET;
 526			break;
 527		case MEM_CGROUP_TARGET_SOFTLIMIT:
 528			next = val + SOFTLIMIT_EVENTS_TARGET;
 529			break;
 530		default:
 531			break;
 532		}
 533		__this_cpu_write(memcg->events_percpu->targets[target], next);
 534		return true;
 535	}
 536	return false;
 537}
 538
 539/*
 540 * Check events in order.
 541 *
 542 */
 543static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
 544{
 545	if (IS_ENABLED(CONFIG_PREEMPT_RT))
 546		return;
 547
 548	/* threshold event is triggered in finer grain than soft limit */
 549	if (unlikely(memcg1_event_ratelimit(memcg,
 550						MEM_CGROUP_TARGET_THRESH))) {
 551		bool do_softlimit;
 552
 553		do_softlimit = memcg1_event_ratelimit(memcg,
 554						MEM_CGROUP_TARGET_SOFTLIMIT);
 555		mem_cgroup_threshold(memcg);
 556		if (unlikely(do_softlimit))
 557			memcg1_update_tree(memcg, nid);
 558	}
 559}
 560
 561void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 562{
 563	unsigned long flags;
 564
 565	local_irq_save(flags);
 566	memcg1_charge_statistics(memcg, folio_nr_pages(folio));
 567	memcg1_check_events(memcg, folio_nid(folio));
 568	local_irq_restore(flags);
 569}
 570
 571void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
 572{
 573	/*
 574	 * Interrupts should be disabled here because the caller holds the
 575	 * i_pages lock which is taken with interrupts-off. It is
 576	 * important here to have the interrupts disabled because it is the
 577	 * only synchronisation we have for updating the per-CPU variables.
 578	 */
 579	preempt_disable_nested();
 580	VM_WARN_ON_IRQS_ENABLED();
 581	memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
 582	preempt_enable_nested();
 583	memcg1_check_events(memcg, folio_nid(folio));
 584}
 585
 586void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 587			   unsigned long nr_memory, int nid)
 588{
 589	unsigned long flags;
 590
 591	local_irq_save(flags);
 592	__count_memcg_events(memcg, PGPGOUT, pgpgout);
 593	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
 594	memcg1_check_events(memcg, nid);
 595	local_irq_restore(flags);
 596}
 597
 598static int compare_thresholds(const void *a, const void *b)
 599{
 600	const struct mem_cgroup_threshold *_a = a;
 601	const struct mem_cgroup_threshold *_b = b;
 602
 603	if (_a->threshold > _b->threshold)
 604		return 1;
 605
 606	if (_a->threshold < _b->threshold)
 607		return -1;
 608
 609	return 0;
 610}
 611
 612static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 613{
 614	struct mem_cgroup_eventfd_list *ev;
 615
 616	spin_lock(&memcg_oom_lock);
 617
 618	list_for_each_entry(ev, &memcg->oom_notify, list)
 619		eventfd_signal(ev->eventfd);
 620
 621	spin_unlock(&memcg_oom_lock);
 622	return 0;
 623}
 624
 625static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 626{
 627	struct mem_cgroup *iter;
 628
 629	for_each_mem_cgroup_tree(iter, memcg)
 630		mem_cgroup_oom_notify_cb(iter);
 631}
 632
 633static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 634	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 635{
 636	struct mem_cgroup_thresholds *thresholds;
 637	struct mem_cgroup_threshold_ary *new;
 638	unsigned long threshold;
 639	unsigned long usage;
 640	int i, size, ret;
 641
 642	ret = page_counter_memparse(args, "-1", &threshold);
 643	if (ret)
 644		return ret;
 645
 646	mutex_lock(&memcg->thresholds_lock);
 647
 648	if (type == _MEM) {
 649		thresholds = &memcg->thresholds;
 650		usage = mem_cgroup_usage(memcg, false);
 651	} else if (type == _MEMSWAP) {
 652		thresholds = &memcg->memsw_thresholds;
 653		usage = mem_cgroup_usage(memcg, true);
 654	} else
 655		BUG();
 656
 657	/* Check if a threshold crossed before adding a new one */
 658	if (thresholds->primary)
 659		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 660
 661	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 662
 663	/* Allocate memory for new array of thresholds */
 664	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
 665	if (!new) {
 666		ret = -ENOMEM;
 667		goto unlock;
 668	}
 669	new->size = size;
 670
 671	/* Copy thresholds (if any) to new array */
 672	if (thresholds->primary)
 673		memcpy(new->entries, thresholds->primary->entries,
 674		       flex_array_size(new, entries, size - 1));
 675
 676	/* Add new threshold */
 677	new->entries[size - 1].eventfd = eventfd;
 678	new->entries[size - 1].threshold = threshold;
 679
 680	/* Sort thresholds. Registering of new threshold isn't time-critical */
 681	sort(new->entries, size, sizeof(*new->entries),
 682			compare_thresholds, NULL);
 683
 684	/* Find current threshold */
 685	new->current_threshold = -1;
 686	for (i = 0; i < size; i++) {
 687		if (new->entries[i].threshold <= usage) {
 688			/*
 689			 * new->current_threshold will not be used until
 690			 * rcu_assign_pointer(), so it's safe to increment
 691			 * it here.
 692			 */
 693			++new->current_threshold;
 694		} else
 695			break;
 696	}
 697
 698	/* Free old spare buffer and save old primary buffer as spare */
 699	kfree(thresholds->spare);
 700	thresholds->spare = thresholds->primary;
 701
 702	rcu_assign_pointer(thresholds->primary, new);
 703
 704	/* To be sure that nobody uses thresholds */
 705	synchronize_rcu();
 706
 707unlock:
 708	mutex_unlock(&memcg->thresholds_lock);
 709
 710	return ret;
 711}
 712
 713static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 714	struct eventfd_ctx *eventfd, const char *args)
 715{
 716	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
 717}
 718
 719static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
 720	struct eventfd_ctx *eventfd, const char *args)
 721{
 722	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
 723}
 724
 725static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 726	struct eventfd_ctx *eventfd, enum res_type type)
 727{
 728	struct mem_cgroup_thresholds *thresholds;
 729	struct mem_cgroup_threshold_ary *new;
 730	unsigned long usage;
 731	int i, j, size, entries;
 732
 733	mutex_lock(&memcg->thresholds_lock);
 734
 735	if (type == _MEM) {
 736		thresholds = &memcg->thresholds;
 737		usage = mem_cgroup_usage(memcg, false);
 738	} else if (type == _MEMSWAP) {
 739		thresholds = &memcg->memsw_thresholds;
 740		usage = mem_cgroup_usage(memcg, true);
 741	} else
 742		BUG();
 743
 744	if (!thresholds->primary)
 745		goto unlock;
 746
 747	/* Check if a threshold crossed before removing */
 748	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 749
 750	/* Calculate new number of threshold */
 751	size = entries = 0;
 752	for (i = 0; i < thresholds->primary->size; i++) {
 753		if (thresholds->primary->entries[i].eventfd != eventfd)
 754			size++;
 755		else
 756			entries++;
 757	}
 758
 759	new = thresholds->spare;
 760
 761	/* If no items related to eventfd have been cleared, nothing to do */
 762	if (!entries)
 763		goto unlock;
 764
 765	/* Set thresholds array to NULL if we don't have thresholds */
 766	if (!size) {
 767		kfree(new);
 768		new = NULL;
 769		goto swap_buffers;
 770	}
 771
 772	new->size = size;
 773
 774	/* Copy thresholds and find current threshold */
 775	new->current_threshold = -1;
 776	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 777		if (thresholds->primary->entries[i].eventfd == eventfd)
 778			continue;
 779
 780		new->entries[j] = thresholds->primary->entries[i];
 781		if (new->entries[j].threshold <= usage) {
 782			/*
 783			 * new->current_threshold will not be used
 784			 * until rcu_assign_pointer(), so it's safe to increment
 785			 * it here.
 786			 */
 787			++new->current_threshold;
 788		}
 789		j++;
 790	}
 791
 792swap_buffers:
 793	/* Swap primary and spare array */
 794	thresholds->spare = thresholds->primary;
 795
 796	rcu_assign_pointer(thresholds->primary, new);
 797
 798	/* To be sure that nobody uses thresholds */
 799	synchronize_rcu();
 800
 801	/* If all events are unregistered, free the spare array */
 802	if (!new) {
 803		kfree(thresholds->spare);
 804		thresholds->spare = NULL;
 805	}
 806unlock:
 807	mutex_unlock(&memcg->thresholds_lock);
 808}
 809
 810static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 811	struct eventfd_ctx *eventfd)
 812{
 813	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
 814}
 815
 816static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 817	struct eventfd_ctx *eventfd)
 818{
 819	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
 820}
 821
 822static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 823	struct eventfd_ctx *eventfd, const char *args)
 824{
 825	struct mem_cgroup_eventfd_list *event;
 826
 827	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 828	if (!event)
 829		return -ENOMEM;
 830
 831	spin_lock(&memcg_oom_lock);
 832
 833	event->eventfd = eventfd;
 834	list_add(&event->list, &memcg->oom_notify);
 835
 836	/* already in OOM ? */
 837	if (memcg->under_oom)
 838		eventfd_signal(eventfd);
 839	spin_unlock(&memcg_oom_lock);
 840
 841	return 0;
 842}
 843
 844static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 845	struct eventfd_ctx *eventfd)
 846{
 847	struct mem_cgroup_eventfd_list *ev, *tmp;
 848
 849	spin_lock(&memcg_oom_lock);
 850
 851	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 852		if (ev->eventfd == eventfd) {
 853			list_del(&ev->list);
 854			kfree(ev);
 855		}
 856	}
 857
 858	spin_unlock(&memcg_oom_lock);
 859}
 860
 861/*
 862 * DO NOT USE IN NEW FILES.
 863 *
 864 * "cgroup.event_control" implementation.
 865 *
 866 * This is way over-engineered.  It tries to support fully configurable
 867 * events for each user.  Such level of flexibility is completely
 868 * unnecessary especially in the light of the planned unified hierarchy.
 869 *
 870 * Please deprecate this and replace with something simpler if at all
 871 * possible.
 872 */
 873
 874/*
 875 * Unregister event and free resources.
 876 *
 877 * Gets called from workqueue.
 878 */
 879static void memcg_event_remove(struct work_struct *work)
 880{
 881	struct mem_cgroup_event *event =
 882		container_of(work, struct mem_cgroup_event, remove);
 883	struct mem_cgroup *memcg = event->memcg;
 884
 885	remove_wait_queue(event->wqh, &event->wait);
 886
 887	event->unregister_event(memcg, event->eventfd);
 888
 889	/* Notify userspace the event is going away. */
 890	eventfd_signal(event->eventfd);
 891
 892	eventfd_ctx_put(event->eventfd);
 893	kfree(event);
 894	css_put(&memcg->css);
 895}
 896
 897/*
 898 * Gets called on EPOLLHUP on eventfd when user closes it.
 899 *
 900 * Called with wqh->lock held and interrupts disabled.
 901 */
 902static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
 903			    int sync, void *key)
 904{
 905	struct mem_cgroup_event *event =
 906		container_of(wait, struct mem_cgroup_event, wait);
 907	struct mem_cgroup *memcg = event->memcg;
 908	__poll_t flags = key_to_poll(key);
 909
 910	if (flags & EPOLLHUP) {
 911		/*
 912		 * If the event has been detached at cgroup removal, we
 913		 * can simply return knowing the other side will cleanup
 914		 * for us.
 915		 *
 916		 * We can't race against event freeing since the other
 917		 * side will require wqh->lock via remove_wait_queue(),
 918		 * which we hold.
 919		 */
 920		spin_lock(&memcg->event_list_lock);
 921		if (!list_empty(&event->list)) {
 922			list_del_init(&event->list);
 923			/*
 924			 * We are in atomic context, but cgroup_event_remove()
 925			 * may sleep, so we have to call it in workqueue.
 926			 */
 927			schedule_work(&event->remove);
 928		}
 929		spin_unlock(&memcg->event_list_lock);
 930	}
 931
 932	return 0;
 933}
 934
 935static void memcg_event_ptable_queue_proc(struct file *file,
 936		wait_queue_head_t *wqh, poll_table *pt)
 937{
 938	struct mem_cgroup_event *event =
 939		container_of(pt, struct mem_cgroup_event, pt);
 940
 941	event->wqh = wqh;
 942	add_wait_queue(wqh, &event->wait);
 943}
 944
 945/*
 946 * DO NOT USE IN NEW FILES.
 947 *
 948 * Parse input and register new cgroup event handler.
 949 *
 950 * Input must be in format '<event_fd> <control_fd> <args>'.
 951 * Interpretation of args is defined by control file implementation.
 952 */
 953static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 954					 char *buf, size_t nbytes, loff_t off)
 955{
 956	struct cgroup_subsys_state *css = of_css(of);
 957	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 958	struct mem_cgroup_event *event;
 959	struct cgroup_subsys_state *cfile_css;
 960	unsigned int efd, cfd;
 961	struct dentry *cdentry;
 962	const char *name;
 963	char *endp;
 964	int ret;
 965
 966	if (IS_ENABLED(CONFIG_PREEMPT_RT))
 967		return -EOPNOTSUPP;
 968
 969	buf = strstrip(buf);
 970
 971	efd = simple_strtoul(buf, &endp, 10);
 972	if (*endp != ' ')
 973		return -EINVAL;
 974	buf = endp + 1;
 975
 976	cfd = simple_strtoul(buf, &endp, 10);
 977	if (*endp == '\0')
 978		buf = endp;
 979	else if (*endp == ' ')
 980		buf = endp + 1;
 981	else
 982		return -EINVAL;
 983
 984	CLASS(fd, efile)(efd);
 985	if (fd_empty(efile))
 986		return -EBADF;
 987
 988	CLASS(fd, cfile)(cfd);
 989
 990	event = kzalloc(sizeof(*event), GFP_KERNEL);
 991	if (!event)
 992		return -ENOMEM;
 993
 994	event->memcg = memcg;
 995	INIT_LIST_HEAD(&event->list);
 996	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
 997	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
 998	INIT_WORK(&event->remove, memcg_event_remove);
 999
1000	event->eventfd = eventfd_ctx_fileget(fd_file(efile));
1001	if (IS_ERR(event->eventfd)) {
1002		ret = PTR_ERR(event->eventfd);
1003		goto out_kfree;
1004	}
1005
1006	if (fd_empty(cfile)) {
1007		ret = -EBADF;
1008		goto out_put_eventfd;
1009	}
1010
1011	/* the process need read permission on control file */
1012	/* AV: shouldn't we check that it's been opened for read instead? */
1013	ret = file_permission(fd_file(cfile), MAY_READ);
1014	if (ret < 0)
1015		goto out_put_eventfd;
1016
1017	/*
1018	 * The control file must be a regular cgroup1 file. As a regular cgroup
1019	 * file can't be renamed, it's safe to access its name afterwards.
1020	 */
1021	cdentry = fd_file(cfile)->f_path.dentry;
1022	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
1023		ret = -EINVAL;
1024		goto out_put_eventfd;
1025	}
1026
1027	/*
1028	 * Determine the event callbacks and set them in @event.  This used
1029	 * to be done via struct cftype but cgroup core no longer knows
1030	 * about these events.  The following is crude but the whole thing
1031	 * is for compatibility anyway.
1032	 *
1033	 * DO NOT ADD NEW FILES.
1034	 */
1035	name = cdentry->d_name.name;
1036
1037	if (!strcmp(name, "memory.usage_in_bytes")) {
1038		event->register_event = mem_cgroup_usage_register_event;
1039		event->unregister_event = mem_cgroup_usage_unregister_event;
1040	} else if (!strcmp(name, "memory.oom_control")) {
1041		pr_warn_once("oom_control is deprecated and will be removed. "
1042			     "Please report your usecase to linux-mm-@kvack.org"
1043			     " if you depend on this functionality. \n");
1044		event->register_event = mem_cgroup_oom_register_event;
1045		event->unregister_event = mem_cgroup_oom_unregister_event;
1046	} else if (!strcmp(name, "memory.pressure_level")) {
1047		pr_warn_once("pressure_level is deprecated and will be removed. "
1048			     "Please report your usecase to linux-mm-@kvack.org "
1049			     "if you depend on this functionality. \n");
1050		event->register_event = vmpressure_register_event;
1051		event->unregister_event = vmpressure_unregister_event;
1052	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
1053		event->register_event = memsw_cgroup_usage_register_event;
1054		event->unregister_event = memsw_cgroup_usage_unregister_event;
1055	} else {
1056		ret = -EINVAL;
1057		goto out_put_eventfd;
1058	}
1059
1060	/*
1061	 * Verify @cfile should belong to @css.  Also, remaining events are
1062	 * automatically removed on cgroup destruction but the removal is
1063	 * asynchronous, so take an extra ref on @css.
1064	 */
1065	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
1066					       &memory_cgrp_subsys);
1067	ret = -EINVAL;
1068	if (IS_ERR(cfile_css))
1069		goto out_put_eventfd;
1070	if (cfile_css != css)
1071		goto out_put_css;
1072
1073	ret = event->register_event(memcg, event->eventfd, buf);
1074	if (ret)
1075		goto out_put_css;
1076
1077	vfs_poll(fd_file(efile), &event->pt);
1078
1079	spin_lock_irq(&memcg->event_list_lock);
1080	list_add(&event->list, &memcg->event_list);
1081	spin_unlock_irq(&memcg->event_list_lock);
1082	return nbytes;
1083
1084out_put_css:
1085	css_put(cfile_css);
1086out_put_eventfd:
1087	eventfd_ctx_put(event->eventfd);
1088out_kfree:
1089	kfree(event);
1090	return ret;
1091}
1092
1093void memcg1_memcg_init(struct mem_cgroup *memcg)
1094{
1095	INIT_LIST_HEAD(&memcg->oom_notify);
1096	mutex_init(&memcg->thresholds_lock);
1097	INIT_LIST_HEAD(&memcg->event_list);
1098	spin_lock_init(&memcg->event_list_lock);
1099}
1100
1101void memcg1_css_offline(struct mem_cgroup *memcg)
1102{
1103	struct mem_cgroup_event *event, *tmp;
1104
1105	/*
1106	 * Unregister events and notify userspace.
1107	 * Notify userspace about cgroup removing only after rmdir of cgroup
1108	 * directory to avoid race between userspace and kernelspace.
1109	 */
1110	spin_lock_irq(&memcg->event_list_lock);
1111	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
1112		list_del_init(&event->list);
1113		schedule_work(&event->remove);
1114	}
1115	spin_unlock_irq(&memcg->event_list_lock);
1116}
1117
1118/*
1119 * Check OOM-Killer is already running under our hierarchy.
1120 * If someone is running, return false.
1121 */
1122static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1123{
1124	struct mem_cgroup *iter, *failed = NULL;
1125
1126	spin_lock(&memcg_oom_lock);
1127
1128	for_each_mem_cgroup_tree(iter, memcg) {
1129		if (iter->oom_lock) {
1130			/*
1131			 * this subtree of our hierarchy is already locked
1132			 * so we cannot give a lock.
1133			 */
1134			failed = iter;
1135			mem_cgroup_iter_break(memcg, iter);
1136			break;
1137		} else
1138			iter->oom_lock = true;
1139	}
1140
1141	if (failed) {
1142		/*
1143		 * OK, we failed to lock the whole subtree so we have
1144		 * to clean up what we set up to the failing subtree
1145		 */
1146		for_each_mem_cgroup_tree(iter, memcg) {
1147			if (iter == failed) {
1148				mem_cgroup_iter_break(memcg, iter);
1149				break;
1150			}
1151			iter->oom_lock = false;
1152		}
1153	} else
1154		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1155
1156	spin_unlock(&memcg_oom_lock);
1157
1158	return !failed;
1159}
1160
1161static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1162{
1163	struct mem_cgroup *iter;
1164
1165	spin_lock(&memcg_oom_lock);
1166	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1167	for_each_mem_cgroup_tree(iter, memcg)
1168		iter->oom_lock = false;
1169	spin_unlock(&memcg_oom_lock);
1170}
1171
1172static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1173{
1174	struct mem_cgroup *iter;
1175
1176	spin_lock(&memcg_oom_lock);
1177	for_each_mem_cgroup_tree(iter, memcg)
1178		iter->under_oom++;
1179	spin_unlock(&memcg_oom_lock);
1180}
1181
1182static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1183{
1184	struct mem_cgroup *iter;
1185
1186	/*
1187	 * Be careful about under_oom underflows because a child memcg
1188	 * could have been added after mem_cgroup_mark_under_oom.
1189	 */
1190	spin_lock(&memcg_oom_lock);
1191	for_each_mem_cgroup_tree(iter, memcg)
1192		if (iter->under_oom > 0)
1193			iter->under_oom--;
1194	spin_unlock(&memcg_oom_lock);
1195}
1196
1197static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1198
1199struct oom_wait_info {
1200	struct mem_cgroup *memcg;
1201	wait_queue_entry_t	wait;
1202};
1203
1204static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1205	unsigned mode, int sync, void *arg)
1206{
1207	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1208	struct mem_cgroup *oom_wait_memcg;
1209	struct oom_wait_info *oom_wait_info;
1210
1211	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1212	oom_wait_memcg = oom_wait_info->memcg;
1213
1214	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1215	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1216		return 0;
1217	return autoremove_wake_function(wait, mode, sync, arg);
1218}
1219
1220void memcg1_oom_recover(struct mem_cgroup *memcg)
1221{
1222	/*
1223	 * For the following lockless ->under_oom test, the only required
1224	 * guarantee is that it must see the state asserted by an OOM when
1225	 * this function is called as a result of userland actions
1226	 * triggered by the notification of the OOM.  This is trivially
1227	 * achieved by invoking mem_cgroup_mark_under_oom() before
1228	 * triggering notification.
1229	 */
1230	if (memcg && memcg->under_oom)
1231		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1232}
1233
1234/**
1235 * mem_cgroup_oom_synchronize - complete memcg OOM handling
1236 * @handle: actually kill/wait or just clean up the OOM state
1237 *
1238 * This has to be called at the end of a page fault if the memcg OOM
1239 * handler was enabled.
1240 *
1241 * Memcg supports userspace OOM handling where failed allocations must
1242 * sleep on a waitqueue until the userspace task resolves the
1243 * situation.  Sleeping directly in the charge context with all kinds
1244 * of locks held is not a good idea, instead we remember an OOM state
1245 * in the task and mem_cgroup_oom_synchronize() has to be called at
1246 * the end of the page fault to complete the OOM handling.
1247 *
1248 * Returns %true if an ongoing memcg OOM situation was detected and
1249 * completed, %false otherwise.
1250 */
1251bool mem_cgroup_oom_synchronize(bool handle)
1252{
1253	struct mem_cgroup *memcg = current->memcg_in_oom;
1254	struct oom_wait_info owait;
1255	bool locked;
1256
1257	/* OOM is global, do not handle */
1258	if (!memcg)
1259		return false;
1260
1261	if (!handle)
1262		goto cleanup;
1263
1264	owait.memcg = memcg;
1265	owait.wait.flags = 0;
1266	owait.wait.func = memcg_oom_wake_function;
1267	owait.wait.private = current;
1268	INIT_LIST_HEAD(&owait.wait.entry);
1269
1270	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1271	mem_cgroup_mark_under_oom(memcg);
1272
1273	locked = mem_cgroup_oom_trylock(memcg);
1274
1275	if (locked)
1276		mem_cgroup_oom_notify(memcg);
1277
1278	schedule();
1279	mem_cgroup_unmark_under_oom(memcg);
1280	finish_wait(&memcg_oom_waitq, &owait.wait);
1281
1282	if (locked)
1283		mem_cgroup_oom_unlock(memcg);
1284cleanup:
1285	current->memcg_in_oom = NULL;
1286	css_put(&memcg->css);
1287	return true;
1288}
1289
1290
1291bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
1292{
1293	/*
1294	 * We are in the middle of the charge context here, so we
1295	 * don't want to block when potentially sitting on a callstack
1296	 * that holds all kinds of filesystem and mm locks.
1297	 *
1298	 * cgroup1 allows disabling the OOM killer and waiting for outside
1299	 * handling until the charge can succeed; remember the context and put
1300	 * the task to sleep at the end of the page fault when all locks are
1301	 * released.
1302	 *
1303	 * On the other hand, in-kernel OOM killer allows for an async victim
1304	 * memory reclaim (oom_reaper) and that means that we are not solely
1305	 * relying on the oom victim to make a forward progress and we can
1306	 * invoke the oom killer here.
1307	 *
1308	 * Please note that mem_cgroup_out_of_memory might fail to find a
1309	 * victim and then we have to bail out from the charge path.
1310	 */
1311	if (READ_ONCE(memcg->oom_kill_disable)) {
1312		if (current->in_user_fault) {
1313			css_get(&memcg->css);
1314			current->memcg_in_oom = memcg;
1315		}
1316		return false;
1317	}
1318
1319	mem_cgroup_mark_under_oom(memcg);
1320
1321	*locked = mem_cgroup_oom_trylock(memcg);
1322
1323	if (*locked)
1324		mem_cgroup_oom_notify(memcg);
1325
1326	mem_cgroup_unmark_under_oom(memcg);
1327
1328	return true;
1329}
1330
1331void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
1332{
1333	if (locked)
1334		mem_cgroup_oom_unlock(memcg);
1335}
1336
1337static DEFINE_MUTEX(memcg_max_mutex);
1338
1339static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
1340				 unsigned long max, bool memsw)
1341{
1342	bool enlarge = false;
1343	bool drained = false;
1344	int ret;
1345	bool limits_invariant;
1346	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
1347
1348	do {
1349		if (signal_pending(current)) {
1350			ret = -EINTR;
1351			break;
1352		}
1353
1354		mutex_lock(&memcg_max_mutex);
1355		/*
1356		 * Make sure that the new limit (memsw or memory limit) doesn't
1357		 * break our basic invariant rule memory.max <= memsw.max.
1358		 */
1359		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
1360					   max <= memcg->memsw.max;
1361		if (!limits_invariant) {
1362			mutex_unlock(&memcg_max_mutex);
1363			ret = -EINVAL;
1364			break;
1365		}
1366		if (max > counter->max)
1367			enlarge = true;
1368		ret = page_counter_set_max(counter, max);
1369		mutex_unlock(&memcg_max_mutex);
1370
1371		if (!ret)
1372			break;
1373
1374		if (!drained) {
1375			drain_all_stock(memcg);
1376			drained = true;
1377			continue;
1378		}
1379
1380		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
1381				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
1382			ret = -EBUSY;
1383			break;
1384		}
1385	} while (true);
1386
1387	if (!ret && enlarge)
1388		memcg1_oom_recover(memcg);
1389
1390	return ret;
1391}
1392
1393/*
1394 * Reclaims as many pages from the given memcg as possible.
1395 *
1396 * Caller is responsible for holding css reference for memcg.
1397 */
1398static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
1399{
1400	int nr_retries = MAX_RECLAIM_RETRIES;
1401
1402	/* we call try-to-free pages for make this cgroup empty */
1403	lru_add_drain_all();
1404
1405	drain_all_stock(memcg);
1406
1407	/* try to free all pages in this cgroup */
1408	while (nr_retries && page_counter_read(&memcg->memory)) {
1409		if (signal_pending(current))
1410			return -EINTR;
1411
1412		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
1413						  MEMCG_RECLAIM_MAY_SWAP, NULL))
1414			nr_retries--;
1415	}
1416
1417	return 0;
1418}
1419
1420static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
1421					    char *buf, size_t nbytes,
1422					    loff_t off)
1423{
1424	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1425
1426	if (mem_cgroup_is_root(memcg))
1427		return -EINVAL;
1428	return mem_cgroup_force_empty(memcg) ?: nbytes;
1429}
1430
1431static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
1432				     struct cftype *cft)
1433{
1434	return 1;
1435}
1436
1437static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
1438				      struct cftype *cft, u64 val)
1439{
1440	if (val == 1)
1441		return 0;
1442
1443	pr_warn_once("Non-hierarchical mode is deprecated. "
1444		     "Please report your usecase to linux-mm@kvack.org if you "
1445		     "depend on this functionality.\n");
1446
1447	return -EINVAL;
1448}
1449
1450static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
1451			       struct cftype *cft)
1452{
1453	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1454	struct page_counter *counter;
1455
1456	switch (MEMFILE_TYPE(cft->private)) {
1457	case _MEM:
1458		counter = &memcg->memory;
1459		break;
1460	case _MEMSWAP:
1461		counter = &memcg->memsw;
1462		break;
1463	case _KMEM:
1464		counter = &memcg->kmem;
1465		break;
1466	case _TCP:
1467		counter = &memcg->tcpmem;
1468		break;
1469	default:
1470		BUG();
1471	}
1472
1473	switch (MEMFILE_ATTR(cft->private)) {
1474	case RES_USAGE:
1475		if (counter == &memcg->memory)
1476			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
1477		if (counter == &memcg->memsw)
1478			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
1479		return (u64)page_counter_read(counter) * PAGE_SIZE;
1480	case RES_LIMIT:
1481		return (u64)counter->max * PAGE_SIZE;
1482	case RES_MAX_USAGE:
1483		return (u64)counter->watermark * PAGE_SIZE;
1484	case RES_FAILCNT:
1485		return counter->failcnt;
1486	case RES_SOFT_LIMIT:
1487		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
1488	default:
1489		BUG();
1490	}
1491}
1492
1493/*
1494 * This function doesn't do anything useful. Its only job is to provide a read
1495 * handler for a file so that cgroup_file_mode() will add read permissions.
1496 */
1497static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
1498				     __always_unused void *v)
1499{
1500	return -EINVAL;
1501}
1502
1503static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
1504{
1505	int ret;
1506
1507	mutex_lock(&memcg_max_mutex);
1508
1509	ret = page_counter_set_max(&memcg->tcpmem, max);
1510	if (ret)
1511		goto out;
1512
1513	if (!memcg->tcpmem_active) {
1514		/*
1515		 * The active flag needs to be written after the static_key
1516		 * update. This is what guarantees that the socket activation
1517		 * function is the last one to run. See mem_cgroup_sk_alloc()
1518		 * for details, and note that we don't mark any socket as
1519		 * belonging to this memcg until that flag is up.
1520		 *
1521		 * We need to do this, because static_keys will span multiple
1522		 * sites, but we can't control their order. If we mark a socket
1523		 * as accounted, but the accounting functions are not patched in
1524		 * yet, we'll lose accounting.
1525		 *
1526		 * We never race with the readers in mem_cgroup_sk_alloc(),
1527		 * because when this value change, the code to process it is not
1528		 * patched in yet.
1529		 */
1530		static_branch_inc(&memcg_sockets_enabled_key);
1531		memcg->tcpmem_active = true;
1532	}
1533out:
1534	mutex_unlock(&memcg_max_mutex);
1535	return ret;
1536}
1537
1538/*
1539 * The user of this function is...
1540 * RES_LIMIT.
1541 */
1542static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
1543				char *buf, size_t nbytes, loff_t off)
1544{
1545	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1546	unsigned long nr_pages;
1547	int ret;
1548
1549	buf = strstrip(buf);
1550	ret = page_counter_memparse(buf, "-1", &nr_pages);
1551	if (ret)
1552		return ret;
1553
1554	switch (MEMFILE_ATTR(of_cft(of)->private)) {
1555	case RES_LIMIT:
1556		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
1557			ret = -EINVAL;
1558			break;
1559		}
1560		switch (MEMFILE_TYPE(of_cft(of)->private)) {
1561		case _MEM:
1562			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
1563			break;
1564		case _MEMSWAP:
1565			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
1566			break;
1567		case _KMEM:
1568			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
1569				     "Writing any value to this file has no effect. "
1570				     "Please report your usecase to linux-mm@kvack.org if you "
1571				     "depend on this functionality.\n");
1572			ret = 0;
1573			break;
1574		case _TCP:
1575			pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
1576				     "Please report your usecase to linux-mm@kvack.org if you "
1577				     "depend on this functionality.\n");
1578			ret = memcg_update_tcp_max(memcg, nr_pages);
1579			break;
1580		}
1581		break;
1582	case RES_SOFT_LIMIT:
1583		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
1584			ret = -EOPNOTSUPP;
1585		} else {
1586			pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
1587				     "Please report your usecase to linux-mm@kvack.org if you "
1588				     "depend on this functionality.\n");
1589			WRITE_ONCE(memcg->soft_limit, nr_pages);
1590			ret = 0;
1591		}
1592		break;
1593	}
1594	return ret ?: nbytes;
1595}
1596
1597static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
1598				size_t nbytes, loff_t off)
1599{
1600	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1601	struct page_counter *counter;
1602
1603	switch (MEMFILE_TYPE(of_cft(of)->private)) {
1604	case _MEM:
1605		counter = &memcg->memory;
1606		break;
1607	case _MEMSWAP:
1608		counter = &memcg->memsw;
1609		break;
1610	case _KMEM:
1611		counter = &memcg->kmem;
1612		break;
1613	case _TCP:
1614		counter = &memcg->tcpmem;
1615		break;
1616	default:
1617		BUG();
1618	}
1619
1620	switch (MEMFILE_ATTR(of_cft(of)->private)) {
1621	case RES_MAX_USAGE:
1622		page_counter_reset_watermark(counter);
1623		break;
1624	case RES_FAILCNT:
1625		counter->failcnt = 0;
1626		break;
1627	default:
1628		BUG();
1629	}
1630
1631	return nbytes;
1632}
1633
1634#ifdef CONFIG_NUMA
1635
1636#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
1637#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
1638#define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
1639
1640static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1641				int nid, unsigned int lru_mask, bool tree)
1642{
1643	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
1644	unsigned long nr = 0;
1645	enum lru_list lru;
1646
1647	VM_BUG_ON((unsigned)nid >= nr_node_ids);
1648
1649	for_each_lru(lru) {
1650		if (!(BIT(lru) & lru_mask))
1651			continue;
1652		if (tree)
1653			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
1654		else
1655			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
1656	}
1657	return nr;
1658}
1659
1660static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
1661					     unsigned int lru_mask,
1662					     bool tree)
1663{
1664	unsigned long nr = 0;
1665	enum lru_list lru;
1666
1667	for_each_lru(lru) {
1668		if (!(BIT(lru) & lru_mask))
1669			continue;
1670		if (tree)
1671			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
1672		else
1673			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
1674	}
1675	return nr;
1676}
1677
1678static int memcg_numa_stat_show(struct seq_file *m, void *v)
1679{
1680	struct numa_stat {
1681		const char *name;
1682		unsigned int lru_mask;
1683	};
1684
1685	static const struct numa_stat stats[] = {
1686		{ "total", LRU_ALL },
1687		{ "file", LRU_ALL_FILE },
1688		{ "anon", LRU_ALL_ANON },
1689		{ "unevictable", BIT(LRU_UNEVICTABLE) },
1690	};
1691	const struct numa_stat *stat;
1692	int nid;
1693	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1694
1695	mem_cgroup_flush_stats(memcg);
1696
1697	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
1698		seq_printf(m, "%s=%lu", stat->name,
1699			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
1700						   false));
1701		for_each_node_state(nid, N_MEMORY)
1702			seq_printf(m, " N%d=%lu", nid,
1703				   mem_cgroup_node_nr_lru_pages(memcg, nid,
1704							stat->lru_mask, false));
1705		seq_putc(m, '\n');
1706	}
1707
1708	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
1709
1710		seq_printf(m, "hierarchical_%s=%lu", stat->name,
1711			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
1712						   true));
1713		for_each_node_state(nid, N_MEMORY)
1714			seq_printf(m, " N%d=%lu", nid,
1715				   mem_cgroup_node_nr_lru_pages(memcg, nid,
1716							stat->lru_mask, true));
1717		seq_putc(m, '\n');
1718	}
1719
1720	return 0;
1721}
1722#endif /* CONFIG_NUMA */
1723
1724static const unsigned int memcg1_stats[] = {
1725	NR_FILE_PAGES,
1726	NR_ANON_MAPPED,
1727#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1728	NR_ANON_THPS,
1729#endif
1730	NR_SHMEM,
1731	NR_FILE_MAPPED,
1732	NR_FILE_DIRTY,
1733	NR_WRITEBACK,
1734	WORKINGSET_REFAULT_ANON,
1735	WORKINGSET_REFAULT_FILE,
1736#ifdef CONFIG_SWAP
1737	MEMCG_SWAP,
1738	NR_SWAPCACHE,
1739#endif
1740};
1741
1742static const char *const memcg1_stat_names[] = {
1743	"cache",
1744	"rss",
1745#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1746	"rss_huge",
1747#endif
1748	"shmem",
1749	"mapped_file",
1750	"dirty",
1751	"writeback",
1752	"workingset_refault_anon",
1753	"workingset_refault_file",
1754#ifdef CONFIG_SWAP
1755	"swap",
1756	"swapcached",
1757#endif
1758};
1759
1760/* Universal VM events cgroup1 shows, original sort order */
1761static const unsigned int memcg1_events[] = {
1762	PGPGIN,
1763	PGPGOUT,
1764	PGFAULT,
1765	PGMAJFAULT,
1766};
1767
1768void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1769{
1770	unsigned long memory, memsw;
1771	struct mem_cgroup *mi;
1772	unsigned int i;
1773
1774	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
1775
1776	mem_cgroup_flush_stats(memcg);
1777
1778	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1779		unsigned long nr;
1780
1781		nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
1782		seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
1783	}
1784
1785	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
1786		seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
1787			       memcg_events_local(memcg, memcg1_events[i]));
1788
1789	for (i = 0; i < NR_LRU_LISTS; i++)
1790		seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
1791			       memcg_page_state_local(memcg, NR_LRU_BASE + i) *
1792			       PAGE_SIZE);
1793
1794	/* Hierarchical information */
1795	memory = memsw = PAGE_COUNTER_MAX;
1796	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
1797		memory = min(memory, READ_ONCE(mi->memory.max));
1798		memsw = min(memsw, READ_ONCE(mi->memsw.max));
1799	}
1800	seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
1801		       (u64)memory * PAGE_SIZE);
1802	seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
1803		       (u64)memsw * PAGE_SIZE);
1804
1805	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1806		unsigned long nr;
1807
1808		nr = memcg_page_state_output(memcg, memcg1_stats[i]);
1809		seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
1810			       (u64)nr);
1811	}
1812
1813	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
1814		seq_buf_printf(s, "total_%s %llu\n",
1815			       vm_event_name(memcg1_events[i]),
1816			       (u64)memcg_events(memcg, memcg1_events[i]));
1817
1818	for (i = 0; i < NR_LRU_LISTS; i++)
1819		seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
1820			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1821			       PAGE_SIZE);
1822
1823#ifdef CONFIG_DEBUG_VM
1824	{
1825		pg_data_t *pgdat;
1826		struct mem_cgroup_per_node *mz;
1827		unsigned long anon_cost = 0;
1828		unsigned long file_cost = 0;
1829
1830		for_each_online_pgdat(pgdat) {
1831			mz = memcg->nodeinfo[pgdat->node_id];
1832
1833			anon_cost += mz->lruvec.anon_cost;
1834			file_cost += mz->lruvec.file_cost;
1835		}
1836		seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
1837		seq_buf_printf(s, "file_cost %lu\n", file_cost);
1838	}
1839#endif
1840}
1841
1842static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
1843				      struct cftype *cft)
1844{
1845	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1846
1847	return mem_cgroup_swappiness(memcg);
1848}
1849
1850static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
1851				       struct cftype *cft, u64 val)
1852{
1853	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1854
1855	if (val > MAX_SWAPPINESS)
1856		return -EINVAL;
1857
1858	if (!mem_cgroup_is_root(memcg))
1859		WRITE_ONCE(memcg->swappiness, val);
1860	else
1861		WRITE_ONCE(vm_swappiness, val);
1862
1863	return 0;
1864}
1865
1866static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
1867{
1868	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
1869
1870	seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
1871	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
1872	seq_printf(sf, "oom_kill %lu\n",
1873		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
1874	return 0;
1875}
1876
1877static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
1878	struct cftype *cft, u64 val)
1879{
1880	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1881
1882	pr_warn_once("oom_control is deprecated and will be removed. "
1883		     "Please report your usecase to linux-mm-@kvack.org if you "
1884		     "depend on this functionality. \n");
1885
1886	/* cannot set to root cgroup and only 0 and 1 are allowed */
1887	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
1888		return -EINVAL;
1889
1890	WRITE_ONCE(memcg->oom_kill_disable, val);
1891	if (!val)
1892		memcg1_oom_recover(memcg);
1893
1894	return 0;
1895}
1896
1897#ifdef CONFIG_SLUB_DEBUG
1898static int mem_cgroup_slab_show(struct seq_file *m, void *p)
1899{
1900	/*
1901	 * Deprecated.
1902	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
1903	 */
1904	return 0;
1905}
1906#endif
1907
1908struct cftype mem_cgroup_legacy_files[] = {
1909	{
1910		.name = "usage_in_bytes",
1911		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1912		.read_u64 = mem_cgroup_read_u64,
1913	},
1914	{
1915		.name = "max_usage_in_bytes",
1916		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1917		.write = mem_cgroup_reset,
1918		.read_u64 = mem_cgroup_read_u64,
1919	},
1920	{
1921		.name = "limit_in_bytes",
1922		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1923		.write = mem_cgroup_write,
1924		.read_u64 = mem_cgroup_read_u64,
1925	},
1926	{
1927		.name = "soft_limit_in_bytes",
1928		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
1929		.write = mem_cgroup_write,
1930		.read_u64 = mem_cgroup_read_u64,
1931	},
1932	{
1933		.name = "failcnt",
1934		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1935		.write = mem_cgroup_reset,
1936		.read_u64 = mem_cgroup_read_u64,
1937	},
1938	{
1939		.name = "stat",
1940		.seq_show = memory_stat_show,
1941	},
1942	{
1943		.name = "force_empty",
1944		.write = mem_cgroup_force_empty_write,
1945	},
1946	{
1947		.name = "use_hierarchy",
1948		.write_u64 = mem_cgroup_hierarchy_write,
1949		.read_u64 = mem_cgroup_hierarchy_read,
1950	},
1951	{
1952		.name = "cgroup.event_control",		/* XXX: for compat */
1953		.write = memcg_write_event_control,
1954		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
1955	},
1956	{
1957		.name = "swappiness",
1958		.read_u64 = mem_cgroup_swappiness_read,
1959		.write_u64 = mem_cgroup_swappiness_write,
1960	},
1961	{
1962		.name = "move_charge_at_immigrate",
1963		.read_u64 = mem_cgroup_move_charge_read,
1964		.write_u64 = mem_cgroup_move_charge_write,
1965	},
1966	{
1967		.name = "oom_control",
1968		.seq_show = mem_cgroup_oom_control_read,
1969		.write_u64 = mem_cgroup_oom_control_write,
1970	},
1971	{
1972		.name = "pressure_level",
1973		.seq_show = mem_cgroup_dummy_seq_show,
1974	},
1975#ifdef CONFIG_NUMA
1976	{
1977		.name = "numa_stat",
1978		.seq_show = memcg_numa_stat_show,
1979	},
1980#endif
1981	{
1982		.name = "kmem.limit_in_bytes",
1983		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
1984		.write = mem_cgroup_write,
1985		.read_u64 = mem_cgroup_read_u64,
1986	},
1987	{
1988		.name = "kmem.usage_in_bytes",
1989		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
1990		.read_u64 = mem_cgroup_read_u64,
1991	},
1992	{
1993		.name = "kmem.failcnt",
1994		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
1995		.write = mem_cgroup_reset,
1996		.read_u64 = mem_cgroup_read_u64,
1997	},
1998	{
1999		.name = "kmem.max_usage_in_bytes",
2000		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2001		.write = mem_cgroup_reset,
2002		.read_u64 = mem_cgroup_read_u64,
2003	},
2004#ifdef CONFIG_SLUB_DEBUG
2005	{
2006		.name = "kmem.slabinfo",
2007		.seq_show = mem_cgroup_slab_show,
2008	},
2009#endif
2010	{
2011		.name = "kmem.tcp.limit_in_bytes",
2012		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
2013		.write = mem_cgroup_write,
2014		.read_u64 = mem_cgroup_read_u64,
2015	},
2016	{
2017		.name = "kmem.tcp.usage_in_bytes",
2018		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
2019		.read_u64 = mem_cgroup_read_u64,
2020	},
2021	{
2022		.name = "kmem.tcp.failcnt",
2023		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
2024		.write = mem_cgroup_reset,
2025		.read_u64 = mem_cgroup_read_u64,
2026	},
2027	{
2028		.name = "kmem.tcp.max_usage_in_bytes",
2029		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
2030		.write = mem_cgroup_reset,
2031		.read_u64 = mem_cgroup_read_u64,
2032	},
2033	{ },	/* terminate */
2034};
2035
2036struct cftype memsw_files[] = {
2037	{
2038		.name = "memsw.usage_in_bytes",
2039		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2040		.read_u64 = mem_cgroup_read_u64,
2041	},
2042	{
2043		.name = "memsw.max_usage_in_bytes",
2044		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2045		.write = mem_cgroup_reset,
2046		.read_u64 = mem_cgroup_read_u64,
2047	},
2048	{
2049		.name = "memsw.limit_in_bytes",
2050		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2051		.write = mem_cgroup_write,
2052		.read_u64 = mem_cgroup_read_u64,
2053	},
2054	{
2055		.name = "memsw.failcnt",
2056		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2057		.write = mem_cgroup_reset,
2058		.read_u64 = mem_cgroup_read_u64,
2059	},
2060	{ },	/* terminate */
2061};
2062
2063void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2064{
2065	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
2066		if (nr_pages > 0)
2067			page_counter_charge(&memcg->kmem, nr_pages);
2068		else
2069			page_counter_uncharge(&memcg->kmem, -nr_pages);
2070	}
2071}
2072
2073bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
2074			 gfp_t gfp_mask)
2075{
2076	struct page_counter *fail;
2077
2078	if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
2079		memcg->tcpmem_pressure = 0;
2080		return true;
2081	}
2082	memcg->tcpmem_pressure = 1;
2083	if (gfp_mask & __GFP_NOFAIL) {
2084		page_counter_charge(&memcg->tcpmem, nr_pages);
2085		return true;
2086	}
2087	return false;
2088}
2089
2090bool memcg1_alloc_events(struct mem_cgroup *memcg)
2091{
2092	memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
2093						GFP_KERNEL_ACCOUNT);
2094	return !!memcg->events_percpu;
2095}
2096
2097void memcg1_free_events(struct mem_cgroup *memcg)
2098{
2099	if (memcg->events_percpu)
2100		free_percpu(memcg->events_percpu);
2101}
2102
2103static int __init memcg1_init(void)
2104{
2105	int node;
2106
2107	for_each_node(node) {
2108		struct mem_cgroup_tree_per_node *rtpn;
2109
2110		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
2111
2112		rtpn->rb_root = RB_ROOT;
2113		rtpn->rb_rightmost = NULL;
2114		spin_lock_init(&rtpn->lock);
2115		soft_limit_tree.rb_tree_per_node[node] = rtpn;
2116	}
2117
2118	return 0;
2119}
2120subsys_initcall(memcg1_init);