Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
Note: File does not exist in v3.5.6.
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Process number limiting controller for cgroups.
  4 *
  5 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
  6 * after a certain limit is reached.
  7 *
  8 * Since it is trivial to hit the task limit without hitting any kmemcg limits
  9 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
 10 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
 11 * of the number of tasks in a cgroup.
 12 *
 13 * In order to use the `pids` controller, set the maximum number of tasks in
 14 * pids.max (this is not available in the root cgroup for obvious reasons). The
 15 * number of processes currently in the cgroup is given by pids.current.
 16 * Organisational operations are not blocked by cgroup policies, so it is
 17 * possible to have pids.current > pids.max. However, it is not possible to
 18 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
 19 * would cause a cgroup policy to be violated.
 20 *
 21 * To set a cgroup to have no limit, set pids.max to "max". This is the default
 22 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
 23 * stringent limit in the hierarchy is followed).
 24 *
 25 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
 26 * a superset of parent/child/pids.current.
 27 *
 28 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
 29 */
 30
 31#include <linux/kernel.h>
 32#include <linux/threads.h>
 33#include <linux/atomic.h>
 34#include <linux/cgroup.h>
 35#include <linux/slab.h>
 36#include <linux/sched/task.h>
 37
 38#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
 39#define PIDS_MAX_STR "max"
 40
 41struct pids_cgroup {
 42	struct cgroup_subsys_state	css;
 43
 44	/*
 45	 * Use 64-bit types so that we can safely represent "max" as
 46	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
 47	 */
 48	atomic64_t			counter;
 49	atomic64_t			limit;
 50
 51	/* Handle for "pids.events" */
 52	struct cgroup_file		events_file;
 53
 54	/* Number of times fork failed because limit was hit. */
 55	atomic64_t			events_limit;
 56};
 57
 58static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
 59{
 60	return container_of(css, struct pids_cgroup, css);
 61}
 62
 63static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
 64{
 65	return css_pids(pids->css.parent);
 66}
 67
 68static struct cgroup_subsys_state *
 69pids_css_alloc(struct cgroup_subsys_state *parent)
 70{
 71	struct pids_cgroup *pids;
 72
 73	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
 74	if (!pids)
 75		return ERR_PTR(-ENOMEM);
 76
 77	atomic64_set(&pids->counter, 0);
 78	atomic64_set(&pids->limit, PIDS_MAX);
 79	atomic64_set(&pids->events_limit, 0);
 80	return &pids->css;
 81}
 82
 83static void pids_css_free(struct cgroup_subsys_state *css)
 84{
 85	kfree(css_pids(css));
 86}
 87
 88/**
 89 * pids_cancel - uncharge the local pid count
 90 * @pids: the pid cgroup state
 91 * @num: the number of pids to cancel
 92 *
 93 * This function will WARN if the pid count goes under 0, because such a case is
 94 * a bug in the pids controller proper.
 95 */
 96static void pids_cancel(struct pids_cgroup *pids, int num)
 97{
 98	/*
 99	 * A negative count (or overflow for that matter) is invalid,
100	 * and indicates a bug in the `pids` controller proper.
101	 */
102	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
103}
104
105/**
106 * pids_uncharge - hierarchically uncharge the pid count
107 * @pids: the pid cgroup state
108 * @num: the number of pids to uncharge
109 */
110static void pids_uncharge(struct pids_cgroup *pids, int num)
111{
112	struct pids_cgroup *p;
113
114	for (p = pids; parent_pids(p); p = parent_pids(p))
115		pids_cancel(p, num);
116}
117
118/**
119 * pids_charge - hierarchically charge the pid count
120 * @pids: the pid cgroup state
121 * @num: the number of pids to charge
122 *
123 * This function does *not* follow the pid limit set. It cannot fail and the new
124 * pid count may exceed the limit. This is only used for reverting failed
125 * attaches, where there is no other way out than violating the limit.
126 */
127static void pids_charge(struct pids_cgroup *pids, int num)
128{
129	struct pids_cgroup *p;
130
131	for (p = pids; parent_pids(p); p = parent_pids(p))
132		atomic64_add(num, &p->counter);
133}
134
135/**
136 * pids_try_charge - hierarchically try to charge the pid count
137 * @pids: the pid cgroup state
138 * @num: the number of pids to charge
139 *
140 * This function follows the set limit. It will fail if the charge would cause
141 * the new value to exceed the hierarchical limit. Returns 0 if the charge
142 * succeeded, otherwise -EAGAIN.
143 */
144static int pids_try_charge(struct pids_cgroup *pids, int num)
145{
146	struct pids_cgroup *p, *q;
147
148	for (p = pids; parent_pids(p); p = parent_pids(p)) {
149		int64_t new = atomic64_add_return(num, &p->counter);
150		int64_t limit = atomic64_read(&p->limit);
151
152		/*
153		 * Since new is capped to the maximum number of pid_t, if
154		 * p->limit is %PIDS_MAX then we know that this test will never
155		 * fail.
156		 */
157		if (new > limit)
158			goto revert;
159	}
160
161	return 0;
162
163revert:
164	for (q = pids; q != p; q = parent_pids(q))
165		pids_cancel(q, num);
166	pids_cancel(p, num);
167
168	return -EAGAIN;
169}
170
171static int pids_can_attach(struct cgroup_taskset *tset)
172{
173	struct task_struct *task;
174	struct cgroup_subsys_state *dst_css;
175
176	cgroup_taskset_for_each(task, dst_css, tset) {
177		struct pids_cgroup *pids = css_pids(dst_css);
178		struct cgroup_subsys_state *old_css;
179		struct pids_cgroup *old_pids;
180
181		/*
182		 * No need to pin @old_css between here and cancel_attach()
183		 * because cgroup core protects it from being freed before
184		 * the migration completes or fails.
185		 */
186		old_css = task_css(task, pids_cgrp_id);
187		old_pids = css_pids(old_css);
188
189		pids_charge(pids, 1);
190		pids_uncharge(old_pids, 1);
191	}
192
193	return 0;
194}
195
196static void pids_cancel_attach(struct cgroup_taskset *tset)
197{
198	struct task_struct *task;
199	struct cgroup_subsys_state *dst_css;
200
201	cgroup_taskset_for_each(task, dst_css, tset) {
202		struct pids_cgroup *pids = css_pids(dst_css);
203		struct cgroup_subsys_state *old_css;
204		struct pids_cgroup *old_pids;
205
206		old_css = task_css(task, pids_cgrp_id);
207		old_pids = css_pids(old_css);
208
209		pids_charge(old_pids, 1);
210		pids_uncharge(pids, 1);
211	}
212}
213
214/*
215 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
216 * on cgroup_threadgroup_change_begin() held by the copy_process().
217 */
218static int pids_can_fork(struct task_struct *task, struct css_set *cset)
219{
220	struct cgroup_subsys_state *css;
221	struct pids_cgroup *pids;
222	int err;
223
224	if (cset)
225		css = cset->subsys[pids_cgrp_id];
226	else
227		css = task_css_check(current, pids_cgrp_id, true);
228	pids = css_pids(css);
229	err = pids_try_charge(pids, 1);
230	if (err) {
231		/* Only log the first time events_limit is incremented. */
232		if (atomic64_inc_return(&pids->events_limit) == 1) {
233			pr_info("cgroup: fork rejected by pids controller in ");
234			pr_cont_cgroup_path(css->cgroup);
235			pr_cont("\n");
236		}
237		cgroup_file_notify(&pids->events_file);
238	}
239	return err;
240}
241
242static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
243{
244	struct cgroup_subsys_state *css;
245	struct pids_cgroup *pids;
246
247	if (cset)
248		css = cset->subsys[pids_cgrp_id];
249	else
250		css = task_css_check(current, pids_cgrp_id, true);
251	pids = css_pids(css);
252	pids_uncharge(pids, 1);
253}
254
255static void pids_release(struct task_struct *task)
256{
257	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
258
259	pids_uncharge(pids, 1);
260}
261
262static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
263			      size_t nbytes, loff_t off)
264{
265	struct cgroup_subsys_state *css = of_css(of);
266	struct pids_cgroup *pids = css_pids(css);
267	int64_t limit;
268	int err;
269
270	buf = strstrip(buf);
271	if (!strcmp(buf, PIDS_MAX_STR)) {
272		limit = PIDS_MAX;
273		goto set_limit;
274	}
275
276	err = kstrtoll(buf, 0, &limit);
277	if (err)
278		return err;
279
280	if (limit < 0 || limit >= PIDS_MAX)
281		return -EINVAL;
282
283set_limit:
284	/*
285	 * Limit updates don't need to be mutex'd, since it isn't
286	 * critical that any racing fork()s follow the new limit.
287	 */
288	atomic64_set(&pids->limit, limit);
289	return nbytes;
290}
291
292static int pids_max_show(struct seq_file *sf, void *v)
293{
294	struct cgroup_subsys_state *css = seq_css(sf);
295	struct pids_cgroup *pids = css_pids(css);
296	int64_t limit = atomic64_read(&pids->limit);
297
298	if (limit >= PIDS_MAX)
299		seq_printf(sf, "%s\n", PIDS_MAX_STR);
300	else
301		seq_printf(sf, "%lld\n", limit);
302
303	return 0;
304}
305
306static s64 pids_current_read(struct cgroup_subsys_state *css,
307			     struct cftype *cft)
308{
309	struct pids_cgroup *pids = css_pids(css);
310
311	return atomic64_read(&pids->counter);
312}
313
314static int pids_events_show(struct seq_file *sf, void *v)
315{
316	struct pids_cgroup *pids = css_pids(seq_css(sf));
317
318	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
319	return 0;
320}
321
322static struct cftype pids_files[] = {
323	{
324		.name = "max",
325		.write = pids_max_write,
326		.seq_show = pids_max_show,
327		.flags = CFTYPE_NOT_ON_ROOT,
328	},
329	{
330		.name = "current",
331		.read_s64 = pids_current_read,
332		.flags = CFTYPE_NOT_ON_ROOT,
333	},
334	{
335		.name = "events",
336		.seq_show = pids_events_show,
337		.file_offset = offsetof(struct pids_cgroup, events_file),
338		.flags = CFTYPE_NOT_ON_ROOT,
339	},
340	{ }	/* terminate */
341};
342
343struct cgroup_subsys pids_cgrp_subsys = {
344	.css_alloc	= pids_css_alloc,
345	.css_free	= pids_css_free,
346	.can_attach 	= pids_can_attach,
347	.cancel_attach 	= pids_cancel_attach,
348	.can_fork	= pids_can_fork,
349	.cancel_fork	= pids_cancel_fork,
350	.release	= pids_release,
351	.legacy_cftypes	= pids_files,
352	.dfl_cftypes	= pids_files,
353	.threaded	= true,
354};