Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2
  3#include "cpuset-internal.h"
  4
  5/*
  6 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
  7 */
  8struct cpuset_remove_tasks_struct {
  9	struct work_struct work;
 10	struct cpuset *cs;
 11};
 12
 13/*
 14 * Frequency meter - How fast is some event occurring?
 15 *
 16 * These routines manage a digitally filtered, constant time based,
 17 * event frequency meter.  There are four routines:
 18 *   fmeter_init() - initialize a frequency meter.
 19 *   fmeter_markevent() - called each time the event happens.
 20 *   fmeter_getrate() - returns the recent rate of such events.
 21 *   fmeter_update() - internal routine used to update fmeter.
 22 *
 23 * A common data structure is passed to each of these routines,
 24 * which is used to keep track of the state required to manage the
 25 * frequency meter and its digital filter.
 26 *
 27 * The filter works on the number of events marked per unit time.
 28 * The filter is single-pole low-pass recursive (IIR).  The time unit
 29 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 30 * simulate 3 decimal digits of precision (multiplied by 1000).
 31 *
 32 * With an FM_COEF of 933, and a time base of 1 second, the filter
 33 * has a half-life of 10 seconds, meaning that if the events quit
 34 * happening, then the rate returned from the fmeter_getrate()
 35 * will be cut in half each 10 seconds, until it converges to zero.
 36 *
 37 * It is not worth doing a real infinitely recursive filter.  If more
 38 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 39 * just compute FM_MAXTICKS ticks worth, by which point the level
 40 * will be stable.
 41 *
 42 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 43 * arithmetic overflow in the fmeter_update() routine.
 44 *
 45 * Given the simple 32 bit integer arithmetic used, this meter works
 46 * best for reporting rates between one per millisecond (msec) and
 47 * one per 32 (approx) seconds.  At constant rates faster than one
 48 * per msec it maxes out at values just under 1,000,000.  At constant
 49 * rates between one per msec, and one per second it will stabilize
 50 * to a value N*1000, where N is the rate of events per second.
 51 * At constant rates between one per second and one per 32 seconds,
 52 * it will be choppy, moving up on the seconds that have an event,
 53 * and then decaying until the next event.  At rates slower than
 54 * about one in 32 seconds, it decays all the way back to zero between
 55 * each event.
 56 */
 57
 58#define FM_COEF 933		/* coefficient for half-life of 10 secs */
 59#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
 60#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
 61#define FM_SCALE 1000		/* faux fixed point scale */
 62
 63/* Initialize a frequency meter */
 64void fmeter_init(struct fmeter *fmp)
 65{
 66	fmp->cnt = 0;
 67	fmp->val = 0;
 68	fmp->time = 0;
 69	spin_lock_init(&fmp->lock);
 70}
 71
 72/* Internal meter update - process cnt events and update value */
 73static void fmeter_update(struct fmeter *fmp)
 74{
 75	time64_t now;
 76	u32 ticks;
 77
 78	now = ktime_get_seconds();
 79	ticks = now - fmp->time;
 80
 81	if (ticks == 0)
 82		return;
 83
 84	ticks = min(FM_MAXTICKS, ticks);
 85	while (ticks-- > 0)
 86		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
 87	fmp->time = now;
 88
 89	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
 90	fmp->cnt = 0;
 91}
 92
 93/* Process any previous ticks, then bump cnt by one (times scale). */
 94static void fmeter_markevent(struct fmeter *fmp)
 95{
 96	spin_lock(&fmp->lock);
 97	fmeter_update(fmp);
 98	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
 99	spin_unlock(&fmp->lock);
100}
101
102/* Process any previous ticks, then return current value. */
103static int fmeter_getrate(struct fmeter *fmp)
104{
105	int val;
106
107	spin_lock(&fmp->lock);
108	fmeter_update(fmp);
109	val = fmp->val;
110	spin_unlock(&fmp->lock);
111	return val;
112}
113
114/*
115 * Collection of memory_pressure is suppressed unless
116 * this flag is enabled by writing "1" to the special
117 * cpuset file 'memory_pressure_enabled' in the root cpuset.
118 */
119
120int cpuset_memory_pressure_enabled __read_mostly;
121
122/*
123 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
124 *
125 * Keep a running average of the rate of synchronous (direct)
126 * page reclaim efforts initiated by tasks in each cpuset.
127 *
128 * This represents the rate at which some task in the cpuset
129 * ran low on memory on all nodes it was allowed to use, and
130 * had to enter the kernels page reclaim code in an effort to
131 * create more free memory by tossing clean pages or swapping
132 * or writing dirty pages.
133 *
134 * Display to user space in the per-cpuset read-only file
135 * "memory_pressure".  Value displayed is an integer
136 * representing the recent rate of entry into the synchronous
137 * (direct) page reclaim by any task attached to the cpuset.
138 */
139
140void __cpuset_memory_pressure_bump(void)
141{
142	rcu_read_lock();
143	fmeter_markevent(&task_cs(current)->fmeter);
144	rcu_read_unlock();
145}
146
147static int update_relax_domain_level(struct cpuset *cs, s64 val)
148{
149#ifdef CONFIG_SMP
150	if (val < -1 || val > sched_domain_level_max + 1)
151		return -EINVAL;
152#endif
153
154	if (val != cs->relax_domain_level) {
155		cs->relax_domain_level = val;
156		if (!cpumask_empty(cs->cpus_allowed) &&
157		    is_sched_load_balance(cs))
158			rebuild_sched_domains_locked();
159	}
160
161	return 0;
162}
163
164static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
165			    s64 val)
166{
167	struct cpuset *cs = css_cs(css);
168	cpuset_filetype_t type = cft->private;
169	int retval = -ENODEV;
170
171	cpus_read_lock();
172	cpuset_lock();
173	if (!is_cpuset_online(cs))
174		goto out_unlock;
175
176	switch (type) {
177	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
178		retval = update_relax_domain_level(cs, val);
179		break;
180	default:
181		retval = -EINVAL;
182		break;
183	}
184out_unlock:
185	cpuset_unlock();
186	cpus_read_unlock();
187	return retval;
188}
189
190static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
191{
192	struct cpuset *cs = css_cs(css);
193	cpuset_filetype_t type = cft->private;
194
195	switch (type) {
196	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
197		return cs->relax_domain_level;
198	default:
199		BUG();
200	}
201
202	/* Unreachable but makes gcc happy */
203	return 0;
204}
205
206/*
207 * update task's spread flag if cpuset's page/slab spread flag is set
208 *
209 * Call with callback_lock or cpuset_mutex held. The check can be skipped
210 * if on default hierarchy.
211 */
212void cpuset1_update_task_spread_flags(struct cpuset *cs,
213					struct task_struct *tsk)
214{
215	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
216		return;
217
218	if (is_spread_page(cs))
219		task_set_spread_page(tsk);
220	else
221		task_clear_spread_page(tsk);
222
223	if (is_spread_slab(cs))
224		task_set_spread_slab(tsk);
225	else
226		task_clear_spread_slab(tsk);
227}
228
229/**
230 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231 * @cs: the cpuset in which each task's spread flags needs to be changed
232 *
233 * Iterate through each task of @cs updating its spread flags.  As this
234 * function is called with cpuset_mutex held, cpuset membership stays
235 * stable.
236 */
237void cpuset1_update_tasks_flags(struct cpuset *cs)
238{
239	struct css_task_iter it;
240	struct task_struct *task;
241
242	css_task_iter_start(&cs->css, 0, &it);
243	while ((task = css_task_iter_next(&it)))
244		cpuset1_update_task_spread_flags(cs, task);
245	css_task_iter_end(&it);
246}
247
248/*
249 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
250 * or memory nodes, we need to walk over the cpuset hierarchy,
251 * removing that CPU or node from all cpusets.  If this removes the
252 * last CPU or node from a cpuset, then move the tasks in the empty
253 * cpuset to its next-highest non-empty parent.
254 */
255static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
256{
257	struct cpuset *parent;
258
259	/*
260	 * Find its next-highest non-empty parent, (top cpuset
261	 * has online cpus, so can't be empty).
262	 */
263	parent = parent_cs(cs);
264	while (cpumask_empty(parent->cpus_allowed) ||
265			nodes_empty(parent->mems_allowed))
266		parent = parent_cs(parent);
267
268	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
269		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270		pr_cont_cgroup_name(cs->css.cgroup);
271		pr_cont("\n");
272	}
273}
274
275static void cpuset_migrate_tasks_workfn(struct work_struct *work)
276{
277	struct cpuset_remove_tasks_struct *s;
278
279	s = container_of(work, struct cpuset_remove_tasks_struct, work);
280	remove_tasks_in_empty_cpuset(s->cs);
281	css_put(&s->cs->css);
282	kfree(s);
283}
284
285void cpuset1_hotplug_update_tasks(struct cpuset *cs,
286			    struct cpumask *new_cpus, nodemask_t *new_mems,
287			    bool cpus_updated, bool mems_updated)
288{
289	bool is_empty;
290
291	cpuset_callback_lock_irq();
292	cpumask_copy(cs->cpus_allowed, new_cpus);
293	cpumask_copy(cs->effective_cpus, new_cpus);
294	cs->mems_allowed = *new_mems;
295	cs->effective_mems = *new_mems;
296	cpuset_callback_unlock_irq();
297
298	/*
299	 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300	 * as the tasks will be migrated to an ancestor.
301	 */
302	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
303		cpuset_update_tasks_cpumask(cs, new_cpus);
304	if (mems_updated && !nodes_empty(cs->mems_allowed))
305		cpuset_update_tasks_nodemask(cs);
306
307	is_empty = cpumask_empty(cs->cpus_allowed) ||
308		   nodes_empty(cs->mems_allowed);
309
310	/*
311	 * Move tasks to the nearest ancestor with execution resources,
312	 * This is full cgroup operation which will also call back into
313	 * cpuset. Execute it asynchronously using workqueue.
314	 */
315	if (is_empty && cs->css.cgroup->nr_populated_csets &&
316	    css_tryget_online(&cs->css)) {
317		struct cpuset_remove_tasks_struct *s;
318
319		s = kzalloc(sizeof(*s), GFP_KERNEL);
320		if (WARN_ON_ONCE(!s)) {
321			css_put(&cs->css);
322			return;
323		}
324
325		s->cs = cs;
326		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
327		schedule_work(&s->work);
328	}
329}
330
331/*
332 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
333 *
334 * One cpuset is a subset of another if all its allowed CPUs and
335 * Memory Nodes are a subset of the other, and its exclusive flags
336 * are only set if the other's are set.  Call holding cpuset_mutex.
337 */
338
339static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
340{
341	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
342		nodes_subset(p->mems_allowed, q->mems_allowed) &&
343		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
344		is_mem_exclusive(p) <= is_mem_exclusive(q);
345}
346
347/*
348 * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
349 *                            behavior.
350 */
351int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
352{
353	struct cgroup_subsys_state *css;
354	struct cpuset *c, *par;
355	int ret;
356
357	WARN_ON_ONCE(!rcu_read_lock_held());
358
359	/* Each of our child cpusets must be a subset of us */
360	ret = -EBUSY;
361	cpuset_for_each_child(c, css, cur)
362		if (!is_cpuset_subset(c, trial))
363			goto out;
364
365	/* On legacy hierarchy, we must be a subset of our parent cpuset. */
366	ret = -EACCES;
367	par = parent_cs(cur);
368	if (par && !is_cpuset_subset(trial, par))
369		goto out;
370
371	ret = 0;
372out:
373	return ret;
374}
375
376static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
377{
378	struct cpuset *cs = css_cs(css);
379	cpuset_filetype_t type = cft->private;
380
381	switch (type) {
382	case FILE_CPU_EXCLUSIVE:
383		return is_cpu_exclusive(cs);
384	case FILE_MEM_EXCLUSIVE:
385		return is_mem_exclusive(cs);
386	case FILE_MEM_HARDWALL:
387		return is_mem_hardwall(cs);
388	case FILE_SCHED_LOAD_BALANCE:
389		return is_sched_load_balance(cs);
390	case FILE_MEMORY_MIGRATE:
391		return is_memory_migrate(cs);
392	case FILE_MEMORY_PRESSURE_ENABLED:
393		return cpuset_memory_pressure_enabled;
394	case FILE_MEMORY_PRESSURE:
395		return fmeter_getrate(&cs->fmeter);
396	case FILE_SPREAD_PAGE:
397		return is_spread_page(cs);
398	case FILE_SPREAD_SLAB:
399		return is_spread_slab(cs);
400	default:
401		BUG();
402	}
403
404	/* Unreachable but makes gcc happy */
405	return 0;
406}
407
408static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
409			    u64 val)
410{
411	struct cpuset *cs = css_cs(css);
412	cpuset_filetype_t type = cft->private;
413	int retval = 0;
414
415	cpus_read_lock();
416	cpuset_lock();
417	if (!is_cpuset_online(cs)) {
418		retval = -ENODEV;
419		goto out_unlock;
420	}
421
422	switch (type) {
423	case FILE_CPU_EXCLUSIVE:
424		retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
425		break;
426	case FILE_MEM_EXCLUSIVE:
427		retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
428		break;
429	case FILE_MEM_HARDWALL:
430		retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
431		break;
432	case FILE_SCHED_LOAD_BALANCE:
433		retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
434		break;
435	case FILE_MEMORY_MIGRATE:
436		retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
437		break;
438	case FILE_MEMORY_PRESSURE_ENABLED:
439		cpuset_memory_pressure_enabled = !!val;
440		break;
441	case FILE_SPREAD_PAGE:
442		retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
443		break;
444	case FILE_SPREAD_SLAB:
445		retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
446		break;
447	default:
448		retval = -EINVAL;
449		break;
450	}
451out_unlock:
452	cpuset_unlock();
453	cpus_read_unlock();
454	return retval;
455}
456
457/*
458 * for the common functions, 'private' gives the type of file
459 */
460
461struct cftype cpuset1_files[] = {
462	{
463		.name = "cpus",
464		.seq_show = cpuset_common_seq_show,
465		.write = cpuset_write_resmask,
466		.max_write_len = (100U + 6 * NR_CPUS),
467		.private = FILE_CPULIST,
468	},
469
470	{
471		.name = "mems",
472		.seq_show = cpuset_common_seq_show,
473		.write = cpuset_write_resmask,
474		.max_write_len = (100U + 6 * MAX_NUMNODES),
475		.private = FILE_MEMLIST,
476	},
477
478	{
479		.name = "effective_cpus",
480		.seq_show = cpuset_common_seq_show,
481		.private = FILE_EFFECTIVE_CPULIST,
482	},
483
484	{
485		.name = "effective_mems",
486		.seq_show = cpuset_common_seq_show,
487		.private = FILE_EFFECTIVE_MEMLIST,
488	},
489
490	{
491		.name = "cpu_exclusive",
492		.read_u64 = cpuset_read_u64,
493		.write_u64 = cpuset_write_u64,
494		.private = FILE_CPU_EXCLUSIVE,
495	},
496
497	{
498		.name = "mem_exclusive",
499		.read_u64 = cpuset_read_u64,
500		.write_u64 = cpuset_write_u64,
501		.private = FILE_MEM_EXCLUSIVE,
502	},
503
504	{
505		.name = "mem_hardwall",
506		.read_u64 = cpuset_read_u64,
507		.write_u64 = cpuset_write_u64,
508		.private = FILE_MEM_HARDWALL,
509	},
510
511	{
512		.name = "sched_load_balance",
513		.read_u64 = cpuset_read_u64,
514		.write_u64 = cpuset_write_u64,
515		.private = FILE_SCHED_LOAD_BALANCE,
516	},
517
518	{
519		.name = "sched_relax_domain_level",
520		.read_s64 = cpuset_read_s64,
521		.write_s64 = cpuset_write_s64,
522		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
523	},
524
525	{
526		.name = "memory_migrate",
527		.read_u64 = cpuset_read_u64,
528		.write_u64 = cpuset_write_u64,
529		.private = FILE_MEMORY_MIGRATE,
530	},
531
532	{
533		.name = "memory_pressure",
534		.read_u64 = cpuset_read_u64,
535		.private = FILE_MEMORY_PRESSURE,
536	},
537
538	{
539		.name = "memory_spread_page",
540		.read_u64 = cpuset_read_u64,
541		.write_u64 = cpuset_write_u64,
542		.private = FILE_SPREAD_PAGE,
543	},
544
545	{
546		/* obsolete, may be removed in the future */
547		.name = "memory_spread_slab",
548		.read_u64 = cpuset_read_u64,
549		.write_u64 = cpuset_write_u64,
550		.private = FILE_SPREAD_SLAB,
551	},
552
553	{
554		.name = "memory_pressure_enabled",
555		.flags = CFTYPE_ONLY_ON_ROOT,
556		.read_u64 = cpuset_read_u64,
557		.write_u64 = cpuset_write_u64,
558		.private = FILE_MEMORY_PRESSURE_ENABLED,
559	},
560
561	{ }	/* terminate */
562};