Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.15.
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * RDMA resource limiting controller for cgroups.
  4 *
  5 * Used to allow a cgroup hierarchy to stop processes from consuming
  6 * additional RDMA resources after a certain limit is reached.
  7 *
  8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  9 */
 10
 11#include <linux/bitops.h>
 12#include <linux/slab.h>
 13#include <linux/seq_file.h>
 14#include <linux/cgroup.h>
 15#include <linux/parser.h>
 16#include <linux/cgroup_rdma.h>
 17
 18#define RDMACG_MAX_STR "max"
 19
 20/*
 21 * Protects list of resource pools maintained on per cgroup basis
 22 * and rdma device list.
 23 */
 24static DEFINE_MUTEX(rdmacg_mutex);
 25static LIST_HEAD(rdmacg_devices);
 26
 27enum rdmacg_file_type {
 28	RDMACG_RESOURCE_TYPE_MAX,
 29	RDMACG_RESOURCE_TYPE_STAT,
 30};
 31
 32/*
 33 * resource table definition as to be seen by the user.
 34 * Need to add entries to it when more resources are
 35 * added/defined at IB verb/core layer.
 36 */
 37static char const *rdmacg_resource_names[] = {
 38	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
 39	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
 40};
 41
 42/* resource tracker for each resource of rdma cgroup */
 43struct rdmacg_resource {
 44	int max;
 45	int usage;
 46};
 47
 48/*
 49 * resource pool object which represents per cgroup, per device
 50 * resources. There are multiple instances of this object per cgroup,
 51 * therefore it cannot be embedded within rdma_cgroup structure. It
 52 * is maintained as list.
 53 */
 54struct rdmacg_resource_pool {
 55	struct rdmacg_device	*device;
 56	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
 57
 58	struct list_head	cg_node;
 59	struct list_head	dev_node;
 60
 61	/* count active user tasks of this pool */
 62	u64			usage_sum;
 63	/* total number counts which are set to max */
 64	int			num_max_cnt;
 65};
 66
 67static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
 68{
 69	return container_of(css, struct rdma_cgroup, css);
 70}
 71
 72static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
 73{
 74	return css_rdmacg(cg->css.parent);
 75}
 76
 77static inline struct rdma_cgroup *get_current_rdmacg(void)
 78{
 79	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
 80}
 81
 82static void set_resource_limit(struct rdmacg_resource_pool *rpool,
 83			       int index, int new_max)
 84{
 85	if (new_max == S32_MAX) {
 86		if (rpool->resources[index].max != S32_MAX)
 87			rpool->num_max_cnt++;
 88	} else {
 89		if (rpool->resources[index].max == S32_MAX)
 90			rpool->num_max_cnt--;
 91	}
 92	rpool->resources[index].max = new_max;
 93}
 94
 95static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
 96{
 97	int i;
 98
 99	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100		set_resource_limit(rpool, i, S32_MAX);
101}
102
103static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
104{
105	lockdep_assert_held(&rdmacg_mutex);
106
107	list_del(&rpool->cg_node);
108	list_del(&rpool->dev_node);
109	kfree(rpool);
110}
111
112static struct rdmacg_resource_pool *
113find_cg_rpool_locked(struct rdma_cgroup *cg,
114		     struct rdmacg_device *device)
115
116{
117	struct rdmacg_resource_pool *pool;
118
119	lockdep_assert_held(&rdmacg_mutex);
120
121	list_for_each_entry(pool, &cg->rpools, cg_node)
122		if (pool->device == device)
123			return pool;
124
125	return NULL;
126}
127
128static struct rdmacg_resource_pool *
129get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
130{
131	struct rdmacg_resource_pool *rpool;
132
133	rpool = find_cg_rpool_locked(cg, device);
134	if (rpool)
135		return rpool;
136
137	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
138	if (!rpool)
139		return ERR_PTR(-ENOMEM);
140
141	rpool->device = device;
142	set_all_resource_max_limit(rpool);
143
144	INIT_LIST_HEAD(&rpool->cg_node);
145	INIT_LIST_HEAD(&rpool->dev_node);
146	list_add_tail(&rpool->cg_node, &cg->rpools);
147	list_add_tail(&rpool->dev_node, &device->rpools);
148	return rpool;
149}
150
151/**
152 * uncharge_cg_locked - uncharge resource for rdma cgroup
153 * @cg: pointer to cg to uncharge and all parents in hierarchy
154 * @device: pointer to rdmacg device
155 * @index: index of the resource to uncharge in cg (resource pool)
156 *
157 * It also frees the resource pool which was created as part of
158 * charging operation when there are no resources attached to
159 * resource pool.
160 */
161static void
162uncharge_cg_locked(struct rdma_cgroup *cg,
163		   struct rdmacg_device *device,
164		   enum rdmacg_resource_type index)
165{
166	struct rdmacg_resource_pool *rpool;
167
168	rpool = find_cg_rpool_locked(cg, device);
169
170	/*
171	 * rpool cannot be null at this stage. Let kernel operate in case
172	 * if there a bug in IB stack or rdma controller, instead of crashing
173	 * the system.
174	 */
175	if (unlikely(!rpool)) {
176		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
177		return;
178	}
179
180	rpool->resources[index].usage--;
181
182	/*
183	 * A negative count (or overflow) is invalid,
184	 * it indicates a bug in the rdma controller.
185	 */
186	WARN_ON_ONCE(rpool->resources[index].usage < 0);
187	rpool->usage_sum--;
188	if (rpool->usage_sum == 0 &&
189	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
190		/*
191		 * No user of the rpool and all entries are set to max, so
192		 * safe to delete this rpool.
193		 */
194		free_cg_rpool_locked(rpool);
195	}
196}
197
198/**
199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200 * @cg: pointer to cg to uncharge and all parents in hierarchy
201 * @device: pointer to rdmacg device
202 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
203 *           stop uncharging
204 * @index: index of the resource to uncharge in cg in given resource pool
205 */
206static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
207				     struct rdmacg_device *device,
208				     struct rdma_cgroup *stop_cg,
209				     enum rdmacg_resource_type index)
210{
211	struct rdma_cgroup *p;
212
213	mutex_lock(&rdmacg_mutex);
214
215	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
216		uncharge_cg_locked(p, device, index);
217
218	mutex_unlock(&rdmacg_mutex);
219
220	css_put(&cg->css);
221}
222
223/**
224 * rdmacg_uncharge - hierarchically uncharge rdma resource count
225 * @cg: pointer to cg to uncharge and all parents in hierarchy
226 * @device: pointer to rdmacg device
227 * @index: index of the resource to uncharge in cgroup in given resource pool
228 */
229void rdmacg_uncharge(struct rdma_cgroup *cg,
230		     struct rdmacg_device *device,
231		     enum rdmacg_resource_type index)
232{
233	if (index >= RDMACG_RESOURCE_MAX)
234		return;
235
236	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
237}
238EXPORT_SYMBOL(rdmacg_uncharge);
239
240/**
241 * rdmacg_try_charge - hierarchically try to charge the rdma resource
242 * @rdmacg: pointer to rdma cgroup which will own this resource
243 * @device: pointer to rdmacg device
244 * @index: index of the resource to charge in cgroup (resource pool)
245 *
246 * This function follows charging resource in hierarchical way.
247 * It will fail if the charge would cause the new value to exceed the
248 * hierarchical limit.
249 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
250 * Returns pointer to rdmacg for this resource when charging is successful.
251 *
252 * Charger needs to account resources on two criteria.
253 * (a) per cgroup & (b) per device resource usage.
254 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
255 * the configured limits. Per device provides granular configuration
256 * in multi device usage. It allocates resource pool in the hierarchy
257 * for each parent it come across for first resource. Later on resource
258 * pool will be available. Therefore it will be much faster thereon
259 * to charge/uncharge.
260 */
261int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
262		      struct rdmacg_device *device,
263		      enum rdmacg_resource_type index)
264{
265	struct rdma_cgroup *cg, *p;
266	struct rdmacg_resource_pool *rpool;
267	s64 new;
268	int ret = 0;
269
270	if (index >= RDMACG_RESOURCE_MAX)
271		return -EINVAL;
272
273	/*
274	 * hold on to css, as cgroup can be removed but resource
275	 * accounting happens on css.
276	 */
277	cg = get_current_rdmacg();
278
279	mutex_lock(&rdmacg_mutex);
280	for (p = cg; p; p = parent_rdmacg(p)) {
281		rpool = get_cg_rpool_locked(p, device);
282		if (IS_ERR(rpool)) {
283			ret = PTR_ERR(rpool);
284			goto err;
285		} else {
286			new = rpool->resources[index].usage + 1;
287			if (new > rpool->resources[index].max) {
288				ret = -EAGAIN;
289				goto err;
290			} else {
291				rpool->resources[index].usage = new;
292				rpool->usage_sum++;
293			}
294		}
295	}
296	mutex_unlock(&rdmacg_mutex);
297
298	*rdmacg = cg;
299	return 0;
300
301err:
302	mutex_unlock(&rdmacg_mutex);
303	rdmacg_uncharge_hierarchy(cg, device, p, index);
304	return ret;
305}
306EXPORT_SYMBOL(rdmacg_try_charge);
307
308/**
309 * rdmacg_register_device - register rdmacg device to rdma controller.
310 * @device: pointer to rdmacg device whose resources need to be accounted.
311 *
312 * If IB stack wish a device to participate in rdma cgroup resource
313 * tracking, it must invoke this API to register with rdma cgroup before
314 * any user space application can start using the RDMA resources.
315 */
316void rdmacg_register_device(struct rdmacg_device *device)
317{
318	INIT_LIST_HEAD(&device->dev_node);
319	INIT_LIST_HEAD(&device->rpools);
320
321	mutex_lock(&rdmacg_mutex);
322	list_add_tail(&device->dev_node, &rdmacg_devices);
323	mutex_unlock(&rdmacg_mutex);
324}
325EXPORT_SYMBOL(rdmacg_register_device);
326
327/**
328 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
329 * @device: pointer to rdmacg device which was previously registered with rdma
330 *          controller using rdmacg_register_device().
331 *
332 * IB stack must invoke this after all the resources of the IB device
333 * are destroyed and after ensuring that no more resources will be created
334 * when this API is invoked.
335 */
336void rdmacg_unregister_device(struct rdmacg_device *device)
337{
338	struct rdmacg_resource_pool *rpool, *tmp;
339
340	/*
341	 * Synchronize with any active resource settings,
342	 * usage query happening via configfs.
343	 */
344	mutex_lock(&rdmacg_mutex);
345	list_del_init(&device->dev_node);
346
347	/*
348	 * Now that this device is off the cgroup list, its safe to free
349	 * all the rpool resources.
350	 */
351	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
352		free_cg_rpool_locked(rpool);
353
354	mutex_unlock(&rdmacg_mutex);
355}
356EXPORT_SYMBOL(rdmacg_unregister_device);
357
358static int parse_resource(char *c, int *intval)
359{
360	substring_t argstr;
361	char *name, *value = c;
362	size_t len;
363	int ret, i;
364
365	name = strsep(&value, "=");
366	if (!name || !value)
367		return -EINVAL;
368
369	i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
370	if (i < 0)
371		return i;
372
373	len = strlen(value);
374
375	argstr.from = value;
376	argstr.to = value + len;
377
378	ret = match_int(&argstr, intval);
379	if (ret >= 0) {
380		if (*intval < 0)
381			return -EINVAL;
382		return i;
383	}
384	if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
385		*intval = S32_MAX;
386		return i;
387	}
388	return -EINVAL;
389}
390
391static int rdmacg_parse_limits(char *options,
392			       int *new_limits, unsigned long *enables)
393{
394	char *c;
395	int err = -EINVAL;
396
397	/* parse resource options */
398	while ((c = strsep(&options, " ")) != NULL) {
399		int index, intval;
400
401		index = parse_resource(c, &intval);
402		if (index < 0)
403			goto err;
404
405		new_limits[index] = intval;
406		*enables |= BIT(index);
407	}
408	return 0;
409
410err:
411	return err;
412}
413
414static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
415{
416	struct rdmacg_device *device;
417
418	lockdep_assert_held(&rdmacg_mutex);
419
420	list_for_each_entry(device, &rdmacg_devices, dev_node)
421		if (!strcmp(name, device->name))
422			return device;
423
424	return NULL;
425}
426
427static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
428				       char *buf, size_t nbytes, loff_t off)
429{
430	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
431	const char *dev_name;
432	struct rdmacg_resource_pool *rpool;
433	struct rdmacg_device *device;
434	char *options = strstrip(buf);
435	int *new_limits;
436	unsigned long enables = 0;
437	int i = 0, ret = 0;
438
439	/* extract the device name first */
440	dev_name = strsep(&options, " ");
441	if (!dev_name) {
442		ret = -EINVAL;
443		goto err;
444	}
445
446	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
447	if (!new_limits) {
448		ret = -ENOMEM;
449		goto err;
450	}
451
452	ret = rdmacg_parse_limits(options, new_limits, &enables);
453	if (ret)
454		goto parse_err;
455
456	/* acquire lock to synchronize with hot plug devices */
457	mutex_lock(&rdmacg_mutex);
458
459	device = rdmacg_get_device_locked(dev_name);
460	if (!device) {
461		ret = -ENODEV;
462		goto dev_err;
463	}
464
465	rpool = get_cg_rpool_locked(cg, device);
466	if (IS_ERR(rpool)) {
467		ret = PTR_ERR(rpool);
468		goto dev_err;
469	}
470
471	/* now set the new limits of the rpool */
472	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
473		set_resource_limit(rpool, i, new_limits[i]);
474
475	if (rpool->usage_sum == 0 &&
476	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
477		/*
478		 * No user of the rpool and all entries are set to max, so
479		 * safe to delete this rpool.
480		 */
481		free_cg_rpool_locked(rpool);
482	}
483
484dev_err:
485	mutex_unlock(&rdmacg_mutex);
486
487parse_err:
488	kfree(new_limits);
489
490err:
491	return ret ?: nbytes;
492}
493
494static void print_rpool_values(struct seq_file *sf,
495			       struct rdmacg_resource_pool *rpool)
496{
497	enum rdmacg_file_type sf_type;
498	int i;
499	u32 value;
500
501	sf_type = seq_cft(sf)->private;
502
503	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
504		seq_puts(sf, rdmacg_resource_names[i]);
505		seq_putc(sf, '=');
506		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
507			if (rpool)
508				value = rpool->resources[i].max;
509			else
510				value = S32_MAX;
511		} else {
512			if (rpool)
513				value = rpool->resources[i].usage;
514			else
515				value = 0;
516		}
517
518		if (value == S32_MAX)
519			seq_puts(sf, RDMACG_MAX_STR);
520		else
521			seq_printf(sf, "%d", value);
522		seq_putc(sf, ' ');
523	}
524}
525
526static int rdmacg_resource_read(struct seq_file *sf, void *v)
527{
528	struct rdmacg_device *device;
529	struct rdmacg_resource_pool *rpool;
530	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
531
532	mutex_lock(&rdmacg_mutex);
533
534	list_for_each_entry(device, &rdmacg_devices, dev_node) {
535		seq_printf(sf, "%s ", device->name);
536
537		rpool = find_cg_rpool_locked(cg, device);
538		print_rpool_values(sf, rpool);
539
540		seq_putc(sf, '\n');
541	}
542
543	mutex_unlock(&rdmacg_mutex);
544	return 0;
545}
546
547static struct cftype rdmacg_files[] = {
548	{
549		.name = "max",
550		.write = rdmacg_resource_set_max,
551		.seq_show = rdmacg_resource_read,
552		.private = RDMACG_RESOURCE_TYPE_MAX,
553		.flags = CFTYPE_NOT_ON_ROOT,
554	},
555	{
556		.name = "current",
557		.seq_show = rdmacg_resource_read,
558		.private = RDMACG_RESOURCE_TYPE_STAT,
559		.flags = CFTYPE_NOT_ON_ROOT,
560	},
561	{ }	/* terminate */
562};
563
564static struct cgroup_subsys_state *
565rdmacg_css_alloc(struct cgroup_subsys_state *parent)
566{
567	struct rdma_cgroup *cg;
568
569	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
570	if (!cg)
571		return ERR_PTR(-ENOMEM);
572
573	INIT_LIST_HEAD(&cg->rpools);
574	return &cg->css;
575}
576
577static void rdmacg_css_free(struct cgroup_subsys_state *css)
578{
579	struct rdma_cgroup *cg = css_rdmacg(css);
580
581	kfree(cg);
582}
583
584/**
585 * rdmacg_css_offline - cgroup css_offline callback
586 * @css: css of interest
587 *
588 * This function is called when @css is about to go away and responsible
589 * for shooting down all rdmacg associated with @css. As part of that it
590 * marks all the resource pool entries to max value, so that when resources are
591 * uncharged, associated resource pool can be freed as well.
592 */
593static void rdmacg_css_offline(struct cgroup_subsys_state *css)
594{
595	struct rdma_cgroup *cg = css_rdmacg(css);
596	struct rdmacg_resource_pool *rpool;
597
598	mutex_lock(&rdmacg_mutex);
599
600	list_for_each_entry(rpool, &cg->rpools, cg_node)
601		set_all_resource_max_limit(rpool);
602
603	mutex_unlock(&rdmacg_mutex);
604}
605
606struct cgroup_subsys rdma_cgrp_subsys = {
607	.css_alloc	= rdmacg_css_alloc,
608	.css_free	= rdmacg_css_free,
609	.css_offline	= rdmacg_css_offline,
610	.legacy_cftypes	= rdmacg_files,
611	.dfl_cftypes	= rdmacg_files,
612};