Linux Audio

Check our new training course

Loading...
v6.2
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * RDMA resource limiting controller for cgroups.
  4 *
  5 * Used to allow a cgroup hierarchy to stop processes from consuming
  6 * additional RDMA resources after a certain limit is reached.
  7 *
  8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
 
 
 
 
  9 */
 10
 11#include <linux/bitops.h>
 12#include <linux/slab.h>
 13#include <linux/seq_file.h>
 14#include <linux/cgroup.h>
 15#include <linux/parser.h>
 16#include <linux/cgroup_rdma.h>
 17
 18#define RDMACG_MAX_STR "max"
 19
 20/*
 21 * Protects list of resource pools maintained on per cgroup basis
 22 * and rdma device list.
 23 */
 24static DEFINE_MUTEX(rdmacg_mutex);
 25static LIST_HEAD(rdmacg_devices);
 26
 27enum rdmacg_file_type {
 28	RDMACG_RESOURCE_TYPE_MAX,
 29	RDMACG_RESOURCE_TYPE_STAT,
 30};
 31
 32/*
 33 * resource table definition as to be seen by the user.
 34 * Need to add entries to it when more resources are
 35 * added/defined at IB verb/core layer.
 36 */
 37static char const *rdmacg_resource_names[] = {
 38	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
 39	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
 40};
 41
 42/* resource tracker for each resource of rdma cgroup */
 43struct rdmacg_resource {
 44	int max;
 45	int usage;
 46};
 47
 48/*
 49 * resource pool object which represents per cgroup, per device
 50 * resources. There are multiple instances of this object per cgroup,
 51 * therefore it cannot be embedded within rdma_cgroup structure. It
 52 * is maintained as list.
 53 */
 54struct rdmacg_resource_pool {
 55	struct rdmacg_device	*device;
 56	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
 57
 58	struct list_head	cg_node;
 59	struct list_head	dev_node;
 60
 61	/* count active user tasks of this pool */
 62	u64			usage_sum;
 63	/* total number counts which are set to max */
 64	int			num_max_cnt;
 65};
 66
 67static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
 68{
 69	return container_of(css, struct rdma_cgroup, css);
 70}
 71
 72static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
 73{
 74	return css_rdmacg(cg->css.parent);
 75}
 76
 77static inline struct rdma_cgroup *get_current_rdmacg(void)
 78{
 79	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
 80}
 81
 82static void set_resource_limit(struct rdmacg_resource_pool *rpool,
 83			       int index, int new_max)
 84{
 85	if (new_max == S32_MAX) {
 86		if (rpool->resources[index].max != S32_MAX)
 87			rpool->num_max_cnt++;
 88	} else {
 89		if (rpool->resources[index].max == S32_MAX)
 90			rpool->num_max_cnt--;
 91	}
 92	rpool->resources[index].max = new_max;
 93}
 94
 95static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
 96{
 97	int i;
 98
 99	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100		set_resource_limit(rpool, i, S32_MAX);
101}
102
103static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
104{
105	lockdep_assert_held(&rdmacg_mutex);
106
107	list_del(&rpool->cg_node);
108	list_del(&rpool->dev_node);
109	kfree(rpool);
110}
111
112static struct rdmacg_resource_pool *
113find_cg_rpool_locked(struct rdma_cgroup *cg,
114		     struct rdmacg_device *device)
115
116{
117	struct rdmacg_resource_pool *pool;
118
119	lockdep_assert_held(&rdmacg_mutex);
120
121	list_for_each_entry(pool, &cg->rpools, cg_node)
122		if (pool->device == device)
123			return pool;
124
125	return NULL;
126}
127
128static struct rdmacg_resource_pool *
129get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
130{
131	struct rdmacg_resource_pool *rpool;
132
133	rpool = find_cg_rpool_locked(cg, device);
134	if (rpool)
135		return rpool;
136
137	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
138	if (!rpool)
139		return ERR_PTR(-ENOMEM);
140
141	rpool->device = device;
142	set_all_resource_max_limit(rpool);
143
144	INIT_LIST_HEAD(&rpool->cg_node);
145	INIT_LIST_HEAD(&rpool->dev_node);
146	list_add_tail(&rpool->cg_node, &cg->rpools);
147	list_add_tail(&rpool->dev_node, &device->rpools);
148	return rpool;
149}
150
151/**
152 * uncharge_cg_locked - uncharge resource for rdma cgroup
153 * @cg: pointer to cg to uncharge and all parents in hierarchy
154 * @device: pointer to rdmacg device
155 * @index: index of the resource to uncharge in cg (resource pool)
156 *
157 * It also frees the resource pool which was created as part of
158 * charging operation when there are no resources attached to
159 * resource pool.
160 */
161static void
162uncharge_cg_locked(struct rdma_cgroup *cg,
163		   struct rdmacg_device *device,
164		   enum rdmacg_resource_type index)
165{
166	struct rdmacg_resource_pool *rpool;
167
168	rpool = find_cg_rpool_locked(cg, device);
169
170	/*
171	 * rpool cannot be null at this stage. Let kernel operate in case
172	 * if there a bug in IB stack or rdma controller, instead of crashing
173	 * the system.
174	 */
175	if (unlikely(!rpool)) {
176		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
177		return;
178	}
179
180	rpool->resources[index].usage--;
181
182	/*
183	 * A negative count (or overflow) is invalid,
184	 * it indicates a bug in the rdma controller.
185	 */
186	WARN_ON_ONCE(rpool->resources[index].usage < 0);
187	rpool->usage_sum--;
188	if (rpool->usage_sum == 0 &&
189	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
190		/*
191		 * No user of the rpool and all entries are set to max, so
192		 * safe to delete this rpool.
193		 */
194		free_cg_rpool_locked(rpool);
195	}
196}
197
198/**
199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200 * @device: pointer to rdmacg device
201 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
202 *           stop uncharging
203 * @index: index of the resource to uncharge in cg in given resource pool
204 */
205static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
206				     struct rdmacg_device *device,
207				     struct rdma_cgroup *stop_cg,
208				     enum rdmacg_resource_type index)
209{
210	struct rdma_cgroup *p;
211
212	mutex_lock(&rdmacg_mutex);
213
214	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
215		uncharge_cg_locked(p, device, index);
216
217	mutex_unlock(&rdmacg_mutex);
218
219	css_put(&cg->css);
220}
221
222/**
223 * rdmacg_uncharge - hierarchically uncharge rdma resource count
224 * @device: pointer to rdmacg device
225 * @index: index of the resource to uncharge in cgroup in given resource pool
226 */
227void rdmacg_uncharge(struct rdma_cgroup *cg,
228		     struct rdmacg_device *device,
229		     enum rdmacg_resource_type index)
230{
231	if (index >= RDMACG_RESOURCE_MAX)
232		return;
233
234	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
235}
236EXPORT_SYMBOL(rdmacg_uncharge);
237
238/**
239 * rdmacg_try_charge - hierarchically try to charge the rdma resource
240 * @rdmacg: pointer to rdma cgroup which will own this resource
241 * @device: pointer to rdmacg device
242 * @index: index of the resource to charge in cgroup (resource pool)
243 *
244 * This function follows charging resource in hierarchical way.
245 * It will fail if the charge would cause the new value to exceed the
246 * hierarchical limit.
247 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
248 * Returns pointer to rdmacg for this resource when charging is successful.
249 *
250 * Charger needs to account resources on two criteria.
251 * (a) per cgroup & (b) per device resource usage.
252 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
253 * the configured limits. Per device provides granular configuration
254 * in multi device usage. It allocates resource pool in the hierarchy
255 * for each parent it come across for first resource. Later on resource
256 * pool will be available. Therefore it will be much faster thereon
257 * to charge/uncharge.
258 */
259int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
260		      struct rdmacg_device *device,
261		      enum rdmacg_resource_type index)
262{
263	struct rdma_cgroup *cg, *p;
264	struct rdmacg_resource_pool *rpool;
265	s64 new;
266	int ret = 0;
267
268	if (index >= RDMACG_RESOURCE_MAX)
269		return -EINVAL;
270
271	/*
272	 * hold on to css, as cgroup can be removed but resource
273	 * accounting happens on css.
274	 */
275	cg = get_current_rdmacg();
276
277	mutex_lock(&rdmacg_mutex);
278	for (p = cg; p; p = parent_rdmacg(p)) {
279		rpool = get_cg_rpool_locked(p, device);
280		if (IS_ERR(rpool)) {
281			ret = PTR_ERR(rpool);
282			goto err;
283		} else {
284			new = rpool->resources[index].usage + 1;
285			if (new > rpool->resources[index].max) {
286				ret = -EAGAIN;
287				goto err;
288			} else {
289				rpool->resources[index].usage = new;
290				rpool->usage_sum++;
291			}
292		}
293	}
294	mutex_unlock(&rdmacg_mutex);
295
296	*rdmacg = cg;
297	return 0;
298
299err:
300	mutex_unlock(&rdmacg_mutex);
301	rdmacg_uncharge_hierarchy(cg, device, p, index);
302	return ret;
303}
304EXPORT_SYMBOL(rdmacg_try_charge);
305
306/**
307 * rdmacg_register_device - register rdmacg device to rdma controller.
308 * @device: pointer to rdmacg device whose resources need to be accounted.
309 *
310 * If IB stack wish a device to participate in rdma cgroup resource
311 * tracking, it must invoke this API to register with rdma cgroup before
312 * any user space application can start using the RDMA resources.
 
 
313 */
314void rdmacg_register_device(struct rdmacg_device *device)
315{
316	INIT_LIST_HEAD(&device->dev_node);
317	INIT_LIST_HEAD(&device->rpools);
318
319	mutex_lock(&rdmacg_mutex);
320	list_add_tail(&device->dev_node, &rdmacg_devices);
321	mutex_unlock(&rdmacg_mutex);
 
322}
323EXPORT_SYMBOL(rdmacg_register_device);
324
325/**
326 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
327 * @device: pointer to rdmacg device which was previously registered with rdma
328 *          controller using rdmacg_register_device().
329 *
330 * IB stack must invoke this after all the resources of the IB device
331 * are destroyed and after ensuring that no more resources will be created
332 * when this API is invoked.
333 */
334void rdmacg_unregister_device(struct rdmacg_device *device)
335{
336	struct rdmacg_resource_pool *rpool, *tmp;
337
338	/*
339	 * Synchronize with any active resource settings,
340	 * usage query happening via configfs.
341	 */
342	mutex_lock(&rdmacg_mutex);
343	list_del_init(&device->dev_node);
344
345	/*
346	 * Now that this device is off the cgroup list, its safe to free
347	 * all the rpool resources.
348	 */
349	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
350		free_cg_rpool_locked(rpool);
351
352	mutex_unlock(&rdmacg_mutex);
353}
354EXPORT_SYMBOL(rdmacg_unregister_device);
355
356static int parse_resource(char *c, int *intval)
357{
358	substring_t argstr;
 
359	char *name, *value = c;
360	size_t len;
361	int ret, i;
362
363	name = strsep(&value, "=");
364	if (!name || !value)
365		return -EINVAL;
366
367	i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
368	if (i < 0)
369		return i;
370
371	len = strlen(value);
372
373	argstr.from = value;
374	argstr.to = value + len;
375
376	ret = match_int(&argstr, intval);
377	if (ret >= 0) {
378		if (*intval < 0)
379			return -EINVAL;
380		return i;
381	}
382	if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
383		*intval = S32_MAX;
384		return i;
 
 
 
 
 
 
385	}
386	return -EINVAL;
387}
388
389static int rdmacg_parse_limits(char *options,
390			       int *new_limits, unsigned long *enables)
391{
392	char *c;
393	int err = -EINVAL;
394
395	/* parse resource options */
396	while ((c = strsep(&options, " ")) != NULL) {
397		int index, intval;
398
399		index = parse_resource(c, &intval);
400		if (index < 0)
401			goto err;
402
403		new_limits[index] = intval;
404		*enables |= BIT(index);
405	}
406	return 0;
407
408err:
409	return err;
410}
411
412static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
413{
414	struct rdmacg_device *device;
415
416	lockdep_assert_held(&rdmacg_mutex);
417
418	list_for_each_entry(device, &rdmacg_devices, dev_node)
419		if (!strcmp(name, device->name))
420			return device;
421
422	return NULL;
423}
424
425static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
426				       char *buf, size_t nbytes, loff_t off)
427{
428	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
429	const char *dev_name;
430	struct rdmacg_resource_pool *rpool;
431	struct rdmacg_device *device;
432	char *options = strstrip(buf);
433	int *new_limits;
434	unsigned long enables = 0;
435	int i = 0, ret = 0;
436
437	/* extract the device name first */
438	dev_name = strsep(&options, " ");
439	if (!dev_name) {
440		ret = -EINVAL;
441		goto err;
442	}
443
444	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
445	if (!new_limits) {
446		ret = -ENOMEM;
447		goto err;
448	}
449
450	ret = rdmacg_parse_limits(options, new_limits, &enables);
451	if (ret)
452		goto parse_err;
453
454	/* acquire lock to synchronize with hot plug devices */
455	mutex_lock(&rdmacg_mutex);
456
457	device = rdmacg_get_device_locked(dev_name);
458	if (!device) {
459		ret = -ENODEV;
460		goto dev_err;
461	}
462
463	rpool = get_cg_rpool_locked(cg, device);
464	if (IS_ERR(rpool)) {
465		ret = PTR_ERR(rpool);
466		goto dev_err;
467	}
468
469	/* now set the new limits of the rpool */
470	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
471		set_resource_limit(rpool, i, new_limits[i]);
472
473	if (rpool->usage_sum == 0 &&
474	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
475		/*
476		 * No user of the rpool and all entries are set to max, so
477		 * safe to delete this rpool.
478		 */
479		free_cg_rpool_locked(rpool);
480	}
481
482dev_err:
483	mutex_unlock(&rdmacg_mutex);
484
485parse_err:
486	kfree(new_limits);
487
488err:
489	return ret ?: nbytes;
490}
491
492static void print_rpool_values(struct seq_file *sf,
493			       struct rdmacg_resource_pool *rpool)
494{
495	enum rdmacg_file_type sf_type;
496	int i;
497	u32 value;
498
499	sf_type = seq_cft(sf)->private;
500
501	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
502		seq_puts(sf, rdmacg_resource_names[i]);
503		seq_putc(sf, '=');
504		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
505			if (rpool)
506				value = rpool->resources[i].max;
507			else
508				value = S32_MAX;
509		} else {
510			if (rpool)
511				value = rpool->resources[i].usage;
512			else
513				value = 0;
514		}
515
516		if (value == S32_MAX)
517			seq_puts(sf, RDMACG_MAX_STR);
518		else
519			seq_printf(sf, "%d", value);
520		seq_putc(sf, ' ');
521	}
522}
523
524static int rdmacg_resource_read(struct seq_file *sf, void *v)
525{
526	struct rdmacg_device *device;
527	struct rdmacg_resource_pool *rpool;
528	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
529
530	mutex_lock(&rdmacg_mutex);
531
532	list_for_each_entry(device, &rdmacg_devices, dev_node) {
533		seq_printf(sf, "%s ", device->name);
534
535		rpool = find_cg_rpool_locked(cg, device);
536		print_rpool_values(sf, rpool);
537
538		seq_putc(sf, '\n');
539	}
540
541	mutex_unlock(&rdmacg_mutex);
542	return 0;
543}
544
545static struct cftype rdmacg_files[] = {
546	{
547		.name = "max",
548		.write = rdmacg_resource_set_max,
549		.seq_show = rdmacg_resource_read,
550		.private = RDMACG_RESOURCE_TYPE_MAX,
551		.flags = CFTYPE_NOT_ON_ROOT,
552	},
553	{
554		.name = "current",
555		.seq_show = rdmacg_resource_read,
556		.private = RDMACG_RESOURCE_TYPE_STAT,
557		.flags = CFTYPE_NOT_ON_ROOT,
558	},
559	{ }	/* terminate */
560};
561
562static struct cgroup_subsys_state *
563rdmacg_css_alloc(struct cgroup_subsys_state *parent)
564{
565	struct rdma_cgroup *cg;
566
567	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
568	if (!cg)
569		return ERR_PTR(-ENOMEM);
570
571	INIT_LIST_HEAD(&cg->rpools);
572	return &cg->css;
573}
574
575static void rdmacg_css_free(struct cgroup_subsys_state *css)
576{
577	struct rdma_cgroup *cg = css_rdmacg(css);
578
579	kfree(cg);
580}
581
582/**
583 * rdmacg_css_offline - cgroup css_offline callback
584 * @css: css of interest
585 *
586 * This function is called when @css is about to go away and responsible
587 * for shooting down all rdmacg associated with @css. As part of that it
588 * marks all the resource pool entries to max value, so that when resources are
589 * uncharged, associated resource pool can be freed as well.
590 */
591static void rdmacg_css_offline(struct cgroup_subsys_state *css)
592{
593	struct rdma_cgroup *cg = css_rdmacg(css);
594	struct rdmacg_resource_pool *rpool;
595
596	mutex_lock(&rdmacg_mutex);
597
598	list_for_each_entry(rpool, &cg->rpools, cg_node)
599		set_all_resource_max_limit(rpool);
600
601	mutex_unlock(&rdmacg_mutex);
602}
603
604struct cgroup_subsys rdma_cgrp_subsys = {
605	.css_alloc	= rdmacg_css_alloc,
606	.css_free	= rdmacg_css_free,
607	.css_offline	= rdmacg_css_offline,
608	.legacy_cftypes	= rdmacg_files,
609	.dfl_cftypes	= rdmacg_files,
610};
v4.17
 
  1/*
  2 * RDMA resource limiting controller for cgroups.
  3 *
  4 * Used to allow a cgroup hierarchy to stop processes from consuming
  5 * additional RDMA resources after a certain limit is reached.
  6 *
  7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  8 *
  9 * This file is subject to the terms and conditions of version 2 of the GNU
 10 * General Public License. See the file COPYING in the main directory of the
 11 * Linux distribution for more details.
 12 */
 13
 14#include <linux/bitops.h>
 15#include <linux/slab.h>
 16#include <linux/seq_file.h>
 17#include <linux/cgroup.h>
 18#include <linux/parser.h>
 19#include <linux/cgroup_rdma.h>
 20
 21#define RDMACG_MAX_STR "max"
 22
 23/*
 24 * Protects list of resource pools maintained on per cgroup basis
 25 * and rdma device list.
 26 */
 27static DEFINE_MUTEX(rdmacg_mutex);
 28static LIST_HEAD(rdmacg_devices);
 29
 30enum rdmacg_file_type {
 31	RDMACG_RESOURCE_TYPE_MAX,
 32	RDMACG_RESOURCE_TYPE_STAT,
 33};
 34
 35/*
 36 * resource table definition as to be seen by the user.
 37 * Need to add entries to it when more resources are
 38 * added/defined at IB verb/core layer.
 39 */
 40static char const *rdmacg_resource_names[] = {
 41	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
 42	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
 43};
 44
 45/* resource tracker for each resource of rdma cgroup */
 46struct rdmacg_resource {
 47	int max;
 48	int usage;
 49};
 50
 51/*
 52 * resource pool object which represents per cgroup, per device
 53 * resources. There are multiple instances of this object per cgroup,
 54 * therefore it cannot be embedded within rdma_cgroup structure. It
 55 * is maintained as list.
 56 */
 57struct rdmacg_resource_pool {
 58	struct rdmacg_device	*device;
 59	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
 60
 61	struct list_head	cg_node;
 62	struct list_head	dev_node;
 63
 64	/* count active user tasks of this pool */
 65	u64			usage_sum;
 66	/* total number counts which are set to max */
 67	int			num_max_cnt;
 68};
 69
 70static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
 71{
 72	return container_of(css, struct rdma_cgroup, css);
 73}
 74
 75static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
 76{
 77	return css_rdmacg(cg->css.parent);
 78}
 79
 80static inline struct rdma_cgroup *get_current_rdmacg(void)
 81{
 82	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
 83}
 84
 85static void set_resource_limit(struct rdmacg_resource_pool *rpool,
 86			       int index, int new_max)
 87{
 88	if (new_max == S32_MAX) {
 89		if (rpool->resources[index].max != S32_MAX)
 90			rpool->num_max_cnt++;
 91	} else {
 92		if (rpool->resources[index].max == S32_MAX)
 93			rpool->num_max_cnt--;
 94	}
 95	rpool->resources[index].max = new_max;
 96}
 97
 98static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
 99{
100	int i;
101
102	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
103		set_resource_limit(rpool, i, S32_MAX);
104}
105
106static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
107{
108	lockdep_assert_held(&rdmacg_mutex);
109
110	list_del(&rpool->cg_node);
111	list_del(&rpool->dev_node);
112	kfree(rpool);
113}
114
115static struct rdmacg_resource_pool *
116find_cg_rpool_locked(struct rdma_cgroup *cg,
117		     struct rdmacg_device *device)
118
119{
120	struct rdmacg_resource_pool *pool;
121
122	lockdep_assert_held(&rdmacg_mutex);
123
124	list_for_each_entry(pool, &cg->rpools, cg_node)
125		if (pool->device == device)
126			return pool;
127
128	return NULL;
129}
130
131static struct rdmacg_resource_pool *
132get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
133{
134	struct rdmacg_resource_pool *rpool;
135
136	rpool = find_cg_rpool_locked(cg, device);
137	if (rpool)
138		return rpool;
139
140	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
141	if (!rpool)
142		return ERR_PTR(-ENOMEM);
143
144	rpool->device = device;
145	set_all_resource_max_limit(rpool);
146
147	INIT_LIST_HEAD(&rpool->cg_node);
148	INIT_LIST_HEAD(&rpool->dev_node);
149	list_add_tail(&rpool->cg_node, &cg->rpools);
150	list_add_tail(&rpool->dev_node, &device->rpools);
151	return rpool;
152}
153
154/**
155 * uncharge_cg_locked - uncharge resource for rdma cgroup
156 * @cg: pointer to cg to uncharge and all parents in hierarchy
157 * @device: pointer to rdmacg device
158 * @index: index of the resource to uncharge in cg (resource pool)
159 *
160 * It also frees the resource pool which was created as part of
161 * charging operation when there are no resources attached to
162 * resource pool.
163 */
164static void
165uncharge_cg_locked(struct rdma_cgroup *cg,
166		   struct rdmacg_device *device,
167		   enum rdmacg_resource_type index)
168{
169	struct rdmacg_resource_pool *rpool;
170
171	rpool = find_cg_rpool_locked(cg, device);
172
173	/*
174	 * rpool cannot be null at this stage. Let kernel operate in case
175	 * if there a bug in IB stack or rdma controller, instead of crashing
176	 * the system.
177	 */
178	if (unlikely(!rpool)) {
179		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180		return;
181	}
182
183	rpool->resources[index].usage--;
184
185	/*
186	 * A negative count (or overflow) is invalid,
187	 * it indicates a bug in the rdma controller.
188	 */
189	WARN_ON_ONCE(rpool->resources[index].usage < 0);
190	rpool->usage_sum--;
191	if (rpool->usage_sum == 0 &&
192	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
193		/*
194		 * No user of the rpool and all entries are set to max, so
195		 * safe to delete this rpool.
196		 */
197		free_cg_rpool_locked(rpool);
198	}
199}
200
201/**
202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203 * @device: pointer to rdmacg device
204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
205 *           stop uncharging
206 * @index: index of the resource to uncharge in cg in given resource pool
207 */
208static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
209				     struct rdmacg_device *device,
210				     struct rdma_cgroup *stop_cg,
211				     enum rdmacg_resource_type index)
212{
213	struct rdma_cgroup *p;
214
215	mutex_lock(&rdmacg_mutex);
216
217	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
218		uncharge_cg_locked(p, device, index);
219
220	mutex_unlock(&rdmacg_mutex);
221
222	css_put(&cg->css);
223}
224
225/**
226 * rdmacg_uncharge - hierarchically uncharge rdma resource count
227 * @device: pointer to rdmacg device
228 * @index: index of the resource to uncharge in cgroup in given resource pool
229 */
230void rdmacg_uncharge(struct rdma_cgroup *cg,
231		     struct rdmacg_device *device,
232		     enum rdmacg_resource_type index)
233{
234	if (index >= RDMACG_RESOURCE_MAX)
235		return;
236
237	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
238}
239EXPORT_SYMBOL(rdmacg_uncharge);
240
241/**
242 * rdmacg_try_charge - hierarchically try to charge the rdma resource
243 * @rdmacg: pointer to rdma cgroup which will own this resource
244 * @device: pointer to rdmacg device
245 * @index: index of the resource to charge in cgroup (resource pool)
246 *
247 * This function follows charging resource in hierarchical way.
248 * It will fail if the charge would cause the new value to exceed the
249 * hierarchical limit.
250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251 * Returns pointer to rdmacg for this resource when charging is successful.
252 *
253 * Charger needs to account resources on two criteria.
254 * (a) per cgroup & (b) per device resource usage.
255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256 * the configured limits. Per device provides granular configuration
257 * in multi device usage. It allocates resource pool in the hierarchy
258 * for each parent it come across for first resource. Later on resource
259 * pool will be available. Therefore it will be much faster thereon
260 * to charge/uncharge.
261 */
262int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
263		      struct rdmacg_device *device,
264		      enum rdmacg_resource_type index)
265{
266	struct rdma_cgroup *cg, *p;
267	struct rdmacg_resource_pool *rpool;
268	s64 new;
269	int ret = 0;
270
271	if (index >= RDMACG_RESOURCE_MAX)
272		return -EINVAL;
273
274	/*
275	 * hold on to css, as cgroup can be removed but resource
276	 * accounting happens on css.
277	 */
278	cg = get_current_rdmacg();
279
280	mutex_lock(&rdmacg_mutex);
281	for (p = cg; p; p = parent_rdmacg(p)) {
282		rpool = get_cg_rpool_locked(p, device);
283		if (IS_ERR(rpool)) {
284			ret = PTR_ERR(rpool);
285			goto err;
286		} else {
287			new = rpool->resources[index].usage + 1;
288			if (new > rpool->resources[index].max) {
289				ret = -EAGAIN;
290				goto err;
291			} else {
292				rpool->resources[index].usage = new;
293				rpool->usage_sum++;
294			}
295		}
296	}
297	mutex_unlock(&rdmacg_mutex);
298
299	*rdmacg = cg;
300	return 0;
301
302err:
303	mutex_unlock(&rdmacg_mutex);
304	rdmacg_uncharge_hierarchy(cg, device, p, index);
305	return ret;
306}
307EXPORT_SYMBOL(rdmacg_try_charge);
308
309/**
310 * rdmacg_register_device - register rdmacg device to rdma controller.
311 * @device: pointer to rdmacg device whose resources need to be accounted.
312 *
313 * If IB stack wish a device to participate in rdma cgroup resource
314 * tracking, it must invoke this API to register with rdma cgroup before
315 * any user space application can start using the RDMA resources.
316 * Returns 0 on success or EINVAL when table length given is beyond
317 * supported size.
318 */
319int rdmacg_register_device(struct rdmacg_device *device)
320{
321	INIT_LIST_HEAD(&device->dev_node);
322	INIT_LIST_HEAD(&device->rpools);
323
324	mutex_lock(&rdmacg_mutex);
325	list_add_tail(&device->dev_node, &rdmacg_devices);
326	mutex_unlock(&rdmacg_mutex);
327	return 0;
328}
329EXPORT_SYMBOL(rdmacg_register_device);
330
331/**
332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333 * @device: pointer to rdmacg device which was previously registered with rdma
334 *          controller using rdmacg_register_device().
335 *
336 * IB stack must invoke this after all the resources of the IB device
337 * are destroyed and after ensuring that no more resources will be created
338 * when this API is invoked.
339 */
340void rdmacg_unregister_device(struct rdmacg_device *device)
341{
342	struct rdmacg_resource_pool *rpool, *tmp;
343
344	/*
345	 * Synchronize with any active resource settings,
346	 * usage query happening via configfs.
347	 */
348	mutex_lock(&rdmacg_mutex);
349	list_del_init(&device->dev_node);
350
351	/*
352	 * Now that this device is off the cgroup list, its safe to free
353	 * all the rpool resources.
354	 */
355	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
356		free_cg_rpool_locked(rpool);
357
358	mutex_unlock(&rdmacg_mutex);
359}
360EXPORT_SYMBOL(rdmacg_unregister_device);
361
362static int parse_resource(char *c, int *intval)
363{
364	substring_t argstr;
365	const char **table = &rdmacg_resource_names[0];
366	char *name, *value = c;
367	size_t len;
368	int ret, i = 0;
369
370	name = strsep(&value, "=");
371	if (!name || !value)
372		return -EINVAL;
373
 
 
 
 
374	len = strlen(value);
375
376	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
377		if (strcmp(table[i], name))
378			continue;
379
380		argstr.from = value;
381		argstr.to = value + len;
382
383		ret = match_int(&argstr, intval);
384		if (ret >= 0) {
385			if (*intval < 0)
386				break;
387			return i;
388		}
389		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390			*intval = S32_MAX;
391			return i;
392		}
393		break;
394	}
395	return -EINVAL;
396}
397
398static int rdmacg_parse_limits(char *options,
399			       int *new_limits, unsigned long *enables)
400{
401	char *c;
402	int err = -EINVAL;
403
404	/* parse resource options */
405	while ((c = strsep(&options, " ")) != NULL) {
406		int index, intval;
407
408		index = parse_resource(c, &intval);
409		if (index < 0)
410			goto err;
411
412		new_limits[index] = intval;
413		*enables |= BIT(index);
414	}
415	return 0;
416
417err:
418	return err;
419}
420
421static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
422{
423	struct rdmacg_device *device;
424
425	lockdep_assert_held(&rdmacg_mutex);
426
427	list_for_each_entry(device, &rdmacg_devices, dev_node)
428		if (!strcmp(name, device->name))
429			return device;
430
431	return NULL;
432}
433
434static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
435				       char *buf, size_t nbytes, loff_t off)
436{
437	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
438	const char *dev_name;
439	struct rdmacg_resource_pool *rpool;
440	struct rdmacg_device *device;
441	char *options = strstrip(buf);
442	int *new_limits;
443	unsigned long enables = 0;
444	int i = 0, ret = 0;
445
446	/* extract the device name first */
447	dev_name = strsep(&options, " ");
448	if (!dev_name) {
449		ret = -EINVAL;
450		goto err;
451	}
452
453	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
454	if (!new_limits) {
455		ret = -ENOMEM;
456		goto err;
457	}
458
459	ret = rdmacg_parse_limits(options, new_limits, &enables);
460	if (ret)
461		goto parse_err;
462
463	/* acquire lock to synchronize with hot plug devices */
464	mutex_lock(&rdmacg_mutex);
465
466	device = rdmacg_get_device_locked(dev_name);
467	if (!device) {
468		ret = -ENODEV;
469		goto dev_err;
470	}
471
472	rpool = get_cg_rpool_locked(cg, device);
473	if (IS_ERR(rpool)) {
474		ret = PTR_ERR(rpool);
475		goto dev_err;
476	}
477
478	/* now set the new limits of the rpool */
479	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
480		set_resource_limit(rpool, i, new_limits[i]);
481
482	if (rpool->usage_sum == 0 &&
483	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
484		/*
485		 * No user of the rpool and all entries are set to max, so
486		 * safe to delete this rpool.
487		 */
488		free_cg_rpool_locked(rpool);
489	}
490
491dev_err:
492	mutex_unlock(&rdmacg_mutex);
493
494parse_err:
495	kfree(new_limits);
496
497err:
498	return ret ?: nbytes;
499}
500
501static void print_rpool_values(struct seq_file *sf,
502			       struct rdmacg_resource_pool *rpool)
503{
504	enum rdmacg_file_type sf_type;
505	int i;
506	u32 value;
507
508	sf_type = seq_cft(sf)->private;
509
510	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
511		seq_puts(sf, rdmacg_resource_names[i]);
512		seq_putc(sf, '=');
513		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
514			if (rpool)
515				value = rpool->resources[i].max;
516			else
517				value = S32_MAX;
518		} else {
519			if (rpool)
520				value = rpool->resources[i].usage;
521			else
522				value = 0;
523		}
524
525		if (value == S32_MAX)
526			seq_puts(sf, RDMACG_MAX_STR);
527		else
528			seq_printf(sf, "%d", value);
529		seq_putc(sf, ' ');
530	}
531}
532
533static int rdmacg_resource_read(struct seq_file *sf, void *v)
534{
535	struct rdmacg_device *device;
536	struct rdmacg_resource_pool *rpool;
537	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
538
539	mutex_lock(&rdmacg_mutex);
540
541	list_for_each_entry(device, &rdmacg_devices, dev_node) {
542		seq_printf(sf, "%s ", device->name);
543
544		rpool = find_cg_rpool_locked(cg, device);
545		print_rpool_values(sf, rpool);
546
547		seq_putc(sf, '\n');
548	}
549
550	mutex_unlock(&rdmacg_mutex);
551	return 0;
552}
553
554static struct cftype rdmacg_files[] = {
555	{
556		.name = "max",
557		.write = rdmacg_resource_set_max,
558		.seq_show = rdmacg_resource_read,
559		.private = RDMACG_RESOURCE_TYPE_MAX,
560		.flags = CFTYPE_NOT_ON_ROOT,
561	},
562	{
563		.name = "current",
564		.seq_show = rdmacg_resource_read,
565		.private = RDMACG_RESOURCE_TYPE_STAT,
566		.flags = CFTYPE_NOT_ON_ROOT,
567	},
568	{ }	/* terminate */
569};
570
571static struct cgroup_subsys_state *
572rdmacg_css_alloc(struct cgroup_subsys_state *parent)
573{
574	struct rdma_cgroup *cg;
575
576	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
577	if (!cg)
578		return ERR_PTR(-ENOMEM);
579
580	INIT_LIST_HEAD(&cg->rpools);
581	return &cg->css;
582}
583
584static void rdmacg_css_free(struct cgroup_subsys_state *css)
585{
586	struct rdma_cgroup *cg = css_rdmacg(css);
587
588	kfree(cg);
589}
590
591/**
592 * rdmacg_css_offline - cgroup css_offline callback
593 * @css: css of interest
594 *
595 * This function is called when @css is about to go away and responsible
596 * for shooting down all rdmacg associated with @css. As part of that it
597 * marks all the resource pool entries to max value, so that when resources are
598 * uncharged, associated resource pool can be freed as well.
599 */
600static void rdmacg_css_offline(struct cgroup_subsys_state *css)
601{
602	struct rdma_cgroup *cg = css_rdmacg(css);
603	struct rdmacg_resource_pool *rpool;
604
605	mutex_lock(&rdmacg_mutex);
606
607	list_for_each_entry(rpool, &cg->rpools, cg_node)
608		set_all_resource_max_limit(rpool);
609
610	mutex_unlock(&rdmacg_mutex);
611}
612
613struct cgroup_subsys rdma_cgrp_subsys = {
614	.css_alloc	= rdmacg_css_alloc,
615	.css_free	= rdmacg_css_free,
616	.css_offline	= rdmacg_css_offline,
617	.legacy_cftypes	= rdmacg_files,
618	.dfl_cftypes	= rdmacg_files,
619};