cgroup.c - kernel/bpf/cgroup.c - Linux source code v3.15

Note: File does not exist in v3.15.
  1/*
  2 * Functions to manage eBPF programs attached to cgroups
  3 *
  4 * Copyright (c) 2016 Daniel Mack
  5 *
  6 * This file is subject to the terms and conditions of version 2 of the GNU
  7 * General Public License.  See the file COPYING in the main directory of the
  8 * Linux distribution for more details.
  9 */
 10
 11#include <linux/kernel.h>
 12#include <linux/atomic.h>
 13#include <linux/cgroup.h>
 14#include <linux/slab.h>
 15#include <linux/bpf.h>
 16#include <linux/bpf-cgroup.h>
 17#include <net/sock.h>
 18
 19DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
 20EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 21
 22/**
 23 * cgroup_bpf_put() - put references of all bpf programs
 24 * @cgrp: the cgroup to modify
 25 */
 26void cgroup_bpf_put(struct cgroup *cgrp)
 27{
 28	unsigned int type;
 29
 30	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
 31		struct list_head *progs = &cgrp->bpf.progs[type];
 32		struct bpf_prog_list *pl, *tmp;
 33
 34		list_for_each_entry_safe(pl, tmp, progs, node) {
 35			list_del(&pl->node);
 36			bpf_prog_put(pl->prog);
 37			kfree(pl);
 38			static_branch_dec(&cgroup_bpf_enabled_key);
 39		}
 40		bpf_prog_array_free(cgrp->bpf.effective[type]);
 41	}
 42}
 43
 44/* count number of elements in the list.
 45 * it's slow but the list cannot be long
 46 */
 47static u32 prog_list_length(struct list_head *head)
 48{
 49	struct bpf_prog_list *pl;
 50	u32 cnt = 0;
 51
 52	list_for_each_entry(pl, head, node) {
 53		if (!pl->prog)
 54			continue;
 55		cnt++;
 56	}
 57	return cnt;
 58}
 59
 60/* if parent has non-overridable prog attached,
 61 * disallow attaching new programs to the descendent cgroup.
 62 * if parent has overridable or multi-prog, allow attaching
 63 */
 64static bool hierarchy_allows_attach(struct cgroup *cgrp,
 65				    enum bpf_attach_type type,
 66				    u32 new_flags)
 67{
 68	struct cgroup *p;
 69
 70	p = cgroup_parent(cgrp);
 71	if (!p)
 72		return true;
 73	do {
 74		u32 flags = p->bpf.flags[type];
 75		u32 cnt;
 76
 77		if (flags & BPF_F_ALLOW_MULTI)
 78			return true;
 79		cnt = prog_list_length(&p->bpf.progs[type]);
 80		WARN_ON_ONCE(cnt > 1);
 81		if (cnt == 1)
 82			return !!(flags & BPF_F_ALLOW_OVERRIDE);
 83		p = cgroup_parent(p);
 84	} while (p);
 85	return true;
 86}
 87
 88/* compute a chain of effective programs for a given cgroup:
 89 * start from the list of programs in this cgroup and add
 90 * all parent programs.
 91 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 92 * to programs in this cgroup
 93 */
 94static int compute_effective_progs(struct cgroup *cgrp,
 95				   enum bpf_attach_type type,
 96				   struct bpf_prog_array __rcu **array)
 97{
 98	struct bpf_prog_array __rcu *progs;
 99	struct bpf_prog_list *pl;
100	struct cgroup *p = cgrp;
101	int cnt = 0;
102
103	/* count number of effective programs by walking parents */
104	do {
105		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
106			cnt += prog_list_length(&p->bpf.progs[type]);
107		p = cgroup_parent(p);
108	} while (p);
109
110	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
111	if (!progs)
112		return -ENOMEM;
113
114	/* populate the array with effective progs */
115	cnt = 0;
116	p = cgrp;
117	do {
118		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
119			list_for_each_entry(pl,
120					    &p->bpf.progs[type], node) {
121				if (!pl->prog)
122					continue;
123				rcu_dereference_protected(progs, 1)->
124					progs[cnt++] = pl->prog;
125			}
126		p = cgroup_parent(p);
127	} while (p);
128
129	*array = progs;
130	return 0;
131}
132
133static void activate_effective_progs(struct cgroup *cgrp,
134				     enum bpf_attach_type type,
135				     struct bpf_prog_array __rcu *array)
136{
137	struct bpf_prog_array __rcu *old_array;
138
139	old_array = xchg(&cgrp->bpf.effective[type], array);
140	/* free prog array after grace period, since __cgroup_bpf_run_*()
141	 * might be still walking the array
142	 */
143	bpf_prog_array_free(old_array);
144}
145
146/**
147 * cgroup_bpf_inherit() - inherit effective programs from parent
148 * @cgrp: the cgroup to modify
149 */
150int cgroup_bpf_inherit(struct cgroup *cgrp)
151{
152/* has to use marco instead of const int, since compiler thinks
153 * that array below is variable length
154 */
155#define	NR ARRAY_SIZE(cgrp->bpf.effective)
156	struct bpf_prog_array __rcu *arrays[NR] = {};
157	int i;
158
159	for (i = 0; i < NR; i++)
160		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
161
162	for (i = 0; i < NR; i++)
163		if (compute_effective_progs(cgrp, i, &arrays[i]))
164			goto cleanup;
165
166	for (i = 0; i < NR; i++)
167		activate_effective_progs(cgrp, i, arrays[i]);
168
169	return 0;
170cleanup:
171	for (i = 0; i < NR; i++)
172		bpf_prog_array_free(arrays[i]);
173	return -ENOMEM;
174}
175
176#define BPF_CGROUP_MAX_PROGS 64
177
178/**
179 * __cgroup_bpf_attach() - Attach the program to a cgroup, and
180 *                         propagate the change to descendants
181 * @cgrp: The cgroup which descendants to traverse
182 * @prog: A program to attach
183 * @type: Type of attach operation
184 *
185 * Must be called with cgroup_mutex held.
186 */
187int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
188			enum bpf_attach_type type, u32 flags)
189{
190	struct list_head *progs = &cgrp->bpf.progs[type];
191	struct bpf_prog *old_prog = NULL;
192	struct cgroup_subsys_state *css;
193	struct bpf_prog_list *pl;
194	bool pl_was_allocated;
195	int err;
196
197	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
198		/* invalid combination */
199		return -EINVAL;
200
201	if (!hierarchy_allows_attach(cgrp, type, flags))
202		return -EPERM;
203
204	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
205		/* Disallow attaching non-overridable on top
206		 * of existing overridable in this cgroup.
207		 * Disallow attaching multi-prog if overridable or none
208		 */
209		return -EPERM;
210
211	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
212		return -E2BIG;
213
214	if (flags & BPF_F_ALLOW_MULTI) {
215		list_for_each_entry(pl, progs, node)
216			if (pl->prog == prog)
217				/* disallow attaching the same prog twice */
218				return -EINVAL;
219
220		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
221		if (!pl)
222			return -ENOMEM;
223		pl_was_allocated = true;
224		pl->prog = prog;
225		list_add_tail(&pl->node, progs);
226	} else {
227		if (list_empty(progs)) {
228			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
229			if (!pl)
230				return -ENOMEM;
231			pl_was_allocated = true;
232			list_add_tail(&pl->node, progs);
233		} else {
234			pl = list_first_entry(progs, typeof(*pl), node);
235			old_prog = pl->prog;
236			pl_was_allocated = false;
237		}
238		pl->prog = prog;
239	}
240
241	cgrp->bpf.flags[type] = flags;
242
243	/* allocate and recompute effective prog arrays */
244	css_for_each_descendant_pre(css, &cgrp->self) {
245		struct cgroup *desc = container_of(css, struct cgroup, self);
246
247		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
248		if (err)
249			goto cleanup;
250	}
251
252	/* all allocations were successful. Activate all prog arrays */
253	css_for_each_descendant_pre(css, &cgrp->self) {
254		struct cgroup *desc = container_of(css, struct cgroup, self);
255
256		activate_effective_progs(desc, type, desc->bpf.inactive);
257		desc->bpf.inactive = NULL;
258	}
259
260	static_branch_inc(&cgroup_bpf_enabled_key);
261	if (old_prog) {
262		bpf_prog_put(old_prog);
263		static_branch_dec(&cgroup_bpf_enabled_key);
264	}
265	return 0;
266
267cleanup:
268	/* oom while computing effective. Free all computed effective arrays
269	 * since they were not activated
270	 */
271	css_for_each_descendant_pre(css, &cgrp->self) {
272		struct cgroup *desc = container_of(css, struct cgroup, self);
273
274		bpf_prog_array_free(desc->bpf.inactive);
275		desc->bpf.inactive = NULL;
276	}
277
278	/* and cleanup the prog list */
279	pl->prog = old_prog;
280	if (pl_was_allocated) {
281		list_del(&pl->node);
282		kfree(pl);
283	}
284	return err;
285}
286
287/**
288 * __cgroup_bpf_detach() - Detach the program from a cgroup, and
289 *                         propagate the change to descendants
290 * @cgrp: The cgroup which descendants to traverse
291 * @prog: A program to detach or NULL
292 * @type: Type of detach operation
293 *
294 * Must be called with cgroup_mutex held.
295 */
296int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
297			enum bpf_attach_type type, u32 unused_flags)
298{
299	struct list_head *progs = &cgrp->bpf.progs[type];
300	u32 flags = cgrp->bpf.flags[type];
301	struct bpf_prog *old_prog = NULL;
302	struct cgroup_subsys_state *css;
303	struct bpf_prog_list *pl;
304	int err;
305
306	if (flags & BPF_F_ALLOW_MULTI) {
307		if (!prog)
308			/* to detach MULTI prog the user has to specify valid FD
309			 * of the program to be detached
310			 */
311			return -EINVAL;
312	} else {
313		if (list_empty(progs))
314			/* report error when trying to detach and nothing is attached */
315			return -ENOENT;
316	}
317
318	if (flags & BPF_F_ALLOW_MULTI) {
319		/* find the prog and detach it */
320		list_for_each_entry(pl, progs, node) {
321			if (pl->prog != prog)
322				continue;
323			old_prog = prog;
324			/* mark it deleted, so it's ignored while
325			 * recomputing effective
326			 */
327			pl->prog = NULL;
328			break;
329		}
330		if (!old_prog)
331			return -ENOENT;
332	} else {
333		/* to maintain backward compatibility NONE and OVERRIDE cgroups
334		 * allow detaching with invalid FD (prog==NULL)
335		 */
336		pl = list_first_entry(progs, typeof(*pl), node);
337		old_prog = pl->prog;
338		pl->prog = NULL;
339	}
340
341	/* allocate and recompute effective prog arrays */
342	css_for_each_descendant_pre(css, &cgrp->self) {
343		struct cgroup *desc = container_of(css, struct cgroup, self);
344
345		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
346		if (err)
347			goto cleanup;
348	}
349
350	/* all allocations were successful. Activate all prog arrays */
351	css_for_each_descendant_pre(css, &cgrp->self) {
352		struct cgroup *desc = container_of(css, struct cgroup, self);
353
354		activate_effective_progs(desc, type, desc->bpf.inactive);
355		desc->bpf.inactive = NULL;
356	}
357
358	/* now can actually delete it from this cgroup list */
359	list_del(&pl->node);
360	kfree(pl);
361	if (list_empty(progs))
362		/* last program was detached, reset flags to zero */
363		cgrp->bpf.flags[type] = 0;
364
365	bpf_prog_put(old_prog);
366	static_branch_dec(&cgroup_bpf_enabled_key);
367	return 0;
368
369cleanup:
370	/* oom while computing effective. Free all computed effective arrays
371	 * since they were not activated
372	 */
373	css_for_each_descendant_pre(css, &cgrp->self) {
374		struct cgroup *desc = container_of(css, struct cgroup, self);
375
376		bpf_prog_array_free(desc->bpf.inactive);
377		desc->bpf.inactive = NULL;
378	}
379
380	/* and restore back old_prog */
381	pl->prog = old_prog;
382	return err;
383}
384
385/* Must be called with cgroup_mutex held to avoid races. */
386int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
387		       union bpf_attr __user *uattr)
388{
389	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
390	enum bpf_attach_type type = attr->query.attach_type;
391	struct list_head *progs = &cgrp->bpf.progs[type];
392	u32 flags = cgrp->bpf.flags[type];
393	int cnt, ret = 0, i;
394
395	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
396		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
397	else
398		cnt = prog_list_length(progs);
399
400	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
401		return -EFAULT;
402	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
403		return -EFAULT;
404	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
405		/* return early if user requested only program count + flags */
406		return 0;
407	if (attr->query.prog_cnt < cnt) {
408		cnt = attr->query.prog_cnt;
409		ret = -ENOSPC;
410	}
411
412	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
413		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
414						   prog_ids, cnt);
415	} else {
416		struct bpf_prog_list *pl;
417		u32 id;
418
419		i = 0;
420		list_for_each_entry(pl, progs, node) {
421			id = pl->prog->aux->id;
422			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
423				return -EFAULT;
424			if (++i == cnt)
425				break;
426		}
427	}
428	return ret;
429}
430
431/**
432 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
433 * @sk: The socket sending or receiving traffic
434 * @skb: The skb that is being sent or received
435 * @type: The type of program to be exectuted
436 *
437 * If no socket is passed, or the socket is not of type INET or INET6,
438 * this function does nothing and returns 0.
439 *
440 * The program type passed in via @type must be suitable for network
441 * filtering. No further check is performed to assert that.
442 *
443 * This function will return %-EPERM if any if an attached program was found
444 * and if it returned != 1 during execution. In all other cases, 0 is returned.
445 */
446int __cgroup_bpf_run_filter_skb(struct sock *sk,
447				struct sk_buff *skb,
448				enum bpf_attach_type type)
449{
450	unsigned int offset = skb->data - skb_network_header(skb);
451	struct sock *save_sk;
452	struct cgroup *cgrp;
453	int ret;
454
455	if (!sk || !sk_fullsock(sk))
456		return 0;
457
458	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
459		return 0;
460
461	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
462	save_sk = skb->sk;
463	skb->sk = sk;
464	__skb_push(skb, offset);
465	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
466				 bpf_prog_run_save_cb);
467	__skb_pull(skb, offset);
468	skb->sk = save_sk;
469	return ret == 1 ? 0 : -EPERM;
470}
471EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
472
473/**
474 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
475 * @sk: sock structure to manipulate
476 * @type: The type of program to be exectuted
477 *
478 * socket is passed is expected to be of type INET or INET6.
479 *
480 * The program type passed in via @type must be suitable for sock
481 * filtering. No further check is performed to assert that.
482 *
483 * This function will return %-EPERM if any if an attached program was found
484 * and if it returned != 1 during execution. In all other cases, 0 is returned.
485 */
486int __cgroup_bpf_run_filter_sk(struct sock *sk,
487			       enum bpf_attach_type type)
488{
489	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
490	int ret;
491
492	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
493	return ret == 1 ? 0 : -EPERM;
494}
495EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
496
497/**
498 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
499 *                                       provided by user sockaddr
500 * @sk: sock struct that will use sockaddr
501 * @uaddr: sockaddr struct provided by user
502 * @type: The type of program to be exectuted
503 *
504 * socket is expected to be of type INET or INET6.
505 *
506 * This function will return %-EPERM if an attached program is found and
507 * returned value != 1 during execution. In all other cases, 0 is returned.
508 */
509int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
510				      struct sockaddr *uaddr,
511				      enum bpf_attach_type type)
512{
513	struct bpf_sock_addr_kern ctx = {
514		.sk = sk,
515		.uaddr = uaddr,
516	};
517	struct cgroup *cgrp;
518	int ret;
519
520	/* Check socket family since not all sockets represent network
521	 * endpoint (e.g. AF_UNIX).
522	 */
523	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
524		return 0;
525
526	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
527	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
528
529	return ret == 1 ? 0 : -EPERM;
530}
531EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
532
533/**
534 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
535 * @sk: socket to get cgroup from
536 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
537 * sk with connection information (IP addresses, etc.) May not contain
538 * cgroup info if it is a req sock.
539 * @type: The type of program to be exectuted
540 *
541 * socket passed is expected to be of type INET or INET6.
542 *
543 * The program type passed in via @type must be suitable for sock_ops
544 * filtering. No further check is performed to assert that.
545 *
546 * This function will return %-EPERM if any if an attached program was found
547 * and if it returned != 1 during execution. In all other cases, 0 is returned.
548 */
549int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
550				     struct bpf_sock_ops_kern *sock_ops,
551				     enum bpf_attach_type type)
552{
553	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
554	int ret;
555
556	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
557				 BPF_PROG_RUN);
558	return ret == 1 ? 0 : -EPERM;
559}
560EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
561
562int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
563				      short access, enum bpf_attach_type type)
564{
565	struct cgroup *cgrp;
566	struct bpf_cgroup_dev_ctx ctx = {
567		.access_type = (access << 16) | dev_type,
568		.major = major,
569		.minor = minor,
570	};
571	int allow = 1;
572
573	rcu_read_lock();
574	cgrp = task_dfl_cgroup(current);
575	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
576				   BPF_PROG_RUN);
577	rcu_read_unlock();
578
579	return !allow;
580}
581EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
582
583static const struct bpf_func_proto *
584cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
585{
586	switch (func_id) {
587	case BPF_FUNC_map_lookup_elem:
588		return &bpf_map_lookup_elem_proto;
589	case BPF_FUNC_map_update_elem:
590		return &bpf_map_update_elem_proto;
591	case BPF_FUNC_map_delete_elem:
592		return &bpf_map_delete_elem_proto;
593	case BPF_FUNC_get_current_uid_gid:
594		return &bpf_get_current_uid_gid_proto;
595	case BPF_FUNC_trace_printk:
596		if (capable(CAP_SYS_ADMIN))
597			return bpf_get_trace_printk_proto();
598	default:
599		return NULL;
600	}
601}
602
603static bool cgroup_dev_is_valid_access(int off, int size,
604				       enum bpf_access_type type,
605				       const struct bpf_prog *prog,
606				       struct bpf_insn_access_aux *info)
607{
608	const int size_default = sizeof(__u32);
609
610	if (type == BPF_WRITE)
611		return false;
612
613	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
614		return false;
615	/* The verifier guarantees that size > 0. */
616	if (off % size != 0)
617		return false;
618
619	switch (off) {
620	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
621		bpf_ctx_record_field_size(info, size_default);
622		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
623			return false;
624		break;
625	default:
626		if (size != size_default)
627			return false;
628	}
629
630	return true;
631}
632
633const struct bpf_prog_ops cg_dev_prog_ops = {
634};
635
636const struct bpf_verifier_ops cg_dev_verifier_ops = {
637	.get_func_proto		= cgroup_dev_func_proto,
638	.is_valid_access	= cgroup_dev_is_valid_access,
639};