bpf_sk_storage.c - net/core/bpf_sk_storage.c - Linux diff v5.14.15

  1// SPDX-License-Identifier: GPL-2.0
  2/* Copyright (c) 2019 Facebook  */
  3#include <linux/rculist.h>
  4#include <linux/list.h>
  5#include <linux/hash.h>
  6#include <linux/types.h>
  7#include <linux/spinlock.h>
  8#include <linux/bpf.h>
  9#include <linux/btf.h>
 10#include <linux/btf_ids.h>
 11#include <linux/bpf_local_storage.h>
 12#include <net/bpf_sk_storage.h>
 13#include <net/sock.h>
 14#include <uapi/linux/sock_diag.h>
 15#include <uapi/linux/btf.h>
 16
 17DEFINE_BPF_STORAGE_CACHE(sk_cache);
 
 18
 19static struct bpf_local_storage_data *
 20bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 21{
 22	struct bpf_local_storage *sk_storage;
 23	struct bpf_local_storage_map *smap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 24
 25	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 26	if (!sk_storage)
 27		return NULL;
 28
 29	smap = (struct bpf_local_storage_map *)map;
 30	return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 31}
 32
 33static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 34{
 35	struct bpf_local_storage_data *sdata;
 36
 37	sdata = bpf_sk_storage_lookup(sk, map, false);
 38	if (!sdata)
 39		return -ENOENT;
 40
 41	bpf_selem_unlink(SELEM(sdata));
 42
 43	return 0;
 44}
 45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 46/* Called by __sk_destruct() & bpf_sk_storage_clone() */
 47void bpf_sk_storage_free(struct sock *sk)
 48{
 49	struct bpf_local_storage_elem *selem;
 50	struct bpf_local_storage *sk_storage;
 51	bool free_sk_storage = false;
 52	struct hlist_node *n;
 53
 54	rcu_read_lock();
 55	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 56	if (!sk_storage) {
 57		rcu_read_unlock();
 58		return;
 59	}
 60
 61	/* Netiher the bpf_prog nor the bpf-map's syscall
 62	 * could be modifying the sk_storage->list now.
 63	 * Thus, no elem can be added-to or deleted-from the
 64	 * sk_storage->list by the bpf_prog or by the bpf-map's syscall.
 65	 *
 66	 * It is racing with bpf_local_storage_map_free() alone
 67	 * when unlinking elem from the sk_storage->list and
 68	 * the map's bucket->list.
 69	 */
 70	raw_spin_lock_bh(&sk_storage->lock);
 71	hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) {
 72		/* Always unlink from map before unlinking from
 73		 * sk_storage.
 74		 */
 75		bpf_selem_unlink_map(selem);
 76		free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage,
 77								  selem, true);
 78	}
 79	raw_spin_unlock_bh(&sk_storage->lock);
 80	rcu_read_unlock();
 81
 82	if (free_sk_storage)
 83		kfree_rcu(sk_storage, rcu);
 84}
 85
 86static void bpf_sk_storage_map_free(struct bpf_map *map)
 87{
 88	struct bpf_local_storage_map *smap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 89
 90	smap = (struct bpf_local_storage_map *)map;
 91	bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
 92	bpf_local_storage_map_free(smap, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 93}
 94
 95static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 96{
 97	struct bpf_local_storage_map *smap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 98
 99	smap = bpf_local_storage_map_alloc(attr);
100	if (IS_ERR(smap))
101		return ERR_CAST(smap);
 
 
 
 
102
103	smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
104	return &smap->map;
105}
106
107static int notsupp_get_next_key(struct bpf_map *map, void *key,
108				void *next_key)
109{
110	return -ENOTSUPP;
111}
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
114{
115	struct bpf_local_storage_data *sdata;
116	struct socket *sock;
117	int fd, err;
118
119	fd = *(int *)key;
120	sock = sockfd_lookup(fd, &err);
121	if (sock) {
122		sdata = bpf_sk_storage_lookup(sock->sk, map, true);
123		sockfd_put(sock);
124		return sdata ? sdata->data : NULL;
125	}
126
127	return ERR_PTR(err);
128}
129
130static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
131					 void *value, u64 map_flags)
132{
133	struct bpf_local_storage_data *sdata;
134	struct socket *sock;
135	int fd, err;
136
137	fd = *(int *)key;
138	sock = sockfd_lookup(fd, &err);
139	if (sock) {
140		sdata = bpf_local_storage_update(
141			sock->sk, (struct bpf_local_storage_map *)map, value,
142			map_flags);
143		sockfd_put(sock);
144		return PTR_ERR_OR_ZERO(sdata);
145	}
146
147	return err;
148}
149
150static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
151{
152	struct socket *sock;
153	int fd, err;
154
155	fd = *(int *)key;
156	sock = sockfd_lookup(fd, &err);
157	if (sock) {
158		err = bpf_sk_storage_del(sock->sk, map);
159		sockfd_put(sock);
160		return err;
161	}
162
163	return err;
164}
165
166static struct bpf_local_storage_elem *
167bpf_sk_storage_clone_elem(struct sock *newsk,
168			  struct bpf_local_storage_map *smap,
169			  struct bpf_local_storage_elem *selem)
170{
171	struct bpf_local_storage_elem *copy_selem;
172
173	copy_selem = bpf_selem_alloc(smap, newsk, NULL, true);
174	if (!copy_selem)
175		return NULL;
176
177	if (map_value_has_spin_lock(&smap->map))
178		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
179				      SDATA(selem)->data, true);
180	else
181		copy_map_value(&smap->map, SDATA(copy_selem)->data,
182			       SDATA(selem)->data);
183
184	return copy_selem;
185}
186
187int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
188{
189	struct bpf_local_storage *new_sk_storage = NULL;
190	struct bpf_local_storage *sk_storage;
191	struct bpf_local_storage_elem *selem;
192	int ret = 0;
193
194	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
195
196	rcu_read_lock();
197	sk_storage = rcu_dereference(sk->sk_bpf_storage);
198
199	if (!sk_storage || hlist_empty(&sk_storage->list))
200		goto out;
201
202	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
203		struct bpf_local_storage_elem *copy_selem;
204		struct bpf_local_storage_map *smap;
205		struct bpf_map *map;
206
207		smap = rcu_dereference(SDATA(selem)->smap);
208		if (!(smap->map.map_flags & BPF_F_CLONE))
209			continue;
210
211		/* Note that for lockless listeners adding new element
212		 * here can race with cleanup in bpf_local_storage_map_free.
213		 * Try to grab map refcnt to make sure that it's still
214		 * alive and prevent concurrent removal.
215		 */
216		map = bpf_map_inc_not_zero(&smap->map);
217		if (IS_ERR(map))
218			continue;
219
220		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
221		if (!copy_selem) {
222			ret = -ENOMEM;
223			bpf_map_put(map);
224			goto out;
225		}
226
227		if (new_sk_storage) {
228			bpf_selem_link_map(smap, copy_selem);
229			bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
230		} else {
231			ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
232			if (ret) {
233				kfree(copy_selem);
234				atomic_sub(smap->elem_size,
235					   &newsk->sk_omem_alloc);
236				bpf_map_put(map);
237				goto out;
238			}
239
240			new_sk_storage =
241				rcu_dereference(copy_selem->local_storage);
242		}
243		bpf_map_put(map);
244	}
245
246out:
247	rcu_read_unlock();
248
249	/* In case of an error, don't free anything explicitly here, the
250	 * caller is responsible to call bpf_sk_storage_free.
251	 */
252
253	return ret;
254}
255
256BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
257	   void *, value, u64, flags)
258{
259	struct bpf_local_storage_data *sdata;
260
261	if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
262		return (unsigned long)NULL;
263
264	sdata = bpf_sk_storage_lookup(sk, map, true);
265	if (sdata)
266		return (unsigned long)sdata->data;
267
268	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
269	    /* Cannot add new elem to a going away sk.
270	     * Otherwise, the new elem may become a leak
271	     * (and also other memory issues during map
272	     *  destruction).
273	     */
274	    refcount_inc_not_zero(&sk->sk_refcnt)) {
275		sdata = bpf_local_storage_update(
276			sk, (struct bpf_local_storage_map *)map, value,
277			BPF_NOEXIST);
278		/* sk must be a fullsock (guaranteed by verifier),
279		 * so sock_gen_put() is unnecessary.
280		 */
281		sock_put(sk);
282		return IS_ERR(sdata) ?
283			(unsigned long)NULL : (unsigned long)sdata->data;
284	}
285
286	return (unsigned long)NULL;
287}
288
289BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
290{
291	if (!sk || !sk_fullsock(sk))
292		return -EINVAL;
293
294	if (refcount_inc_not_zero(&sk->sk_refcnt)) {
295		int err;
296
297		err = bpf_sk_storage_del(sk, map);
298		sock_put(sk);
299		return err;
300	}
301
302	return -ENOENT;
303}
304
305static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
306				 void *owner, u32 size)
307{
308	struct sock *sk = (struct sock *)owner;
309
310	/* same check as in sock_kmalloc() */
311	if (size <= sysctl_optmem_max &&
312	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
313		atomic_add(size, &sk->sk_omem_alloc);
314		return 0;
315	}
316
317	return -ENOMEM;
318}
319
320static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap,
321				    void *owner, u32 size)
322{
323	struct sock *sk = owner;
324
325	atomic_sub(size, &sk->sk_omem_alloc);
326}
327
328static struct bpf_local_storage __rcu **
329bpf_sk_storage_ptr(void *owner)
330{
331	struct sock *sk = owner;
332
333	return &sk->sk_bpf_storage;
334}
335
336static int sk_storage_map_btf_id;
337const struct bpf_map_ops sk_storage_map_ops = {
338	.map_meta_equal = bpf_map_meta_equal,
339	.map_alloc_check = bpf_local_storage_map_alloc_check,
340	.map_alloc = bpf_sk_storage_map_alloc,
341	.map_free = bpf_sk_storage_map_free,
342	.map_get_next_key = notsupp_get_next_key,
343	.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
344	.map_update_elem = bpf_fd_sk_storage_update_elem,
345	.map_delete_elem = bpf_fd_sk_storage_delete_elem,
346	.map_check_btf = bpf_local_storage_map_check_btf,
347	.map_btf_name = "bpf_local_storage_map",
348	.map_btf_id = &sk_storage_map_btf_id,
349	.map_local_storage_charge = bpf_sk_storage_charge,
350	.map_local_storage_uncharge = bpf_sk_storage_uncharge,
351	.map_owner_storage_ptr = bpf_sk_storage_ptr,
352};
353
354const struct bpf_func_proto bpf_sk_storage_get_proto = {
355	.func		= bpf_sk_storage_get,
356	.gpl_only	= false,
357	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
358	.arg1_type	= ARG_CONST_MAP_PTR,
359	.arg2_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
360	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
361	.arg4_type	= ARG_ANYTHING,
362};
363
364const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = {
365	.func		= bpf_sk_storage_get,
366	.gpl_only	= false,
367	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
368	.arg1_type	= ARG_CONST_MAP_PTR,
369	.arg2_type	= ARG_PTR_TO_CTX, /* context is 'struct sock' */
370	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
371	.arg4_type	= ARG_ANYTHING,
372};
373
374const struct bpf_func_proto bpf_sk_storage_delete_proto = {
375	.func		= bpf_sk_storage_delete,
376	.gpl_only	= false,
377	.ret_type	= RET_INTEGER,
378	.arg1_type	= ARG_CONST_MAP_PTR,
379	.arg2_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
380};
381
382static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
383{
384	const struct btf *btf_vmlinux;
385	const struct btf_type *t;
386	const char *tname;
387	u32 btf_id;
388
389	if (prog->aux->dst_prog)
390		return false;
391
392	/* Ensure the tracing program is not tracing
393	 * any bpf_sk_storage*() function and also
394	 * use the bpf_sk_storage_(get|delete) helper.
395	 */
396	switch (prog->expected_attach_type) {
397	case BPF_TRACE_ITER:
398	case BPF_TRACE_RAW_TP:
399		/* bpf_sk_storage has no trace point */
400		return true;
401	case BPF_TRACE_FENTRY:
402	case BPF_TRACE_FEXIT:
403		btf_vmlinux = bpf_get_btf_vmlinux();
404		btf_id = prog->aux->attach_btf_id;
405		t = btf_type_by_id(btf_vmlinux, btf_id);
406		tname = btf_name_by_offset(btf_vmlinux, t->name_off);
407		return !!strncmp(tname, "bpf_sk_storage",
408				 strlen("bpf_sk_storage"));
409	default:
410		return false;
411	}
412
413	return false;
414}
415
416BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
417	   void *, value, u64, flags)
418{
419	if (in_irq() || in_nmi())
420		return (unsigned long)NULL;
421
422	return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
423}
424
425BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
426	   struct sock *, sk)
427{
428	if (in_irq() || in_nmi())
429		return -EPERM;
430
431	return ____bpf_sk_storage_delete(map, sk);
432}
433
434const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
435	.func		= bpf_sk_storage_get_tracing,
436	.gpl_only	= false,
437	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
438	.arg1_type	= ARG_CONST_MAP_PTR,
439	.arg2_type	= ARG_PTR_TO_BTF_ID,
440	.arg2_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
441	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
442	.arg4_type	= ARG_ANYTHING,
443	.allowed	= bpf_sk_storage_tracing_allowed,
444};
445
446const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
447	.func		= bpf_sk_storage_delete_tracing,
448	.gpl_only	= false,
449	.ret_type	= RET_INTEGER,
450	.arg1_type	= ARG_CONST_MAP_PTR,
451	.arg2_type	= ARG_PTR_TO_BTF_ID,
452	.arg2_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
453	.allowed	= bpf_sk_storage_tracing_allowed,
454};
455
456struct bpf_sk_storage_diag {
457	u32 nr_maps;
458	struct bpf_map *maps[];
459};
460
461/* The reply will be like:
462 * INET_DIAG_BPF_SK_STORAGES (nla_nest)
463 *	SK_DIAG_BPF_STORAGE (nla_nest)
464 *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
465 *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
466 *	SK_DIAG_BPF_STORAGE (nla_nest)
467 *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
468 *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
469 *	....
470 */
471static int nla_value_size(u32 value_size)
472{
473	/* SK_DIAG_BPF_STORAGE (nla_nest)
474	 *	SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
475	 *	SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
476	 */
477	return nla_total_size(0) + nla_total_size(sizeof(u32)) +
478		nla_total_size_64bit(value_size);
479}
480
481void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
482{
483	u32 i;
484
485	if (!diag)
486		return;
487
488	for (i = 0; i < diag->nr_maps; i++)
489		bpf_map_put(diag->maps[i]);
490
491	kfree(diag);
492}
493EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
494
495static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
496			   const struct bpf_map *map)
497{
498	u32 i;
499
500	for (i = 0; i < diag->nr_maps; i++) {
501		if (diag->maps[i] == map)
502			return true;
503	}
504
505	return false;
506}
507
508struct bpf_sk_storage_diag *
509bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
510{
511	struct bpf_sk_storage_diag *diag;
512	struct nlattr *nla;
513	u32 nr_maps = 0;
514	int rem, err;
515
516	/* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
517	 * the map_alloc_check() side also does.
518	 */
519	if (!bpf_capable())
520		return ERR_PTR(-EPERM);
521
522	nla_for_each_nested(nla, nla_stgs, rem) {
523		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
524			nr_maps++;
525	}
526
527	diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
 
528	if (!diag)
529		return ERR_PTR(-ENOMEM);
530
531	nla_for_each_nested(nla, nla_stgs, rem) {
532		struct bpf_map *map;
533		int map_fd;
534
535		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
536			continue;
537
538		map_fd = nla_get_u32(nla);
539		map = bpf_map_get(map_fd);
540		if (IS_ERR(map)) {
541			err = PTR_ERR(map);
542			goto err_free;
543		}
544		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
545			bpf_map_put(map);
546			err = -EINVAL;
547			goto err_free;
548		}
549		if (diag_check_dup(diag, map)) {
550			bpf_map_put(map);
551			err = -EEXIST;
552			goto err_free;
553		}
554		diag->maps[diag->nr_maps++] = map;
555	}
556
557	return diag;
558
559err_free:
560	bpf_sk_storage_diag_free(diag);
561	return ERR_PTR(err);
562}
563EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
564
565static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
566{
567	struct nlattr *nla_stg, *nla_value;
568	struct bpf_local_storage_map *smap;
569
570	/* It cannot exceed max nlattr's payload */
571	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
572
573	nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
574	if (!nla_stg)
575		return -EMSGSIZE;
576
577	smap = rcu_dereference(sdata->smap);
578	if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
579		goto errout;
580
581	nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
582				      smap->map.value_size,
583				      SK_DIAG_BPF_STORAGE_PAD);
584	if (!nla_value)
585		goto errout;
586
587	if (map_value_has_spin_lock(&smap->map))
588		copy_map_value_locked(&smap->map, nla_data(nla_value),
589				      sdata->data, true);
590	else
591		copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
592
593	nla_nest_end(skb, nla_stg);
594	return 0;
595
596errout:
597	nla_nest_cancel(skb, nla_stg);
598	return -EMSGSIZE;
599}
600
601static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
602				       int stg_array_type,
603				       unsigned int *res_diag_size)
604{
605	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
606	unsigned int diag_size = nla_total_size(0);
607	struct bpf_local_storage *sk_storage;
608	struct bpf_local_storage_elem *selem;
609	struct bpf_local_storage_map *smap;
610	struct nlattr *nla_stgs;
611	unsigned int saved_len;
612	int err = 0;
613
614	rcu_read_lock();
615
616	sk_storage = rcu_dereference(sk->sk_bpf_storage);
617	if (!sk_storage || hlist_empty(&sk_storage->list)) {
618		rcu_read_unlock();
619		return 0;
620	}
621
622	nla_stgs = nla_nest_start(skb, stg_array_type);
623	if (!nla_stgs)
624		/* Continue to learn diag_size */
625		err = -EMSGSIZE;
626
627	saved_len = skb->len;
628	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
629		smap = rcu_dereference(SDATA(selem)->smap);
630		diag_size += nla_value_size(smap->map.value_size);
631
632		if (nla_stgs && diag_get(SDATA(selem), skb))
633			/* Continue to learn diag_size */
634			err = -EMSGSIZE;
635	}
636
637	rcu_read_unlock();
638
639	if (nla_stgs) {
640		if (saved_len == skb->len)
641			nla_nest_cancel(skb, nla_stgs);
642		else
643			nla_nest_end(skb, nla_stgs);
644	}
645
646	if (diag_size == nla_total_size(0)) {
647		*res_diag_size = 0;
648		return 0;
649	}
650
651	*res_diag_size = diag_size;
652	return err;
653}
654
655int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
656			    struct sock *sk, struct sk_buff *skb,
657			    int stg_array_type,
658			    unsigned int *res_diag_size)
659{
660	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
661	unsigned int diag_size = nla_total_size(0);
662	struct bpf_local_storage *sk_storage;
663	struct bpf_local_storage_data *sdata;
664	struct nlattr *nla_stgs;
665	unsigned int saved_len;
666	int err = 0;
667	u32 i;
668
669	*res_diag_size = 0;
670
671	/* No map has been specified.  Dump all. */
672	if (!diag->nr_maps)
673		return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
674						   res_diag_size);
675
676	rcu_read_lock();
677	sk_storage = rcu_dereference(sk->sk_bpf_storage);
678	if (!sk_storage || hlist_empty(&sk_storage->list)) {
679		rcu_read_unlock();
680		return 0;
681	}
682
683	nla_stgs = nla_nest_start(skb, stg_array_type);
684	if (!nla_stgs)
685		/* Continue to learn diag_size */
686		err = -EMSGSIZE;
687
688	saved_len = skb->len;
689	for (i = 0; i < diag->nr_maps; i++) {
690		sdata = bpf_local_storage_lookup(sk_storage,
691				(struct bpf_local_storage_map *)diag->maps[i],
692				false);
693
694		if (!sdata)
695			continue;
696
697		diag_size += nla_value_size(diag->maps[i]->value_size);
698
699		if (nla_stgs && diag_get(sdata, skb))
700			/* Continue to learn diag_size */
701			err = -EMSGSIZE;
702	}
703	rcu_read_unlock();
704
705	if (nla_stgs) {
706		if (saved_len == skb->len)
707			nla_nest_cancel(skb, nla_stgs);
708		else
709			nla_nest_end(skb, nla_stgs);
710	}
711
712	if (diag_size == nla_total_size(0)) {
713		*res_diag_size = 0;
714		return 0;
715	}
716
717	*res_diag_size = diag_size;
718	return err;
719}
720EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
721
722struct bpf_iter_seq_sk_storage_map_info {
723	struct bpf_map *map;
724	unsigned int bucket_id;
725	unsigned skip_elems;
726};
727
728static struct bpf_local_storage_elem *
729bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
730				 struct bpf_local_storage_elem *prev_selem)
731	__acquires(RCU) __releases(RCU)
732{
733	struct bpf_local_storage *sk_storage;
734	struct bpf_local_storage_elem *selem;
735	u32 skip_elems = info->skip_elems;
736	struct bpf_local_storage_map *smap;
737	u32 bucket_id = info->bucket_id;
738	u32 i, count, n_buckets;
739	struct bpf_local_storage_map_bucket *b;
740
741	smap = (struct bpf_local_storage_map *)info->map;
742	n_buckets = 1U << smap->bucket_log;
743	if (bucket_id >= n_buckets)
744		return NULL;
745
746	/* try to find next selem in the same bucket */
747	selem = prev_selem;
748	count = 0;
749	while (selem) {
750		selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
751					 struct bpf_local_storage_elem, map_node);
752		if (!selem) {
753			/* not found, unlock and go to the next bucket */
754			b = &smap->buckets[bucket_id++];
755			rcu_read_unlock();
756			skip_elems = 0;
757			break;
758		}
759		sk_storage = rcu_dereference(selem->local_storage);
760		if (sk_storage) {
761			info->skip_elems = skip_elems + count;
762			return selem;
763		}
764		count++;
765	}
766
767	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
768		b = &smap->buckets[i];
769		rcu_read_lock();
770		count = 0;
771		hlist_for_each_entry_rcu(selem, &b->list, map_node) {
772			sk_storage = rcu_dereference(selem->local_storage);
773			if (sk_storage && count >= skip_elems) {
774				info->bucket_id = i;
775				info->skip_elems = count;
776				return selem;
777			}
778			count++;
779		}
780		rcu_read_unlock();
781		skip_elems = 0;
782	}
783
784	info->bucket_id = i;
785	info->skip_elems = 0;
786	return NULL;
787}
788
789static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
790{
791	struct bpf_local_storage_elem *selem;
792
793	selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
794	if (!selem)
795		return NULL;
796
797	if (*pos == 0)
798		++*pos;
799	return selem;
800}
801
802static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v,
803					 loff_t *pos)
804{
805	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
806
807	++*pos;
808	++info->skip_elems;
809	return bpf_sk_storage_map_seq_find_next(seq->private, v);
810}
811
812struct bpf_iter__bpf_sk_storage_map {
813	__bpf_md_ptr(struct bpf_iter_meta *, meta);
814	__bpf_md_ptr(struct bpf_map *, map);
815	__bpf_md_ptr(struct sock *, sk);
816	__bpf_md_ptr(void *, value);
817};
818
819DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
820		     struct bpf_map *map, struct sock *sk,
821		     void *value)
822
823static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
824					 struct bpf_local_storage_elem *selem)
825{
826	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
827	struct bpf_iter__bpf_sk_storage_map ctx = {};
828	struct bpf_local_storage *sk_storage;
829	struct bpf_iter_meta meta;
830	struct bpf_prog *prog;
831	int ret = 0;
832
833	meta.seq = seq;
834	prog = bpf_iter_get_info(&meta, selem == NULL);
835	if (prog) {
836		ctx.meta = &meta;
837		ctx.map = info->map;
838		if (selem) {
839			sk_storage = rcu_dereference(selem->local_storage);
840			ctx.sk = sk_storage->owner;
841			ctx.value = SDATA(selem)->data;
842		}
843		ret = bpf_iter_run_prog(prog, &ctx);
844	}
845
846	return ret;
847}
848
849static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
850{
851	return __bpf_sk_storage_map_seq_show(seq, v);
852}
853
854static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
855	__releases(RCU)
856{
857	if (!v)
 
 
 
 
858		(void)__bpf_sk_storage_map_seq_show(seq, v);
859	else
860		rcu_read_unlock();
 
 
 
861}
862
863static int bpf_iter_init_sk_storage_map(void *priv_data,
864					struct bpf_iter_aux_info *aux)
865{
866	struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
867
868	seq_info->map = aux->map;
869	return 0;
870}
871
872static int bpf_iter_attach_map(struct bpf_prog *prog,
873			       union bpf_iter_link_info *linfo,
874			       struct bpf_iter_aux_info *aux)
875{
876	struct bpf_map *map;
877	int err = -EINVAL;
878
879	if (!linfo->map.map_fd)
880		return -EBADF;
881
882	map = bpf_map_get_with_uref(linfo->map.map_fd);
883	if (IS_ERR(map))
884		return PTR_ERR(map);
885
886	if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
887		goto put_map;
888
889	if (prog->aux->max_rdonly_access > map->value_size) {
890		err = -EACCES;
891		goto put_map;
892	}
893
894	aux->map = map;
895	return 0;
896
897put_map:
898	bpf_map_put_with_uref(map);
899	return err;
900}
901
902static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
903{
904	bpf_map_put_with_uref(aux->map);
905}
906
907static const struct seq_operations bpf_sk_storage_map_seq_ops = {
908	.start  = bpf_sk_storage_map_seq_start,
909	.next   = bpf_sk_storage_map_seq_next,
910	.stop   = bpf_sk_storage_map_seq_stop,
911	.show   = bpf_sk_storage_map_seq_show,
912};
913
914static const struct bpf_iter_seq_info iter_seq_info = {
915	.seq_ops		= &bpf_sk_storage_map_seq_ops,
916	.init_seq_private	= bpf_iter_init_sk_storage_map,
917	.fini_seq_private	= NULL,
918	.seq_priv_size		= sizeof(struct bpf_iter_seq_sk_storage_map_info),
919};
920
921static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
922	.target			= "bpf_sk_storage_map",
923	.attach_target		= bpf_iter_attach_map,
924	.detach_target		= bpf_iter_detach_map,
925	.show_fdinfo		= bpf_iter_map_show_fdinfo,
926	.fill_link_info		= bpf_iter_map_fill_link_info,
927	.ctx_arg_info_size	= 2,
928	.ctx_arg_info		= {
929		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
930		  PTR_TO_BTF_ID_OR_NULL },
931		{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),
932		  PTR_TO_RDWR_BUF_OR_NULL },
933	},
934	.seq_info		= &iter_seq_info,
935};
936
937static int __init bpf_sk_storage_map_iter_init(void)
938{
939	bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id =
940		btf_sock_ids[BTF_SOCK_TYPE_SOCK];
941	return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info);
942}
943late_initcall(bpf_sk_storage_map_iter_init);

   1// SPDX-License-Identifier: GPL-2.0
   2/* Copyright (c) 2019 Facebook  */
   3#include <linux/rculist.h>
   4#include <linux/list.h>
   5#include <linux/hash.h>
   6#include <linux/types.h>
   7#include <linux/spinlock.h>
   8#include <linux/bpf.h>
 
   9#include <linux/btf_ids.h>
 
  10#include <net/bpf_sk_storage.h>
  11#include <net/sock.h>
  12#include <uapi/linux/sock_diag.h>
  13#include <uapi/linux/btf.h>
  14
  15#define SK_STORAGE_CREATE_FLAG_MASK					\
  16	(BPF_F_NO_PREALLOC | BPF_F_CLONE)
  17
  18struct bucket {
  19	struct hlist_head list;
  20	raw_spinlock_t lock;
  21};
  22
  23/* Thp map is not the primary owner of a bpf_sk_storage_elem.
  24 * Instead, the sk->sk_bpf_storage is.
  25 *
  26 * The map (bpf_sk_storage_map) is for two purposes
  27 * 1. Define the size of the "sk local storage".  It is
  28 *    the map's value_size.
  29 *
  30 * 2. Maintain a list to keep track of all elems such
  31 *    that they can be cleaned up during the map destruction.
  32 *
  33 * When a bpf local storage is being looked up for a
  34 * particular sk,  the "bpf_map" pointer is actually used
  35 * as the "key" to search in the list of elem in
  36 * sk->sk_bpf_storage.
  37 *
  38 * Hence, consider sk->sk_bpf_storage is the mini-map
  39 * with the "bpf_map" pointer as the searching key.
  40 */
  41struct bpf_sk_storage_map {
  42	struct bpf_map map;
  43	/* Lookup elem does not require accessing the map.
  44	 *
  45	 * Updating/Deleting requires a bucket lock to
  46	 * link/unlink the elem from the map.  Having
  47	 * multiple buckets to improve contention.
  48	 */
  49	struct bucket *buckets;
  50	u32 bucket_log;
  51	u16 elem_size;
  52	u16 cache_idx;
  53};
  54
  55struct bpf_sk_storage_data {
  56	/* smap is used as the searching key when looking up
  57	 * from sk->sk_bpf_storage.
  58	 *
  59	 * Put it in the same cacheline as the data to minimize
  60	 * the number of cachelines access during the cache hit case.
  61	 */
  62	struct bpf_sk_storage_map __rcu *smap;
  63	u8 data[] __aligned(8);
  64};
  65
  66/* Linked to bpf_sk_storage and bpf_sk_storage_map */
  67struct bpf_sk_storage_elem {
  68	struct hlist_node map_node;	/* Linked to bpf_sk_storage_map */
  69	struct hlist_node snode;	/* Linked to bpf_sk_storage */
  70	struct bpf_sk_storage __rcu *sk_storage;
  71	struct rcu_head rcu;
  72	/* 8 bytes hole */
  73	/* The data is stored in aother cacheline to minimize
  74	 * the number of cachelines access during a cache hit.
  75	 */
  76	struct bpf_sk_storage_data sdata ____cacheline_aligned;
  77};
  78
  79#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
  80#define SDATA(_SELEM) (&(_SELEM)->sdata)
  81#define BPF_SK_STORAGE_CACHE_SIZE	16
  82
  83static DEFINE_SPINLOCK(cache_idx_lock);
  84static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE];
  85
  86struct bpf_sk_storage {
  87	struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
  88	struct hlist_head list;	/* List of bpf_sk_storage_elem */
  89	struct sock *sk;	/* The sk that owns the the above "list" of
  90				 * bpf_sk_storage_elem.
  91				 */
  92	struct rcu_head rcu;
  93	raw_spinlock_t lock;	/* Protect adding/removing from the "list" */
  94};
  95
  96static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
  97				    struct bpf_sk_storage_elem *selem)
  98{
  99	return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
 100}
 101
 102static int omem_charge(struct sock *sk, unsigned int size)
 103{
 104	/* same check as in sock_kmalloc() */
 105	if (size <= sysctl_optmem_max &&
 106	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
 107		atomic_add(size, &sk->sk_omem_alloc);
 108		return 0;
 109	}
 110
 111	return -ENOMEM;
 112}
 113
 114static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
 115{
 116	return !hlist_unhashed(&selem->snode);
 117}
 118
 119static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
 120{
 121	return !hlist_unhashed(&selem->map_node);
 122}
 123
 124static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
 125					       struct sock *sk, void *value,
 126					       bool charge_omem)
 127{
 128	struct bpf_sk_storage_elem *selem;
 129
 130	if (charge_omem && omem_charge(sk, smap->elem_size))
 131		return NULL;
 132
 133	selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
 134	if (selem) {
 135		if (value)
 136			memcpy(SDATA(selem)->data, value, smap->map.value_size);
 137		return selem;
 138	}
 139
 140	if (charge_omem)
 141		atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 142
 143	return NULL;
 144}
 145
 146/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
 147 * The caller must ensure selem->smap is still valid to be
 148 * dereferenced for its smap->elem_size and smap->cache_idx.
 149 */
 150static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
 151			      struct bpf_sk_storage_elem *selem,
 152			      bool uncharge_omem)
 153{
 154	struct bpf_sk_storage_map *smap;
 155	bool free_sk_storage;
 156	struct sock *sk;
 157
 158	smap = rcu_dereference(SDATA(selem)->smap);
 159	sk = sk_storage->sk;
 160
 161	/* All uncharging on sk->sk_omem_alloc must be done first.
 162	 * sk may be freed once the last selem is unlinked from sk_storage.
 163	 */
 164	if (uncharge_omem)
 165		atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 166
 167	free_sk_storage = hlist_is_singular_node(&selem->snode,
 168						 &sk_storage->list);
 169	if (free_sk_storage) {
 170		atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
 171		sk_storage->sk = NULL;
 172		/* After this RCU_INIT, sk may be freed and cannot be used */
 173		RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
 174
 175		/* sk_storage is not freed now.  sk_storage->lock is
 176		 * still held and raw_spin_unlock_bh(&sk_storage->lock)
 177		 * will be done by the caller.
 178		 *
 179		 * Although the unlock will be done under
 180		 * rcu_read_lock(),  it is more intutivie to
 181		 * read if kfree_rcu(sk_storage, rcu) is done
 182		 * after the raw_spin_unlock_bh(&sk_storage->lock).
 183		 *
 184		 * Hence, a "bool free_sk_storage" is returned
 185		 * to the caller which then calls the kfree_rcu()
 186		 * after unlock.
 187		 */
 188	}
 189	hlist_del_init_rcu(&selem->snode);
 190	if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
 191	    SDATA(selem))
 192		RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
 193
 194	kfree_rcu(selem, rcu);
 195
 196	return free_sk_storage;
 197}
 198
 199static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
 200{
 201	struct bpf_sk_storage *sk_storage;
 202	bool free_sk_storage = false;
 203
 204	if (unlikely(!selem_linked_to_sk(selem)))
 205		/* selem has already been unlinked from sk */
 206		return;
 207
 208	sk_storage = rcu_dereference(selem->sk_storage);
 209	raw_spin_lock_bh(&sk_storage->lock);
 210	if (likely(selem_linked_to_sk(selem)))
 211		free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
 212	raw_spin_unlock_bh(&sk_storage->lock);
 213
 214	if (free_sk_storage)
 215		kfree_rcu(sk_storage, rcu);
 216}
 217
 218static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
 219			    struct bpf_sk_storage_elem *selem)
 220{
 221	RCU_INIT_POINTER(selem->sk_storage, sk_storage);
 222	hlist_add_head(&selem->snode, &sk_storage->list);
 223}
 224
 225static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
 226{
 227	struct bpf_sk_storage_map *smap;
 228	struct bucket *b;
 229
 230	if (unlikely(!selem_linked_to_map(selem)))
 231		/* selem has already be unlinked from smap */
 232		return;
 233
 234	smap = rcu_dereference(SDATA(selem)->smap);
 235	b = select_bucket(smap, selem);
 236	raw_spin_lock_bh(&b->lock);
 237	if (likely(selem_linked_to_map(selem)))
 238		hlist_del_init_rcu(&selem->map_node);
 239	raw_spin_unlock_bh(&b->lock);
 240}
 241
 242static void selem_link_map(struct bpf_sk_storage_map *smap,
 243			   struct bpf_sk_storage_elem *selem)
 244{
 245	struct bucket *b = select_bucket(smap, selem);
 246
 247	raw_spin_lock_bh(&b->lock);
 248	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 249	hlist_add_head_rcu(&selem->map_node, &b->list);
 250	raw_spin_unlock_bh(&b->lock);
 251}
 252
 253static void selem_unlink(struct bpf_sk_storage_elem *selem)
 254{
 255	/* Always unlink from map before unlinking from sk_storage
 256	 * because selem will be freed after successfully unlinked from
 257	 * the sk_storage.
 258	 */
 259	selem_unlink_map(selem);
 260	selem_unlink_sk(selem);
 261}
 262
 263static struct bpf_sk_storage_data *
 264__sk_storage_lookup(struct bpf_sk_storage *sk_storage,
 265		    struct bpf_sk_storage_map *smap,
 266		    bool cacheit_lockit)
 267{
 268	struct bpf_sk_storage_data *sdata;
 269	struct bpf_sk_storage_elem *selem;
 270
 271	/* Fast path (cache hit) */
 272	sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
 273	if (sdata && rcu_access_pointer(sdata->smap) == smap)
 274		return sdata;
 275
 276	/* Slow path (cache miss) */
 277	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
 278		if (rcu_access_pointer(SDATA(selem)->smap) == smap)
 279			break;
 280
 281	if (!selem)
 282		return NULL;
 283
 284	sdata = SDATA(selem);
 285	if (cacheit_lockit) {
 286		/* spinlock is needed to avoid racing with the
 287		 * parallel delete.  Otherwise, publishing an already
 288		 * deleted sdata to the cache will become a use-after-free
 289		 * problem in the next __sk_storage_lookup().
 290		 */
 291		raw_spin_lock_bh(&sk_storage->lock);
 292		if (selem_linked_to_sk(selem))
 293			rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
 294					   sdata);
 295		raw_spin_unlock_bh(&sk_storage->lock);
 296	}
 297
 298	return sdata;
 299}
 300
 301static struct bpf_sk_storage_data *
 302sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
 303{
 304	struct bpf_sk_storage *sk_storage;
 305	struct bpf_sk_storage_map *smap;
 306
 307	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 308	if (!sk_storage)
 309		return NULL;
 310
 311	smap = (struct bpf_sk_storage_map *)map;
 312	return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
 313}
 314
 315static int check_flags(const struct bpf_sk_storage_data *old_sdata,
 316		       u64 map_flags)
 317{
 318	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
 319		/* elem already exists */
 320		return -EEXIST;
 321
 322	if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
 323		/* elem doesn't exist, cannot update it */
 324		return -ENOENT;
 325
 326	return 0;
 327}
 328
 329static int sk_storage_alloc(struct sock *sk,
 330			    struct bpf_sk_storage_map *smap,
 331			    struct bpf_sk_storage_elem *first_selem)
 332{
 333	struct bpf_sk_storage *prev_sk_storage, *sk_storage;
 334	int err;
 335
 336	err = omem_charge(sk, sizeof(*sk_storage));
 337	if (err)
 338		return err;
 339
 340	sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
 341	if (!sk_storage) {
 342		err = -ENOMEM;
 343		goto uncharge;
 344	}
 345	INIT_HLIST_HEAD(&sk_storage->list);
 346	raw_spin_lock_init(&sk_storage->lock);
 347	sk_storage->sk = sk;
 348
 349	__selem_link_sk(sk_storage, first_selem);
 350	selem_link_map(smap, first_selem);
 351	/* Publish sk_storage to sk.  sk->sk_lock cannot be acquired.
 352	 * Hence, atomic ops is used to set sk->sk_bpf_storage
 353	 * from NULL to the newly allocated sk_storage ptr.
 354	 *
 355	 * From now on, the sk->sk_bpf_storage pointer is protected
 356	 * by the sk_storage->lock.  Hence,  when freeing
 357	 * the sk->sk_bpf_storage, the sk_storage->lock must
 358	 * be held before setting sk->sk_bpf_storage to NULL.
 359	 */
 360	prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
 361				  NULL, sk_storage);
 362	if (unlikely(prev_sk_storage)) {
 363		selem_unlink_map(first_selem);
 364		err = -EAGAIN;
 365		goto uncharge;
 366
 367		/* Note that even first_selem was linked to smap's
 368		 * bucket->list, first_selem can be freed immediately
 369		 * (instead of kfree_rcu) because
 370		 * bpf_sk_storage_map_free() does a
 371		 * synchronize_rcu() before walking the bucket->list.
 372		 * Hence, no one is accessing selem from the
 373		 * bucket->list under rcu_read_lock().
 374		 */
 375	}
 376
 377	return 0;
 378
 379uncharge:
 380	kfree(sk_storage);
 381	atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
 382	return err;
 383}
 384
 385/* sk cannot be going away because it is linking new elem
 386 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
 387 * Otherwise, it will become a leak (and other memory issues
 388 * during map destruction).
 389 */
 390static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
 391						     struct bpf_map *map,
 392						     void *value,
 393						     u64 map_flags)
 394{
 395	struct bpf_sk_storage_data *old_sdata = NULL;
 396	struct bpf_sk_storage_elem *selem;
 397	struct bpf_sk_storage *sk_storage;
 398	struct bpf_sk_storage_map *smap;
 399	int err;
 400
 401	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
 402	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
 403	    /* BPF_F_LOCK can only be used in a value with spin_lock */
 404	    unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
 405		return ERR_PTR(-EINVAL);
 406
 407	smap = (struct bpf_sk_storage_map *)map;
 408	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 409	if (!sk_storage || hlist_empty(&sk_storage->list)) {
 410		/* Very first elem for this sk */
 411		err = check_flags(NULL, map_flags);
 412		if (err)
 413			return ERR_PTR(err);
 414
 415		selem = selem_alloc(smap, sk, value, true);
 416		if (!selem)
 417			return ERR_PTR(-ENOMEM);
 418
 419		err = sk_storage_alloc(sk, smap, selem);
 420		if (err) {
 421			kfree(selem);
 422			atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
 423			return ERR_PTR(err);
 424		}
 425
 426		return SDATA(selem);
 427	}
 428
 429	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
 430		/* Hoping to find an old_sdata to do inline update
 431		 * such that it can avoid taking the sk_storage->lock
 432		 * and changing the lists.
 433		 */
 434		old_sdata = __sk_storage_lookup(sk_storage, smap, false);
 435		err = check_flags(old_sdata, map_flags);
 436		if (err)
 437			return ERR_PTR(err);
 438		if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
 439			copy_map_value_locked(map, old_sdata->data,
 440					      value, false);
 441			return old_sdata;
 442		}
 443	}
 444
 445	raw_spin_lock_bh(&sk_storage->lock);
 446
 447	/* Recheck sk_storage->list under sk_storage->lock */
 448	if (unlikely(hlist_empty(&sk_storage->list))) {
 449		/* A parallel del is happening and sk_storage is going
 450		 * away.  It has just been checked before, so very
 451		 * unlikely.  Return instead of retry to keep things
 452		 * simple.
 453		 */
 454		err = -EAGAIN;
 455		goto unlock_err;
 456	}
 457
 458	old_sdata = __sk_storage_lookup(sk_storage, smap, false);
 459	err = check_flags(old_sdata, map_flags);
 460	if (err)
 461		goto unlock_err;
 462
 463	if (old_sdata && (map_flags & BPF_F_LOCK)) {
 464		copy_map_value_locked(map, old_sdata->data, value, false);
 465		selem = SELEM(old_sdata);
 466		goto unlock;
 467	}
 468
 469	/* sk_storage->lock is held.  Hence, we are sure
 470	 * we can unlink and uncharge the old_sdata successfully
 471	 * later.  Hence, instead of charging the new selem now
 472	 * and then uncharge the old selem later (which may cause
 473	 * a potential but unnecessary charge failure),  avoid taking
 474	 * a charge at all here (the "!old_sdata" check) and the
 475	 * old_sdata will not be uncharged later during __selem_unlink_sk().
 476	 */
 477	selem = selem_alloc(smap, sk, value, !old_sdata);
 478	if (!selem) {
 479		err = -ENOMEM;
 480		goto unlock_err;
 481	}
 482
 483	/* First, link the new selem to the map */
 484	selem_link_map(smap, selem);
 485
 486	/* Second, link (and publish) the new selem to sk_storage */
 487	__selem_link_sk(sk_storage, selem);
 488
 489	/* Third, remove old selem, SELEM(old_sdata) */
 490	if (old_sdata) {
 491		selem_unlink_map(SELEM(old_sdata));
 492		__selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
 493	}
 494
 495unlock:
 496	raw_spin_unlock_bh(&sk_storage->lock);
 497	return SDATA(selem);
 498
 499unlock_err:
 500	raw_spin_unlock_bh(&sk_storage->lock);
 501	return ERR_PTR(err);
 502}
 503
 504static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
 505{
 506	struct bpf_sk_storage_data *sdata;
 507
 508	sdata = sk_storage_lookup(sk, map, false);
 509	if (!sdata)
 510		return -ENOENT;
 511
 512	selem_unlink(SELEM(sdata));
 513
 514	return 0;
 515}
 516
 517static u16 cache_idx_get(void)
 518{
 519	u64 min_usage = U64_MAX;
 520	u16 i, res = 0;
 521
 522	spin_lock(&cache_idx_lock);
 523
 524	for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) {
 525		if (cache_idx_usage_counts[i] < min_usage) {
 526			min_usage = cache_idx_usage_counts[i];
 527			res = i;
 528
 529			/* Found a free cache_idx */
 530			if (!min_usage)
 531				break;
 532		}
 533	}
 534	cache_idx_usage_counts[res]++;
 535
 536	spin_unlock(&cache_idx_lock);
 537
 538	return res;
 539}
 540
 541static void cache_idx_free(u16 idx)
 542{
 543	spin_lock(&cache_idx_lock);
 544	cache_idx_usage_counts[idx]--;
 545	spin_unlock(&cache_idx_lock);
 546}
 547
 548/* Called by __sk_destruct() & bpf_sk_storage_clone() */
 549void bpf_sk_storage_free(struct sock *sk)
 550{
 551	struct bpf_sk_storage_elem *selem;
 552	struct bpf_sk_storage *sk_storage;
 553	bool free_sk_storage = false;
 554	struct hlist_node *n;
 555
 556	rcu_read_lock();
 557	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 558	if (!sk_storage) {
 559		rcu_read_unlock();
 560		return;
 561	}
 562
 563	/* Netiher the bpf_prog nor the bpf-map's syscall
 564	 * could be modifying the sk_storage->list now.
 565	 * Thus, no elem can be added-to or deleted-from the
 566	 * sk_storage->list by the bpf_prog or by the bpf-map's syscall.
 567	 *
 568	 * It is racing with bpf_sk_storage_map_free() alone
 569	 * when unlinking elem from the sk_storage->list and
 570	 * the map's bucket->list.
 571	 */
 572	raw_spin_lock_bh(&sk_storage->lock);
 573	hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) {
 574		/* Always unlink from map before unlinking from
 575		 * sk_storage.
 576		 */
 577		selem_unlink_map(selem);
 578		free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
 
 579	}
 580	raw_spin_unlock_bh(&sk_storage->lock);
 581	rcu_read_unlock();
 582
 583	if (free_sk_storage)
 584		kfree_rcu(sk_storage, rcu);
 585}
 586
 587static void bpf_sk_storage_map_free(struct bpf_map *map)
 588{
 589	struct bpf_sk_storage_elem *selem;
 590	struct bpf_sk_storage_map *smap;
 591	struct bucket *b;
 592	unsigned int i;
 593
 594	smap = (struct bpf_sk_storage_map *)map;
 595
 596	cache_idx_free(smap->cache_idx);
 597
 598	/* Note that this map might be concurrently cloned from
 599	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
 600	 * RCU read section to finish before proceeding. New RCU
 601	 * read sections should be prevented via bpf_map_inc_not_zero.
 602	 */
 603	synchronize_rcu();
 604
 605	/* bpf prog and the userspace can no longer access this map
 606	 * now.  No new selem (of this map) can be added
 607	 * to the sk->sk_bpf_storage or to the map bucket's list.
 608	 *
 609	 * The elem of this map can be cleaned up here
 610	 * or
 611	 * by bpf_sk_storage_free() during __sk_destruct().
 612	 */
 613	for (i = 0; i < (1U << smap->bucket_log); i++) {
 614		b = &smap->buckets[i];
 615
 616		rcu_read_lock();
 617		/* No one is adding to b->list now */
 618		while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
 619						 struct bpf_sk_storage_elem,
 620						 map_node))) {
 621			selem_unlink(selem);
 622			cond_resched_rcu();
 623		}
 624		rcu_read_unlock();
 625	}
 626
 627	/* bpf_sk_storage_free() may still need to access the map.
 628	 * e.g. bpf_sk_storage_free() has unlinked selem from the map
 629	 * which then made the above while((selem = ...)) loop
 630	 * exited immediately.
 631	 *
 632	 * However, the bpf_sk_storage_free() still needs to access
 633	 * the smap->elem_size to do the uncharging in
 634	 * __selem_unlink_sk().
 635	 *
 636	 * Hence, wait another rcu grace period for the
 637	 * bpf_sk_storage_free() to finish.
 638	 */
 639	synchronize_rcu();
 640
 641	kvfree(smap->buckets);
 642	kfree(map);
 643}
 644
 645/* U16_MAX is much more than enough for sk local storage
 646 * considering a tcp_sock is ~2k.
 647 */
 648#define MAX_VALUE_SIZE							\
 649	min_t(u32,							\
 650	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
 651	      (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
 652
 653static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 654{
 655	if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
 656	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
 657	    attr->max_entries ||
 658	    attr->key_size != sizeof(int) || !attr->value_size ||
 659	    /* Enforce BTF for userspace sk dumping */
 660	    !attr->btf_key_type_id || !attr->btf_value_type_id)
 661		return -EINVAL;
 662
 663	if (!bpf_capable())
 664		return -EPERM;
 665
 666	if (attr->value_size > MAX_VALUE_SIZE)
 667		return -E2BIG;
 668
 669	return 0;
 670}
 671
 672static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 673{
 674	struct bpf_sk_storage_map *smap;
 675	unsigned int i;
 676	u32 nbuckets;
 677	u64 cost;
 678	int ret;
 679
 680	smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
 681	if (!smap)
 682		return ERR_PTR(-ENOMEM);
 683	bpf_map_init_from_attr(&smap->map, attr);
 684
 685	nbuckets = roundup_pow_of_two(num_possible_cpus());
 686	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
 687	nbuckets = max_t(u32, 2, nbuckets);
 688	smap->bucket_log = ilog2(nbuckets);
 689	cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
 690
 691	ret = bpf_map_charge_init(&smap->map.memory, cost);
 692	if (ret < 0) {
 693		kfree(smap);
 694		return ERR_PTR(ret);
 695	}
 696
 697	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
 698				 GFP_USER | __GFP_NOWARN);
 699	if (!smap->buckets) {
 700		bpf_map_charge_finish(&smap->map.memory);
 701		kfree(smap);
 702		return ERR_PTR(-ENOMEM);
 703	}
 704
 705	for (i = 0; i < nbuckets; i++) {
 706		INIT_HLIST_HEAD(&smap->buckets[i].list);
 707		raw_spin_lock_init(&smap->buckets[i].lock);
 708	}
 709
 710	smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
 711	smap->cache_idx = cache_idx_get();
 712
 
 713	return &smap->map;
 714}
 715
 716static int notsupp_get_next_key(struct bpf_map *map, void *key,
 717				void *next_key)
 718{
 719	return -ENOTSUPP;
 720}
 721
 722static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
 723					const struct btf *btf,
 724					const struct btf_type *key_type,
 725					const struct btf_type *value_type)
 726{
 727	u32 int_data;
 728
 729	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
 730		return -EINVAL;
 731
 732	int_data = *(u32 *)(key_type + 1);
 733	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
 734		return -EINVAL;
 735
 736	return 0;
 737}
 738
 739static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 740{
 741	struct bpf_sk_storage_data *sdata;
 742	struct socket *sock;
 743	int fd, err;
 744
 745	fd = *(int *)key;
 746	sock = sockfd_lookup(fd, &err);
 747	if (sock) {
 748		sdata = sk_storage_lookup(sock->sk, map, true);
 749		sockfd_put(sock);
 750		return sdata ? sdata->data : NULL;
 751	}
 752
 753	return ERR_PTR(err);
 754}
 755
 756static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 757					 void *value, u64 map_flags)
 758{
 759	struct bpf_sk_storage_data *sdata;
 760	struct socket *sock;
 761	int fd, err;
 762
 763	fd = *(int *)key;
 764	sock = sockfd_lookup(fd, &err);
 765	if (sock) {
 766		sdata = sk_storage_update(sock->sk, map, value, map_flags);
 
 
 767		sockfd_put(sock);
 768		return PTR_ERR_OR_ZERO(sdata);
 769	}
 770
 771	return err;
 772}
 773
 774static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 775{
 776	struct socket *sock;
 777	int fd, err;
 778
 779	fd = *(int *)key;
 780	sock = sockfd_lookup(fd, &err);
 781	if (sock) {
 782		err = sk_storage_delete(sock->sk, map);
 783		sockfd_put(sock);
 784		return err;
 785	}
 786
 787	return err;
 788}
 789
 790static struct bpf_sk_storage_elem *
 791bpf_sk_storage_clone_elem(struct sock *newsk,
 792			  struct bpf_sk_storage_map *smap,
 793			  struct bpf_sk_storage_elem *selem)
 794{
 795	struct bpf_sk_storage_elem *copy_selem;
 796
 797	copy_selem = selem_alloc(smap, newsk, NULL, true);
 798	if (!copy_selem)
 799		return NULL;
 800
 801	if (map_value_has_spin_lock(&smap->map))
 802		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
 803				      SDATA(selem)->data, true);
 804	else
 805		copy_map_value(&smap->map, SDATA(copy_selem)->data,
 806			       SDATA(selem)->data);
 807
 808	return copy_selem;
 809}
 810
 811int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 812{
 813	struct bpf_sk_storage *new_sk_storage = NULL;
 814	struct bpf_sk_storage *sk_storage;
 815	struct bpf_sk_storage_elem *selem;
 816	int ret = 0;
 817
 818	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
 819
 820	rcu_read_lock();
 821	sk_storage = rcu_dereference(sk->sk_bpf_storage);
 822
 823	if (!sk_storage || hlist_empty(&sk_storage->list))
 824		goto out;
 825
 826	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
 827		struct bpf_sk_storage_elem *copy_selem;
 828		struct bpf_sk_storage_map *smap;
 829		struct bpf_map *map;
 830
 831		smap = rcu_dereference(SDATA(selem)->smap);
 832		if (!(smap->map.map_flags & BPF_F_CLONE))
 833			continue;
 834
 835		/* Note that for lockless listeners adding new element
 836		 * here can race with cleanup in bpf_sk_storage_map_free.
 837		 * Try to grab map refcnt to make sure that it's still
 838		 * alive and prevent concurrent removal.
 839		 */
 840		map = bpf_map_inc_not_zero(&smap->map);
 841		if (IS_ERR(map))
 842			continue;
 843
 844		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
 845		if (!copy_selem) {
 846			ret = -ENOMEM;
 847			bpf_map_put(map);
 848			goto out;
 849		}
 850
 851		if (new_sk_storage) {
 852			selem_link_map(smap, copy_selem);
 853			__selem_link_sk(new_sk_storage, copy_selem);
 854		} else {
 855			ret = sk_storage_alloc(newsk, smap, copy_selem);
 856			if (ret) {
 857				kfree(copy_selem);
 858				atomic_sub(smap->elem_size,
 859					   &newsk->sk_omem_alloc);
 860				bpf_map_put(map);
 861				goto out;
 862			}
 863
 864			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
 
 865		}
 866		bpf_map_put(map);
 867	}
 868
 869out:
 870	rcu_read_unlock();
 871
 872	/* In case of an error, don't free anything explicitly here, the
 873	 * caller is responsible to call bpf_sk_storage_free.
 874	 */
 875
 876	return ret;
 877}
 878
 879BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
 880	   void *, value, u64, flags)
 881{
 882	struct bpf_sk_storage_data *sdata;
 883
 884	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
 885		return (unsigned long)NULL;
 886
 887	sdata = sk_storage_lookup(sk, map, true);
 888	if (sdata)
 889		return (unsigned long)sdata->data;
 890
 891	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
 892	    /* Cannot add new elem to a going away sk.
 893	     * Otherwise, the new elem may become a leak
 894	     * (and also other memory issues during map
 895	     *  destruction).
 896	     */
 897	    refcount_inc_not_zero(&sk->sk_refcnt)) {
 898		sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
 
 
 899		/* sk must be a fullsock (guaranteed by verifier),
 900		 * so sock_gen_put() is unnecessary.
 901		 */
 902		sock_put(sk);
 903		return IS_ERR(sdata) ?
 904			(unsigned long)NULL : (unsigned long)sdata->data;
 905	}
 906
 907	return (unsigned long)NULL;
 908}
 909
 910BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
 911{
 
 
 
 912	if (refcount_inc_not_zero(&sk->sk_refcnt)) {
 913		int err;
 914
 915		err = sk_storage_delete(sk, map);
 916		sock_put(sk);
 917		return err;
 918	}
 919
 920	return -ENOENT;
 921}
 922
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 923static int sk_storage_map_btf_id;
 924const struct bpf_map_ops sk_storage_map_ops = {
 925	.map_alloc_check = bpf_sk_storage_map_alloc_check,
 
 926	.map_alloc = bpf_sk_storage_map_alloc,
 927	.map_free = bpf_sk_storage_map_free,
 928	.map_get_next_key = notsupp_get_next_key,
 929	.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
 930	.map_update_elem = bpf_fd_sk_storage_update_elem,
 931	.map_delete_elem = bpf_fd_sk_storage_delete_elem,
 932	.map_check_btf = bpf_sk_storage_map_check_btf,
 933	.map_btf_name = "bpf_sk_storage_map",
 934	.map_btf_id = &sk_storage_map_btf_id,
 
 
 
 935};
 936
 937const struct bpf_func_proto bpf_sk_storage_get_proto = {
 938	.func		= bpf_sk_storage_get,
 939	.gpl_only	= false,
 940	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 941	.arg1_type	= ARG_CONST_MAP_PTR,
 942	.arg2_type	= ARG_PTR_TO_SOCKET,
 943	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
 944	.arg4_type	= ARG_ANYTHING,
 945};
 946
 947const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = {
 948	.func		= bpf_sk_storage_get,
 949	.gpl_only	= false,
 950	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 951	.arg1_type	= ARG_CONST_MAP_PTR,
 952	.arg2_type	= ARG_PTR_TO_CTX, /* context is 'struct sock' */
 953	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
 954	.arg4_type	= ARG_ANYTHING,
 955};
 956
 957const struct bpf_func_proto bpf_sk_storage_delete_proto = {
 958	.func		= bpf_sk_storage_delete,
 959	.gpl_only	= false,
 960	.ret_type	= RET_INTEGER,
 961	.arg1_type	= ARG_CONST_MAP_PTR,
 962	.arg2_type	= ARG_PTR_TO_SOCKET,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 963};
 964
 965struct bpf_sk_storage_diag {
 966	u32 nr_maps;
 967	struct bpf_map *maps[];
 968};
 969
 970/* The reply will be like:
 971 * INET_DIAG_BPF_SK_STORAGES (nla_nest)
 972 *	SK_DIAG_BPF_STORAGE (nla_nest)
 973 *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
 974 *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
 975 *	SK_DIAG_BPF_STORAGE (nla_nest)
 976 *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
 977 *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
 978 *	....
 979 */
 980static int nla_value_size(u32 value_size)
 981{
 982	/* SK_DIAG_BPF_STORAGE (nla_nest)
 983	 *	SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
 984	 *	SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
 985	 */
 986	return nla_total_size(0) + nla_total_size(sizeof(u32)) +
 987		nla_total_size_64bit(value_size);
 988}
 989
 990void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
 991{
 992	u32 i;
 993
 994	if (!diag)
 995		return;
 996
 997	for (i = 0; i < diag->nr_maps; i++)
 998		bpf_map_put(diag->maps[i]);
 999
1000	kfree(diag);
1001}
1002EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
1003
1004static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
1005			   const struct bpf_map *map)
1006{
1007	u32 i;
1008
1009	for (i = 0; i < diag->nr_maps; i++) {
1010		if (diag->maps[i] == map)
1011			return true;
1012	}
1013
1014	return false;
1015}
1016
1017struct bpf_sk_storage_diag *
1018bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
1019{
1020	struct bpf_sk_storage_diag *diag;
1021	struct nlattr *nla;
1022	u32 nr_maps = 0;
1023	int rem, err;
1024
1025	/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
1026	 * the map_alloc_check() side also does.
1027	 */
1028	if (!bpf_capable())
1029		return ERR_PTR(-EPERM);
1030
1031	nla_for_each_nested(nla, nla_stgs, rem) {
1032		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
1033			nr_maps++;
1034	}
1035
1036	diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
1037		       GFP_KERNEL);
1038	if (!diag)
1039		return ERR_PTR(-ENOMEM);
1040
1041	nla_for_each_nested(nla, nla_stgs, rem) {
1042		struct bpf_map *map;
1043		int map_fd;
1044
1045		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
1046			continue;
1047
1048		map_fd = nla_get_u32(nla);
1049		map = bpf_map_get(map_fd);
1050		if (IS_ERR(map)) {
1051			err = PTR_ERR(map);
1052			goto err_free;
1053		}
1054		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
1055			bpf_map_put(map);
1056			err = -EINVAL;
1057			goto err_free;
1058		}
1059		if (diag_check_dup(diag, map)) {
1060			bpf_map_put(map);
1061			err = -EEXIST;
1062			goto err_free;
1063		}
1064		diag->maps[diag->nr_maps++] = map;
1065	}
1066
1067	return diag;
1068
1069err_free:
1070	bpf_sk_storage_diag_free(diag);
1071	return ERR_PTR(err);
1072}
1073EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
1074
1075static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
1076{
1077	struct nlattr *nla_stg, *nla_value;
1078	struct bpf_sk_storage_map *smap;
1079
1080	/* It cannot exceed max nlattr's payload */
1081	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
1082
1083	nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
1084	if (!nla_stg)
1085		return -EMSGSIZE;
1086
1087	smap = rcu_dereference(sdata->smap);
1088	if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
1089		goto errout;
1090
1091	nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
1092				      smap->map.value_size,
1093				      SK_DIAG_BPF_STORAGE_PAD);
1094	if (!nla_value)
1095		goto errout;
1096
1097	if (map_value_has_spin_lock(&smap->map))
1098		copy_map_value_locked(&smap->map, nla_data(nla_value),
1099				      sdata->data, true);
1100	else
1101		copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
1102
1103	nla_nest_end(skb, nla_stg);
1104	return 0;
1105
1106errout:
1107	nla_nest_cancel(skb, nla_stg);
1108	return -EMSGSIZE;
1109}
1110
1111static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
1112				       int stg_array_type,
1113				       unsigned int *res_diag_size)
1114{
1115	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
1116	unsigned int diag_size = nla_total_size(0);
1117	struct bpf_sk_storage *sk_storage;
1118	struct bpf_sk_storage_elem *selem;
1119	struct bpf_sk_storage_map *smap;
1120	struct nlattr *nla_stgs;
1121	unsigned int saved_len;
1122	int err = 0;
1123
1124	rcu_read_lock();
1125
1126	sk_storage = rcu_dereference(sk->sk_bpf_storage);
1127	if (!sk_storage || hlist_empty(&sk_storage->list)) {
1128		rcu_read_unlock();
1129		return 0;
1130	}
1131
1132	nla_stgs = nla_nest_start(skb, stg_array_type);
1133	if (!nla_stgs)
1134		/* Continue to learn diag_size */
1135		err = -EMSGSIZE;
1136
1137	saved_len = skb->len;
1138	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
1139		smap = rcu_dereference(SDATA(selem)->smap);
1140		diag_size += nla_value_size(smap->map.value_size);
1141
1142		if (nla_stgs && diag_get(SDATA(selem), skb))
1143			/* Continue to learn diag_size */
1144			err = -EMSGSIZE;
1145	}
1146
1147	rcu_read_unlock();
1148
1149	if (nla_stgs) {
1150		if (saved_len == skb->len)
1151			nla_nest_cancel(skb, nla_stgs);
1152		else
1153			nla_nest_end(skb, nla_stgs);
1154	}
1155
1156	if (diag_size == nla_total_size(0)) {
1157		*res_diag_size = 0;
1158		return 0;
1159	}
1160
1161	*res_diag_size = diag_size;
1162	return err;
1163}
1164
1165int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
1166			    struct sock *sk, struct sk_buff *skb,
1167			    int stg_array_type,
1168			    unsigned int *res_diag_size)
1169{
1170	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
1171	unsigned int diag_size = nla_total_size(0);
1172	struct bpf_sk_storage *sk_storage;
1173	struct bpf_sk_storage_data *sdata;
1174	struct nlattr *nla_stgs;
1175	unsigned int saved_len;
1176	int err = 0;
1177	u32 i;
1178
1179	*res_diag_size = 0;
1180
1181	/* No map has been specified.  Dump all. */
1182	if (!diag->nr_maps)
1183		return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
1184						   res_diag_size);
1185
1186	rcu_read_lock();
1187	sk_storage = rcu_dereference(sk->sk_bpf_storage);
1188	if (!sk_storage || hlist_empty(&sk_storage->list)) {
1189		rcu_read_unlock();
1190		return 0;
1191	}
1192
1193	nla_stgs = nla_nest_start(skb, stg_array_type);
1194	if (!nla_stgs)
1195		/* Continue to learn diag_size */
1196		err = -EMSGSIZE;
1197
1198	saved_len = skb->len;
1199	for (i = 0; i < diag->nr_maps; i++) {
1200		sdata = __sk_storage_lookup(sk_storage,
1201				(struct bpf_sk_storage_map *)diag->maps[i],
1202				false);
1203
1204		if (!sdata)
1205			continue;
1206
1207		diag_size += nla_value_size(diag->maps[i]->value_size);
1208
1209		if (nla_stgs && diag_get(sdata, skb))
1210			/* Continue to learn diag_size */
1211			err = -EMSGSIZE;
1212	}
1213	rcu_read_unlock();
1214
1215	if (nla_stgs) {
1216		if (saved_len == skb->len)
1217			nla_nest_cancel(skb, nla_stgs);
1218		else
1219			nla_nest_end(skb, nla_stgs);
1220	}
1221
1222	if (diag_size == nla_total_size(0)) {
1223		*res_diag_size = 0;
1224		return 0;
1225	}
1226
1227	*res_diag_size = diag_size;
1228	return err;
1229}
1230EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
1231
1232struct bpf_iter_seq_sk_storage_map_info {
1233	struct bpf_map *map;
1234	unsigned int bucket_id;
1235	unsigned skip_elems;
1236};
1237
1238static struct bpf_sk_storage_elem *
1239bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
1240				 struct bpf_sk_storage_elem *prev_selem)
 
1241{
1242	struct bpf_sk_storage *sk_storage;
1243	struct bpf_sk_storage_elem *selem;
1244	u32 skip_elems = info->skip_elems;
1245	struct bpf_sk_storage_map *smap;
1246	u32 bucket_id = info->bucket_id;
1247	u32 i, count, n_buckets;
1248	struct bucket *b;
1249
1250	smap = (struct bpf_sk_storage_map *)info->map;
1251	n_buckets = 1U << smap->bucket_log;
1252	if (bucket_id >= n_buckets)
1253		return NULL;
1254
1255	/* try to find next selem in the same bucket */
1256	selem = prev_selem;
1257	count = 0;
1258	while (selem) {
1259		selem = hlist_entry_safe(selem->map_node.next,
1260					 struct bpf_sk_storage_elem, map_node);
1261		if (!selem) {
1262			/* not found, unlock and go to the next bucket */
1263			b = &smap->buckets[bucket_id++];
1264			raw_spin_unlock_bh(&b->lock);
1265			skip_elems = 0;
1266			break;
1267		}
1268		sk_storage = rcu_dereference_raw(selem->sk_storage);
1269		if (sk_storage) {
1270			info->skip_elems = skip_elems + count;
1271			return selem;
1272		}
1273		count++;
1274	}
1275
1276	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
1277		b = &smap->buckets[i];
1278		raw_spin_lock_bh(&b->lock);
1279		count = 0;
1280		hlist_for_each_entry(selem, &b->list, map_node) {
1281			sk_storage = rcu_dereference_raw(selem->sk_storage);
1282			if (sk_storage && count >= skip_elems) {
1283				info->bucket_id = i;
1284				info->skip_elems = count;
1285				return selem;
1286			}
1287			count++;
1288		}
1289		raw_spin_unlock_bh(&b->lock);
1290		skip_elems = 0;
1291	}
1292
1293	info->bucket_id = i;
1294	info->skip_elems = 0;
1295	return NULL;
1296}
1297
1298static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
1299{
1300	struct bpf_sk_storage_elem *selem;
1301
1302	selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
1303	if (!selem)
1304		return NULL;
1305
1306	if (*pos == 0)
1307		++*pos;
1308	return selem;
1309}
1310
1311static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v,
1312					 loff_t *pos)
1313{
1314	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
1315
1316	++*pos;
1317	++info->skip_elems;
1318	return bpf_sk_storage_map_seq_find_next(seq->private, v);
1319}
1320
1321struct bpf_iter__bpf_sk_storage_map {
1322	__bpf_md_ptr(struct bpf_iter_meta *, meta);
1323	__bpf_md_ptr(struct bpf_map *, map);
1324	__bpf_md_ptr(struct sock *, sk);
1325	__bpf_md_ptr(void *, value);
1326};
1327
1328DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
1329		     struct bpf_map *map, struct sock *sk,
1330		     void *value)
1331
1332static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
1333					 struct bpf_sk_storage_elem *selem)
1334{
1335	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
1336	struct bpf_iter__bpf_sk_storage_map ctx = {};
1337	struct bpf_sk_storage *sk_storage;
1338	struct bpf_iter_meta meta;
1339	struct bpf_prog *prog;
1340	int ret = 0;
1341
1342	meta.seq = seq;
1343	prog = bpf_iter_get_info(&meta, selem == NULL);
1344	if (prog) {
1345		ctx.meta = &meta;
1346		ctx.map = info->map;
1347		if (selem) {
1348			sk_storage = rcu_dereference_raw(selem->sk_storage);
1349			ctx.sk = sk_storage->sk;
1350			ctx.value = SDATA(selem)->data;
1351		}
1352		ret = bpf_iter_run_prog(prog, &ctx);
1353	}
1354
1355	return ret;
1356}
1357
1358static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
1359{
1360	return __bpf_sk_storage_map_seq_show(seq, v);
1361}
1362
1363static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
 
1364{
1365	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
1366	struct bpf_sk_storage_map *smap;
1367	struct bucket *b;
1368
1369	if (!v) {
1370		(void)__bpf_sk_storage_map_seq_show(seq, v);
1371	} else {
1372		smap = (struct bpf_sk_storage_map *)info->map;
1373		b = &smap->buckets[info->bucket_id];
1374		raw_spin_unlock_bh(&b->lock);
1375	}
1376}
1377
1378static int bpf_iter_init_sk_storage_map(void *priv_data,
1379					struct bpf_iter_aux_info *aux)
1380{
1381	struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
1382
1383	seq_info->map = aux->map;
1384	return 0;
1385}
1386
1387static int bpf_iter_attach_map(struct bpf_prog *prog,
1388			       union bpf_iter_link_info *linfo,
1389			       struct bpf_iter_aux_info *aux)
1390{
1391	struct bpf_map *map;
1392	int err = -EINVAL;
1393
1394	if (!linfo->map.map_fd)
1395		return -EBADF;
1396
1397	map = bpf_map_get_with_uref(linfo->map.map_fd);
1398	if (IS_ERR(map))
1399		return PTR_ERR(map);
1400
1401	if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
1402		goto put_map;
1403
1404	if (prog->aux->max_rdonly_access > map->value_size) {
1405		err = -EACCES;
1406		goto put_map;
1407	}
1408
1409	aux->map = map;
1410	return 0;
1411
1412put_map:
1413	bpf_map_put_with_uref(map);
1414	return err;
1415}
1416
1417static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
1418{
1419	bpf_map_put_with_uref(aux->map);
1420}
1421
1422static const struct seq_operations bpf_sk_storage_map_seq_ops = {
1423	.start  = bpf_sk_storage_map_seq_start,
1424	.next   = bpf_sk_storage_map_seq_next,
1425	.stop   = bpf_sk_storage_map_seq_stop,
1426	.show   = bpf_sk_storage_map_seq_show,
1427};
1428
1429static const struct bpf_iter_seq_info iter_seq_info = {
1430	.seq_ops		= &bpf_sk_storage_map_seq_ops,
1431	.init_seq_private	= bpf_iter_init_sk_storage_map,
1432	.fini_seq_private	= NULL,
1433	.seq_priv_size		= sizeof(struct bpf_iter_seq_sk_storage_map_info),
1434};
1435
1436static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
1437	.target			= "bpf_sk_storage_map",
1438	.attach_target		= bpf_iter_attach_map,
1439	.detach_target		= bpf_iter_detach_map,
 
 
1440	.ctx_arg_info_size	= 2,
1441	.ctx_arg_info		= {
1442		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
1443		  PTR_TO_BTF_ID_OR_NULL },
1444		{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),
1445		  PTR_TO_RDWR_BUF_OR_NULL },
1446	},
1447	.seq_info		= &iter_seq_info,
1448};
1449
1450static int __init bpf_sk_storage_map_iter_init(void)
1451{
1452	bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id =
1453		btf_sock_ids[BTF_SOCK_TYPE_SOCK];
1454	return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info);
1455}
1456late_initcall(bpf_sk_storage_map_iter_init);