Linux Audio

Check our new training course

Loading...
v6.8
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 *	ip6_flowlabel.c		IPv6 flowlabel manager.
  4 *
  5 *	Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 */
  7
  8#include <linux/capability.h>
  9#include <linux/errno.h>
 10#include <linux/types.h>
 11#include <linux/socket.h>
 12#include <linux/net.h>
 13#include <linux/netdevice.h>
 14#include <linux/in6.h>
 15#include <linux/proc_fs.h>
 16#include <linux/seq_file.h>
 17#include <linux/slab.h>
 18#include <linux/export.h>
 19#include <linux/pid_namespace.h>
 20#include <linux/jump_label_ratelimit.h>
 21
 22#include <net/net_namespace.h>
 23#include <net/sock.h>
 24
 25#include <net/ipv6.h>
 26#include <net/rawv6.h>
 27#include <net/transp_v6.h>
 28
 29#include <linux/uaccess.h>
 30
 31#define FL_MIN_LINGER	6	/* Minimal linger. It is set to 6sec specified
 32				   in old IPv6 RFC. Well, it was reasonable value.
 33				 */
 34#define FL_MAX_LINGER	150	/* Maximal linger timeout */
 35
 36/* FL hash table */
 37
 38#define FL_MAX_PER_SOCK	32
 39#define FL_MAX_SIZE	4096
 40#define FL_HASH_MASK	255
 41#define FL_HASH(l)	(ntohl(l)&FL_HASH_MASK)
 42
 43static atomic_t fl_size = ATOMIC_INIT(0);
 44static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 45
 46static void ip6_fl_gc(struct timer_list *unused);
 47static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
 48
 49/* FL hash table lock: it protects only of GC */
 50
 51static DEFINE_SPINLOCK(ip6_fl_lock);
 52
 53/* Big socket sock */
 54
 55static DEFINE_SPINLOCK(ip6_sk_fl_lock);
 56
 57DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
 58EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
 59
 60#define for_each_fl_rcu(hash, fl)				\
 61	for (fl = rcu_dereference(fl_ht[(hash)]);		\
 62	     fl != NULL;					\
 63	     fl = rcu_dereference(fl->next))
 64#define for_each_fl_continue_rcu(fl)				\
 65	for (fl = rcu_dereference(fl->next);			\
 66	     fl != NULL;					\
 67	     fl = rcu_dereference(fl->next))
 68
 69#define for_each_sk_fl_rcu(np, sfl)				\
 70	for (sfl = rcu_dereference(np->ipv6_fl_list);	\
 71	     sfl != NULL;					\
 72	     sfl = rcu_dereference(sfl->next))
 73
 74static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label)
 75{
 76	struct ip6_flowlabel *fl;
 77
 78	for_each_fl_rcu(FL_HASH(label), fl) {
 79		if (fl->label == label && net_eq(fl->fl_net, net))
 80			return fl;
 81	}
 82	return NULL;
 83}
 84
 85static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
 86{
 87	struct ip6_flowlabel *fl;
 88
 89	rcu_read_lock();
 90	fl = __fl_lookup(net, label);
 91	if (fl && !atomic_inc_not_zero(&fl->users))
 92		fl = NULL;
 93	rcu_read_unlock();
 94	return fl;
 95}
 96
 97static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
 98{
 99	return fl->share == IPV6_FL_S_EXCL ||
100	       fl->share == IPV6_FL_S_PROCESS ||
101	       fl->share == IPV6_FL_S_USER;
102}
103
104static void fl_free_rcu(struct rcu_head *head)
105{
106	struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
107
108	if (fl->share == IPV6_FL_S_PROCESS)
109		put_pid(fl->owner.pid);
110	kfree(fl->opt);
111	kfree(fl);
112}
113
114
115static void fl_free(struct ip6_flowlabel *fl)
116{
117	if (!fl)
118		return;
119
120	if (fl_shared_exclusive(fl) || fl->opt)
121		static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
122
123	call_rcu(&fl->rcu, fl_free_rcu);
124}
125
126static void fl_release(struct ip6_flowlabel *fl)
127{
128	spin_lock_bh(&ip6_fl_lock);
129
130	fl->lastuse = jiffies;
131	if (atomic_dec_and_test(&fl->users)) {
132		unsigned long ttd = fl->lastuse + fl->linger;
133		if (time_after(ttd, fl->expires))
134			fl->expires = ttd;
135		ttd = fl->expires;
136		if (fl->opt && fl->share == IPV6_FL_S_EXCL) {
137			struct ipv6_txoptions *opt = fl->opt;
138			fl->opt = NULL;
139			kfree(opt);
140		}
141		if (!timer_pending(&ip6_fl_gc_timer) ||
142		    time_after(ip6_fl_gc_timer.expires, ttd))
143			mod_timer(&ip6_fl_gc_timer, ttd);
144	}
145	spin_unlock_bh(&ip6_fl_lock);
146}
147
148static void ip6_fl_gc(struct timer_list *unused)
149{
150	int i;
151	unsigned long now = jiffies;
152	unsigned long sched = 0;
153
154	spin_lock(&ip6_fl_lock);
155
156	for (i = 0; i <= FL_HASH_MASK; i++) {
157		struct ip6_flowlabel *fl;
158		struct ip6_flowlabel __rcu **flp;
159
160		flp = &fl_ht[i];
161		while ((fl = rcu_dereference_protected(*flp,
162						       lockdep_is_held(&ip6_fl_lock))) != NULL) {
163			if (atomic_read(&fl->users) == 0) {
164				unsigned long ttd = fl->lastuse + fl->linger;
165				if (time_after(ttd, fl->expires))
166					fl->expires = ttd;
167				ttd = fl->expires;
168				if (time_after_eq(now, ttd)) {
169					*flp = fl->next;
170					fl_free(fl);
171					atomic_dec(&fl_size);
172					continue;
173				}
174				if (!sched || time_before(ttd, sched))
175					sched = ttd;
176			}
177			flp = &fl->next;
178		}
179	}
180	if (!sched && atomic_read(&fl_size))
181		sched = now + FL_MAX_LINGER;
182	if (sched) {
183		mod_timer(&ip6_fl_gc_timer, sched);
184	}
185	spin_unlock(&ip6_fl_lock);
186}
187
188static void __net_exit ip6_fl_purge(struct net *net)
189{
190	int i;
191
192	spin_lock_bh(&ip6_fl_lock);
193	for (i = 0; i <= FL_HASH_MASK; i++) {
194		struct ip6_flowlabel *fl;
195		struct ip6_flowlabel __rcu **flp;
196
197		flp = &fl_ht[i];
198		while ((fl = rcu_dereference_protected(*flp,
199						       lockdep_is_held(&ip6_fl_lock))) != NULL) {
200			if (net_eq(fl->fl_net, net) &&
201			    atomic_read(&fl->users) == 0) {
202				*flp = fl->next;
203				fl_free(fl);
204				atomic_dec(&fl_size);
205				continue;
206			}
207			flp = &fl->next;
208		}
209	}
210	spin_unlock_bh(&ip6_fl_lock);
211}
212
213static struct ip6_flowlabel *fl_intern(struct net *net,
214				       struct ip6_flowlabel *fl, __be32 label)
215{
216	struct ip6_flowlabel *lfl;
217
218	fl->label = label & IPV6_FLOWLABEL_MASK;
219
220	rcu_read_lock();
221	spin_lock_bh(&ip6_fl_lock);
222	if (label == 0) {
223		for (;;) {
224			fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
225			if (fl->label) {
226				lfl = __fl_lookup(net, fl->label);
227				if (!lfl)
228					break;
229			}
230		}
231	} else {
232		/*
233		 * we dropper the ip6_fl_lock, so this entry could reappear
234		 * and we need to recheck with it.
235		 *
236		 * OTOH no need to search the active socket first, like it is
237		 * done in ipv6_flowlabel_opt - sock is locked, so new entry
238		 * with the same label can only appear on another sock
239		 */
240		lfl = __fl_lookup(net, fl->label);
241		if (lfl) {
242			atomic_inc(&lfl->users);
243			spin_unlock_bh(&ip6_fl_lock);
244			rcu_read_unlock();
245			return lfl;
246		}
247	}
248
249	fl->lastuse = jiffies;
250	fl->next = fl_ht[FL_HASH(fl->label)];
251	rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
252	atomic_inc(&fl_size);
253	spin_unlock_bh(&ip6_fl_lock);
254	rcu_read_unlock();
255	return NULL;
256}
257
258
259
260/* Socket flowlabel lists */
261
262struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
263{
264	struct ipv6_fl_socklist *sfl;
265	struct ipv6_pinfo *np = inet6_sk(sk);
266
267	label &= IPV6_FLOWLABEL_MASK;
268
269	rcu_read_lock();
270	for_each_sk_fl_rcu(np, sfl) {
271		struct ip6_flowlabel *fl = sfl->fl;
272
273		if (fl->label == label && atomic_inc_not_zero(&fl->users)) {
274			fl->lastuse = jiffies;
275			rcu_read_unlock();
276			return fl;
277		}
278	}
279	rcu_read_unlock();
280	return NULL;
281}
282EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
283
284void fl6_free_socklist(struct sock *sk)
285{
286	struct ipv6_pinfo *np = inet6_sk(sk);
287	struct ipv6_fl_socklist *sfl;
288
289	if (!rcu_access_pointer(np->ipv6_fl_list))
290		return;
291
292	spin_lock_bh(&ip6_sk_fl_lock);
293	while ((sfl = rcu_dereference_protected(np->ipv6_fl_list,
294						lockdep_is_held(&ip6_sk_fl_lock))) != NULL) {
295		np->ipv6_fl_list = sfl->next;
296		spin_unlock_bh(&ip6_sk_fl_lock);
297
298		fl_release(sfl->fl);
299		kfree_rcu(sfl, rcu);
300
301		spin_lock_bh(&ip6_sk_fl_lock);
302	}
303	spin_unlock_bh(&ip6_sk_fl_lock);
304}
305
306/* Service routines */
307
308
309/*
310   It is the only difficult place. flowlabel enforces equal headers
311   before and including routing header, however user may supply options
312   following rthdr.
313 */
314
315struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
316					 struct ip6_flowlabel *fl,
317					 struct ipv6_txoptions *fopt)
318{
319	struct ipv6_txoptions *fl_opt = fl->opt;
320
321	if (!fopt || fopt->opt_flen == 0)
322		return fl_opt;
323
324	if (fl_opt) {
325		opt_space->hopopt = fl_opt->hopopt;
326		opt_space->dst0opt = fl_opt->dst0opt;
327		opt_space->srcrt = fl_opt->srcrt;
328		opt_space->opt_nflen = fl_opt->opt_nflen;
329	} else {
330		if (fopt->opt_nflen == 0)
331			return fopt;
332		opt_space->hopopt = NULL;
333		opt_space->dst0opt = NULL;
334		opt_space->srcrt = NULL;
335		opt_space->opt_nflen = 0;
336	}
337	opt_space->dst1opt = fopt->dst1opt;
338	opt_space->opt_flen = fopt->opt_flen;
339	opt_space->tot_len = fopt->tot_len;
340	return opt_space;
341}
342EXPORT_SYMBOL_GPL(fl6_merge_options);
343
344static unsigned long check_linger(unsigned long ttl)
345{
346	if (ttl < FL_MIN_LINGER)
347		return FL_MIN_LINGER*HZ;
348	if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN))
349		return 0;
350	return ttl*HZ;
351}
352
353static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires)
354{
355	linger = check_linger(linger);
356	if (!linger)
357		return -EPERM;
358	expires = check_linger(expires);
359	if (!expires)
360		return -EPERM;
361
362	spin_lock_bh(&ip6_fl_lock);
363	fl->lastuse = jiffies;
364	if (time_before(fl->linger, linger))
365		fl->linger = linger;
366	if (time_before(expires, fl->linger))
367		expires = fl->linger;
368	if (time_before(fl->expires, fl->lastuse + expires))
369		fl->expires = fl->lastuse + expires;
370	spin_unlock_bh(&ip6_fl_lock);
371
372	return 0;
373}
374
375static struct ip6_flowlabel *
376fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
377	  sockptr_t optval, int optlen, int *err_p)
378{
379	struct ip6_flowlabel *fl = NULL;
380	int olen;
381	int addr_type;
382	int err;
383
384	olen = optlen - CMSG_ALIGN(sizeof(*freq));
385	err = -EINVAL;
386	if (olen > 64 * 1024)
387		goto done;
388
389	err = -ENOMEM;
390	fl = kzalloc(sizeof(*fl), GFP_KERNEL);
391	if (!fl)
392		goto done;
393
394	if (olen > 0) {
395		struct msghdr msg;
396		struct flowi6 flowi6;
397		struct ipcm6_cookie ipc6;
398
399		err = -ENOMEM;
400		fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL);
401		if (!fl->opt)
402			goto done;
403
404		memset(fl->opt, 0, sizeof(*fl->opt));
405		fl->opt->tot_len = sizeof(*fl->opt) + olen;
406		err = -EFAULT;
407		if (copy_from_sockptr_offset(fl->opt + 1, optval,
408				CMSG_ALIGN(sizeof(*freq)), olen))
409			goto done;
410
411		msg.msg_controllen = olen;
412		msg.msg_control = (void *)(fl->opt+1);
413		memset(&flowi6, 0, sizeof(flowi6));
414
415		ipc6.opt = fl->opt;
416		err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6);
417		if (err)
418			goto done;
419		err = -EINVAL;
420		if (fl->opt->opt_flen)
421			goto done;
422		if (fl->opt->opt_nflen == 0) {
423			kfree(fl->opt);
424			fl->opt = NULL;
425		}
426	}
427
428	fl->fl_net = net;
429	fl->expires = jiffies;
430	err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
431	if (err)
432		goto done;
433	fl->share = freq->flr_share;
434	addr_type = ipv6_addr_type(&freq->flr_dst);
435	if ((addr_type & IPV6_ADDR_MAPPED) ||
436	    addr_type == IPV6_ADDR_ANY) {
437		err = -EINVAL;
438		goto done;
439	}
440	fl->dst = freq->flr_dst;
441	atomic_set(&fl->users, 1);
442	switch (fl->share) {
443	case IPV6_FL_S_EXCL:
444	case IPV6_FL_S_ANY:
445		break;
446	case IPV6_FL_S_PROCESS:
447		fl->owner.pid = get_task_pid(current, PIDTYPE_PID);
448		break;
449	case IPV6_FL_S_USER:
450		fl->owner.uid = current_euid();
451		break;
452	default:
453		err = -EINVAL;
454		goto done;
455	}
456	if (fl_shared_exclusive(fl) || fl->opt) {
457		WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1);
458		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
459	}
460	return fl;
461
462done:
463	if (fl) {
464		kfree(fl->opt);
465		kfree(fl);
466	}
467	*err_p = err;
468	return NULL;
469}
470
471static int mem_check(struct sock *sk)
472{
473	struct ipv6_pinfo *np = inet6_sk(sk);
474	struct ipv6_fl_socklist *sfl;
475	int room = FL_MAX_SIZE - atomic_read(&fl_size);
476	int count = 0;
477
478	if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
479		return 0;
480
481	rcu_read_lock();
482	for_each_sk_fl_rcu(np, sfl)
483		count++;
484	rcu_read_unlock();
485
486	if (room <= 0 ||
487	    ((count >= FL_MAX_PER_SOCK ||
488	      (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
489	     !capable(CAP_NET_ADMIN)))
490		return -ENOBUFS;
491
492	return 0;
493}
494
495static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl,
496		struct ip6_flowlabel *fl)
497{
498	spin_lock_bh(&ip6_sk_fl_lock);
499	sfl->fl = fl;
500	sfl->next = np->ipv6_fl_list;
501	rcu_assign_pointer(np->ipv6_fl_list, sfl);
502	spin_unlock_bh(&ip6_sk_fl_lock);
503}
504
505int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
506			   int flags)
507{
508	struct ipv6_pinfo *np = inet6_sk(sk);
509	struct ipv6_fl_socklist *sfl;
510
511	if (flags & IPV6_FL_F_REMOTE) {
512		freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK;
513		return 0;
514	}
515
516	if (inet6_test_bit(REPFLOW, sk)) {
517		freq->flr_label = np->flow_label;
518		return 0;
519	}
520
521	rcu_read_lock();
522
523	for_each_sk_fl_rcu(np, sfl) {
524		if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) {
525			spin_lock_bh(&ip6_fl_lock);
526			freq->flr_label = sfl->fl->label;
527			freq->flr_dst = sfl->fl->dst;
528			freq->flr_share = sfl->fl->share;
529			freq->flr_expires = (sfl->fl->expires - jiffies) / HZ;
530			freq->flr_linger = sfl->fl->linger / HZ;
531
532			spin_unlock_bh(&ip6_fl_lock);
533			rcu_read_unlock();
534			return 0;
535		}
536	}
537	rcu_read_unlock();
538
539	return -ENOENT;
540}
541
542#define socklist_dereference(__sflp) \
543	rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock))
544
545static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
546{
547	struct ipv6_pinfo *np = inet6_sk(sk);
548	struct ipv6_fl_socklist __rcu **sflp;
549	struct ipv6_fl_socklist *sfl;
550
551	if (freq->flr_flags & IPV6_FL_F_REFLECT) {
552		if (sk->sk_protocol != IPPROTO_TCP)
553			return -ENOPROTOOPT;
554		if (!inet6_test_bit(REPFLOW, sk))
555			return -ESRCH;
556		np->flow_label = 0;
557		inet6_clear_bit(REPFLOW, sk);
558		return 0;
559	}
560
561	spin_lock_bh(&ip6_sk_fl_lock);
562	for (sflp = &np->ipv6_fl_list;
563	     (sfl = socklist_dereference(*sflp)) != NULL;
564	     sflp = &sfl->next) {
565		if (sfl->fl->label == freq->flr_label)
566			goto found;
567	}
568	spin_unlock_bh(&ip6_sk_fl_lock);
569	return -ESRCH;
570found:
571	if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK))
572		np->flow_label &= ~IPV6_FLOWLABEL_MASK;
573	*sflp = sfl->next;
574	spin_unlock_bh(&ip6_sk_fl_lock);
575	fl_release(sfl->fl);
576	kfree_rcu(sfl, rcu);
577	return 0;
578}
579
580static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq)
581{
582	struct ipv6_pinfo *np = inet6_sk(sk);
583	struct net *net = sock_net(sk);
584	struct ipv6_fl_socklist *sfl;
585	int err;
586
587	rcu_read_lock();
588	for_each_sk_fl_rcu(np, sfl) {
589		if (sfl->fl->label == freq->flr_label) {
590			err = fl6_renew(sfl->fl, freq->flr_linger,
591					freq->flr_expires);
592			rcu_read_unlock();
593			return err;
594		}
595	}
596	rcu_read_unlock();
597
598	if (freq->flr_share == IPV6_FL_S_NONE &&
599	    ns_capable(net->user_ns, CAP_NET_ADMIN)) {
600		struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label);
601
602		if (fl) {
603			err = fl6_renew(fl, freq->flr_linger,
604					freq->flr_expires);
605			fl_release(fl);
606			return err;
607		}
608	}
609	return -ESRCH;
610}
611
612static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
613		sockptr_t optval, int optlen)
614{
615	struct ipv6_fl_socklist *sfl, *sfl1 = NULL;
616	struct ip6_flowlabel *fl, *fl1 = NULL;
617	struct ipv6_pinfo *np = inet6_sk(sk);
618	struct net *net = sock_net(sk);
619	int err;
620
621	if (freq->flr_flags & IPV6_FL_F_REFLECT) {
622		if (net->ipv6.sysctl.flowlabel_consistency) {
623			net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
624			return -EPERM;
625		}
626
627		if (sk->sk_protocol != IPPROTO_TCP)
628			return -ENOPROTOOPT;
629		inet6_set_bit(REPFLOW, sk);
630		return 0;
631	}
632
633	if (freq->flr_label & ~IPV6_FLOWLABEL_MASK)
634		return -EINVAL;
635	if (net->ipv6.sysctl.flowlabel_state_ranges &&
636	    (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
637		return -ERANGE;
638
639	fl = fl_create(net, sk, freq, optval, optlen, &err);
640	if (!fl)
641		return err;
642
643	sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
644
645	if (freq->flr_label) {
646		err = -EEXIST;
647		rcu_read_lock();
648		for_each_sk_fl_rcu(np, sfl) {
649			if (sfl->fl->label == freq->flr_label) {
650				if (freq->flr_flags & IPV6_FL_F_EXCL) {
651					rcu_read_unlock();
652					goto done;
653				}
654				fl1 = sfl->fl;
655				if (!atomic_inc_not_zero(&fl1->users))
656					fl1 = NULL;
657				break;
658			}
659		}
660		rcu_read_unlock();
661
662		if (!fl1)
663			fl1 = fl_lookup(net, freq->flr_label);
664		if (fl1) {
665recheck:
666			err = -EEXIST;
667			if (freq->flr_flags&IPV6_FL_F_EXCL)
668				goto release;
669			err = -EPERM;
670			if (fl1->share == IPV6_FL_S_EXCL ||
671			    fl1->share != fl->share ||
672			    ((fl1->share == IPV6_FL_S_PROCESS) &&
673			     (fl1->owner.pid != fl->owner.pid)) ||
674			    ((fl1->share == IPV6_FL_S_USER) &&
675			     !uid_eq(fl1->owner.uid, fl->owner.uid)))
676				goto release;
677
678			err = -ENOMEM;
679			if (!sfl1)
680				goto release;
681			if (fl->linger > fl1->linger)
682				fl1->linger = fl->linger;
683			if ((long)(fl->expires - fl1->expires) > 0)
684				fl1->expires = fl->expires;
685			fl_link(np, sfl1, fl1);
686			fl_free(fl);
687			return 0;
688
689release:
690			fl_release(fl1);
691			goto done;
692		}
693	}
694	err = -ENOENT;
695	if (!(freq->flr_flags & IPV6_FL_F_CREATE))
696		goto done;
697
698	err = -ENOMEM;
699	if (!sfl1)
700		goto done;
701
702	err = mem_check(sk);
703	if (err != 0)
704		goto done;
705
706	fl1 = fl_intern(net, fl, freq->flr_label);
707	if (fl1)
708		goto recheck;
709
710	if (!freq->flr_label) {
711		size_t offset = offsetof(struct in6_flowlabel_req, flr_label);
712
713		if (copy_to_sockptr_offset(optval, offset, &fl->label,
714				sizeof(fl->label))) {
715			/* Intentionally ignore fault. */
716		}
717	}
718
719	fl_link(np, sfl1, fl);
720	return 0;
721done:
722	fl_free(fl);
723	kfree(sfl1);
724	return err;
725}
726
727int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen)
728{
729	struct in6_flowlabel_req freq;
730
731	if (optlen < sizeof(freq))
732		return -EINVAL;
733	if (copy_from_sockptr(&freq, optval, sizeof(freq)))
734		return -EFAULT;
735
736	switch (freq.flr_action) {
737	case IPV6_FL_A_PUT:
738		return ipv6_flowlabel_put(sk, &freq);
739	case IPV6_FL_A_RENEW:
740		return ipv6_flowlabel_renew(sk, &freq);
741	case IPV6_FL_A_GET:
742		return ipv6_flowlabel_get(sk, &freq, optval, optlen);
743	default:
744		return -EINVAL;
745	}
746}
747
748#ifdef CONFIG_PROC_FS
749
750struct ip6fl_iter_state {
751	struct seq_net_private p;
752	struct pid_namespace *pid_ns;
753	int bucket;
754};
755
756#define ip6fl_seq_private(seq)	((struct ip6fl_iter_state *)(seq)->private)
757
758static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq)
759{
760	struct ip6_flowlabel *fl = NULL;
761	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
762	struct net *net = seq_file_net(seq);
763
764	for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) {
765		for_each_fl_rcu(state->bucket, fl) {
766			if (net_eq(fl->fl_net, net))
767				goto out;
768		}
769	}
770	fl = NULL;
771out:
772	return fl;
773}
774
775static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl)
776{
777	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
778	struct net *net = seq_file_net(seq);
779
780	for_each_fl_continue_rcu(fl) {
781		if (net_eq(fl->fl_net, net))
782			goto out;
783	}
784
785try_again:
786	if (++state->bucket <= FL_HASH_MASK) {
787		for_each_fl_rcu(state->bucket, fl) {
788			if (net_eq(fl->fl_net, net))
789				goto out;
790		}
791		goto try_again;
792	}
793	fl = NULL;
794
795out:
796	return fl;
797}
798
799static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos)
800{
801	struct ip6_flowlabel *fl = ip6fl_get_first(seq);
802	if (fl)
803		while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL)
804			--pos;
805	return pos ? NULL : fl;
806}
807
808static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
809	__acquires(RCU)
810{
811	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
812
813	state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb);
814
815	rcu_read_lock();
816	return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
817}
818
819static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos)
820{
821	struct ip6_flowlabel *fl;
822
823	if (v == SEQ_START_TOKEN)
824		fl = ip6fl_get_first(seq);
825	else
826		fl = ip6fl_get_next(seq, v);
827	++*pos;
828	return fl;
829}
830
831static void ip6fl_seq_stop(struct seq_file *seq, void *v)
832	__releases(RCU)
833{
834	rcu_read_unlock();
835}
836
837static int ip6fl_seq_show(struct seq_file *seq, void *v)
838{
839	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
840	if (v == SEQ_START_TOKEN) {
841		seq_puts(seq, "Label S Owner  Users  Linger Expires  Dst                              Opt\n");
842	} else {
843		struct ip6_flowlabel *fl = v;
844		seq_printf(seq,
845			   "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n",
846			   (unsigned int)ntohl(fl->label),
847			   fl->share,
848			   ((fl->share == IPV6_FL_S_PROCESS) ?
849			    pid_nr_ns(fl->owner.pid, state->pid_ns) :
850			    ((fl->share == IPV6_FL_S_USER) ?
851			     from_kuid_munged(seq_user_ns(seq), fl->owner.uid) :
852			     0)),
853			   atomic_read(&fl->users),
854			   fl->linger/HZ,
855			   (long)(fl->expires - jiffies)/HZ,
856			   &fl->dst,
857			   fl->opt ? fl->opt->opt_nflen : 0);
858	}
859	return 0;
860}
861
862static const struct seq_operations ip6fl_seq_ops = {
863	.start	=	ip6fl_seq_start,
864	.next	=	ip6fl_seq_next,
865	.stop	=	ip6fl_seq_stop,
866	.show	=	ip6fl_seq_show,
867};
868
869static int __net_init ip6_flowlabel_proc_init(struct net *net)
870{
871	if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net,
872			&ip6fl_seq_ops, sizeof(struct ip6fl_iter_state)))
873		return -ENOMEM;
874	return 0;
875}
876
877static void __net_exit ip6_flowlabel_proc_fini(struct net *net)
878{
879	remove_proc_entry("ip6_flowlabel", net->proc_net);
880}
881#else
882static inline int ip6_flowlabel_proc_init(struct net *net)
883{
884	return 0;
885}
886static inline void ip6_flowlabel_proc_fini(struct net *net)
887{
888}
889#endif
890
891static void __net_exit ip6_flowlabel_net_exit(struct net *net)
892{
893	ip6_fl_purge(net);
894	ip6_flowlabel_proc_fini(net);
895}
896
897static struct pernet_operations ip6_flowlabel_net_ops = {
898	.init = ip6_flowlabel_proc_init,
899	.exit = ip6_flowlabel_net_exit,
900};
901
902int ip6_flowlabel_init(void)
903{
904	return register_pernet_subsys(&ip6_flowlabel_net_ops);
905}
906
907void ip6_flowlabel_cleanup(void)
908{
909	static_key_deferred_flush(&ipv6_flowlabel_exclusive);
910	del_timer(&ip6_fl_gc_timer);
911	unregister_pernet_subsys(&ip6_flowlabel_net_ops);
912}
v5.9
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 *	ip6_flowlabel.c		IPv6 flowlabel manager.
  4 *
  5 *	Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 */
  7
  8#include <linux/capability.h>
  9#include <linux/errno.h>
 10#include <linux/types.h>
 11#include <linux/socket.h>
 12#include <linux/net.h>
 13#include <linux/netdevice.h>
 14#include <linux/in6.h>
 15#include <linux/proc_fs.h>
 16#include <linux/seq_file.h>
 17#include <linux/slab.h>
 18#include <linux/export.h>
 19#include <linux/pid_namespace.h>
 20#include <linux/jump_label_ratelimit.h>
 21
 22#include <net/net_namespace.h>
 23#include <net/sock.h>
 24
 25#include <net/ipv6.h>
 26#include <net/rawv6.h>
 27#include <net/transp_v6.h>
 28
 29#include <linux/uaccess.h>
 30
 31#define FL_MIN_LINGER	6	/* Minimal linger. It is set to 6sec specified
 32				   in old IPv6 RFC. Well, it was reasonable value.
 33				 */
 34#define FL_MAX_LINGER	150	/* Maximal linger timeout */
 35
 36/* FL hash table */
 37
 38#define FL_MAX_PER_SOCK	32
 39#define FL_MAX_SIZE	4096
 40#define FL_HASH_MASK	255
 41#define FL_HASH(l)	(ntohl(l)&FL_HASH_MASK)
 42
 43static atomic_t fl_size = ATOMIC_INIT(0);
 44static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 45
 46static void ip6_fl_gc(struct timer_list *unused);
 47static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
 48
 49/* FL hash table lock: it protects only of GC */
 50
 51static DEFINE_SPINLOCK(ip6_fl_lock);
 52
 53/* Big socket sock */
 54
 55static DEFINE_SPINLOCK(ip6_sk_fl_lock);
 56
 57DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
 58EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
 59
 60#define for_each_fl_rcu(hash, fl)				\
 61	for (fl = rcu_dereference_bh(fl_ht[(hash)]);		\
 62	     fl != NULL;					\
 63	     fl = rcu_dereference_bh(fl->next))
 64#define for_each_fl_continue_rcu(fl)				\
 65	for (fl = rcu_dereference_bh(fl->next);			\
 66	     fl != NULL;					\
 67	     fl = rcu_dereference_bh(fl->next))
 68
 69#define for_each_sk_fl_rcu(np, sfl)				\
 70	for (sfl = rcu_dereference_bh(np->ipv6_fl_list);	\
 71	     sfl != NULL;					\
 72	     sfl = rcu_dereference_bh(sfl->next))
 73
 74static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label)
 75{
 76	struct ip6_flowlabel *fl;
 77
 78	for_each_fl_rcu(FL_HASH(label), fl) {
 79		if (fl->label == label && net_eq(fl->fl_net, net))
 80			return fl;
 81	}
 82	return NULL;
 83}
 84
 85static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
 86{
 87	struct ip6_flowlabel *fl;
 88
 89	rcu_read_lock_bh();
 90	fl = __fl_lookup(net, label);
 91	if (fl && !atomic_inc_not_zero(&fl->users))
 92		fl = NULL;
 93	rcu_read_unlock_bh();
 94	return fl;
 95}
 96
 97static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
 98{
 99	return fl->share == IPV6_FL_S_EXCL ||
100	       fl->share == IPV6_FL_S_PROCESS ||
101	       fl->share == IPV6_FL_S_USER;
102}
103
104static void fl_free_rcu(struct rcu_head *head)
105{
106	struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
107
108	if (fl->share == IPV6_FL_S_PROCESS)
109		put_pid(fl->owner.pid);
110	kfree(fl->opt);
111	kfree(fl);
112}
113
114
115static void fl_free(struct ip6_flowlabel *fl)
116{
117	if (!fl)
118		return;
119
120	if (fl_shared_exclusive(fl) || fl->opt)
121		static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
122
123	call_rcu(&fl->rcu, fl_free_rcu);
124}
125
126static void fl_release(struct ip6_flowlabel *fl)
127{
128	spin_lock_bh(&ip6_fl_lock);
129
130	fl->lastuse = jiffies;
131	if (atomic_dec_and_test(&fl->users)) {
132		unsigned long ttd = fl->lastuse + fl->linger;
133		if (time_after(ttd, fl->expires))
134			fl->expires = ttd;
135		ttd = fl->expires;
136		if (fl->opt && fl->share == IPV6_FL_S_EXCL) {
137			struct ipv6_txoptions *opt = fl->opt;
138			fl->opt = NULL;
139			kfree(opt);
140		}
141		if (!timer_pending(&ip6_fl_gc_timer) ||
142		    time_after(ip6_fl_gc_timer.expires, ttd))
143			mod_timer(&ip6_fl_gc_timer, ttd);
144	}
145	spin_unlock_bh(&ip6_fl_lock);
146}
147
148static void ip6_fl_gc(struct timer_list *unused)
149{
150	int i;
151	unsigned long now = jiffies;
152	unsigned long sched = 0;
153
154	spin_lock(&ip6_fl_lock);
155
156	for (i = 0; i <= FL_HASH_MASK; i++) {
157		struct ip6_flowlabel *fl;
158		struct ip6_flowlabel __rcu **flp;
159
160		flp = &fl_ht[i];
161		while ((fl = rcu_dereference_protected(*flp,
162						       lockdep_is_held(&ip6_fl_lock))) != NULL) {
163			if (atomic_read(&fl->users) == 0) {
164				unsigned long ttd = fl->lastuse + fl->linger;
165				if (time_after(ttd, fl->expires))
166					fl->expires = ttd;
167				ttd = fl->expires;
168				if (time_after_eq(now, ttd)) {
169					*flp = fl->next;
170					fl_free(fl);
171					atomic_dec(&fl_size);
172					continue;
173				}
174				if (!sched || time_before(ttd, sched))
175					sched = ttd;
176			}
177			flp = &fl->next;
178		}
179	}
180	if (!sched && atomic_read(&fl_size))
181		sched = now + FL_MAX_LINGER;
182	if (sched) {
183		mod_timer(&ip6_fl_gc_timer, sched);
184	}
185	spin_unlock(&ip6_fl_lock);
186}
187
188static void __net_exit ip6_fl_purge(struct net *net)
189{
190	int i;
191
192	spin_lock_bh(&ip6_fl_lock);
193	for (i = 0; i <= FL_HASH_MASK; i++) {
194		struct ip6_flowlabel *fl;
195		struct ip6_flowlabel __rcu **flp;
196
197		flp = &fl_ht[i];
198		while ((fl = rcu_dereference_protected(*flp,
199						       lockdep_is_held(&ip6_fl_lock))) != NULL) {
200			if (net_eq(fl->fl_net, net) &&
201			    atomic_read(&fl->users) == 0) {
202				*flp = fl->next;
203				fl_free(fl);
204				atomic_dec(&fl_size);
205				continue;
206			}
207			flp = &fl->next;
208		}
209	}
210	spin_unlock_bh(&ip6_fl_lock);
211}
212
213static struct ip6_flowlabel *fl_intern(struct net *net,
214				       struct ip6_flowlabel *fl, __be32 label)
215{
216	struct ip6_flowlabel *lfl;
217
218	fl->label = label & IPV6_FLOWLABEL_MASK;
219
 
220	spin_lock_bh(&ip6_fl_lock);
221	if (label == 0) {
222		for (;;) {
223			fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK;
224			if (fl->label) {
225				lfl = __fl_lookup(net, fl->label);
226				if (!lfl)
227					break;
228			}
229		}
230	} else {
231		/*
232		 * we dropper the ip6_fl_lock, so this entry could reappear
233		 * and we need to recheck with it.
234		 *
235		 * OTOH no need to search the active socket first, like it is
236		 * done in ipv6_flowlabel_opt - sock is locked, so new entry
237		 * with the same label can only appear on another sock
238		 */
239		lfl = __fl_lookup(net, fl->label);
240		if (lfl) {
241			atomic_inc(&lfl->users);
242			spin_unlock_bh(&ip6_fl_lock);
 
243			return lfl;
244		}
245	}
246
247	fl->lastuse = jiffies;
248	fl->next = fl_ht[FL_HASH(fl->label)];
249	rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
250	atomic_inc(&fl_size);
251	spin_unlock_bh(&ip6_fl_lock);
 
252	return NULL;
253}
254
255
256
257/* Socket flowlabel lists */
258
259struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
260{
261	struct ipv6_fl_socklist *sfl;
262	struct ipv6_pinfo *np = inet6_sk(sk);
263
264	label &= IPV6_FLOWLABEL_MASK;
265
266	rcu_read_lock_bh();
267	for_each_sk_fl_rcu(np, sfl) {
268		struct ip6_flowlabel *fl = sfl->fl;
269
270		if (fl->label == label && atomic_inc_not_zero(&fl->users)) {
271			fl->lastuse = jiffies;
272			rcu_read_unlock_bh();
273			return fl;
274		}
275	}
276	rcu_read_unlock_bh();
277	return NULL;
278}
279EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
280
281void fl6_free_socklist(struct sock *sk)
282{
283	struct ipv6_pinfo *np = inet6_sk(sk);
284	struct ipv6_fl_socklist *sfl;
285
286	if (!rcu_access_pointer(np->ipv6_fl_list))
287		return;
288
289	spin_lock_bh(&ip6_sk_fl_lock);
290	while ((sfl = rcu_dereference_protected(np->ipv6_fl_list,
291						lockdep_is_held(&ip6_sk_fl_lock))) != NULL) {
292		np->ipv6_fl_list = sfl->next;
293		spin_unlock_bh(&ip6_sk_fl_lock);
294
295		fl_release(sfl->fl);
296		kfree_rcu(sfl, rcu);
297
298		spin_lock_bh(&ip6_sk_fl_lock);
299	}
300	spin_unlock_bh(&ip6_sk_fl_lock);
301}
302
303/* Service routines */
304
305
306/*
307   It is the only difficult place. flowlabel enforces equal headers
308   before and including routing header, however user may supply options
309   following rthdr.
310 */
311
312struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
313					 struct ip6_flowlabel *fl,
314					 struct ipv6_txoptions *fopt)
315{
316	struct ipv6_txoptions *fl_opt = fl->opt;
317
318	if (!fopt || fopt->opt_flen == 0)
319		return fl_opt;
320
321	if (fl_opt) {
322		opt_space->hopopt = fl_opt->hopopt;
323		opt_space->dst0opt = fl_opt->dst0opt;
324		opt_space->srcrt = fl_opt->srcrt;
325		opt_space->opt_nflen = fl_opt->opt_nflen;
326	} else {
327		if (fopt->opt_nflen == 0)
328			return fopt;
329		opt_space->hopopt = NULL;
330		opt_space->dst0opt = NULL;
331		opt_space->srcrt = NULL;
332		opt_space->opt_nflen = 0;
333	}
334	opt_space->dst1opt = fopt->dst1opt;
335	opt_space->opt_flen = fopt->opt_flen;
336	opt_space->tot_len = fopt->tot_len;
337	return opt_space;
338}
339EXPORT_SYMBOL_GPL(fl6_merge_options);
340
341static unsigned long check_linger(unsigned long ttl)
342{
343	if (ttl < FL_MIN_LINGER)
344		return FL_MIN_LINGER*HZ;
345	if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN))
346		return 0;
347	return ttl*HZ;
348}
349
350static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires)
351{
352	linger = check_linger(linger);
353	if (!linger)
354		return -EPERM;
355	expires = check_linger(expires);
356	if (!expires)
357		return -EPERM;
358
359	spin_lock_bh(&ip6_fl_lock);
360	fl->lastuse = jiffies;
361	if (time_before(fl->linger, linger))
362		fl->linger = linger;
363	if (time_before(expires, fl->linger))
364		expires = fl->linger;
365	if (time_before(fl->expires, fl->lastuse + expires))
366		fl->expires = fl->lastuse + expires;
367	spin_unlock_bh(&ip6_fl_lock);
368
369	return 0;
370}
371
372static struct ip6_flowlabel *
373fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
374	  sockptr_t optval, int optlen, int *err_p)
375{
376	struct ip6_flowlabel *fl = NULL;
377	int olen;
378	int addr_type;
379	int err;
380
381	olen = optlen - CMSG_ALIGN(sizeof(*freq));
382	err = -EINVAL;
383	if (olen > 64 * 1024)
384		goto done;
385
386	err = -ENOMEM;
387	fl = kzalloc(sizeof(*fl), GFP_KERNEL);
388	if (!fl)
389		goto done;
390
391	if (olen > 0) {
392		struct msghdr msg;
393		struct flowi6 flowi6;
394		struct ipcm6_cookie ipc6;
395
396		err = -ENOMEM;
397		fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL);
398		if (!fl->opt)
399			goto done;
400
401		memset(fl->opt, 0, sizeof(*fl->opt));
402		fl->opt->tot_len = sizeof(*fl->opt) + olen;
403		err = -EFAULT;
404		if (copy_from_sockptr_offset(fl->opt + 1, optval,
405				CMSG_ALIGN(sizeof(*freq)), olen))
406			goto done;
407
408		msg.msg_controllen = olen;
409		msg.msg_control = (void *)(fl->opt+1);
410		memset(&flowi6, 0, sizeof(flowi6));
411
412		ipc6.opt = fl->opt;
413		err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6);
414		if (err)
415			goto done;
416		err = -EINVAL;
417		if (fl->opt->opt_flen)
418			goto done;
419		if (fl->opt->opt_nflen == 0) {
420			kfree(fl->opt);
421			fl->opt = NULL;
422		}
423	}
424
425	fl->fl_net = net;
426	fl->expires = jiffies;
427	err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
428	if (err)
429		goto done;
430	fl->share = freq->flr_share;
431	addr_type = ipv6_addr_type(&freq->flr_dst);
432	if ((addr_type & IPV6_ADDR_MAPPED) ||
433	    addr_type == IPV6_ADDR_ANY) {
434		err = -EINVAL;
435		goto done;
436	}
437	fl->dst = freq->flr_dst;
438	atomic_set(&fl->users, 1);
439	switch (fl->share) {
440	case IPV6_FL_S_EXCL:
441	case IPV6_FL_S_ANY:
442		break;
443	case IPV6_FL_S_PROCESS:
444		fl->owner.pid = get_task_pid(current, PIDTYPE_PID);
445		break;
446	case IPV6_FL_S_USER:
447		fl->owner.uid = current_euid();
448		break;
449	default:
450		err = -EINVAL;
451		goto done;
452	}
453	if (fl_shared_exclusive(fl) || fl->opt)
 
454		static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
 
455	return fl;
456
457done:
458	if (fl) {
459		kfree(fl->opt);
460		kfree(fl);
461	}
462	*err_p = err;
463	return NULL;
464}
465
466static int mem_check(struct sock *sk)
467{
468	struct ipv6_pinfo *np = inet6_sk(sk);
469	struct ipv6_fl_socklist *sfl;
470	int room = FL_MAX_SIZE - atomic_read(&fl_size);
471	int count = 0;
472
473	if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
474		return 0;
475
476	rcu_read_lock_bh();
477	for_each_sk_fl_rcu(np, sfl)
478		count++;
479	rcu_read_unlock_bh();
480
481	if (room <= 0 ||
482	    ((count >= FL_MAX_PER_SOCK ||
483	      (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
484	     !capable(CAP_NET_ADMIN)))
485		return -ENOBUFS;
486
487	return 0;
488}
489
490static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl,
491		struct ip6_flowlabel *fl)
492{
493	spin_lock_bh(&ip6_sk_fl_lock);
494	sfl->fl = fl;
495	sfl->next = np->ipv6_fl_list;
496	rcu_assign_pointer(np->ipv6_fl_list, sfl);
497	spin_unlock_bh(&ip6_sk_fl_lock);
498}
499
500int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
501			   int flags)
502{
503	struct ipv6_pinfo *np = inet6_sk(sk);
504	struct ipv6_fl_socklist *sfl;
505
506	if (flags & IPV6_FL_F_REMOTE) {
507		freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK;
508		return 0;
509	}
510
511	if (np->repflow) {
512		freq->flr_label = np->flow_label;
513		return 0;
514	}
515
516	rcu_read_lock_bh();
517
518	for_each_sk_fl_rcu(np, sfl) {
519		if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) {
520			spin_lock_bh(&ip6_fl_lock);
521			freq->flr_label = sfl->fl->label;
522			freq->flr_dst = sfl->fl->dst;
523			freq->flr_share = sfl->fl->share;
524			freq->flr_expires = (sfl->fl->expires - jiffies) / HZ;
525			freq->flr_linger = sfl->fl->linger / HZ;
526
527			spin_unlock_bh(&ip6_fl_lock);
528			rcu_read_unlock_bh();
529			return 0;
530		}
531	}
532	rcu_read_unlock_bh();
533
534	return -ENOENT;
535}
536
537#define socklist_dereference(__sflp) \
538	rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock))
539
540static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
541{
542	struct ipv6_pinfo *np = inet6_sk(sk);
543	struct ipv6_fl_socklist __rcu **sflp;
544	struct ipv6_fl_socklist *sfl;
545
546	if (freq->flr_flags & IPV6_FL_F_REFLECT) {
547		if (sk->sk_protocol != IPPROTO_TCP)
548			return -ENOPROTOOPT;
549		if (!np->repflow)
550			return -ESRCH;
551		np->flow_label = 0;
552		np->repflow = 0;
553		return 0;
554	}
555
556	spin_lock_bh(&ip6_sk_fl_lock);
557	for (sflp = &np->ipv6_fl_list;
558	     (sfl = socklist_dereference(*sflp)) != NULL;
559	     sflp = &sfl->next) {
560		if (sfl->fl->label == freq->flr_label)
561			goto found;
562	}
563	spin_unlock_bh(&ip6_sk_fl_lock);
564	return -ESRCH;
565found:
566	if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK))
567		np->flow_label &= ~IPV6_FLOWLABEL_MASK;
568	*sflp = sfl->next;
569	spin_unlock_bh(&ip6_sk_fl_lock);
570	fl_release(sfl->fl);
571	kfree_rcu(sfl, rcu);
572	return 0;
573}
574
575static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq)
576{
577	struct ipv6_pinfo *np = inet6_sk(sk);
578	struct net *net = sock_net(sk);
579	struct ipv6_fl_socklist *sfl;
580	int err;
581
582	rcu_read_lock_bh();
583	for_each_sk_fl_rcu(np, sfl) {
584		if (sfl->fl->label == freq->flr_label) {
585			err = fl6_renew(sfl->fl, freq->flr_linger,
586					freq->flr_expires);
587			rcu_read_unlock_bh();
588			return err;
589		}
590	}
591	rcu_read_unlock_bh();
592
593	if (freq->flr_share == IPV6_FL_S_NONE &&
594	    ns_capable(net->user_ns, CAP_NET_ADMIN)) {
595		struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label);
596
597		if (fl) {
598			err = fl6_renew(fl, freq->flr_linger,
599					freq->flr_expires);
600			fl_release(fl);
601			return err;
602		}
603	}
604	return -ESRCH;
605}
606
607static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
608		sockptr_t optval, int optlen)
609{
610	struct ipv6_fl_socklist *sfl, *sfl1 = NULL;
611	struct ip6_flowlabel *fl, *fl1 = NULL;
612	struct ipv6_pinfo *np = inet6_sk(sk);
613	struct net *net = sock_net(sk);
614	int err;
615
616	if (freq->flr_flags & IPV6_FL_F_REFLECT) {
617		if (net->ipv6.sysctl.flowlabel_consistency) {
618			net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
619			return -EPERM;
620		}
621
622		if (sk->sk_protocol != IPPROTO_TCP)
623			return -ENOPROTOOPT;
624		np->repflow = 1;
625		return 0;
626	}
627
628	if (freq->flr_label & ~IPV6_FLOWLABEL_MASK)
629		return -EINVAL;
630	if (net->ipv6.sysctl.flowlabel_state_ranges &&
631	    (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
632		return -ERANGE;
633
634	fl = fl_create(net, sk, freq, optval, optlen, &err);
635	if (!fl)
636		return err;
637
638	sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
639
640	if (freq->flr_label) {
641		err = -EEXIST;
642		rcu_read_lock_bh();
643		for_each_sk_fl_rcu(np, sfl) {
644			if (sfl->fl->label == freq->flr_label) {
645				if (freq->flr_flags & IPV6_FL_F_EXCL) {
646					rcu_read_unlock_bh();
647					goto done;
648				}
649				fl1 = sfl->fl;
650				if (!atomic_inc_not_zero(&fl1->users))
651					fl1 = NULL;
652				break;
653			}
654		}
655		rcu_read_unlock_bh();
656
657		if (!fl1)
658			fl1 = fl_lookup(net, freq->flr_label);
659		if (fl1) {
660recheck:
661			err = -EEXIST;
662			if (freq->flr_flags&IPV6_FL_F_EXCL)
663				goto release;
664			err = -EPERM;
665			if (fl1->share == IPV6_FL_S_EXCL ||
666			    fl1->share != fl->share ||
667			    ((fl1->share == IPV6_FL_S_PROCESS) &&
668			     (fl1->owner.pid != fl->owner.pid)) ||
669			    ((fl1->share == IPV6_FL_S_USER) &&
670			     !uid_eq(fl1->owner.uid, fl->owner.uid)))
671				goto release;
672
673			err = -ENOMEM;
674			if (!sfl1)
675				goto release;
676			if (fl->linger > fl1->linger)
677				fl1->linger = fl->linger;
678			if ((long)(fl->expires - fl1->expires) > 0)
679				fl1->expires = fl->expires;
680			fl_link(np, sfl1, fl1);
681			fl_free(fl);
682			return 0;
683
684release:
685			fl_release(fl1);
686			goto done;
687		}
688	}
689	err = -ENOENT;
690	if (!(freq->flr_flags & IPV6_FL_F_CREATE))
691		goto done;
692
693	err = -ENOMEM;
694	if (!sfl1)
695		goto done;
696
697	err = mem_check(sk);
698	if (err != 0)
699		goto done;
700
701	fl1 = fl_intern(net, fl, freq->flr_label);
702	if (fl1)
703		goto recheck;
704
705	if (!freq->flr_label) {
706		size_t offset = offsetof(struct in6_flowlabel_req, flr_label);
707
708		if (copy_to_sockptr_offset(optval, offset, &fl->label,
709				sizeof(fl->label))) {
710			/* Intentionally ignore fault. */
711		}
712	}
713
714	fl_link(np, sfl1, fl);
715	return 0;
716done:
717	fl_free(fl);
718	kfree(sfl1);
719	return err;
720}
721
722int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen)
723{
724	struct in6_flowlabel_req freq;
725
726	if (optlen < sizeof(freq))
727		return -EINVAL;
728	if (copy_from_sockptr(&freq, optval, sizeof(freq)))
729		return -EFAULT;
730
731	switch (freq.flr_action) {
732	case IPV6_FL_A_PUT:
733		return ipv6_flowlabel_put(sk, &freq);
734	case IPV6_FL_A_RENEW:
735		return ipv6_flowlabel_renew(sk, &freq);
736	case IPV6_FL_A_GET:
737		return ipv6_flowlabel_get(sk, &freq, optval, optlen);
738	default:
739		return -EINVAL;
740	}
741}
742
743#ifdef CONFIG_PROC_FS
744
745struct ip6fl_iter_state {
746	struct seq_net_private p;
747	struct pid_namespace *pid_ns;
748	int bucket;
749};
750
751#define ip6fl_seq_private(seq)	((struct ip6fl_iter_state *)(seq)->private)
752
753static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq)
754{
755	struct ip6_flowlabel *fl = NULL;
756	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
757	struct net *net = seq_file_net(seq);
758
759	for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) {
760		for_each_fl_rcu(state->bucket, fl) {
761			if (net_eq(fl->fl_net, net))
762				goto out;
763		}
764	}
765	fl = NULL;
766out:
767	return fl;
768}
769
770static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl)
771{
772	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
773	struct net *net = seq_file_net(seq);
774
775	for_each_fl_continue_rcu(fl) {
776		if (net_eq(fl->fl_net, net))
777			goto out;
778	}
779
780try_again:
781	if (++state->bucket <= FL_HASH_MASK) {
782		for_each_fl_rcu(state->bucket, fl) {
783			if (net_eq(fl->fl_net, net))
784				goto out;
785		}
786		goto try_again;
787	}
788	fl = NULL;
789
790out:
791	return fl;
792}
793
794static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos)
795{
796	struct ip6_flowlabel *fl = ip6fl_get_first(seq);
797	if (fl)
798		while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL)
799			--pos;
800	return pos ? NULL : fl;
801}
802
803static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
804	__acquires(RCU)
805{
806	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
807
808	state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb);
809
810	rcu_read_lock_bh();
811	return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
812}
813
814static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos)
815{
816	struct ip6_flowlabel *fl;
817
818	if (v == SEQ_START_TOKEN)
819		fl = ip6fl_get_first(seq);
820	else
821		fl = ip6fl_get_next(seq, v);
822	++*pos;
823	return fl;
824}
825
826static void ip6fl_seq_stop(struct seq_file *seq, void *v)
827	__releases(RCU)
828{
829	rcu_read_unlock_bh();
830}
831
832static int ip6fl_seq_show(struct seq_file *seq, void *v)
833{
834	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
835	if (v == SEQ_START_TOKEN) {
836		seq_puts(seq, "Label S Owner  Users  Linger Expires  Dst                              Opt\n");
837	} else {
838		struct ip6_flowlabel *fl = v;
839		seq_printf(seq,
840			   "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n",
841			   (unsigned int)ntohl(fl->label),
842			   fl->share,
843			   ((fl->share == IPV6_FL_S_PROCESS) ?
844			    pid_nr_ns(fl->owner.pid, state->pid_ns) :
845			    ((fl->share == IPV6_FL_S_USER) ?
846			     from_kuid_munged(seq_user_ns(seq), fl->owner.uid) :
847			     0)),
848			   atomic_read(&fl->users),
849			   fl->linger/HZ,
850			   (long)(fl->expires - jiffies)/HZ,
851			   &fl->dst,
852			   fl->opt ? fl->opt->opt_nflen : 0);
853	}
854	return 0;
855}
856
857static const struct seq_operations ip6fl_seq_ops = {
858	.start	=	ip6fl_seq_start,
859	.next	=	ip6fl_seq_next,
860	.stop	=	ip6fl_seq_stop,
861	.show	=	ip6fl_seq_show,
862};
863
864static int __net_init ip6_flowlabel_proc_init(struct net *net)
865{
866	if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net,
867			&ip6fl_seq_ops, sizeof(struct ip6fl_iter_state)))
868		return -ENOMEM;
869	return 0;
870}
871
872static void __net_exit ip6_flowlabel_proc_fini(struct net *net)
873{
874	remove_proc_entry("ip6_flowlabel", net->proc_net);
875}
876#else
877static inline int ip6_flowlabel_proc_init(struct net *net)
878{
879	return 0;
880}
881static inline void ip6_flowlabel_proc_fini(struct net *net)
882{
883}
884#endif
885
886static void __net_exit ip6_flowlabel_net_exit(struct net *net)
887{
888	ip6_fl_purge(net);
889	ip6_flowlabel_proc_fini(net);
890}
891
892static struct pernet_operations ip6_flowlabel_net_ops = {
893	.init = ip6_flowlabel_proc_init,
894	.exit = ip6_flowlabel_net_exit,
895};
896
897int ip6_flowlabel_init(void)
898{
899	return register_pernet_subsys(&ip6_flowlabel_net_ops);
900}
901
902void ip6_flowlabel_cleanup(void)
903{
904	static_key_deferred_flush(&ipv6_flowlabel_exclusive);
905	del_timer(&ip6_fl_gc_timer);
906	unregister_pernet_subsys(&ip6_flowlabel_net_ops);
907}