sch_sfq.c - net/sched/sch_sfq.c - Linux diff v6.13.7

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  4 *
  5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 */
  7
  8#include <linux/module.h>
  9#include <linux/types.h>
 10#include <linux/kernel.h>
 11#include <linux/jiffies.h>
 12#include <linux/string.h>
 13#include <linux/in.h>
 14#include <linux/errno.h>
 15#include <linux/init.h>
 16#include <linux/skbuff.h>
 17#include <linux/siphash.h>
 18#include <linux/slab.h>
 19#include <linux/vmalloc.h>
 20#include <net/netlink.h>
 21#include <net/pkt_sched.h>
 22#include <net/pkt_cls.h>
 23#include <net/red.h>
 24
 25
 26/*	Stochastic Fairness Queuing algorithm.
 27	=======================================
 28
 29	Source:
 30	Paul E. McKenney "Stochastic Fairness Queuing",
 31	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 32
 33	Paul E. McKenney "Stochastic Fairness Queuing",
 34	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 35
 36
 37	See also:
 38	M. Shreedhar and George Varghese "Efficient Fair
 39	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 40
 41
 42	This is not the thing that is usually called (W)FQ nowadays.
 43	It does not use any timestamp mechanism, but instead
 44	processes queues in round-robin order.
 45
 46	ADVANTAGE:
 47
 48	- It is very cheap. Both CPU and memory requirements are minimal.
 49
 50	DRAWBACKS:
 51
 52	- "Stochastic" -> It is not 100% fair.
 53	When hash collisions occur, several flows are considered as one.
 54
 55	- "Round-robin" -> It introduces larger delays than virtual clock
 56	based schemes, and should not be used for isolating interactive
 57	traffic	from non-interactive. It means, that this scheduler
 58	should be used as leaf of CBQ or P3, which put interactive traffic
 59	to higher priority band.
 60
 61	We still need true WFQ for top level CSZ, but using WFQ
 62	for the best effort traffic is absolutely pointless:
 63	SFQ is superior for this purpose.
 64
 65	IMPLEMENTATION:
 66	This implementation limits :
 67	- maximal queue length per flow to 127 packets.
 68	- max mtu to 2^18-1;
 69	- max 65408 flows,
 70	- number of hash buckets to 65536.
 71
 72	It is easy to increase these values, but not in flight.  */
 73
 74#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 75#define SFQ_DEFAULT_FLOWS	128
 76#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 77#define SFQ_EMPTY_SLOT		0xffff
 78#define SFQ_DEFAULT_HASH_DIVISOR 1024
 79
 
 
 
 
 
 
 80/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 81typedef u16 sfq_index;
 82
 83/*
 84 * We dont use pointers to save space.
 85 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 86 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 87 * are 'pointers' to dep[] array
 88 */
 89struct sfq_head {
 90	sfq_index	next;
 91	sfq_index	prev;
 92};
 93
 94struct sfq_slot {
 95	struct sk_buff	*skblist_next;
 96	struct sk_buff	*skblist_prev;
 97	sfq_index	qlen; /* number of skbs in skblist */
 98	sfq_index	next; /* next slot in sfq RR chain */
 99	struct sfq_head dep; /* anchor in dep[] chains */
100	unsigned short	hash; /* hash value (index in ht[]) */
101	int		allot; /* credit for this slot */
102
103	unsigned int    backlog;
104	struct red_vars vars;
105};
106
107struct sfq_sched_data {
108/* frequently used fields */
109	int		limit;		/* limit of total number of packets in this qdisc */
110	unsigned int	divisor;	/* number of slots in hash table */
111	u8		headdrop;
112	u8		maxdepth;	/* limit of packets per flow */
113
114	siphash_key_t 	perturbation;
115	u8		cur_depth;	/* depth of longest slot */
116	u8		flags;
 
117	struct tcf_proto __rcu *filter_list;
118	struct tcf_block *block;
119	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121
122	struct red_parms *red_parms;
123	struct tc_sfqred_stats stats;
124	struct sfq_slot *tail;		/* current slot in round */
125
126	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127					/* Linked lists of slots, indexed by depth
128					 * dep[0] : list of unused flows
129					 * dep[1] : list of flows with 1 packet
130					 * dep[X] : list of flows with X packets
131					 */
132
133	unsigned int	maxflows;	/* number of flows in flows array */
134	int		perturb_period;
135	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136	struct timer_list perturb_timer;
137	struct Qdisc	*sch;
138};
139
140/*
141 * sfq_head are either in a sfq_slot or in dep[] array
142 */
143static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144{
145	if (val < SFQ_MAX_FLOWS)
146		return &q->slots[val].dep;
147	return &q->dep[val - SFQ_MAX_FLOWS];
148}
149
150static unsigned int sfq_hash(const struct sfq_sched_data *q,
151			     const struct sk_buff *skb)
152{
153	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154}
155
156static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157				 int *qerr)
158{
159	struct sfq_sched_data *q = qdisc_priv(sch);
160	struct tcf_result res;
161	struct tcf_proto *fl;
162	int result;
163
164	if (TC_H_MAJ(skb->priority) == sch->handle &&
165	    TC_H_MIN(skb->priority) > 0 &&
166	    TC_H_MIN(skb->priority) <= q->divisor)
167		return TC_H_MIN(skb->priority);
168
169	fl = rcu_dereference_bh(q->filter_list);
170	if (!fl)
171		return sfq_hash(q, skb) + 1;
172
173	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174	result = tcf_classify(skb, NULL, fl, &res, false);
175	if (result >= 0) {
176#ifdef CONFIG_NET_CLS_ACT
177		switch (result) {
178		case TC_ACT_STOLEN:
179		case TC_ACT_QUEUED:
180		case TC_ACT_TRAP:
181			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182			fallthrough;
183		case TC_ACT_SHOT:
184			return 0;
185		}
186#endif
187		if (TC_H_MIN(res.classid) <= q->divisor)
188			return TC_H_MIN(res.classid);
189	}
190	return 0;
191}
192
193/*
194 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195 */
196static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197{
198	sfq_index p, n;
199	struct sfq_slot *slot = &q->slots[x];
200	int qlen = slot->qlen;
201
202	p = qlen + SFQ_MAX_FLOWS;
203	n = q->dep[qlen].next;
204
205	slot->dep.next = n;
206	slot->dep.prev = p;
207
208	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209	sfq_dep_head(q, n)->prev = x;
210}
211
212#define sfq_unlink(q, x, n, p)			\
213	do {					\
214		n = q->slots[x].dep.next;	\
215		p = q->slots[x].dep.prev;	\
216		sfq_dep_head(q, p)->next = n;	\
217		sfq_dep_head(q, n)->prev = p;	\
218	} while (0)
219
220
221static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222{
223	sfq_index p, n;
224	int d;
225
226	sfq_unlink(q, x, n, p);
227
228	d = q->slots[x].qlen--;
229	if (n == p && q->cur_depth == d)
230		q->cur_depth--;
231	sfq_link(q, x);
232}
233
234static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
235{
236	sfq_index p, n;
237	int d;
238
239	sfq_unlink(q, x, n, p);
240
241	d = ++q->slots[x].qlen;
242	if (q->cur_depth < d)
243		q->cur_depth = d;
244	sfq_link(q, x);
245}
246
247/* helper functions : might be changed when/if skb use a standard list_head */
248
249/* remove one skb from tail of slot queue */
250static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
251{
252	struct sk_buff *skb = slot->skblist_prev;
253
254	slot->skblist_prev = skb->prev;
255	skb->prev->next = (struct sk_buff *)slot;
256	skb->next = skb->prev = NULL;
257	return skb;
258}
259
260/* remove one skb from head of slot queue */
261static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
262{
263	struct sk_buff *skb = slot->skblist_next;
264
265	slot->skblist_next = skb->next;
266	skb->next->prev = (struct sk_buff *)slot;
267	skb->next = skb->prev = NULL;
268	return skb;
269}
270
271static inline void slot_queue_init(struct sfq_slot *slot)
272{
273	memset(slot, 0, sizeof(*slot));
274	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
275}
276
277/* add skb to slot queue (tail add) */
278static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
279{
280	skb->prev = slot->skblist_prev;
281	skb->next = (struct sk_buff *)slot;
282	slot->skblist_prev->next = skb;
283	slot->skblist_prev = skb;
284}
285
286static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
287{
288	struct sfq_sched_data *q = qdisc_priv(sch);
289	sfq_index x, d = q->cur_depth;
290	struct sk_buff *skb;
291	unsigned int len;
292	struct sfq_slot *slot;
293
294	/* Queue is full! Find the longest slot and drop tail packet from it */
295	if (d > 1) {
296		x = q->dep[d].next;
297		slot = &q->slots[x];
298drop:
299		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
300		len = qdisc_pkt_len(skb);
301		slot->backlog -= len;
302		sfq_dec(q, x);
303		sch->q.qlen--;
304		qdisc_qstats_backlog_dec(sch, skb);
305		qdisc_drop(skb, sch, to_free);
306		return len;
307	}
308
309	if (d == 1) {
310		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
311		x = q->tail->next;
312		slot = &q->slots[x];
313		q->tail->next = slot->next;
314		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
315		goto drop;
316	}
317
318	return 0;
319}
320
321/* Is ECN parameter configured */
322static int sfq_prob_mark(const struct sfq_sched_data *q)
323{
324	return q->flags & TC_RED_ECN;
325}
326
327/* Should packets over max threshold just be marked */
328static int sfq_hard_mark(const struct sfq_sched_data *q)
329{
330	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
331}
332
333static int sfq_headdrop(const struct sfq_sched_data *q)
334{
335	return q->headdrop;
336}
337
338static int
339sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
340{
341	struct sfq_sched_data *q = qdisc_priv(sch);
342	unsigned int hash, dropped;
343	sfq_index x, qlen;
344	struct sfq_slot *slot;
345	int ret;
346	struct sk_buff *head;
347	int delta;
348
349	hash = sfq_classify(skb, sch, &ret);
350	if (hash == 0) {
351		if (ret & __NET_XMIT_BYPASS)
352			qdisc_qstats_drop(sch);
353		__qdisc_drop(skb, to_free);
354		return ret;
355	}
356	hash--;
357
358	x = q->ht[hash];
359	slot = &q->slots[x];
360	if (x == SFQ_EMPTY_SLOT) {
361		x = q->dep[0].next; /* get a free slot */
362		if (x >= SFQ_MAX_FLOWS)
363			return qdisc_drop(skb, sch, to_free);
364		q->ht[hash] = x;
365		slot = &q->slots[x];
366		slot->hash = hash;
367		slot->backlog = 0; /* should already be 0 anyway... */
368		red_set_vars(&slot->vars);
369		goto enqueue;
370	}
371	if (q->red_parms) {
372		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
373							&slot->vars,
374							slot->backlog);
375		switch (red_action(q->red_parms,
376				   &slot->vars,
377				   slot->vars.qavg)) {
378		case RED_DONT_MARK:
379			break;
380
381		case RED_PROB_MARK:
382			qdisc_qstats_overlimit(sch);
383			if (sfq_prob_mark(q)) {
384				/* We know we have at least one packet in queue */
385				if (sfq_headdrop(q) &&
386				    INET_ECN_set_ce(slot->skblist_next)) {
387					q->stats.prob_mark_head++;
388					break;
389				}
390				if (INET_ECN_set_ce(skb)) {
391					q->stats.prob_mark++;
392					break;
393				}
394			}
395			q->stats.prob_drop++;
396			goto congestion_drop;
397
398		case RED_HARD_MARK:
399			qdisc_qstats_overlimit(sch);
400			if (sfq_hard_mark(q)) {
401				/* We know we have at least one packet in queue */
402				if (sfq_headdrop(q) &&
403				    INET_ECN_set_ce(slot->skblist_next)) {
404					q->stats.forced_mark_head++;
405					break;
406				}
407				if (INET_ECN_set_ce(skb)) {
408					q->stats.forced_mark++;
409					break;
410				}
411			}
412			q->stats.forced_drop++;
413			goto congestion_drop;
414		}
415	}
416
417	if (slot->qlen >= q->maxdepth) {
418congestion_drop:
419		if (!sfq_headdrop(q))
420			return qdisc_drop(skb, sch, to_free);
421
422		/* We know we have at least one packet in queue */
423		head = slot_dequeue_head(slot);
424		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
425		sch->qstats.backlog -= delta;
426		slot->backlog -= delta;
427		qdisc_drop(head, sch, to_free);
428
429		slot_queue_add(slot, skb);
430		qdisc_tree_reduce_backlog(sch, 0, delta);
431		return NET_XMIT_CN;
432	}
433
434enqueue:
435	qdisc_qstats_backlog_inc(sch, skb);
436	slot->backlog += qdisc_pkt_len(skb);
437	slot_queue_add(slot, skb);
438	sfq_inc(q, x);
439	if (slot->qlen == 1) {		/* The flow is new */
440		if (q->tail == NULL) {	/* It is the first flow */
441			slot->next = x;
442		} else {
443			slot->next = q->tail->next;
444			q->tail->next = x;
445		}
446		/* We put this flow at the end of our flow list.
447		 * This might sound unfair for a new flow to wait after old ones,
448		 * but we could endup servicing new flows only, and freeze old ones.
449		 */
450		q->tail = slot;
451		/* We could use a bigger initial quantum for new flows */
452		slot->allot = q->quantum;
453	}
454	if (++sch->q.qlen <= q->limit)
455		return NET_XMIT_SUCCESS;
456
457	qlen = slot->qlen;
458	dropped = sfq_drop(sch, to_free);
459	/* Return Congestion Notification only if we dropped a packet
460	 * from this flow.
461	 */
462	if (qlen != slot->qlen) {
463		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
464		return NET_XMIT_CN;
465	}
466
467	/* As we dropped a packet, better let upper stack know this */
468	qdisc_tree_reduce_backlog(sch, 1, dropped);
469	return NET_XMIT_SUCCESS;
470}
471
472static struct sk_buff *
473sfq_dequeue(struct Qdisc *sch)
474{
475	struct sfq_sched_data *q = qdisc_priv(sch);
476	struct sk_buff *skb;
477	sfq_index a, next_a;
478	struct sfq_slot *slot;
479
480	/* No active slots */
481	if (q->tail == NULL)
482		return NULL;
483
484next_slot:
485	a = q->tail->next;
486	slot = &q->slots[a];
487	if (slot->allot <= 0) {
488		q->tail = slot;
489		slot->allot += q->quantum;
490		goto next_slot;
491	}
492	skb = slot_dequeue_head(slot);
493	sfq_dec(q, a);
494	qdisc_bstats_update(sch, skb);
495	sch->q.qlen--;
496	qdisc_qstats_backlog_dec(sch, skb);
497	slot->backlog -= qdisc_pkt_len(skb);
498	/* Is the slot empty? */
499	if (slot->qlen == 0) {
500		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
501		next_a = slot->next;
502		if (a == next_a) {
503			q->tail = NULL; /* no more active slots */
504			return skb;
505		}
506		q->tail->next = next_a;
507	} else {
508		slot->allot -= qdisc_pkt_len(skb);
509	}
510	return skb;
511}
512
513static void
514sfq_reset(struct Qdisc *sch)
515{
516	struct sk_buff *skb;
517
518	while ((skb = sfq_dequeue(sch)) != NULL)
519		rtnl_kfree_skbs(skb, skb);
520}
521
522/*
523 * When q->perturbation is changed, we rehash all queued skbs
524 * to avoid OOO (Out Of Order) effects.
525 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
526 * counters.
527 */
528static void sfq_rehash(struct Qdisc *sch)
529{
530	struct sfq_sched_data *q = qdisc_priv(sch);
531	struct sk_buff *skb;
532	int i;
533	struct sfq_slot *slot;
534	struct sk_buff_head list;
535	int dropped = 0;
536	unsigned int drop_len = 0;
537
538	__skb_queue_head_init(&list);
539
540	for (i = 0; i < q->maxflows; i++) {
541		slot = &q->slots[i];
542		if (!slot->qlen)
543			continue;
544		while (slot->qlen) {
545			skb = slot_dequeue_head(slot);
546			sfq_dec(q, i);
547			__skb_queue_tail(&list, skb);
548		}
549		slot->backlog = 0;
550		red_set_vars(&slot->vars);
551		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
552	}
553	q->tail = NULL;
554
555	while ((skb = __skb_dequeue(&list)) != NULL) {
556		unsigned int hash = sfq_hash(q, skb);
557		sfq_index x = q->ht[hash];
558
559		slot = &q->slots[x];
560		if (x == SFQ_EMPTY_SLOT) {
561			x = q->dep[0].next; /* get a free slot */
562			if (x >= SFQ_MAX_FLOWS) {
563drop:
564				qdisc_qstats_backlog_dec(sch, skb);
565				drop_len += qdisc_pkt_len(skb);
566				kfree_skb(skb);
567				dropped++;
568				continue;
569			}
570			q->ht[hash] = x;
571			slot = &q->slots[x];
572			slot->hash = hash;
573		}
574		if (slot->qlen >= q->maxdepth)
575			goto drop;
576		slot_queue_add(slot, skb);
577		if (q->red_parms)
578			slot->vars.qavg = red_calc_qavg(q->red_parms,
579							&slot->vars,
580							slot->backlog);
581		slot->backlog += qdisc_pkt_len(skb);
582		sfq_inc(q, x);
583		if (slot->qlen == 1) {		/* The flow is new */
584			if (q->tail == NULL) {	/* It is the first flow */
585				slot->next = x;
586			} else {
587				slot->next = q->tail->next;
588				q->tail->next = x;
589			}
590			q->tail = slot;
591			slot->allot = q->quantum;
592		}
593	}
594	sch->q.qlen -= dropped;
595	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
596}
597
598static void sfq_perturbation(struct timer_list *t)
599{
600	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
601	struct Qdisc *sch = q->sch;
602	spinlock_t *root_lock;
603	siphash_key_t nkey;
604	int period;
605
606	get_random_bytes(&nkey, sizeof(nkey));
607	rcu_read_lock();
608	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
609	spin_lock(root_lock);
610	q->perturbation = nkey;
611	if (!q->filter_list && q->tail)
612		sfq_rehash(sch);
613	spin_unlock(root_lock);
614
615	/* q->perturb_period can change under us from
616	 * sfq_change() and sfq_destroy().
617	 */
618	period = READ_ONCE(q->perturb_period);
619	if (period)
620		mod_timer(&q->perturb_timer, jiffies + period);
621	rcu_read_unlock();
622}
623
624static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
625		      struct netlink_ext_ack *extack)
626{
627	struct sfq_sched_data *q = qdisc_priv(sch);
628	struct tc_sfq_qopt *ctl = nla_data(opt);
629	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
630	unsigned int qlen, dropped = 0;
631	struct red_parms *p = NULL;
632	struct sk_buff *to_free = NULL;
633	struct sk_buff *tail = NULL;
634
635	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
636		return -EINVAL;
637	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
638		ctl_v1 = nla_data(opt);
639	if (ctl->divisor &&
640	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
641		return -EINVAL;
642
643	if ((int)ctl->quantum < 0) {
644		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
645		return -EINVAL;
 
 
 
646	}
 
647	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
648					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
649		return -EINVAL;
650	if (ctl_v1 && ctl_v1->qth_min) {
651		p = kmalloc(sizeof(*p), GFP_KERNEL);
652		if (!p)
653			return -ENOMEM;
654	}
655	if (ctl->limit == 1) {
656		NL_SET_ERR_MSG_MOD(extack, "invalid limit");
657		return -EINVAL;
658	}
659	sch_tree_lock(sch);
660	if (ctl->quantum)
661		q->quantum = ctl->quantum;
662	WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
 
 
663	if (ctl->flows)
664		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
665	if (ctl->divisor) {
666		q->divisor = ctl->divisor;
667		q->maxflows = min_t(u32, q->maxflows, q->divisor);
668	}
669	if (ctl_v1) {
670		if (ctl_v1->depth)
671			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
672		if (p) {
673			swap(q->red_parms, p);
674			red_set_parms(q->red_parms,
675				      ctl_v1->qth_min, ctl_v1->qth_max,
676				      ctl_v1->Wlog,
677				      ctl_v1->Plog, ctl_v1->Scell_log,
678				      NULL,
679				      ctl_v1->max_P);
680		}
681		q->flags = ctl_v1->flags;
682		q->headdrop = ctl_v1->headdrop;
683	}
684	if (ctl->limit) {
685		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
686		q->maxflows = min_t(u32, q->maxflows, q->limit);
687	}
688
689	qlen = sch->q.qlen;
690	while (sch->q.qlen > q->limit) {
691		dropped += sfq_drop(sch, &to_free);
692		if (!tail)
693			tail = to_free;
694	}
695
696	rtnl_kfree_skbs(to_free, tail);
697	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
698
699	del_timer(&q->perturb_timer);
700	if (q->perturb_period) {
701		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
702		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
703	}
704	sch_tree_unlock(sch);
705	kfree(p);
706	return 0;
707}
708
709static void *sfq_alloc(size_t sz)
710{
711	return  kvmalloc(sz, GFP_KERNEL);
712}
713
714static void sfq_free(void *addr)
715{
716	kvfree(addr);
717}
718
719static void sfq_destroy(struct Qdisc *sch)
720{
721	struct sfq_sched_data *q = qdisc_priv(sch);
722
723	tcf_block_put(q->block);
724	WRITE_ONCE(q->perturb_period, 0);
725	del_timer_sync(&q->perturb_timer);
726	sfq_free(q->ht);
727	sfq_free(q->slots);
728	kfree(q->red_parms);
729}
730
731static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
732		    struct netlink_ext_ack *extack)
733{
734	struct sfq_sched_data *q = qdisc_priv(sch);
735	int i;
736	int err;
737
738	q->sch = sch;
739	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
740
741	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
742	if (err)
743		return err;
744
745	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
746		q->dep[i].next = i + SFQ_MAX_FLOWS;
747		q->dep[i].prev = i + SFQ_MAX_FLOWS;
748	}
749
750	q->limit = SFQ_MAX_DEPTH;
751	q->maxdepth = SFQ_MAX_DEPTH;
752	q->cur_depth = 0;
753	q->tail = NULL;
754	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
755	q->maxflows = SFQ_DEFAULT_FLOWS;
756	q->quantum = psched_mtu(qdisc_dev(sch));
 
757	q->perturb_period = 0;
758	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
759
760	if (opt) {
761		int err = sfq_change(sch, opt, extack);
762		if (err)
763			return err;
764	}
765
766	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
767	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
768	if (!q->ht || !q->slots) {
769		/* Note: sfq_destroy() will be called by our caller */
770		return -ENOMEM;
771	}
772
773	for (i = 0; i < q->divisor; i++)
774		q->ht[i] = SFQ_EMPTY_SLOT;
775
776	for (i = 0; i < q->maxflows; i++) {
777		slot_queue_init(&q->slots[i]);
778		sfq_link(q, i);
779	}
780	if (q->limit >= 1)
781		sch->flags |= TCQ_F_CAN_BYPASS;
782	else
783		sch->flags &= ~TCQ_F_CAN_BYPASS;
784	return 0;
785}
786
787static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
788{
789	struct sfq_sched_data *q = qdisc_priv(sch);
790	unsigned char *b = skb_tail_pointer(skb);
791	struct tc_sfq_qopt_v1 opt;
792	struct red_parms *p = q->red_parms;
793
794	memset(&opt, 0, sizeof(opt));
795	opt.v0.quantum	= q->quantum;
796	opt.v0.perturb_period = q->perturb_period / HZ;
797	opt.v0.limit	= q->limit;
798	opt.v0.divisor	= q->divisor;
799	opt.v0.flows	= q->maxflows;
800	opt.depth	= q->maxdepth;
801	opt.headdrop	= q->headdrop;
802
803	if (p) {
804		opt.qth_min	= p->qth_min >> p->Wlog;
805		opt.qth_max	= p->qth_max >> p->Wlog;
806		opt.Wlog	= p->Wlog;
807		opt.Plog	= p->Plog;
808		opt.Scell_log	= p->Scell_log;
809		opt.max_P	= p->max_P;
810	}
811	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
812	opt.flags	= q->flags;
813
814	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
815		goto nla_put_failure;
816
817	return skb->len;
818
819nla_put_failure:
820	nlmsg_trim(skb, b);
821	return -1;
822}
823
824static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
825{
826	return NULL;
827}
828
829static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
830{
831	return 0;
832}
833
834static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
835			      u32 classid)
836{
837	return 0;
838}
839
840static void sfq_unbind(struct Qdisc *q, unsigned long cl)
841{
842}
843
844static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
845				       struct netlink_ext_ack *extack)
846{
847	struct sfq_sched_data *q = qdisc_priv(sch);
848
849	if (cl)
850		return NULL;
851	return q->block;
852}
853
854static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
855			  struct sk_buff *skb, struct tcmsg *tcm)
856{
857	tcm->tcm_handle |= TC_H_MIN(cl);
858	return 0;
859}
860
861static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
862				struct gnet_dump *d)
863{
864	struct sfq_sched_data *q = qdisc_priv(sch);
865	sfq_index idx = q->ht[cl - 1];
866	struct gnet_stats_queue qs = { 0 };
867	struct tc_sfq_xstats xstats = { 0 };
868
869	if (idx != SFQ_EMPTY_SLOT) {
870		const struct sfq_slot *slot = &q->slots[idx];
871
872		xstats.allot = slot->allot;
873		qs.qlen = slot->qlen;
874		qs.backlog = slot->backlog;
875	}
876	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
877		return -1;
878	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
879}
880
881static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
882{
883	struct sfq_sched_data *q = qdisc_priv(sch);
884	unsigned int i;
885
886	if (arg->stop)
887		return;
888
889	for (i = 0; i < q->divisor; i++) {
890		if (q->ht[i] == SFQ_EMPTY_SLOT) {
891			arg->count++;
892			continue;
893		}
894		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
895			break;
896	}
897}
898
899static const struct Qdisc_class_ops sfq_class_ops = {
900	.leaf		=	sfq_leaf,
901	.find		=	sfq_find,
902	.tcf_block	=	sfq_tcf_block,
903	.bind_tcf	=	sfq_bind,
904	.unbind_tcf	=	sfq_unbind,
905	.dump		=	sfq_dump_class,
906	.dump_stats	=	sfq_dump_class_stats,
907	.walk		=	sfq_walk,
908};
909
910static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
911	.cl_ops		=	&sfq_class_ops,
912	.id		=	"sfq",
913	.priv_size	=	sizeof(struct sfq_sched_data),
914	.enqueue	=	sfq_enqueue,
915	.dequeue	=	sfq_dequeue,
916	.peek		=	qdisc_peek_dequeued,
917	.init		=	sfq_init,
918	.reset		=	sfq_reset,
919	.destroy	=	sfq_destroy,
920	.change		=	NULL,
921	.dump		=	sfq_dump,
922	.owner		=	THIS_MODULE,
923};
924MODULE_ALIAS_NET_SCH("sfq");
925
926static int __init sfq_module_init(void)
927{
928	return register_qdisc(&sfq_qdisc_ops);
929}
930static void __exit sfq_module_exit(void)
931{
932	unregister_qdisc(&sfq_qdisc_ops);
933}
934module_init(sfq_module_init)
935module_exit(sfq_module_exit)
936MODULE_LICENSE("GPL");
937MODULE_DESCRIPTION("Stochastic Fairness qdisc");

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  4 *
  5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 */
  7
  8#include <linux/module.h>
  9#include <linux/types.h>
 10#include <linux/kernel.h>
 11#include <linux/jiffies.h>
 12#include <linux/string.h>
 13#include <linux/in.h>
 14#include <linux/errno.h>
 15#include <linux/init.h>
 16#include <linux/skbuff.h>
 17#include <linux/siphash.h>
 18#include <linux/slab.h>
 19#include <linux/vmalloc.h>
 20#include <net/netlink.h>
 21#include <net/pkt_sched.h>
 22#include <net/pkt_cls.h>
 23#include <net/red.h>
 24
 25
 26/*	Stochastic Fairness Queuing algorithm.
 27	=======================================
 28
 29	Source:
 30	Paul E. McKenney "Stochastic Fairness Queuing",
 31	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 32
 33	Paul E. McKenney "Stochastic Fairness Queuing",
 34	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 35
 36
 37	See also:
 38	M. Shreedhar and George Varghese "Efficient Fair
 39	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 40
 41
 42	This is not the thing that is usually called (W)FQ nowadays.
 43	It does not use any timestamp mechanism, but instead
 44	processes queues in round-robin order.
 45
 46	ADVANTAGE:
 47
 48	- It is very cheap. Both CPU and memory requirements are minimal.
 49
 50	DRAWBACKS:
 51
 52	- "Stochastic" -> It is not 100% fair.
 53	When hash collisions occur, several flows are considered as one.
 54
 55	- "Round-robin" -> It introduces larger delays than virtual clock
 56	based schemes, and should not be used for isolating interactive
 57	traffic	from non-interactive. It means, that this scheduler
 58	should be used as leaf of CBQ or P3, which put interactive traffic
 59	to higher priority band.
 60
 61	We still need true WFQ for top level CSZ, but using WFQ
 62	for the best effort traffic is absolutely pointless:
 63	SFQ is superior for this purpose.
 64
 65	IMPLEMENTATION:
 66	This implementation limits :
 67	- maximal queue length per flow to 127 packets.
 68	- max mtu to 2^18-1;
 69	- max 65408 flows,
 70	- number of hash buckets to 65536.
 71
 72	It is easy to increase these values, but not in flight.  */
 73
 74#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 75#define SFQ_DEFAULT_FLOWS	128
 76#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 77#define SFQ_EMPTY_SLOT		0xffff
 78#define SFQ_DEFAULT_HASH_DIVISOR 1024
 79
 80/* We use 16 bits to store allot, and want to handle packets up to 64K
 81 * Scale allot by 8 (1<<3) so that no overflow occurs.
 82 */
 83#define SFQ_ALLOT_SHIFT		3
 84#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 85
 86/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 87typedef u16 sfq_index;
 88
 89/*
 90 * We dont use pointers to save space.
 91 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 92 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 93 * are 'pointers' to dep[] array
 94 */
 95struct sfq_head {
 96	sfq_index	next;
 97	sfq_index	prev;
 98};
 99
100struct sfq_slot {
101	struct sk_buff	*skblist_next;
102	struct sk_buff	*skblist_prev;
103	sfq_index	qlen; /* number of skbs in skblist */
104	sfq_index	next; /* next slot in sfq RR chain */
105	struct sfq_head dep; /* anchor in dep[] chains */
106	unsigned short	hash; /* hash value (index in ht[]) */
107	short		allot; /* credit for this slot */
108
109	unsigned int    backlog;
110	struct red_vars vars;
111};
112
113struct sfq_sched_data {
114/* frequently used fields */
115	int		limit;		/* limit of total number of packets in this qdisc */
116	unsigned int	divisor;	/* number of slots in hash table */
117	u8		headdrop;
118	u8		maxdepth;	/* limit of packets per flow */
119
120	siphash_key_t 	perturbation;
121	u8		cur_depth;	/* depth of longest slot */
122	u8		flags;
123	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
124	struct tcf_proto __rcu *filter_list;
125	struct tcf_block *block;
126	sfq_index	*ht;		/* Hash table ('divisor' slots) */
127	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
128
129	struct red_parms *red_parms;
130	struct tc_sfqred_stats stats;
131	struct sfq_slot *tail;		/* current slot in round */
132
133	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
134					/* Linked lists of slots, indexed by depth
135					 * dep[0] : list of unused flows
136					 * dep[1] : list of flows with 1 packet
137					 * dep[X] : list of flows with X packets
138					 */
139
140	unsigned int	maxflows;	/* number of flows in flows array */
141	int		perturb_period;
142	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
143	struct timer_list perturb_timer;
144	struct Qdisc	*sch;
145};
146
147/*
148 * sfq_head are either in a sfq_slot or in dep[] array
149 */
150static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
151{
152	if (val < SFQ_MAX_FLOWS)
153		return &q->slots[val].dep;
154	return &q->dep[val - SFQ_MAX_FLOWS];
155}
156
157static unsigned int sfq_hash(const struct sfq_sched_data *q,
158			     const struct sk_buff *skb)
159{
160	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
161}
162
163static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
164				 int *qerr)
165{
166	struct sfq_sched_data *q = qdisc_priv(sch);
167	struct tcf_result res;
168	struct tcf_proto *fl;
169	int result;
170
171	if (TC_H_MAJ(skb->priority) == sch->handle &&
172	    TC_H_MIN(skb->priority) > 0 &&
173	    TC_H_MIN(skb->priority) <= q->divisor)
174		return TC_H_MIN(skb->priority);
175
176	fl = rcu_dereference_bh(q->filter_list);
177	if (!fl)
178		return sfq_hash(q, skb) + 1;
179
180	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
181	result = tcf_classify(skb, NULL, fl, &res, false);
182	if (result >= 0) {
183#ifdef CONFIG_NET_CLS_ACT
184		switch (result) {
185		case TC_ACT_STOLEN:
186		case TC_ACT_QUEUED:
187		case TC_ACT_TRAP:
188			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
189			fallthrough;
190		case TC_ACT_SHOT:
191			return 0;
192		}
193#endif
194		if (TC_H_MIN(res.classid) <= q->divisor)
195			return TC_H_MIN(res.classid);
196	}
197	return 0;
198}
199
200/*
201 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
202 */
203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204{
205	sfq_index p, n;
206	struct sfq_slot *slot = &q->slots[x];
207	int qlen = slot->qlen;
208
209	p = qlen + SFQ_MAX_FLOWS;
210	n = q->dep[qlen].next;
211
212	slot->dep.next = n;
213	slot->dep.prev = p;
214
215	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
216	sfq_dep_head(q, n)->prev = x;
217}
218
219#define sfq_unlink(q, x, n, p)			\
220	do {					\
221		n = q->slots[x].dep.next;	\
222		p = q->slots[x].dep.prev;	\
223		sfq_dep_head(q, p)->next = n;	\
224		sfq_dep_head(q, n)->prev = p;	\
225	} while (0)
226
227
228static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229{
230	sfq_index p, n;
231	int d;
232
233	sfq_unlink(q, x, n, p);
234
235	d = q->slots[x].qlen--;
236	if (n == p && q->cur_depth == d)
237		q->cur_depth--;
238	sfq_link(q, x);
239}
240
241static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242{
243	sfq_index p, n;
244	int d;
245
246	sfq_unlink(q, x, n, p);
247
248	d = ++q->slots[x].qlen;
249	if (q->cur_depth < d)
250		q->cur_depth = d;
251	sfq_link(q, x);
252}
253
254/* helper functions : might be changed when/if skb use a standard list_head */
255
256/* remove one skb from tail of slot queue */
257static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
258{
259	struct sk_buff *skb = slot->skblist_prev;
260
261	slot->skblist_prev = skb->prev;
262	skb->prev->next = (struct sk_buff *)slot;
263	skb->next = skb->prev = NULL;
264	return skb;
265}
266
267/* remove one skb from head of slot queue */
268static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
269{
270	struct sk_buff *skb = slot->skblist_next;
271
272	slot->skblist_next = skb->next;
273	skb->next->prev = (struct sk_buff *)slot;
274	skb->next = skb->prev = NULL;
275	return skb;
276}
277
278static inline void slot_queue_init(struct sfq_slot *slot)
279{
280	memset(slot, 0, sizeof(*slot));
281	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282}
283
284/* add skb to slot queue (tail add) */
285static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
286{
287	skb->prev = slot->skblist_prev;
288	skb->next = (struct sk_buff *)slot;
289	slot->skblist_prev->next = skb;
290	slot->skblist_prev = skb;
291}
292
293static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
294{
295	struct sfq_sched_data *q = qdisc_priv(sch);
296	sfq_index x, d = q->cur_depth;
297	struct sk_buff *skb;
298	unsigned int len;
299	struct sfq_slot *slot;
300
301	/* Queue is full! Find the longest slot and drop tail packet from it */
302	if (d > 1) {
303		x = q->dep[d].next;
304		slot = &q->slots[x];
305drop:
306		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
307		len = qdisc_pkt_len(skb);
308		slot->backlog -= len;
309		sfq_dec(q, x);
310		sch->q.qlen--;
311		qdisc_qstats_backlog_dec(sch, skb);
312		qdisc_drop(skb, sch, to_free);
313		return len;
314	}
315
316	if (d == 1) {
317		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318		x = q->tail->next;
319		slot = &q->slots[x];
320		q->tail->next = slot->next;
321		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322		goto drop;
323	}
324
325	return 0;
326}
327
328/* Is ECN parameter configured */
329static int sfq_prob_mark(const struct sfq_sched_data *q)
330{
331	return q->flags & TC_RED_ECN;
332}
333
334/* Should packets over max threshold just be marked */
335static int sfq_hard_mark(const struct sfq_sched_data *q)
336{
337	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338}
339
340static int sfq_headdrop(const struct sfq_sched_data *q)
341{
342	return q->headdrop;
343}
344
345static int
346sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
347{
348	struct sfq_sched_data *q = qdisc_priv(sch);
349	unsigned int hash, dropped;
350	sfq_index x, qlen;
351	struct sfq_slot *slot;
352	int ret;
353	struct sk_buff *head;
354	int delta;
355
356	hash = sfq_classify(skb, sch, &ret);
357	if (hash == 0) {
358		if (ret & __NET_XMIT_BYPASS)
359			qdisc_qstats_drop(sch);
360		__qdisc_drop(skb, to_free);
361		return ret;
362	}
363	hash--;
364
365	x = q->ht[hash];
366	slot = &q->slots[x];
367	if (x == SFQ_EMPTY_SLOT) {
368		x = q->dep[0].next; /* get a free slot */
369		if (x >= SFQ_MAX_FLOWS)
370			return qdisc_drop(skb, sch, to_free);
371		q->ht[hash] = x;
372		slot = &q->slots[x];
373		slot->hash = hash;
374		slot->backlog = 0; /* should already be 0 anyway... */
375		red_set_vars(&slot->vars);
376		goto enqueue;
377	}
378	if (q->red_parms) {
379		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380							&slot->vars,
381							slot->backlog);
382		switch (red_action(q->red_parms,
383				   &slot->vars,
384				   slot->vars.qavg)) {
385		case RED_DONT_MARK:
386			break;
387
388		case RED_PROB_MARK:
389			qdisc_qstats_overlimit(sch);
390			if (sfq_prob_mark(q)) {
391				/* We know we have at least one packet in queue */
392				if (sfq_headdrop(q) &&
393				    INET_ECN_set_ce(slot->skblist_next)) {
394					q->stats.prob_mark_head++;
395					break;
396				}
397				if (INET_ECN_set_ce(skb)) {
398					q->stats.prob_mark++;
399					break;
400				}
401			}
402			q->stats.prob_drop++;
403			goto congestion_drop;
404
405		case RED_HARD_MARK:
406			qdisc_qstats_overlimit(sch);
407			if (sfq_hard_mark(q)) {
408				/* We know we have at least one packet in queue */
409				if (sfq_headdrop(q) &&
410				    INET_ECN_set_ce(slot->skblist_next)) {
411					q->stats.forced_mark_head++;
412					break;
413				}
414				if (INET_ECN_set_ce(skb)) {
415					q->stats.forced_mark++;
416					break;
417				}
418			}
419			q->stats.forced_drop++;
420			goto congestion_drop;
421		}
422	}
423
424	if (slot->qlen >= q->maxdepth) {
425congestion_drop:
426		if (!sfq_headdrop(q))
427			return qdisc_drop(skb, sch, to_free);
428
429		/* We know we have at least one packet in queue */
430		head = slot_dequeue_head(slot);
431		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432		sch->qstats.backlog -= delta;
433		slot->backlog -= delta;
434		qdisc_drop(head, sch, to_free);
435
436		slot_queue_add(slot, skb);
437		qdisc_tree_reduce_backlog(sch, 0, delta);
438		return NET_XMIT_CN;
439	}
440
441enqueue:
442	qdisc_qstats_backlog_inc(sch, skb);
443	slot->backlog += qdisc_pkt_len(skb);
444	slot_queue_add(slot, skb);
445	sfq_inc(q, x);
446	if (slot->qlen == 1) {		/* The flow is new */
447		if (q->tail == NULL) {	/* It is the first flow */
448			slot->next = x;
449		} else {
450			slot->next = q->tail->next;
451			q->tail->next = x;
452		}
453		/* We put this flow at the end of our flow list.
454		 * This might sound unfair for a new flow to wait after old ones,
455		 * but we could endup servicing new flows only, and freeze old ones.
456		 */
457		q->tail = slot;
458		/* We could use a bigger initial quantum for new flows */
459		slot->allot = q->scaled_quantum;
460	}
461	if (++sch->q.qlen <= q->limit)
462		return NET_XMIT_SUCCESS;
463
464	qlen = slot->qlen;
465	dropped = sfq_drop(sch, to_free);
466	/* Return Congestion Notification only if we dropped a packet
467	 * from this flow.
468	 */
469	if (qlen != slot->qlen) {
470		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
471		return NET_XMIT_CN;
472	}
473
474	/* As we dropped a packet, better let upper stack know this */
475	qdisc_tree_reduce_backlog(sch, 1, dropped);
476	return NET_XMIT_SUCCESS;
477}
478
479static struct sk_buff *
480sfq_dequeue(struct Qdisc *sch)
481{
482	struct sfq_sched_data *q = qdisc_priv(sch);
483	struct sk_buff *skb;
484	sfq_index a, next_a;
485	struct sfq_slot *slot;
486
487	/* No active slots */
488	if (q->tail == NULL)
489		return NULL;
490
491next_slot:
492	a = q->tail->next;
493	slot = &q->slots[a];
494	if (slot->allot <= 0) {
495		q->tail = slot;
496		slot->allot += q->scaled_quantum;
497		goto next_slot;
498	}
499	skb = slot_dequeue_head(slot);
500	sfq_dec(q, a);
501	qdisc_bstats_update(sch, skb);
502	sch->q.qlen--;
503	qdisc_qstats_backlog_dec(sch, skb);
504	slot->backlog -= qdisc_pkt_len(skb);
505	/* Is the slot empty? */
506	if (slot->qlen == 0) {
507		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
508		next_a = slot->next;
509		if (a == next_a) {
510			q->tail = NULL; /* no more active slots */
511			return skb;
512		}
513		q->tail->next = next_a;
514	} else {
515		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
516	}
517	return skb;
518}
519
520static void
521sfq_reset(struct Qdisc *sch)
522{
523	struct sk_buff *skb;
524
525	while ((skb = sfq_dequeue(sch)) != NULL)
526		rtnl_kfree_skbs(skb, skb);
527}
528
529/*
530 * When q->perturbation is changed, we rehash all queued skbs
531 * to avoid OOO (Out Of Order) effects.
532 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
533 * counters.
534 */
535static void sfq_rehash(struct Qdisc *sch)
536{
537	struct sfq_sched_data *q = qdisc_priv(sch);
538	struct sk_buff *skb;
539	int i;
540	struct sfq_slot *slot;
541	struct sk_buff_head list;
542	int dropped = 0;
543	unsigned int drop_len = 0;
544
545	__skb_queue_head_init(&list);
546
547	for (i = 0; i < q->maxflows; i++) {
548		slot = &q->slots[i];
549		if (!slot->qlen)
550			continue;
551		while (slot->qlen) {
552			skb = slot_dequeue_head(slot);
553			sfq_dec(q, i);
554			__skb_queue_tail(&list, skb);
555		}
556		slot->backlog = 0;
557		red_set_vars(&slot->vars);
558		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
559	}
560	q->tail = NULL;
561
562	while ((skb = __skb_dequeue(&list)) != NULL) {
563		unsigned int hash = sfq_hash(q, skb);
564		sfq_index x = q->ht[hash];
565
566		slot = &q->slots[x];
567		if (x == SFQ_EMPTY_SLOT) {
568			x = q->dep[0].next; /* get a free slot */
569			if (x >= SFQ_MAX_FLOWS) {
570drop:
571				qdisc_qstats_backlog_dec(sch, skb);
572				drop_len += qdisc_pkt_len(skb);
573				kfree_skb(skb);
574				dropped++;
575				continue;
576			}
577			q->ht[hash] = x;
578			slot = &q->slots[x];
579			slot->hash = hash;
580		}
581		if (slot->qlen >= q->maxdepth)
582			goto drop;
583		slot_queue_add(slot, skb);
584		if (q->red_parms)
585			slot->vars.qavg = red_calc_qavg(q->red_parms,
586							&slot->vars,
587							slot->backlog);
588		slot->backlog += qdisc_pkt_len(skb);
589		sfq_inc(q, x);
590		if (slot->qlen == 1) {		/* The flow is new */
591			if (q->tail == NULL) {	/* It is the first flow */
592				slot->next = x;
593			} else {
594				slot->next = q->tail->next;
595				q->tail->next = x;
596			}
597			q->tail = slot;
598			slot->allot = q->scaled_quantum;
599		}
600	}
601	sch->q.qlen -= dropped;
602	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
603}
604
605static void sfq_perturbation(struct timer_list *t)
606{
607	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
608	struct Qdisc *sch = q->sch;
609	spinlock_t *root_lock;
610	siphash_key_t nkey;
 
611
612	get_random_bytes(&nkey, sizeof(nkey));
613	rcu_read_lock();
614	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
615	spin_lock(root_lock);
616	q->perturbation = nkey;
617	if (!q->filter_list && q->tail)
618		sfq_rehash(sch);
619	spin_unlock(root_lock);
620
621	if (q->perturb_period)
622		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 
 
 
 
623	rcu_read_unlock();
624}
625
626static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 
627{
628	struct sfq_sched_data *q = qdisc_priv(sch);
629	struct tc_sfq_qopt *ctl = nla_data(opt);
630	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
631	unsigned int qlen, dropped = 0;
632	struct red_parms *p = NULL;
633	struct sk_buff *to_free = NULL;
634	struct sk_buff *tail = NULL;
635
636	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
637		return -EINVAL;
638	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
639		ctl_v1 = nla_data(opt);
640	if (ctl->divisor &&
641	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
642		return -EINVAL;
643
644	/* slot->allot is a short, make sure quantum is not too big. */
645	if (ctl->quantum) {
646		unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
647
648		if (scaled <= 0 || scaled > SHRT_MAX)
649			return -EINVAL;
650	}
651
652	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
653					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
654		return -EINVAL;
655	if (ctl_v1 && ctl_v1->qth_min) {
656		p = kmalloc(sizeof(*p), GFP_KERNEL);
657		if (!p)
658			return -ENOMEM;
659	}
 
 
 
 
660	sch_tree_lock(sch);
661	if (ctl->quantum) {
662		q->quantum = ctl->quantum;
663		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
664	}
665	q->perturb_period = ctl->perturb_period * HZ;
666	if (ctl->flows)
667		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
668	if (ctl->divisor) {
669		q->divisor = ctl->divisor;
670		q->maxflows = min_t(u32, q->maxflows, q->divisor);
671	}
672	if (ctl_v1) {
673		if (ctl_v1->depth)
674			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
675		if (p) {
676			swap(q->red_parms, p);
677			red_set_parms(q->red_parms,
678				      ctl_v1->qth_min, ctl_v1->qth_max,
679				      ctl_v1->Wlog,
680				      ctl_v1->Plog, ctl_v1->Scell_log,
681				      NULL,
682				      ctl_v1->max_P);
683		}
684		q->flags = ctl_v1->flags;
685		q->headdrop = ctl_v1->headdrop;
686	}
687	if (ctl->limit) {
688		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
689		q->maxflows = min_t(u32, q->maxflows, q->limit);
690	}
691
692	qlen = sch->q.qlen;
693	while (sch->q.qlen > q->limit) {
694		dropped += sfq_drop(sch, &to_free);
695		if (!tail)
696			tail = to_free;
697	}
698
699	rtnl_kfree_skbs(to_free, tail);
700	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
701
702	del_timer(&q->perturb_timer);
703	if (q->perturb_period) {
704		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
705		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
706	}
707	sch_tree_unlock(sch);
708	kfree(p);
709	return 0;
710}
711
712static void *sfq_alloc(size_t sz)
713{
714	return  kvmalloc(sz, GFP_KERNEL);
715}
716
717static void sfq_free(void *addr)
718{
719	kvfree(addr);
720}
721
722static void sfq_destroy(struct Qdisc *sch)
723{
724	struct sfq_sched_data *q = qdisc_priv(sch);
725
726	tcf_block_put(q->block);
727	q->perturb_period = 0;
728	del_timer_sync(&q->perturb_timer);
729	sfq_free(q->ht);
730	sfq_free(q->slots);
731	kfree(q->red_parms);
732}
733
734static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
735		    struct netlink_ext_ack *extack)
736{
737	struct sfq_sched_data *q = qdisc_priv(sch);
738	int i;
739	int err;
740
741	q->sch = sch;
742	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
743
744	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
745	if (err)
746		return err;
747
748	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
749		q->dep[i].next = i + SFQ_MAX_FLOWS;
750		q->dep[i].prev = i + SFQ_MAX_FLOWS;
751	}
752
753	q->limit = SFQ_MAX_DEPTH;
754	q->maxdepth = SFQ_MAX_DEPTH;
755	q->cur_depth = 0;
756	q->tail = NULL;
757	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
758	q->maxflows = SFQ_DEFAULT_FLOWS;
759	q->quantum = psched_mtu(qdisc_dev(sch));
760	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
761	q->perturb_period = 0;
762	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
763
764	if (opt) {
765		int err = sfq_change(sch, opt);
766		if (err)
767			return err;
768	}
769
770	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
771	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
772	if (!q->ht || !q->slots) {
773		/* Note: sfq_destroy() will be called by our caller */
774		return -ENOMEM;
775	}
776
777	for (i = 0; i < q->divisor; i++)
778		q->ht[i] = SFQ_EMPTY_SLOT;
779
780	for (i = 0; i < q->maxflows; i++) {
781		slot_queue_init(&q->slots[i]);
782		sfq_link(q, i);
783	}
784	if (q->limit >= 1)
785		sch->flags |= TCQ_F_CAN_BYPASS;
786	else
787		sch->flags &= ~TCQ_F_CAN_BYPASS;
788	return 0;
789}
790
791static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
792{
793	struct sfq_sched_data *q = qdisc_priv(sch);
794	unsigned char *b = skb_tail_pointer(skb);
795	struct tc_sfq_qopt_v1 opt;
796	struct red_parms *p = q->red_parms;
797
798	memset(&opt, 0, sizeof(opt));
799	opt.v0.quantum	= q->quantum;
800	opt.v0.perturb_period = q->perturb_period / HZ;
801	opt.v0.limit	= q->limit;
802	opt.v0.divisor	= q->divisor;
803	opt.v0.flows	= q->maxflows;
804	opt.depth	= q->maxdepth;
805	opt.headdrop	= q->headdrop;
806
807	if (p) {
808		opt.qth_min	= p->qth_min >> p->Wlog;
809		opt.qth_max	= p->qth_max >> p->Wlog;
810		opt.Wlog	= p->Wlog;
811		opt.Plog	= p->Plog;
812		opt.Scell_log	= p->Scell_log;
813		opt.max_P	= p->max_P;
814	}
815	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
816	opt.flags	= q->flags;
817
818	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
819		goto nla_put_failure;
820
821	return skb->len;
822
823nla_put_failure:
824	nlmsg_trim(skb, b);
825	return -1;
826}
827
828static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
829{
830	return NULL;
831}
832
833static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
834{
835	return 0;
836}
837
838static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
839			      u32 classid)
840{
841	return 0;
842}
843
844static void sfq_unbind(struct Qdisc *q, unsigned long cl)
845{
846}
847
848static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
849				       struct netlink_ext_ack *extack)
850{
851	struct sfq_sched_data *q = qdisc_priv(sch);
852
853	if (cl)
854		return NULL;
855	return q->block;
856}
857
858static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
859			  struct sk_buff *skb, struct tcmsg *tcm)
860{
861	tcm->tcm_handle |= TC_H_MIN(cl);
862	return 0;
863}
864
865static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
866				struct gnet_dump *d)
867{
868	struct sfq_sched_data *q = qdisc_priv(sch);
869	sfq_index idx = q->ht[cl - 1];
870	struct gnet_stats_queue qs = { 0 };
871	struct tc_sfq_xstats xstats = { 0 };
872
873	if (idx != SFQ_EMPTY_SLOT) {
874		const struct sfq_slot *slot = &q->slots[idx];
875
876		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
877		qs.qlen = slot->qlen;
878		qs.backlog = slot->backlog;
879	}
880	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
881		return -1;
882	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
883}
884
885static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
886{
887	struct sfq_sched_data *q = qdisc_priv(sch);
888	unsigned int i;
889
890	if (arg->stop)
891		return;
892
893	for (i = 0; i < q->divisor; i++) {
894		if (q->ht[i] == SFQ_EMPTY_SLOT) {
895			arg->count++;
896			continue;
897		}
898		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
899			break;
900	}
901}
902
903static const struct Qdisc_class_ops sfq_class_ops = {
904	.leaf		=	sfq_leaf,
905	.find		=	sfq_find,
906	.tcf_block	=	sfq_tcf_block,
907	.bind_tcf	=	sfq_bind,
908	.unbind_tcf	=	sfq_unbind,
909	.dump		=	sfq_dump_class,
910	.dump_stats	=	sfq_dump_class_stats,
911	.walk		=	sfq_walk,
912};
913
914static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
915	.cl_ops		=	&sfq_class_ops,
916	.id		=	"sfq",
917	.priv_size	=	sizeof(struct sfq_sched_data),
918	.enqueue	=	sfq_enqueue,
919	.dequeue	=	sfq_dequeue,
920	.peek		=	qdisc_peek_dequeued,
921	.init		=	sfq_init,
922	.reset		=	sfq_reset,
923	.destroy	=	sfq_destroy,
924	.change		=	NULL,
925	.dump		=	sfq_dump,
926	.owner		=	THIS_MODULE,
927};
 
928
929static int __init sfq_module_init(void)
930{
931	return register_qdisc(&sfq_qdisc_ops);
932}
933static void __exit sfq_module_exit(void)
934{
935	unregister_qdisc(&sfq_qdisc_ops);
936}
937module_init(sfq_module_init)
938module_exit(sfq_module_exit)
939MODULE_LICENSE("GPL");
940MODULE_DESCRIPTION("Stochastic Fairness qdisc");