Linux Audio

Check our new training course

Loading...
v3.5.6
 
  1/*
  2 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 */
 11
 12#include <linux/module.h>
 13#include <linux/types.h>
 14#include <linux/kernel.h>
 15#include <linux/jiffies.h>
 16#include <linux/string.h>
 17#include <linux/in.h>
 18#include <linux/errno.h>
 19#include <linux/init.h>
 20#include <linux/skbuff.h>
 21#include <linux/jhash.h>
 22#include <linux/slab.h>
 23#include <linux/vmalloc.h>
 24#include <net/netlink.h>
 25#include <net/pkt_sched.h>
 26#include <net/flow_keys.h>
 27#include <net/red.h>
 28
 29
 30/*	Stochastic Fairness Queuing algorithm.
 31	=======================================
 32
 33	Source:
 34	Paul E. McKenney "Stochastic Fairness Queuing",
 35	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 36
 37	Paul E. McKenney "Stochastic Fairness Queuing",
 38	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 39
 40
 41	See also:
 42	M. Shreedhar and George Varghese "Efficient Fair
 43	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 44
 45
 46	This is not the thing that is usually called (W)FQ nowadays.
 47	It does not use any timestamp mechanism, but instead
 48	processes queues in round-robin order.
 49
 50	ADVANTAGE:
 51
 52	- It is very cheap. Both CPU and memory requirements are minimal.
 53
 54	DRAWBACKS:
 55
 56	- "Stochastic" -> It is not 100% fair.
 57	When hash collisions occur, several flows are considered as one.
 58
 59	- "Round-robin" -> It introduces larger delays than virtual clock
 60	based schemes, and should not be used for isolating interactive
 61	traffic	from non-interactive. It means, that this scheduler
 62	should be used as leaf of CBQ or P3, which put interactive traffic
 63	to higher priority band.
 64
 65	We still need true WFQ for top level CSZ, but using WFQ
 66	for the best effort traffic is absolutely pointless:
 67	SFQ is superior for this purpose.
 68
 69	IMPLEMENTATION:
 70	This implementation limits :
 71	- maximal queue length per flow to 127 packets.
 72	- max mtu to 2^18-1;
 73	- max 65408 flows,
 74	- number of hash buckets to 65536.
 75
 76	It is easy to increase these values, but not in flight.  */
 77
 78#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 79#define SFQ_DEFAULT_FLOWS	128
 80#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 81#define SFQ_EMPTY_SLOT		0xffff
 82#define SFQ_DEFAULT_HASH_DIVISOR 1024
 83
 84/* We use 16 bits to store allot, and want to handle packets up to 64K
 85 * Scale allot by 8 (1<<3) so that no overflow occurs.
 86 */
 87#define SFQ_ALLOT_SHIFT		3
 88#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 89
 90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 91typedef u16 sfq_index;
 92
 93/*
 94 * We dont use pointers to save space.
 95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 97 * are 'pointers' to dep[] array
 98 */
 99struct sfq_head {
100	sfq_index	next;
101	sfq_index	prev;
102};
103
104struct sfq_slot {
105	struct sk_buff	*skblist_next;
106	struct sk_buff	*skblist_prev;
107	sfq_index	qlen; /* number of skbs in skblist */
108	sfq_index	next; /* next slot in sfq RR chain */
109	struct sfq_head dep; /* anchor in dep[] chains */
110	unsigned short	hash; /* hash value (index in ht[]) */
111	short		allot; /* credit for this slot */
112
113	unsigned int    backlog;
114	struct red_vars vars;
115};
116
117struct sfq_sched_data {
118/* frequently used fields */
119	int		limit;		/* limit of total number of packets in this qdisc */
120	unsigned int	divisor;	/* number of slots in hash table */
121	u8		headdrop;
122	u8		maxdepth;	/* limit of packets per flow */
123
124	u32		perturbation;
125	u8		cur_depth;	/* depth of longest slot */
126	u8		flags;
127	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128	struct tcf_proto *filter_list;
 
129	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131
132	struct red_parms *red_parms;
133	struct tc_sfqred_stats stats;
134	struct sfq_slot *tail;		/* current slot in round */
135
136	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137					/* Linked lists of slots, indexed by depth
138					 * dep[0] : list of unused flows
139					 * dep[1] : list of flows with 1 packet
140					 * dep[X] : list of flows with X packets
141					 */
142
143	unsigned int	maxflows;	/* number of flows in flows array */
144	int		perturb_period;
145	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146	struct timer_list perturb_timer;
 
147};
148
149/*
150 * sfq_head are either in a sfq_slot or in dep[] array
151 */
152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153{
154	if (val < SFQ_MAX_FLOWS)
155		return &q->slots[val].dep;
156	return &q->dep[val - SFQ_MAX_FLOWS];
157}
158
159/*
160 * In order to be able to quickly rehash our queue when timer changes
161 * q->perturbation, we store flow_keys in skb->cb[]
162 */
163struct sfq_skb_cb {
164       struct flow_keys        keys;
165};
166
167static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168{
169	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171}
172
173static unsigned int sfq_hash(const struct sfq_sched_data *q,
174			     const struct sk_buff *skb)
175{
176	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177	unsigned int hash;
178
179	hash = jhash_3words((__force u32)keys->dst,
180			    (__force u32)keys->src ^ keys->ip_proto,
181			    (__force u32)keys->ports, q->perturbation);
182	return hash & (q->divisor - 1);
183}
184
185static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186				 int *qerr)
187{
188	struct sfq_sched_data *q = qdisc_priv(sch);
189	struct tcf_result res;
 
190	int result;
191
192	if (TC_H_MAJ(skb->priority) == sch->handle &&
193	    TC_H_MIN(skb->priority) > 0 &&
194	    TC_H_MIN(skb->priority) <= q->divisor)
195		return TC_H_MIN(skb->priority);
196
197	if (!q->filter_list) {
198		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199		return sfq_hash(q, skb) + 1;
200	}
201
202	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
203	result = tc_classify(skb, q->filter_list, &res);
204	if (result >= 0) {
205#ifdef CONFIG_NET_CLS_ACT
206		switch (result) {
207		case TC_ACT_STOLEN:
208		case TC_ACT_QUEUED:
 
209			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 
210		case TC_ACT_SHOT:
211			return 0;
212		}
213#endif
214		if (TC_H_MIN(res.classid) <= q->divisor)
215			return TC_H_MIN(res.classid);
216	}
217	return 0;
218}
219
220/*
221 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222 */
223static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224{
225	sfq_index p, n;
226	struct sfq_slot *slot = &q->slots[x];
227	int qlen = slot->qlen;
228
229	p = qlen + SFQ_MAX_FLOWS;
230	n = q->dep[qlen].next;
231
232	slot->dep.next = n;
233	slot->dep.prev = p;
234
235	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
236	sfq_dep_head(q, n)->prev = x;
237}
238
239#define sfq_unlink(q, x, n, p)			\
240	n = q->slots[x].dep.next;		\
241	p = q->slots[x].dep.prev;		\
242	sfq_dep_head(q, p)->next = n;		\
243	sfq_dep_head(q, n)->prev = p
 
 
244
245
246static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
247{
248	sfq_index p, n;
249	int d;
250
251	sfq_unlink(q, x, n, p);
252
253	d = q->slots[x].qlen--;
254	if (n == p && q->cur_depth == d)
255		q->cur_depth--;
256	sfq_link(q, x);
257}
258
259static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
260{
261	sfq_index p, n;
262	int d;
263
264	sfq_unlink(q, x, n, p);
265
266	d = ++q->slots[x].qlen;
267	if (q->cur_depth < d)
268		q->cur_depth = d;
269	sfq_link(q, x);
270}
271
272/* helper functions : might be changed when/if skb use a standard list_head */
273
274/* remove one skb from tail of slot queue */
275static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
276{
277	struct sk_buff *skb = slot->skblist_prev;
278
279	slot->skblist_prev = skb->prev;
280	skb->prev->next = (struct sk_buff *)slot;
281	skb->next = skb->prev = NULL;
282	return skb;
283}
284
285/* remove one skb from head of slot queue */
286static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
287{
288	struct sk_buff *skb = slot->skblist_next;
289
290	slot->skblist_next = skb->next;
291	skb->next->prev = (struct sk_buff *)slot;
292	skb->next = skb->prev = NULL;
293	return skb;
294}
295
296static inline void slot_queue_init(struct sfq_slot *slot)
297{
298	memset(slot, 0, sizeof(*slot));
299	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
300}
301
302/* add skb to slot queue (tail add) */
303static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
304{
305	skb->prev = slot->skblist_prev;
306	skb->next = (struct sk_buff *)slot;
307	slot->skblist_prev->next = skb;
308	slot->skblist_prev = skb;
309}
310
311#define	slot_queue_walk(slot, skb)		\
312	for (skb = slot->skblist_next;		\
313	     skb != (struct sk_buff *)slot;	\
314	     skb = skb->next)
315
316static unsigned int sfq_drop(struct Qdisc *sch)
317{
318	struct sfq_sched_data *q = qdisc_priv(sch);
319	sfq_index x, d = q->cur_depth;
320	struct sk_buff *skb;
321	unsigned int len;
322	struct sfq_slot *slot;
323
324	/* Queue is full! Find the longest slot and drop tail packet from it */
325	if (d > 1) {
326		x = q->dep[d].next;
327		slot = &q->slots[x];
328drop:
329		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
330		len = qdisc_pkt_len(skb);
331		slot->backlog -= len;
332		sfq_dec(q, x);
333		kfree_skb(skb);
334		sch->q.qlen--;
335		sch->qstats.drops++;
336		sch->qstats.backlog -= len;
337		return len;
338	}
339
340	if (d == 1) {
341		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
342		x = q->tail->next;
343		slot = &q->slots[x];
344		q->tail->next = slot->next;
345		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
346		goto drop;
347	}
348
349	return 0;
350}
351
352/* Is ECN parameter configured */
353static int sfq_prob_mark(const struct sfq_sched_data *q)
354{
355	return q->flags & TC_RED_ECN;
356}
357
358/* Should packets over max threshold just be marked */
359static int sfq_hard_mark(const struct sfq_sched_data *q)
360{
361	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
362}
363
364static int sfq_headdrop(const struct sfq_sched_data *q)
365{
366	return q->headdrop;
367}
368
369static int
370sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
371{
372	struct sfq_sched_data *q = qdisc_priv(sch);
373	unsigned int hash;
374	sfq_index x, qlen;
375	struct sfq_slot *slot;
376	int uninitialized_var(ret);
377	struct sk_buff *head;
378	int delta;
379
380	hash = sfq_classify(skb, sch, &ret);
381	if (hash == 0) {
382		if (ret & __NET_XMIT_BYPASS)
383			sch->qstats.drops++;
384		kfree_skb(skb);
385		return ret;
386	}
387	hash--;
388
389	x = q->ht[hash];
390	slot = &q->slots[x];
391	if (x == SFQ_EMPTY_SLOT) {
392		x = q->dep[0].next; /* get a free slot */
393		if (x >= SFQ_MAX_FLOWS)
394			return qdisc_drop(skb, sch);
395		q->ht[hash] = x;
396		slot = &q->slots[x];
397		slot->hash = hash;
398		slot->backlog = 0; /* should already be 0 anyway... */
399		red_set_vars(&slot->vars);
400		goto enqueue;
401	}
402	if (q->red_parms) {
403		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
404							&slot->vars,
405							slot->backlog);
406		switch (red_action(q->red_parms,
407				   &slot->vars,
408				   slot->vars.qavg)) {
409		case RED_DONT_MARK:
410			break;
411
412		case RED_PROB_MARK:
413			sch->qstats.overlimits++;
414			if (sfq_prob_mark(q)) {
415				/* We know we have at least one packet in queue */
416				if (sfq_headdrop(q) &&
417				    INET_ECN_set_ce(slot->skblist_next)) {
418					q->stats.prob_mark_head++;
419					break;
420				}
421				if (INET_ECN_set_ce(skb)) {
422					q->stats.prob_mark++;
423					break;
424				}
425			}
426			q->stats.prob_drop++;
427			goto congestion_drop;
428
429		case RED_HARD_MARK:
430			sch->qstats.overlimits++;
431			if (sfq_hard_mark(q)) {
432				/* We know we have at least one packet in queue */
433				if (sfq_headdrop(q) &&
434				    INET_ECN_set_ce(slot->skblist_next)) {
435					q->stats.forced_mark_head++;
436					break;
437				}
438				if (INET_ECN_set_ce(skb)) {
439					q->stats.forced_mark++;
440					break;
441				}
442			}
443			q->stats.forced_drop++;
444			goto congestion_drop;
445		}
446	}
447
448	if (slot->qlen >= q->maxdepth) {
449congestion_drop:
450		if (!sfq_headdrop(q))
451			return qdisc_drop(skb, sch);
452
453		/* We know we have at least one packet in queue */
454		head = slot_dequeue_head(slot);
455		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
456		sch->qstats.backlog -= delta;
457		slot->backlog -= delta;
458		qdisc_drop(head, sch);
459
460		slot_queue_add(slot, skb);
 
461		return NET_XMIT_CN;
462	}
463
464enqueue:
465	sch->qstats.backlog += qdisc_pkt_len(skb);
466	slot->backlog += qdisc_pkt_len(skb);
467	slot_queue_add(slot, skb);
468	sfq_inc(q, x);
469	if (slot->qlen == 1) {		/* The flow is new */
470		if (q->tail == NULL) {	/* It is the first flow */
471			slot->next = x;
472		} else {
473			slot->next = q->tail->next;
474			q->tail->next = x;
475		}
476		/* We put this flow at the end of our flow list.
477		 * This might sound unfair for a new flow to wait after old ones,
478		 * but we could endup servicing new flows only, and freeze old ones.
479		 */
480		q->tail = slot;
481		/* We could use a bigger initial quantum for new flows */
482		slot->allot = q->scaled_quantum;
483	}
484	if (++sch->q.qlen <= q->limit)
485		return NET_XMIT_SUCCESS;
486
487	qlen = slot->qlen;
488	sfq_drop(sch);
489	/* Return Congestion Notification only if we dropped a packet
490	 * from this flow.
491	 */
492	if (qlen != slot->qlen)
 
493		return NET_XMIT_CN;
 
494
495	/* As we dropped a packet, better let upper stack know this */
496	qdisc_tree_decrease_qlen(sch, 1);
497	return NET_XMIT_SUCCESS;
498}
499
500static struct sk_buff *
501sfq_dequeue(struct Qdisc *sch)
502{
503	struct sfq_sched_data *q = qdisc_priv(sch);
504	struct sk_buff *skb;
505	sfq_index a, next_a;
506	struct sfq_slot *slot;
507
508	/* No active slots */
509	if (q->tail == NULL)
510		return NULL;
511
512next_slot:
513	a = q->tail->next;
514	slot = &q->slots[a];
515	if (slot->allot <= 0) {
516		q->tail = slot;
517		slot->allot += q->scaled_quantum;
518		goto next_slot;
519	}
520	skb = slot_dequeue_head(slot);
521	sfq_dec(q, a);
522	qdisc_bstats_update(sch, skb);
523	sch->q.qlen--;
524	sch->qstats.backlog -= qdisc_pkt_len(skb);
525	slot->backlog -= qdisc_pkt_len(skb);
526	/* Is the slot empty? */
527	if (slot->qlen == 0) {
528		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
529		next_a = slot->next;
530		if (a == next_a) {
531			q->tail = NULL; /* no more active slots */
532			return skb;
533		}
534		q->tail->next = next_a;
535	} else {
536		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
537	}
538	return skb;
539}
540
541static void
542sfq_reset(struct Qdisc *sch)
543{
544	struct sk_buff *skb;
545
546	while ((skb = sfq_dequeue(sch)) != NULL)
547		kfree_skb(skb);
548}
549
550/*
551 * When q->perturbation is changed, we rehash all queued skbs
552 * to avoid OOO (Out Of Order) effects.
553 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
554 * counters.
555 */
556static void sfq_rehash(struct Qdisc *sch)
557{
558	struct sfq_sched_data *q = qdisc_priv(sch);
559	struct sk_buff *skb;
560	int i;
561	struct sfq_slot *slot;
562	struct sk_buff_head list;
563	int dropped = 0;
 
564
565	__skb_queue_head_init(&list);
566
567	for (i = 0; i < q->maxflows; i++) {
568		slot = &q->slots[i];
569		if (!slot->qlen)
570			continue;
571		while (slot->qlen) {
572			skb = slot_dequeue_head(slot);
573			sfq_dec(q, i);
574			__skb_queue_tail(&list, skb);
575		}
576		slot->backlog = 0;
577		red_set_vars(&slot->vars);
578		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
579	}
580	q->tail = NULL;
581
582	while ((skb = __skb_dequeue(&list)) != NULL) {
583		unsigned int hash = sfq_hash(q, skb);
584		sfq_index x = q->ht[hash];
585
586		slot = &q->slots[x];
587		if (x == SFQ_EMPTY_SLOT) {
588			x = q->dep[0].next; /* get a free slot */
589			if (x >= SFQ_MAX_FLOWS) {
590drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
 
 
591				kfree_skb(skb);
592				dropped++;
593				continue;
594			}
595			q->ht[hash] = x;
596			slot = &q->slots[x];
597			slot->hash = hash;
598		}
599		if (slot->qlen >= q->maxdepth)
600			goto drop;
601		slot_queue_add(slot, skb);
602		if (q->red_parms)
603			slot->vars.qavg = red_calc_qavg(q->red_parms,
604							&slot->vars,
605							slot->backlog);
606		slot->backlog += qdisc_pkt_len(skb);
607		sfq_inc(q, x);
608		if (slot->qlen == 1) {		/* The flow is new */
609			if (q->tail == NULL) {	/* It is the first flow */
610				slot->next = x;
611			} else {
612				slot->next = q->tail->next;
613				q->tail->next = x;
614			}
615			q->tail = slot;
616			slot->allot = q->scaled_quantum;
617		}
618	}
619	sch->q.qlen -= dropped;
620	qdisc_tree_decrease_qlen(sch, dropped);
621}
622
623static void sfq_perturbation(unsigned long arg)
624{
625	struct Qdisc *sch = (struct Qdisc *)arg;
626	struct sfq_sched_data *q = qdisc_priv(sch);
627	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 
628
 
629	spin_lock(root_lock);
630	q->perturbation = net_random();
631	if (!q->filter_list && q->tail)
632		sfq_rehash(sch);
633	spin_unlock(root_lock);
634
635	if (q->perturb_period)
636		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
637}
638
639static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
640{
641	struct sfq_sched_data *q = qdisc_priv(sch);
642	struct tc_sfq_qopt *ctl = nla_data(opt);
643	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
644	unsigned int qlen;
645	struct red_parms *p = NULL;
 
 
646
647	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648		return -EINVAL;
649	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650		ctl_v1 = nla_data(opt);
651	if (ctl->divisor &&
652	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653		return -EINVAL;
 
 
 
 
 
 
 
 
 
 
 
 
654	if (ctl_v1 && ctl_v1->qth_min) {
655		p = kmalloc(sizeof(*p), GFP_KERNEL);
656		if (!p)
657			return -ENOMEM;
658	}
659	sch_tree_lock(sch);
660	if (ctl->quantum) {
661		q->quantum = ctl->quantum;
662		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
663	}
664	q->perturb_period = ctl->perturb_period * HZ;
665	if (ctl->flows)
666		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
667	if (ctl->divisor) {
668		q->divisor = ctl->divisor;
669		q->maxflows = min_t(u32, q->maxflows, q->divisor);
670	}
671	if (ctl_v1) {
672		if (ctl_v1->depth)
673			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
674		if (p) {
675			swap(q->red_parms, p);
676			red_set_parms(q->red_parms,
677				      ctl_v1->qth_min, ctl_v1->qth_max,
678				      ctl_v1->Wlog,
679				      ctl_v1->Plog, ctl_v1->Scell_log,
680				      NULL,
681				      ctl_v1->max_P);
682		}
683		q->flags = ctl_v1->flags;
684		q->headdrop = ctl_v1->headdrop;
685	}
686	if (ctl->limit) {
687		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
688		q->maxflows = min_t(u32, q->maxflows, q->limit);
689	}
690
691	qlen = sch->q.qlen;
692	while (sch->q.qlen > q->limit)
693		sfq_drop(sch);
694	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 
 
 
 
 
695
696	del_timer(&q->perturb_timer);
697	if (q->perturb_period) {
698		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
699		q->perturbation = net_random();
700	}
701	sch_tree_unlock(sch);
702	kfree(p);
703	return 0;
704}
705
706static void *sfq_alloc(size_t sz)
707{
708	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
709
710	if (!ptr)
711		ptr = vmalloc(sz);
712	return ptr;
713}
714
715static void sfq_free(void *addr)
716{
717	if (addr) {
718		if (is_vmalloc_addr(addr))
719			vfree(addr);
720		else
721			kfree(addr);
722	}
723}
724
725static void sfq_destroy(struct Qdisc *sch)
726{
727	struct sfq_sched_data *q = qdisc_priv(sch);
728
729	tcf_destroy_chain(&q->filter_list);
730	q->perturb_period = 0;
731	del_timer_sync(&q->perturb_timer);
732	sfq_free(q->ht);
733	sfq_free(q->slots);
734	kfree(q->red_parms);
735}
736
737static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 
738{
739	struct sfq_sched_data *q = qdisc_priv(sch);
740	int i;
 
 
 
 
741
742	q->perturb_timer.function = sfq_perturbation;
743	q->perturb_timer.data = (unsigned long)sch;
744	init_timer_deferrable(&q->perturb_timer);
745
746	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
747		q->dep[i].next = i + SFQ_MAX_FLOWS;
748		q->dep[i].prev = i + SFQ_MAX_FLOWS;
749	}
750
751	q->limit = SFQ_MAX_DEPTH;
752	q->maxdepth = SFQ_MAX_DEPTH;
753	q->cur_depth = 0;
754	q->tail = NULL;
755	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
756	q->maxflows = SFQ_DEFAULT_FLOWS;
757	q->quantum = psched_mtu(qdisc_dev(sch));
758	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
759	q->perturb_period = 0;
760	q->perturbation = net_random();
761
762	if (opt) {
763		int err = sfq_change(sch, opt);
764		if (err)
765			return err;
766	}
767
768	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
769	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
770	if (!q->ht || !q->slots) {
771		sfq_destroy(sch);
772		return -ENOMEM;
773	}
 
774	for (i = 0; i < q->divisor; i++)
775		q->ht[i] = SFQ_EMPTY_SLOT;
776
777	for (i = 0; i < q->maxflows; i++) {
778		slot_queue_init(&q->slots[i]);
779		sfq_link(q, i);
780	}
781	if (q->limit >= 1)
782		sch->flags |= TCQ_F_CAN_BYPASS;
783	else
784		sch->flags &= ~TCQ_F_CAN_BYPASS;
785	return 0;
786}
787
788static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
789{
790	struct sfq_sched_data *q = qdisc_priv(sch);
791	unsigned char *b = skb_tail_pointer(skb);
792	struct tc_sfq_qopt_v1 opt;
793	struct red_parms *p = q->red_parms;
794
795	memset(&opt, 0, sizeof(opt));
796	opt.v0.quantum	= q->quantum;
797	opt.v0.perturb_period = q->perturb_period / HZ;
798	opt.v0.limit	= q->limit;
799	opt.v0.divisor	= q->divisor;
800	opt.v0.flows	= q->maxflows;
801	opt.depth	= q->maxdepth;
802	opt.headdrop	= q->headdrop;
803
804	if (p) {
805		opt.qth_min	= p->qth_min >> p->Wlog;
806		opt.qth_max	= p->qth_max >> p->Wlog;
807		opt.Wlog	= p->Wlog;
808		opt.Plog	= p->Plog;
809		opt.Scell_log	= p->Scell_log;
810		opt.max_P	= p->max_P;
811	}
812	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
813	opt.flags	= q->flags;
814
815	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
816		goto nla_put_failure;
817
818	return skb->len;
819
820nla_put_failure:
821	nlmsg_trim(skb, b);
822	return -1;
823}
824
825static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
826{
827	return NULL;
828}
829
830static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
831{
832	return 0;
833}
834
835static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
836			      u32 classid)
837{
838	/* we cannot bypass queue discipline anymore */
839	sch->flags &= ~TCQ_F_CAN_BYPASS;
840	return 0;
841}
842
843static void sfq_put(struct Qdisc *q, unsigned long cl)
844{
845}
846
847static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 
848{
849	struct sfq_sched_data *q = qdisc_priv(sch);
850
851	if (cl)
852		return NULL;
853	return &q->filter_list;
854}
855
856static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
857			  struct sk_buff *skb, struct tcmsg *tcm)
858{
859	tcm->tcm_handle |= TC_H_MIN(cl);
860	return 0;
861}
862
863static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
864				struct gnet_dump *d)
865{
866	struct sfq_sched_data *q = qdisc_priv(sch);
867	sfq_index idx = q->ht[cl - 1];
868	struct gnet_stats_queue qs = { 0 };
869	struct tc_sfq_xstats xstats = { 0 };
870
871	if (idx != SFQ_EMPTY_SLOT) {
872		const struct sfq_slot *slot = &q->slots[idx];
873
874		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
875		qs.qlen = slot->qlen;
876		qs.backlog = slot->backlog;
877	}
878	if (gnet_stats_copy_queue(d, &qs) < 0)
879		return -1;
880	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
881}
882
883static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
884{
885	struct sfq_sched_data *q = qdisc_priv(sch);
886	unsigned int i;
887
888	if (arg->stop)
889		return;
890
891	for (i = 0; i < q->divisor; i++) {
892		if (q->ht[i] == SFQ_EMPTY_SLOT ||
893		    arg->count < arg->skip) {
894			arg->count++;
895			continue;
896		}
897		if (arg->fn(sch, i + 1, arg) < 0) {
898			arg->stop = 1;
899			break;
900		}
901		arg->count++;
902	}
903}
904
905static const struct Qdisc_class_ops sfq_class_ops = {
906	.leaf		=	sfq_leaf,
907	.get		=	sfq_get,
908	.put		=	sfq_put,
909	.tcf_chain	=	sfq_find_tcf,
910	.bind_tcf	=	sfq_bind,
911	.unbind_tcf	=	sfq_put,
912	.dump		=	sfq_dump_class,
913	.dump_stats	=	sfq_dump_class_stats,
914	.walk		=	sfq_walk,
915};
916
917static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
918	.cl_ops		=	&sfq_class_ops,
919	.id		=	"sfq",
920	.priv_size	=	sizeof(struct sfq_sched_data),
921	.enqueue	=	sfq_enqueue,
922	.dequeue	=	sfq_dequeue,
923	.peek		=	qdisc_peek_dequeued,
924	.drop		=	sfq_drop,
925	.init		=	sfq_init,
926	.reset		=	sfq_reset,
927	.destroy	=	sfq_destroy,
928	.change		=	NULL,
929	.dump		=	sfq_dump,
930	.owner		=	THIS_MODULE,
931};
932
933static int __init sfq_module_init(void)
934{
935	return register_qdisc(&sfq_qdisc_ops);
936}
937static void __exit sfq_module_exit(void)
938{
939	unregister_qdisc(&sfq_qdisc_ops);
940}
941module_init(sfq_module_init)
942module_exit(sfq_module_exit)
943MODULE_LICENSE("GPL");
v5.14.15
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  4 *
 
 
 
 
 
  5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 */
  7
  8#include <linux/module.h>
  9#include <linux/types.h>
 10#include <linux/kernel.h>
 11#include <linux/jiffies.h>
 12#include <linux/string.h>
 13#include <linux/in.h>
 14#include <linux/errno.h>
 15#include <linux/init.h>
 16#include <linux/skbuff.h>
 17#include <linux/siphash.h>
 18#include <linux/slab.h>
 19#include <linux/vmalloc.h>
 20#include <net/netlink.h>
 21#include <net/pkt_sched.h>
 22#include <net/pkt_cls.h>
 23#include <net/red.h>
 24
 25
 26/*	Stochastic Fairness Queuing algorithm.
 27	=======================================
 28
 29	Source:
 30	Paul E. McKenney "Stochastic Fairness Queuing",
 31	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 32
 33	Paul E. McKenney "Stochastic Fairness Queuing",
 34	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 35
 36
 37	See also:
 38	M. Shreedhar and George Varghese "Efficient Fair
 39	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 40
 41
 42	This is not the thing that is usually called (W)FQ nowadays.
 43	It does not use any timestamp mechanism, but instead
 44	processes queues in round-robin order.
 45
 46	ADVANTAGE:
 47
 48	- It is very cheap. Both CPU and memory requirements are minimal.
 49
 50	DRAWBACKS:
 51
 52	- "Stochastic" -> It is not 100% fair.
 53	When hash collisions occur, several flows are considered as one.
 54
 55	- "Round-robin" -> It introduces larger delays than virtual clock
 56	based schemes, and should not be used for isolating interactive
 57	traffic	from non-interactive. It means, that this scheduler
 58	should be used as leaf of CBQ or P3, which put interactive traffic
 59	to higher priority band.
 60
 61	We still need true WFQ for top level CSZ, but using WFQ
 62	for the best effort traffic is absolutely pointless:
 63	SFQ is superior for this purpose.
 64
 65	IMPLEMENTATION:
 66	This implementation limits :
 67	- maximal queue length per flow to 127 packets.
 68	- max mtu to 2^18-1;
 69	- max 65408 flows,
 70	- number of hash buckets to 65536.
 71
 72	It is easy to increase these values, but not in flight.  */
 73
 74#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 75#define SFQ_DEFAULT_FLOWS	128
 76#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 77#define SFQ_EMPTY_SLOT		0xffff
 78#define SFQ_DEFAULT_HASH_DIVISOR 1024
 79
 80/* We use 16 bits to store allot, and want to handle packets up to 64K
 81 * Scale allot by 8 (1<<3) so that no overflow occurs.
 82 */
 83#define SFQ_ALLOT_SHIFT		3
 84#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 85
 86/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 87typedef u16 sfq_index;
 88
 89/*
 90 * We dont use pointers to save space.
 91 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 92 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 93 * are 'pointers' to dep[] array
 94 */
 95struct sfq_head {
 96	sfq_index	next;
 97	sfq_index	prev;
 98};
 99
100struct sfq_slot {
101	struct sk_buff	*skblist_next;
102	struct sk_buff	*skblist_prev;
103	sfq_index	qlen; /* number of skbs in skblist */
104	sfq_index	next; /* next slot in sfq RR chain */
105	struct sfq_head dep; /* anchor in dep[] chains */
106	unsigned short	hash; /* hash value (index in ht[]) */
107	short		allot; /* credit for this slot */
108
109	unsigned int    backlog;
110	struct red_vars vars;
111};
112
113struct sfq_sched_data {
114/* frequently used fields */
115	int		limit;		/* limit of total number of packets in this qdisc */
116	unsigned int	divisor;	/* number of slots in hash table */
117	u8		headdrop;
118	u8		maxdepth;	/* limit of packets per flow */
119
120	siphash_key_t 	perturbation;
121	u8		cur_depth;	/* depth of longest slot */
122	u8		flags;
123	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
124	struct tcf_proto __rcu *filter_list;
125	struct tcf_block *block;
126	sfq_index	*ht;		/* Hash table ('divisor' slots) */
127	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
128
129	struct red_parms *red_parms;
130	struct tc_sfqred_stats stats;
131	struct sfq_slot *tail;		/* current slot in round */
132
133	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
134					/* Linked lists of slots, indexed by depth
135					 * dep[0] : list of unused flows
136					 * dep[1] : list of flows with 1 packet
137					 * dep[X] : list of flows with X packets
138					 */
139
140	unsigned int	maxflows;	/* number of flows in flows array */
141	int		perturb_period;
142	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
143	struct timer_list perturb_timer;
144	struct Qdisc	*sch;
145};
146
147/*
148 * sfq_head are either in a sfq_slot or in dep[] array
149 */
150static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
151{
152	if (val < SFQ_MAX_FLOWS)
153		return &q->slots[val].dep;
154	return &q->dep[val - SFQ_MAX_FLOWS];
155}
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157static unsigned int sfq_hash(const struct sfq_sched_data *q,
158			     const struct sk_buff *skb)
159{
160	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
 
 
 
 
 
 
161}
162
163static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
164				 int *qerr)
165{
166	struct sfq_sched_data *q = qdisc_priv(sch);
167	struct tcf_result res;
168	struct tcf_proto *fl;
169	int result;
170
171	if (TC_H_MAJ(skb->priority) == sch->handle &&
172	    TC_H_MIN(skb->priority) > 0 &&
173	    TC_H_MIN(skb->priority) <= q->divisor)
174		return TC_H_MIN(skb->priority);
175
176	fl = rcu_dereference_bh(q->filter_list);
177	if (!fl)
178		return sfq_hash(q, skb) + 1;
 
179
180	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
181	result = tcf_classify(skb, fl, &res, false);
182	if (result >= 0) {
183#ifdef CONFIG_NET_CLS_ACT
184		switch (result) {
185		case TC_ACT_STOLEN:
186		case TC_ACT_QUEUED:
187		case TC_ACT_TRAP:
188			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
189			fallthrough;
190		case TC_ACT_SHOT:
191			return 0;
192		}
193#endif
194		if (TC_H_MIN(res.classid) <= q->divisor)
195			return TC_H_MIN(res.classid);
196	}
197	return 0;
198}
199
200/*
201 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
202 */
203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204{
205	sfq_index p, n;
206	struct sfq_slot *slot = &q->slots[x];
207	int qlen = slot->qlen;
208
209	p = qlen + SFQ_MAX_FLOWS;
210	n = q->dep[qlen].next;
211
212	slot->dep.next = n;
213	slot->dep.prev = p;
214
215	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
216	sfq_dep_head(q, n)->prev = x;
217}
218
219#define sfq_unlink(q, x, n, p)			\
220	do {					\
221		n = q->slots[x].dep.next;	\
222		p = q->slots[x].dep.prev;	\
223		sfq_dep_head(q, p)->next = n;	\
224		sfq_dep_head(q, n)->prev = p;	\
225	} while (0)
226
227
228static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229{
230	sfq_index p, n;
231	int d;
232
233	sfq_unlink(q, x, n, p);
234
235	d = q->slots[x].qlen--;
236	if (n == p && q->cur_depth == d)
237		q->cur_depth--;
238	sfq_link(q, x);
239}
240
241static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242{
243	sfq_index p, n;
244	int d;
245
246	sfq_unlink(q, x, n, p);
247
248	d = ++q->slots[x].qlen;
249	if (q->cur_depth < d)
250		q->cur_depth = d;
251	sfq_link(q, x);
252}
253
254/* helper functions : might be changed when/if skb use a standard list_head */
255
256/* remove one skb from tail of slot queue */
257static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
258{
259	struct sk_buff *skb = slot->skblist_prev;
260
261	slot->skblist_prev = skb->prev;
262	skb->prev->next = (struct sk_buff *)slot;
263	skb->next = skb->prev = NULL;
264	return skb;
265}
266
267/* remove one skb from head of slot queue */
268static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
269{
270	struct sk_buff *skb = slot->skblist_next;
271
272	slot->skblist_next = skb->next;
273	skb->next->prev = (struct sk_buff *)slot;
274	skb->next = skb->prev = NULL;
275	return skb;
276}
277
278static inline void slot_queue_init(struct sfq_slot *slot)
279{
280	memset(slot, 0, sizeof(*slot));
281	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282}
283
284/* add skb to slot queue (tail add) */
285static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
286{
287	skb->prev = slot->skblist_prev;
288	skb->next = (struct sk_buff *)slot;
289	slot->skblist_prev->next = skb;
290	slot->skblist_prev = skb;
291}
292
293static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 
 
 
 
 
294{
295	struct sfq_sched_data *q = qdisc_priv(sch);
296	sfq_index x, d = q->cur_depth;
297	struct sk_buff *skb;
298	unsigned int len;
299	struct sfq_slot *slot;
300
301	/* Queue is full! Find the longest slot and drop tail packet from it */
302	if (d > 1) {
303		x = q->dep[d].next;
304		slot = &q->slots[x];
305drop:
306		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
307		len = qdisc_pkt_len(skb);
308		slot->backlog -= len;
309		sfq_dec(q, x);
 
310		sch->q.qlen--;
311		qdisc_qstats_backlog_dec(sch, skb);
312		qdisc_drop(skb, sch, to_free);
313		return len;
314	}
315
316	if (d == 1) {
317		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318		x = q->tail->next;
319		slot = &q->slots[x];
320		q->tail->next = slot->next;
321		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322		goto drop;
323	}
324
325	return 0;
326}
327
328/* Is ECN parameter configured */
329static int sfq_prob_mark(const struct sfq_sched_data *q)
330{
331	return q->flags & TC_RED_ECN;
332}
333
334/* Should packets over max threshold just be marked */
335static int sfq_hard_mark(const struct sfq_sched_data *q)
336{
337	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338}
339
340static int sfq_headdrop(const struct sfq_sched_data *q)
341{
342	return q->headdrop;
343}
344
345static int
346sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
347{
348	struct sfq_sched_data *q = qdisc_priv(sch);
349	unsigned int hash, dropped;
350	sfq_index x, qlen;
351	struct sfq_slot *slot;
352	int ret;
353	struct sk_buff *head;
354	int delta;
355
356	hash = sfq_classify(skb, sch, &ret);
357	if (hash == 0) {
358		if (ret & __NET_XMIT_BYPASS)
359			qdisc_qstats_drop(sch);
360		__qdisc_drop(skb, to_free);
361		return ret;
362	}
363	hash--;
364
365	x = q->ht[hash];
366	slot = &q->slots[x];
367	if (x == SFQ_EMPTY_SLOT) {
368		x = q->dep[0].next; /* get a free slot */
369		if (x >= SFQ_MAX_FLOWS)
370			return qdisc_drop(skb, sch, to_free);
371		q->ht[hash] = x;
372		slot = &q->slots[x];
373		slot->hash = hash;
374		slot->backlog = 0; /* should already be 0 anyway... */
375		red_set_vars(&slot->vars);
376		goto enqueue;
377	}
378	if (q->red_parms) {
379		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380							&slot->vars,
381							slot->backlog);
382		switch (red_action(q->red_parms,
383				   &slot->vars,
384				   slot->vars.qavg)) {
385		case RED_DONT_MARK:
386			break;
387
388		case RED_PROB_MARK:
389			qdisc_qstats_overlimit(sch);
390			if (sfq_prob_mark(q)) {
391				/* We know we have at least one packet in queue */
392				if (sfq_headdrop(q) &&
393				    INET_ECN_set_ce(slot->skblist_next)) {
394					q->stats.prob_mark_head++;
395					break;
396				}
397				if (INET_ECN_set_ce(skb)) {
398					q->stats.prob_mark++;
399					break;
400				}
401			}
402			q->stats.prob_drop++;
403			goto congestion_drop;
404
405		case RED_HARD_MARK:
406			qdisc_qstats_overlimit(sch);
407			if (sfq_hard_mark(q)) {
408				/* We know we have at least one packet in queue */
409				if (sfq_headdrop(q) &&
410				    INET_ECN_set_ce(slot->skblist_next)) {
411					q->stats.forced_mark_head++;
412					break;
413				}
414				if (INET_ECN_set_ce(skb)) {
415					q->stats.forced_mark++;
416					break;
417				}
418			}
419			q->stats.forced_drop++;
420			goto congestion_drop;
421		}
422	}
423
424	if (slot->qlen >= q->maxdepth) {
425congestion_drop:
426		if (!sfq_headdrop(q))
427			return qdisc_drop(skb, sch, to_free);
428
429		/* We know we have at least one packet in queue */
430		head = slot_dequeue_head(slot);
431		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432		sch->qstats.backlog -= delta;
433		slot->backlog -= delta;
434		qdisc_drop(head, sch, to_free);
435
436		slot_queue_add(slot, skb);
437		qdisc_tree_reduce_backlog(sch, 0, delta);
438		return NET_XMIT_CN;
439	}
440
441enqueue:
442	qdisc_qstats_backlog_inc(sch, skb);
443	slot->backlog += qdisc_pkt_len(skb);
444	slot_queue_add(slot, skb);
445	sfq_inc(q, x);
446	if (slot->qlen == 1) {		/* The flow is new */
447		if (q->tail == NULL) {	/* It is the first flow */
448			slot->next = x;
449		} else {
450			slot->next = q->tail->next;
451			q->tail->next = x;
452		}
453		/* We put this flow at the end of our flow list.
454		 * This might sound unfair for a new flow to wait after old ones,
455		 * but we could endup servicing new flows only, and freeze old ones.
456		 */
457		q->tail = slot;
458		/* We could use a bigger initial quantum for new flows */
459		slot->allot = q->scaled_quantum;
460	}
461	if (++sch->q.qlen <= q->limit)
462		return NET_XMIT_SUCCESS;
463
464	qlen = slot->qlen;
465	dropped = sfq_drop(sch, to_free);
466	/* Return Congestion Notification only if we dropped a packet
467	 * from this flow.
468	 */
469	if (qlen != slot->qlen) {
470		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
471		return NET_XMIT_CN;
472	}
473
474	/* As we dropped a packet, better let upper stack know this */
475	qdisc_tree_reduce_backlog(sch, 1, dropped);
476	return NET_XMIT_SUCCESS;
477}
478
479static struct sk_buff *
480sfq_dequeue(struct Qdisc *sch)
481{
482	struct sfq_sched_data *q = qdisc_priv(sch);
483	struct sk_buff *skb;
484	sfq_index a, next_a;
485	struct sfq_slot *slot;
486
487	/* No active slots */
488	if (q->tail == NULL)
489		return NULL;
490
491next_slot:
492	a = q->tail->next;
493	slot = &q->slots[a];
494	if (slot->allot <= 0) {
495		q->tail = slot;
496		slot->allot += q->scaled_quantum;
497		goto next_slot;
498	}
499	skb = slot_dequeue_head(slot);
500	sfq_dec(q, a);
501	qdisc_bstats_update(sch, skb);
502	sch->q.qlen--;
503	qdisc_qstats_backlog_dec(sch, skb);
504	slot->backlog -= qdisc_pkt_len(skb);
505	/* Is the slot empty? */
506	if (slot->qlen == 0) {
507		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
508		next_a = slot->next;
509		if (a == next_a) {
510			q->tail = NULL; /* no more active slots */
511			return skb;
512		}
513		q->tail->next = next_a;
514	} else {
515		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
516	}
517	return skb;
518}
519
520static void
521sfq_reset(struct Qdisc *sch)
522{
523	struct sk_buff *skb;
524
525	while ((skb = sfq_dequeue(sch)) != NULL)
526		rtnl_kfree_skbs(skb, skb);
527}
528
529/*
530 * When q->perturbation is changed, we rehash all queued skbs
531 * to avoid OOO (Out Of Order) effects.
532 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
533 * counters.
534 */
535static void sfq_rehash(struct Qdisc *sch)
536{
537	struct sfq_sched_data *q = qdisc_priv(sch);
538	struct sk_buff *skb;
539	int i;
540	struct sfq_slot *slot;
541	struct sk_buff_head list;
542	int dropped = 0;
543	unsigned int drop_len = 0;
544
545	__skb_queue_head_init(&list);
546
547	for (i = 0; i < q->maxflows; i++) {
548		slot = &q->slots[i];
549		if (!slot->qlen)
550			continue;
551		while (slot->qlen) {
552			skb = slot_dequeue_head(slot);
553			sfq_dec(q, i);
554			__skb_queue_tail(&list, skb);
555		}
556		slot->backlog = 0;
557		red_set_vars(&slot->vars);
558		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
559	}
560	q->tail = NULL;
561
562	while ((skb = __skb_dequeue(&list)) != NULL) {
563		unsigned int hash = sfq_hash(q, skb);
564		sfq_index x = q->ht[hash];
565
566		slot = &q->slots[x];
567		if (x == SFQ_EMPTY_SLOT) {
568			x = q->dep[0].next; /* get a free slot */
569			if (x >= SFQ_MAX_FLOWS) {
570drop:
571				qdisc_qstats_backlog_dec(sch, skb);
572				drop_len += qdisc_pkt_len(skb);
573				kfree_skb(skb);
574				dropped++;
575				continue;
576			}
577			q->ht[hash] = x;
578			slot = &q->slots[x];
579			slot->hash = hash;
580		}
581		if (slot->qlen >= q->maxdepth)
582			goto drop;
583		slot_queue_add(slot, skb);
584		if (q->red_parms)
585			slot->vars.qavg = red_calc_qavg(q->red_parms,
586							&slot->vars,
587							slot->backlog);
588		slot->backlog += qdisc_pkt_len(skb);
589		sfq_inc(q, x);
590		if (slot->qlen == 1) {		/* The flow is new */
591			if (q->tail == NULL) {	/* It is the first flow */
592				slot->next = x;
593			} else {
594				slot->next = q->tail->next;
595				q->tail->next = x;
596			}
597			q->tail = slot;
598			slot->allot = q->scaled_quantum;
599		}
600	}
601	sch->q.qlen -= dropped;
602	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
603}
604
605static void sfq_perturbation(struct timer_list *t)
606{
607	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
608	struct Qdisc *sch = q->sch;
609	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
610	siphash_key_t nkey;
611
612	get_random_bytes(&nkey, sizeof(nkey));
613	spin_lock(root_lock);
614	q->perturbation = nkey;
615	if (!q->filter_list && q->tail)
616		sfq_rehash(sch);
617	spin_unlock(root_lock);
618
619	if (q->perturb_period)
620		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
621}
622
623static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
624{
625	struct sfq_sched_data *q = qdisc_priv(sch);
626	struct tc_sfq_qopt *ctl = nla_data(opt);
627	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
628	unsigned int qlen, dropped = 0;
629	struct red_parms *p = NULL;
630	struct sk_buff *to_free = NULL;
631	struct sk_buff *tail = NULL;
632
633	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
634		return -EINVAL;
635	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
636		ctl_v1 = nla_data(opt);
637	if (ctl->divisor &&
638	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
639		return -EINVAL;
640
641	/* slot->allot is a short, make sure quantum is not too big. */
642	if (ctl->quantum) {
643		unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
644
645		if (scaled <= 0 || scaled > SHRT_MAX)
646			return -EINVAL;
647	}
648
649	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
650					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
651		return -EINVAL;
652	if (ctl_v1 && ctl_v1->qth_min) {
653		p = kmalloc(sizeof(*p), GFP_KERNEL);
654		if (!p)
655			return -ENOMEM;
656	}
657	sch_tree_lock(sch);
658	if (ctl->quantum) {
659		q->quantum = ctl->quantum;
660		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
661	}
662	q->perturb_period = ctl->perturb_period * HZ;
663	if (ctl->flows)
664		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
665	if (ctl->divisor) {
666		q->divisor = ctl->divisor;
667		q->maxflows = min_t(u32, q->maxflows, q->divisor);
668	}
669	if (ctl_v1) {
670		if (ctl_v1->depth)
671			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
672		if (p) {
673			swap(q->red_parms, p);
674			red_set_parms(q->red_parms,
675				      ctl_v1->qth_min, ctl_v1->qth_max,
676				      ctl_v1->Wlog,
677				      ctl_v1->Plog, ctl_v1->Scell_log,
678				      NULL,
679				      ctl_v1->max_P);
680		}
681		q->flags = ctl_v1->flags;
682		q->headdrop = ctl_v1->headdrop;
683	}
684	if (ctl->limit) {
685		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
686		q->maxflows = min_t(u32, q->maxflows, q->limit);
687	}
688
689	qlen = sch->q.qlen;
690	while (sch->q.qlen > q->limit) {
691		dropped += sfq_drop(sch, &to_free);
692		if (!tail)
693			tail = to_free;
694	}
695
696	rtnl_kfree_skbs(to_free, tail);
697	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
698
699	del_timer(&q->perturb_timer);
700	if (q->perturb_period) {
701		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
702		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
703	}
704	sch_tree_unlock(sch);
705	kfree(p);
706	return 0;
707}
708
709static void *sfq_alloc(size_t sz)
710{
711	return  kvmalloc(sz, GFP_KERNEL);
 
 
 
 
712}
713
714static void sfq_free(void *addr)
715{
716	kvfree(addr);
 
 
 
 
 
717}
718
719static void sfq_destroy(struct Qdisc *sch)
720{
721	struct sfq_sched_data *q = qdisc_priv(sch);
722
723	tcf_block_put(q->block);
724	q->perturb_period = 0;
725	del_timer_sync(&q->perturb_timer);
726	sfq_free(q->ht);
727	sfq_free(q->slots);
728	kfree(q->red_parms);
729}
730
731static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
732		    struct netlink_ext_ack *extack)
733{
734	struct sfq_sched_data *q = qdisc_priv(sch);
735	int i;
736	int err;
737
738	q->sch = sch;
739	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
740
741	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
742	if (err)
743		return err;
744
745	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
746		q->dep[i].next = i + SFQ_MAX_FLOWS;
747		q->dep[i].prev = i + SFQ_MAX_FLOWS;
748	}
749
750	q->limit = SFQ_MAX_DEPTH;
751	q->maxdepth = SFQ_MAX_DEPTH;
752	q->cur_depth = 0;
753	q->tail = NULL;
754	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
755	q->maxflows = SFQ_DEFAULT_FLOWS;
756	q->quantum = psched_mtu(qdisc_dev(sch));
757	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
758	q->perturb_period = 0;
759	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
760
761	if (opt) {
762		int err = sfq_change(sch, opt);
763		if (err)
764			return err;
765	}
766
767	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
768	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
769	if (!q->ht || !q->slots) {
770		/* Note: sfq_destroy() will be called by our caller */
771		return -ENOMEM;
772	}
773
774	for (i = 0; i < q->divisor; i++)
775		q->ht[i] = SFQ_EMPTY_SLOT;
776
777	for (i = 0; i < q->maxflows; i++) {
778		slot_queue_init(&q->slots[i]);
779		sfq_link(q, i);
780	}
781	if (q->limit >= 1)
782		sch->flags |= TCQ_F_CAN_BYPASS;
783	else
784		sch->flags &= ~TCQ_F_CAN_BYPASS;
785	return 0;
786}
787
788static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
789{
790	struct sfq_sched_data *q = qdisc_priv(sch);
791	unsigned char *b = skb_tail_pointer(skb);
792	struct tc_sfq_qopt_v1 opt;
793	struct red_parms *p = q->red_parms;
794
795	memset(&opt, 0, sizeof(opt));
796	opt.v0.quantum	= q->quantum;
797	opt.v0.perturb_period = q->perturb_period / HZ;
798	opt.v0.limit	= q->limit;
799	opt.v0.divisor	= q->divisor;
800	opt.v0.flows	= q->maxflows;
801	opt.depth	= q->maxdepth;
802	opt.headdrop	= q->headdrop;
803
804	if (p) {
805		opt.qth_min	= p->qth_min >> p->Wlog;
806		opt.qth_max	= p->qth_max >> p->Wlog;
807		opt.Wlog	= p->Wlog;
808		opt.Plog	= p->Plog;
809		opt.Scell_log	= p->Scell_log;
810		opt.max_P	= p->max_P;
811	}
812	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
813	opt.flags	= q->flags;
814
815	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
816		goto nla_put_failure;
817
818	return skb->len;
819
820nla_put_failure:
821	nlmsg_trim(skb, b);
822	return -1;
823}
824
825static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
826{
827	return NULL;
828}
829
830static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
831{
832	return 0;
833}
834
835static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
836			      u32 classid)
837{
 
 
838	return 0;
839}
840
841static void sfq_unbind(struct Qdisc *q, unsigned long cl)
842{
843}
844
845static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
846				       struct netlink_ext_ack *extack)
847{
848	struct sfq_sched_data *q = qdisc_priv(sch);
849
850	if (cl)
851		return NULL;
852	return q->block;
853}
854
855static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
856			  struct sk_buff *skb, struct tcmsg *tcm)
857{
858	tcm->tcm_handle |= TC_H_MIN(cl);
859	return 0;
860}
861
862static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
863				struct gnet_dump *d)
864{
865	struct sfq_sched_data *q = qdisc_priv(sch);
866	sfq_index idx = q->ht[cl - 1];
867	struct gnet_stats_queue qs = { 0 };
868	struct tc_sfq_xstats xstats = { 0 };
869
870	if (idx != SFQ_EMPTY_SLOT) {
871		const struct sfq_slot *slot = &q->slots[idx];
872
873		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
874		qs.qlen = slot->qlen;
875		qs.backlog = slot->backlog;
876	}
877	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
878		return -1;
879	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
880}
881
882static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
883{
884	struct sfq_sched_data *q = qdisc_priv(sch);
885	unsigned int i;
886
887	if (arg->stop)
888		return;
889
890	for (i = 0; i < q->divisor; i++) {
891		if (q->ht[i] == SFQ_EMPTY_SLOT ||
892		    arg->count < arg->skip) {
893			arg->count++;
894			continue;
895		}
896		if (arg->fn(sch, i + 1, arg) < 0) {
897			arg->stop = 1;
898			break;
899		}
900		arg->count++;
901	}
902}
903
904static const struct Qdisc_class_ops sfq_class_ops = {
905	.leaf		=	sfq_leaf,
906	.find		=	sfq_find,
907	.tcf_block	=	sfq_tcf_block,
 
908	.bind_tcf	=	sfq_bind,
909	.unbind_tcf	=	sfq_unbind,
910	.dump		=	sfq_dump_class,
911	.dump_stats	=	sfq_dump_class_stats,
912	.walk		=	sfq_walk,
913};
914
915static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
916	.cl_ops		=	&sfq_class_ops,
917	.id		=	"sfq",
918	.priv_size	=	sizeof(struct sfq_sched_data),
919	.enqueue	=	sfq_enqueue,
920	.dequeue	=	sfq_dequeue,
921	.peek		=	qdisc_peek_dequeued,
 
922	.init		=	sfq_init,
923	.reset		=	sfq_reset,
924	.destroy	=	sfq_destroy,
925	.change		=	NULL,
926	.dump		=	sfq_dump,
927	.owner		=	THIS_MODULE,
928};
929
930static int __init sfq_module_init(void)
931{
932	return register_qdisc(&sfq_qdisc_ops);
933}
934static void __exit sfq_module_exit(void)
935{
936	unregister_qdisc(&sfq_qdisc_ops);
937}
938module_init(sfq_module_init)
939module_exit(sfq_module_exit)
940MODULE_LICENSE("GPL");