Linux Audio

Check our new training course

Linux debugging, profiling, tracing and performance analysis training

Apr 14-17, 2025
Register
Loading...
v3.15
  1/*
  2 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 */
 11
 12#include <linux/module.h>
 13#include <linux/types.h>
 14#include <linux/kernel.h>
 15#include <linux/jiffies.h>
 16#include <linux/string.h>
 17#include <linux/in.h>
 18#include <linux/errno.h>
 19#include <linux/init.h>
 
 20#include <linux/skbuff.h>
 21#include <linux/jhash.h>
 22#include <linux/slab.h>
 23#include <linux/vmalloc.h>
 
 24#include <net/netlink.h>
 25#include <net/pkt_sched.h>
 26#include <net/flow_keys.h>
 27#include <net/red.h>
 28
 29
 30/*	Stochastic Fairness Queuing algorithm.
 31	=======================================
 32
 33	Source:
 34	Paul E. McKenney "Stochastic Fairness Queuing",
 35	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 36
 37	Paul E. McKenney "Stochastic Fairness Queuing",
 38	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 39
 40
 41	See also:
 42	M. Shreedhar and George Varghese "Efficient Fair
 43	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 44
 45
 46	This is not the thing that is usually called (W)FQ nowadays.
 47	It does not use any timestamp mechanism, but instead
 48	processes queues in round-robin order.
 49
 50	ADVANTAGE:
 51
 52	- It is very cheap. Both CPU and memory requirements are minimal.
 53
 54	DRAWBACKS:
 55
 56	- "Stochastic" -> It is not 100% fair.
 57	When hash collisions occur, several flows are considered as one.
 58
 59	- "Round-robin" -> It introduces larger delays than virtual clock
 60	based schemes, and should not be used for isolating interactive
 61	traffic	from non-interactive. It means, that this scheduler
 62	should be used as leaf of CBQ or P3, which put interactive traffic
 63	to higher priority band.
 64
 65	We still need true WFQ for top level CSZ, but using WFQ
 66	for the best effort traffic is absolutely pointless:
 67	SFQ is superior for this purpose.
 68
 69	IMPLEMENTATION:
 70	This implementation limits :
 71	- maximal queue length per flow to 127 packets.
 72	- max mtu to 2^18-1;
 73	- max 65408 flows,
 74	- number of hash buckets to 65536.
 75
 76	It is easy to increase these values, but not in flight.  */
 77
 78#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 79#define SFQ_DEFAULT_FLOWS	128
 80#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 81#define SFQ_EMPTY_SLOT		0xffff
 82#define SFQ_DEFAULT_HASH_DIVISOR 1024
 83
 84/* We use 16 bits to store allot, and want to handle packets up to 64K
 85 * Scale allot by 8 (1<<3) so that no overflow occurs.
 86 */
 87#define SFQ_ALLOT_SHIFT		3
 88#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 89
 90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 91typedef u16 sfq_index;
 92
 93/*
 94 * We dont use pointers to save space.
 95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 97 * are 'pointers' to dep[] array
 98 */
 99struct sfq_head {
100	sfq_index	next;
101	sfq_index	prev;
102};
103
104struct sfq_slot {
105	struct sk_buff	*skblist_next;
106	struct sk_buff	*skblist_prev;
107	sfq_index	qlen; /* number of skbs in skblist */
108	sfq_index	next; /* next slot in sfq RR chain */
109	struct sfq_head dep; /* anchor in dep[] chains */
110	unsigned short	hash; /* hash value (index in ht[]) */
111	short		allot; /* credit for this slot */
112
113	unsigned int    backlog;
114	struct red_vars vars;
115};
116
117struct sfq_sched_data {
118/* frequently used fields */
119	int		limit;		/* limit of total number of packets in this qdisc */
 
 
120	unsigned int	divisor;	/* number of slots in hash table */
121	u8		headdrop;
122	u8		maxdepth;	/* limit of packets per flow */
123
124	u32		perturbation;
125	u8		cur_depth;	/* depth of longest slot */
126	u8		flags;
127	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128	struct tcf_proto *filter_list;
129	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131
132	struct red_parms *red_parms;
133	struct tc_sfqred_stats stats;
134	struct sfq_slot *tail;		/* current slot in round */
135
136	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137					/* Linked lists of slots, indexed by depth
138					 * dep[0] : list of unused flows
139					 * dep[1] : list of flows with 1 packet
140					 * dep[X] : list of flows with X packets
141					 */
142
143	unsigned int	maxflows;	/* number of flows in flows array */
144	int		perturb_period;
145	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146	struct timer_list perturb_timer;
147};
148
149/*
150 * sfq_head are either in a sfq_slot or in dep[] array
151 */
152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153{
154	if (val < SFQ_MAX_FLOWS)
155		return &q->slots[val].dep;
156	return &q->dep[val - SFQ_MAX_FLOWS];
157}
158
159/*
160 * In order to be able to quickly rehash our queue when timer changes
161 * q->perturbation, we store flow_keys in skb->cb[]
162 */
163struct sfq_skb_cb {
164       struct flow_keys        keys;
165};
166
167static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168{
169	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171}
172
173static unsigned int sfq_hash(const struct sfq_sched_data *q,
174			     const struct sk_buff *skb)
175{
176	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177	unsigned int hash;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
179	hash = jhash_3words((__force u32)keys->dst,
180			    (__force u32)keys->src ^ keys->ip_proto,
181			    (__force u32)keys->ports, q->perturbation);
182	return hash & (q->divisor - 1);
183}
184
185static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186				 int *qerr)
187{
188	struct sfq_sched_data *q = qdisc_priv(sch);
189	struct tcf_result res;
190	int result;
191
192	if (TC_H_MAJ(skb->priority) == sch->handle &&
193	    TC_H_MIN(skb->priority) > 0 &&
194	    TC_H_MIN(skb->priority) <= q->divisor)
195		return TC_H_MIN(skb->priority);
196
197	if (!q->filter_list) {
198		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199		return sfq_hash(q, skb) + 1;
200	}
201
202	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
203	result = tc_classify(skb, q->filter_list, &res);
204	if (result >= 0) {
205#ifdef CONFIG_NET_CLS_ACT
206		switch (result) {
207		case TC_ACT_STOLEN:
208		case TC_ACT_QUEUED:
209			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
210		case TC_ACT_SHOT:
211			return 0;
212		}
213#endif
214		if (TC_H_MIN(res.classid) <= q->divisor)
215			return TC_H_MIN(res.classid);
216	}
217	return 0;
218}
219
220/*
221 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222 */
223static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224{
225	sfq_index p, n;
226	struct sfq_slot *slot = &q->slots[x];
227	int qlen = slot->qlen;
228
229	p = qlen + SFQ_MAX_FLOWS;
230	n = q->dep[qlen].next;
231
232	slot->dep.next = n;
233	slot->dep.prev = p;
234
235	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
236	sfq_dep_head(q, n)->prev = x;
237}
238
239#define sfq_unlink(q, x, n, p)			\
240	do {					\
241		n = q->slots[x].dep.next;	\
242		p = q->slots[x].dep.prev;	\
243		sfq_dep_head(q, p)->next = n;	\
244		sfq_dep_head(q, n)->prev = p;	\
245	} while (0)
246
247
248static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
249{
250	sfq_index p, n;
251	int d;
252
253	sfq_unlink(q, x, n, p);
254
255	d = q->slots[x].qlen--;
256	if (n == p && q->cur_depth == d)
257		q->cur_depth--;
258	sfq_link(q, x);
259}
260
261static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
262{
263	sfq_index p, n;
264	int d;
265
266	sfq_unlink(q, x, n, p);
267
268	d = ++q->slots[x].qlen;
269	if (q->cur_depth < d)
270		q->cur_depth = d;
271	sfq_link(q, x);
272}
273
274/* helper functions : might be changed when/if skb use a standard list_head */
275
276/* remove one skb from tail of slot queue */
277static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
278{
279	struct sk_buff *skb = slot->skblist_prev;
280
281	slot->skblist_prev = skb->prev;
282	skb->prev->next = (struct sk_buff *)slot;
283	skb->next = skb->prev = NULL;
284	return skb;
285}
286
287/* remove one skb from head of slot queue */
288static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
289{
290	struct sk_buff *skb = slot->skblist_next;
291
292	slot->skblist_next = skb->next;
293	skb->next->prev = (struct sk_buff *)slot;
294	skb->next = skb->prev = NULL;
295	return skb;
296}
297
298static inline void slot_queue_init(struct sfq_slot *slot)
299{
300	memset(slot, 0, sizeof(*slot));
301	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
302}
303
304/* add skb to slot queue (tail add) */
305static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
306{
307	skb->prev = slot->skblist_prev;
308	skb->next = (struct sk_buff *)slot;
309	slot->skblist_prev->next = skb;
310	slot->skblist_prev = skb;
311}
312
313#define	slot_queue_walk(slot, skb)		\
314	for (skb = slot->skblist_next;		\
315	     skb != (struct sk_buff *)slot;	\
316	     skb = skb->next)
317
318static unsigned int sfq_drop(struct Qdisc *sch)
319{
320	struct sfq_sched_data *q = qdisc_priv(sch);
321	sfq_index x, d = q->cur_depth;
322	struct sk_buff *skb;
323	unsigned int len;
324	struct sfq_slot *slot;
325
326	/* Queue is full! Find the longest slot and drop tail packet from it */
327	if (d > 1) {
328		x = q->dep[d].next;
329		slot = &q->slots[x];
330drop:
331		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
332		len = qdisc_pkt_len(skb);
333		slot->backlog -= len;
334		sfq_dec(q, x);
335		kfree_skb(skb);
336		sch->q.qlen--;
337		sch->qstats.drops++;
338		sch->qstats.backlog -= len;
339		return len;
340	}
341
342	if (d == 1) {
343		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
344		x = q->tail->next;
345		slot = &q->slots[x];
346		q->tail->next = slot->next;
347		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
348		goto drop;
349	}
350
351	return 0;
352}
353
354/* Is ECN parameter configured */
355static int sfq_prob_mark(const struct sfq_sched_data *q)
356{
357	return q->flags & TC_RED_ECN;
358}
359
360/* Should packets over max threshold just be marked */
361static int sfq_hard_mark(const struct sfq_sched_data *q)
362{
363	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
364}
365
366static int sfq_headdrop(const struct sfq_sched_data *q)
367{
368	return q->headdrop;
369}
370
371static int
372sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
373{
374	struct sfq_sched_data *q = qdisc_priv(sch);
375	unsigned int hash;
376	sfq_index x, qlen;
377	struct sfq_slot *slot;
378	int uninitialized_var(ret);
379	struct sk_buff *head;
380	int delta;
381
382	hash = sfq_classify(skb, sch, &ret);
383	if (hash == 0) {
384		if (ret & __NET_XMIT_BYPASS)
385			sch->qstats.drops++;
386		kfree_skb(skb);
387		return ret;
388	}
389	hash--;
390
391	x = q->ht[hash];
392	slot = &q->slots[x];
393	if (x == SFQ_EMPTY_SLOT) {
394		x = q->dep[0].next; /* get a free slot */
395		if (x >= SFQ_MAX_FLOWS)
396			return qdisc_drop(skb, sch);
397		q->ht[hash] = x;
398		slot = &q->slots[x];
399		slot->hash = hash;
400		slot->backlog = 0; /* should already be 0 anyway... */
401		red_set_vars(&slot->vars);
402		goto enqueue;
403	}
404	if (q->red_parms) {
405		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
406							&slot->vars,
407							slot->backlog);
408		switch (red_action(q->red_parms,
409				   &slot->vars,
410				   slot->vars.qavg)) {
411		case RED_DONT_MARK:
412			break;
413
414		case RED_PROB_MARK:
415			sch->qstats.overlimits++;
416			if (sfq_prob_mark(q)) {
417				/* We know we have at least one packet in queue */
418				if (sfq_headdrop(q) &&
419				    INET_ECN_set_ce(slot->skblist_next)) {
420					q->stats.prob_mark_head++;
421					break;
422				}
423				if (INET_ECN_set_ce(skb)) {
424					q->stats.prob_mark++;
425					break;
426				}
427			}
428			q->stats.prob_drop++;
429			goto congestion_drop;
430
431		case RED_HARD_MARK:
432			sch->qstats.overlimits++;
433			if (sfq_hard_mark(q)) {
434				/* We know we have at least one packet in queue */
435				if (sfq_headdrop(q) &&
436				    INET_ECN_set_ce(slot->skblist_next)) {
437					q->stats.forced_mark_head++;
438					break;
439				}
440				if (INET_ECN_set_ce(skb)) {
441					q->stats.forced_mark++;
442					break;
443				}
444			}
445			q->stats.forced_drop++;
446			goto congestion_drop;
447		}
448	}
449
450	if (slot->qlen >= q->maxdepth) {
451congestion_drop:
452		if (!sfq_headdrop(q))
453			return qdisc_drop(skb, sch);
454
455		/* We know we have at least one packet in queue */
456		head = slot_dequeue_head(slot);
457		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
458		sch->qstats.backlog -= delta;
459		slot->backlog -= delta;
460		qdisc_drop(head, sch);
461
462		slot_queue_add(slot, skb);
463		return NET_XMIT_CN;
464	}
465
466enqueue:
467	sch->qstats.backlog += qdisc_pkt_len(skb);
468	slot->backlog += qdisc_pkt_len(skb);
469	slot_queue_add(slot, skb);
470	sfq_inc(q, x);
471	if (slot->qlen == 1) {		/* The flow is new */
472		if (q->tail == NULL) {	/* It is the first flow */
473			slot->next = x;
474		} else {
475			slot->next = q->tail->next;
476			q->tail->next = x;
477		}
478		/* We put this flow at the end of our flow list.
479		 * This might sound unfair for a new flow to wait after old ones,
480		 * but we could endup servicing new flows only, and freeze old ones.
481		 */
482		q->tail = slot;
483		/* We could use a bigger initial quantum for new flows */
484		slot->allot = q->scaled_quantum;
485	}
486	if (++sch->q.qlen <= q->limit)
487		return NET_XMIT_SUCCESS;
488
489	qlen = slot->qlen;
490	sfq_drop(sch);
491	/* Return Congestion Notification only if we dropped a packet
492	 * from this flow.
493	 */
494	if (qlen != slot->qlen)
495		return NET_XMIT_CN;
496
497	/* As we dropped a packet, better let upper stack know this */
498	qdisc_tree_decrease_qlen(sch, 1);
499	return NET_XMIT_SUCCESS;
500}
501
502static struct sk_buff *
503sfq_dequeue(struct Qdisc *sch)
504{
505	struct sfq_sched_data *q = qdisc_priv(sch);
506	struct sk_buff *skb;
507	sfq_index a, next_a;
508	struct sfq_slot *slot;
509
510	/* No active slots */
511	if (q->tail == NULL)
512		return NULL;
513
514next_slot:
515	a = q->tail->next;
516	slot = &q->slots[a];
517	if (slot->allot <= 0) {
518		q->tail = slot;
519		slot->allot += q->scaled_quantum;
520		goto next_slot;
521	}
522	skb = slot_dequeue_head(slot);
523	sfq_dec(q, a);
524	qdisc_bstats_update(sch, skb);
525	sch->q.qlen--;
526	sch->qstats.backlog -= qdisc_pkt_len(skb);
527	slot->backlog -= qdisc_pkt_len(skb);
528	/* Is the slot empty? */
529	if (slot->qlen == 0) {
530		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
531		next_a = slot->next;
532		if (a == next_a) {
533			q->tail = NULL; /* no more active slots */
534			return skb;
535		}
536		q->tail->next = next_a;
537	} else {
538		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
539	}
540	return skb;
541}
542
543static void
544sfq_reset(struct Qdisc *sch)
545{
546	struct sk_buff *skb;
547
548	while ((skb = sfq_dequeue(sch)) != NULL)
549		kfree_skb(skb);
550}
551
552/*
553 * When q->perturbation is changed, we rehash all queued skbs
554 * to avoid OOO (Out Of Order) effects.
555 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
556 * counters.
557 */
558static void sfq_rehash(struct Qdisc *sch)
559{
560	struct sfq_sched_data *q = qdisc_priv(sch);
561	struct sk_buff *skb;
562	int i;
563	struct sfq_slot *slot;
564	struct sk_buff_head list;
565	int dropped = 0;
566
567	__skb_queue_head_init(&list);
568
569	for (i = 0; i < q->maxflows; i++) {
570		slot = &q->slots[i];
571		if (!slot->qlen)
572			continue;
573		while (slot->qlen) {
574			skb = slot_dequeue_head(slot);
575			sfq_dec(q, i);
576			__skb_queue_tail(&list, skb);
577		}
578		slot->backlog = 0;
579		red_set_vars(&slot->vars);
580		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
581	}
582	q->tail = NULL;
583
584	while ((skb = __skb_dequeue(&list)) != NULL) {
585		unsigned int hash = sfq_hash(q, skb);
586		sfq_index x = q->ht[hash];
587
588		slot = &q->slots[x];
589		if (x == SFQ_EMPTY_SLOT) {
590			x = q->dep[0].next; /* get a free slot */
591			if (x >= SFQ_MAX_FLOWS) {
592drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
593				kfree_skb(skb);
594				dropped++;
595				continue;
596			}
597			q->ht[hash] = x;
598			slot = &q->slots[x];
599			slot->hash = hash;
600		}
601		if (slot->qlen >= q->maxdepth)
602			goto drop;
603		slot_queue_add(slot, skb);
604		if (q->red_parms)
605			slot->vars.qavg = red_calc_qavg(q->red_parms,
606							&slot->vars,
607							slot->backlog);
608		slot->backlog += qdisc_pkt_len(skb);
609		sfq_inc(q, x);
610		if (slot->qlen == 1) {		/* The flow is new */
611			if (q->tail == NULL) {	/* It is the first flow */
612				slot->next = x;
613			} else {
614				slot->next = q->tail->next;
615				q->tail->next = x;
616			}
617			q->tail = slot;
618			slot->allot = q->scaled_quantum;
619		}
620	}
621	sch->q.qlen -= dropped;
622	qdisc_tree_decrease_qlen(sch, dropped);
623}
624
625static void sfq_perturbation(unsigned long arg)
626{
627	struct Qdisc *sch = (struct Qdisc *)arg;
628	struct sfq_sched_data *q = qdisc_priv(sch);
629	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
630
631	spin_lock(root_lock);
632	q->perturbation = prandom_u32();
633	if (!q->filter_list && q->tail)
634		sfq_rehash(sch);
635	spin_unlock(root_lock);
636
637	if (q->perturb_period)
638		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
639}
640
641static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
642{
643	struct sfq_sched_data *q = qdisc_priv(sch);
644	struct tc_sfq_qopt *ctl = nla_data(opt);
645	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
646	unsigned int qlen;
647	struct red_parms *p = NULL;
648
649	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
650		return -EINVAL;
651	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
652		ctl_v1 = nla_data(opt);
653	if (ctl->divisor &&
654	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
655		return -EINVAL;
656	if (ctl_v1 && ctl_v1->qth_min) {
657		p = kmalloc(sizeof(*p), GFP_KERNEL);
658		if (!p)
659			return -ENOMEM;
660	}
661	sch_tree_lock(sch);
662	if (ctl->quantum) {
663		q->quantum = ctl->quantum;
664		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
665	}
666	q->perturb_period = ctl->perturb_period * HZ;
667	if (ctl->flows)
668		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
669	if (ctl->divisor) {
670		q->divisor = ctl->divisor;
671		q->maxflows = min_t(u32, q->maxflows, q->divisor);
672	}
673	if (ctl_v1) {
674		if (ctl_v1->depth)
675			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
676		if (p) {
677			swap(q->red_parms, p);
678			red_set_parms(q->red_parms,
679				      ctl_v1->qth_min, ctl_v1->qth_max,
680				      ctl_v1->Wlog,
681				      ctl_v1->Plog, ctl_v1->Scell_log,
682				      NULL,
683				      ctl_v1->max_P);
684		}
685		q->flags = ctl_v1->flags;
686		q->headdrop = ctl_v1->headdrop;
687	}
688	if (ctl->limit) {
689		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
690		q->maxflows = min_t(u32, q->maxflows, q->limit);
691	}
692
693	qlen = sch->q.qlen;
694	while (sch->q.qlen > q->limit)
695		sfq_drop(sch);
696	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
697
698	del_timer(&q->perturb_timer);
699	if (q->perturb_period) {
700		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
701		q->perturbation = prandom_u32();
702	}
703	sch_tree_unlock(sch);
704	kfree(p);
705	return 0;
706}
707
708static void *sfq_alloc(size_t sz)
709{
710	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
711
712	if (!ptr)
713		ptr = vmalloc(sz);
714	return ptr;
715}
716
717static void sfq_free(void *addr)
718{
719	if (addr) {
720		if (is_vmalloc_addr(addr))
721			vfree(addr);
722		else
723			kfree(addr);
724	}
725}
726
727static void sfq_destroy(struct Qdisc *sch)
728{
729	struct sfq_sched_data *q = qdisc_priv(sch);
730
731	tcf_destroy_chain(&q->filter_list);
732	q->perturb_period = 0;
733	del_timer_sync(&q->perturb_timer);
734	sfq_free(q->ht);
735	sfq_free(q->slots);
736	kfree(q->red_parms);
737}
738
739static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
740{
741	struct sfq_sched_data *q = qdisc_priv(sch);
 
742	int i;
743
744	q->perturb_timer.function = sfq_perturbation;
745	q->perturb_timer.data = (unsigned long)sch;
746	init_timer_deferrable(&q->perturb_timer);
747
748	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
749		q->dep[i].next = i + SFQ_MAX_FLOWS;
750		q->dep[i].prev = i + SFQ_MAX_FLOWS;
751	}
752
753	q->limit = SFQ_MAX_DEPTH;
754	q->maxdepth = SFQ_MAX_DEPTH;
755	q->cur_depth = 0;
756	q->tail = NULL;
757	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
758	q->maxflows = SFQ_DEFAULT_FLOWS;
759	q->quantum = psched_mtu(qdisc_dev(sch));
760	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
761	q->perturb_period = 0;
762	q->perturbation = prandom_u32();
763
764	if (opt) {
765		int err = sfq_change(sch, opt);
766		if (err)
767			return err;
768	}
769
770	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
771	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
772	if (!q->ht || !q->slots) {
773		sfq_destroy(sch);
 
774		return -ENOMEM;
775	}
776	for (i = 0; i < q->divisor; i++)
777		q->ht[i] = SFQ_EMPTY_SLOT;
778
779	for (i = 0; i < q->maxflows; i++) {
780		slot_queue_init(&q->slots[i]);
781		sfq_link(q, i);
782	}
783	if (q->limit >= 1)
784		sch->flags |= TCQ_F_CAN_BYPASS;
785	else
786		sch->flags &= ~TCQ_F_CAN_BYPASS;
787	return 0;
788}
789
 
 
 
 
 
 
 
 
 
 
 
 
 
790static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
791{
792	struct sfq_sched_data *q = qdisc_priv(sch);
793	unsigned char *b = skb_tail_pointer(skb);
794	struct tc_sfq_qopt_v1 opt;
795	struct red_parms *p = q->red_parms;
796
797	memset(&opt, 0, sizeof(opt));
798	opt.v0.quantum	= q->quantum;
799	opt.v0.perturb_period = q->perturb_period / HZ;
800	opt.v0.limit	= q->limit;
801	opt.v0.divisor	= q->divisor;
802	opt.v0.flows	= q->maxflows;
803	opt.depth	= q->maxdepth;
804	opt.headdrop	= q->headdrop;
805
806	if (p) {
807		opt.qth_min	= p->qth_min >> p->Wlog;
808		opt.qth_max	= p->qth_max >> p->Wlog;
809		opt.Wlog	= p->Wlog;
810		opt.Plog	= p->Plog;
811		opt.Scell_log	= p->Scell_log;
812		opt.max_P	= p->max_P;
813	}
814	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
815	opt.flags	= q->flags;
816
817	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
818		goto nla_put_failure;
819
820	return skb->len;
821
822nla_put_failure:
823	nlmsg_trim(skb, b);
824	return -1;
825}
826
827static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
828{
829	return NULL;
830}
831
832static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
833{
834	return 0;
835}
836
837static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
838			      u32 classid)
839{
840	/* we cannot bypass queue discipline anymore */
841	sch->flags &= ~TCQ_F_CAN_BYPASS;
842	return 0;
843}
844
845static void sfq_put(struct Qdisc *q, unsigned long cl)
846{
847}
848
849static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
850{
851	struct sfq_sched_data *q = qdisc_priv(sch);
852
853	if (cl)
854		return NULL;
855	return &q->filter_list;
856}
857
858static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
859			  struct sk_buff *skb, struct tcmsg *tcm)
860{
861	tcm->tcm_handle |= TC_H_MIN(cl);
862	return 0;
863}
864
865static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
866				struct gnet_dump *d)
867{
868	struct sfq_sched_data *q = qdisc_priv(sch);
869	sfq_index idx = q->ht[cl - 1];
870	struct gnet_stats_queue qs = { 0 };
871	struct tc_sfq_xstats xstats = { 0 };
 
872
873	if (idx != SFQ_EMPTY_SLOT) {
874		const struct sfq_slot *slot = &q->slots[idx];
875
876		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
877		qs.qlen = slot->qlen;
878		qs.backlog = slot->backlog;
 
879	}
880	if (gnet_stats_copy_queue(d, &qs) < 0)
881		return -1;
882	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
883}
884
885static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
886{
887	struct sfq_sched_data *q = qdisc_priv(sch);
888	unsigned int i;
889
890	if (arg->stop)
891		return;
892
893	for (i = 0; i < q->divisor; i++) {
894		if (q->ht[i] == SFQ_EMPTY_SLOT ||
895		    arg->count < arg->skip) {
896			arg->count++;
897			continue;
898		}
899		if (arg->fn(sch, i + 1, arg) < 0) {
900			arg->stop = 1;
901			break;
902		}
903		arg->count++;
904	}
905}
906
907static const struct Qdisc_class_ops sfq_class_ops = {
908	.leaf		=	sfq_leaf,
909	.get		=	sfq_get,
910	.put		=	sfq_put,
911	.tcf_chain	=	sfq_find_tcf,
912	.bind_tcf	=	sfq_bind,
913	.unbind_tcf	=	sfq_put,
914	.dump		=	sfq_dump_class,
915	.dump_stats	=	sfq_dump_class_stats,
916	.walk		=	sfq_walk,
917};
918
919static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
920	.cl_ops		=	&sfq_class_ops,
921	.id		=	"sfq",
922	.priv_size	=	sizeof(struct sfq_sched_data),
923	.enqueue	=	sfq_enqueue,
924	.dequeue	=	sfq_dequeue,
925	.peek		=	qdisc_peek_dequeued,
926	.drop		=	sfq_drop,
927	.init		=	sfq_init,
928	.reset		=	sfq_reset,
929	.destroy	=	sfq_destroy,
930	.change		=	NULL,
931	.dump		=	sfq_dump,
932	.owner		=	THIS_MODULE,
933};
934
935static int __init sfq_module_init(void)
936{
937	return register_qdisc(&sfq_qdisc_ops);
938}
939static void __exit sfq_module_exit(void)
940{
941	unregister_qdisc(&sfq_qdisc_ops);
942}
943module_init(sfq_module_init)
944module_exit(sfq_module_exit)
945MODULE_LICENSE("GPL");
v3.1
  1/*
  2 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 */
 11
 12#include <linux/module.h>
 13#include <linux/types.h>
 14#include <linux/kernel.h>
 15#include <linux/jiffies.h>
 16#include <linux/string.h>
 17#include <linux/in.h>
 18#include <linux/errno.h>
 19#include <linux/init.h>
 20#include <linux/ipv6.h>
 21#include <linux/skbuff.h>
 22#include <linux/jhash.h>
 23#include <linux/slab.h>
 24#include <linux/vmalloc.h>
 25#include <net/ip.h>
 26#include <net/netlink.h>
 27#include <net/pkt_sched.h>
 
 
 28
 29
 30/*	Stochastic Fairness Queuing algorithm.
 31	=======================================
 32
 33	Source:
 34	Paul E. McKenney "Stochastic Fairness Queuing",
 35	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 36
 37	Paul E. McKenney "Stochastic Fairness Queuing",
 38	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 39
 40
 41	See also:
 42	M. Shreedhar and George Varghese "Efficient Fair
 43	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 44
 45
 46	This is not the thing that is usually called (W)FQ nowadays.
 47	It does not use any timestamp mechanism, but instead
 48	processes queues in round-robin order.
 49
 50	ADVANTAGE:
 51
 52	- It is very cheap. Both CPU and memory requirements are minimal.
 53
 54	DRAWBACKS:
 55
 56	- "Stochastic" -> It is not 100% fair.
 57	When hash collisions occur, several flows are considered as one.
 58
 59	- "Round-robin" -> It introduces larger delays than virtual clock
 60	based schemes, and should not be used for isolating interactive
 61	traffic	from non-interactive. It means, that this scheduler
 62	should be used as leaf of CBQ or P3, which put interactive traffic
 63	to higher priority band.
 64
 65	We still need true WFQ for top level CSZ, but using WFQ
 66	for the best effort traffic is absolutely pointless:
 67	SFQ is superior for this purpose.
 68
 69	IMPLEMENTATION:
 70	This implementation limits maximal queue length to 128;
 71	max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024.
 72	The only goal of this restrictions was that all data
 73	fit into one 4K page on 32bit arches.
 
 74
 75	It is easy to increase these values, but not in flight.  */
 76
 77#define SFQ_DEPTH		128 /* max number of packets per flow */
 78#define SFQ_SLOTS		128 /* max number of flows */
 79#define SFQ_EMPTY_SLOT		255
 
 80#define SFQ_DEFAULT_HASH_DIVISOR 1024
 81
 82/* We use 16 bits to store allot, and want to handle packets up to 64K
 83 * Scale allot by 8 (1<<3) so that no overflow occurs.
 84 */
 85#define SFQ_ALLOT_SHIFT		3
 86#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 87
 88/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */
 89typedef unsigned char sfq_index;
 90
 91/*
 92 * We dont use pointers to save space.
 93 * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array
 94 * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
 95 * are 'pointers' to dep[] array
 96 */
 97struct sfq_head {
 98	sfq_index	next;
 99	sfq_index	prev;
100};
101
102struct sfq_slot {
103	struct sk_buff	*skblist_next;
104	struct sk_buff	*skblist_prev;
105	sfq_index	qlen; /* number of skbs in skblist */
106	sfq_index	next; /* next slot in sfq chain */
107	struct sfq_head dep; /* anchor in dep[] chains */
108	unsigned short	hash; /* hash value (index in ht[]) */
109	short		allot; /* credit for this slot */
 
 
 
110};
111
112struct sfq_sched_data {
113/* Parameters */
114	int		perturb_period;
115	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
116	int		limit;
117	unsigned int	divisor;	/* number of slots in hash table */
118/* Variables */
119	struct tcf_proto *filter_list;
120	struct timer_list perturb_timer;
121	u32		perturbation;
122	sfq_index	cur_depth;	/* depth of longest slot */
 
123	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 
 
 
 
 
 
124	struct sfq_slot *tail;		/* current slot in round */
125	sfq_index	*ht;		/* Hash table (divisor slots) */
126	struct sfq_slot	slots[SFQ_SLOTS];
127	struct sfq_head	dep[SFQ_DEPTH];	/* Linked list of slots, indexed by depth */
 
 
 
 
 
 
 
 
 
128};
129
130/*
131 * sfq_head are either in a sfq_slot or in dep[] array
132 */
133static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
134{
135	if (val < SFQ_SLOTS)
136		return &q->slots[val].dep;
137	return &q->dep[val - SFQ_SLOTS];
138}
139
140static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 
 
 
 
 
 
 
 
141{
142	return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
 
143}
144
145static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 
146{
147	u32 h, h2;
148
149	switch (skb->protocol) {
150	case htons(ETH_P_IP):
151	{
152		const struct iphdr *iph;
153		int poff;
154
155		if (!pskb_network_may_pull(skb, sizeof(*iph)))
156			goto err;
157		iph = ip_hdr(skb);
158		h = (__force u32)iph->daddr;
159		h2 = (__force u32)iph->saddr ^ iph->protocol;
160		if (ip_is_fragment(iph))
161			break;
162		poff = proto_ports_offset(iph->protocol);
163		if (poff >= 0 &&
164		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
165			iph = ip_hdr(skb);
166			h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
167		}
168		break;
169	}
170	case htons(ETH_P_IPV6):
171	{
172		const struct ipv6hdr *iph;
173		int poff;
174
175		if (!pskb_network_may_pull(skb, sizeof(*iph)))
176			goto err;
177		iph = ipv6_hdr(skb);
178		h = (__force u32)iph->daddr.s6_addr32[3];
179		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
180		poff = proto_ports_offset(iph->nexthdr);
181		if (poff >= 0 &&
182		    pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
183			iph = ipv6_hdr(skb);
184			h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
185		}
186		break;
187	}
188	default:
189err:
190		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
191		h2 = (unsigned long)skb->sk;
192	}
193
194	return sfq_fold_hash(q, h, h2);
 
 
 
195}
196
197static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
198				 int *qerr)
199{
200	struct sfq_sched_data *q = qdisc_priv(sch);
201	struct tcf_result res;
202	int result;
203
204	if (TC_H_MAJ(skb->priority) == sch->handle &&
205	    TC_H_MIN(skb->priority) > 0 &&
206	    TC_H_MIN(skb->priority) <= q->divisor)
207		return TC_H_MIN(skb->priority);
208
209	if (!q->filter_list)
 
210		return sfq_hash(q, skb) + 1;
 
211
212	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
213	result = tc_classify(skb, q->filter_list, &res);
214	if (result >= 0) {
215#ifdef CONFIG_NET_CLS_ACT
216		switch (result) {
217		case TC_ACT_STOLEN:
218		case TC_ACT_QUEUED:
219			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
220		case TC_ACT_SHOT:
221			return 0;
222		}
223#endif
224		if (TC_H_MIN(res.classid) <= q->divisor)
225			return TC_H_MIN(res.classid);
226	}
227	return 0;
228}
229
230/*
231 * x : slot number [0 .. SFQ_SLOTS - 1]
232 */
233static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
234{
235	sfq_index p, n;
236	int qlen = q->slots[x].qlen;
 
237
238	p = qlen + SFQ_SLOTS;
239	n = q->dep[qlen].next;
240
241	q->slots[x].dep.next = n;
242	q->slots[x].dep.prev = p;
243
244	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
245	sfq_dep_head(q, n)->prev = x;
246}
247
248#define sfq_unlink(q, x, n, p)			\
249	n = q->slots[x].dep.next;		\
250	p = q->slots[x].dep.prev;		\
251	sfq_dep_head(q, p)->next = n;		\
252	sfq_dep_head(q, n)->prev = p
 
 
253
254
255static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
256{
257	sfq_index p, n;
258	int d;
259
260	sfq_unlink(q, x, n, p);
261
262	d = q->slots[x].qlen--;
263	if (n == p && q->cur_depth == d)
264		q->cur_depth--;
265	sfq_link(q, x);
266}
267
268static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
269{
270	sfq_index p, n;
271	int d;
272
273	sfq_unlink(q, x, n, p);
274
275	d = ++q->slots[x].qlen;
276	if (q->cur_depth < d)
277		q->cur_depth = d;
278	sfq_link(q, x);
279}
280
281/* helper functions : might be changed when/if skb use a standard list_head */
282
283/* remove one skb from tail of slot queue */
284static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
285{
286	struct sk_buff *skb = slot->skblist_prev;
287
288	slot->skblist_prev = skb->prev;
289	skb->prev->next = (struct sk_buff *)slot;
290	skb->next = skb->prev = NULL;
291	return skb;
292}
293
294/* remove one skb from head of slot queue */
295static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
296{
297	struct sk_buff *skb = slot->skblist_next;
298
299	slot->skblist_next = skb->next;
300	skb->next->prev = (struct sk_buff *)slot;
301	skb->next = skb->prev = NULL;
302	return skb;
303}
304
305static inline void slot_queue_init(struct sfq_slot *slot)
306{
 
307	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
308}
309
310/* add skb to slot queue (tail add) */
311static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
312{
313	skb->prev = slot->skblist_prev;
314	skb->next = (struct sk_buff *)slot;
315	slot->skblist_prev->next = skb;
316	slot->skblist_prev = skb;
317}
318
319#define	slot_queue_walk(slot, skb)		\
320	for (skb = slot->skblist_next;		\
321	     skb != (struct sk_buff *)slot;	\
322	     skb = skb->next)
323
324static unsigned int sfq_drop(struct Qdisc *sch)
325{
326	struct sfq_sched_data *q = qdisc_priv(sch);
327	sfq_index x, d = q->cur_depth;
328	struct sk_buff *skb;
329	unsigned int len;
330	struct sfq_slot *slot;
331
332	/* Queue is full! Find the longest slot and drop tail packet from it */
333	if (d > 1) {
334		x = q->dep[d].next;
335		slot = &q->slots[x];
336drop:
337		skb = slot_dequeue_tail(slot);
338		len = qdisc_pkt_len(skb);
 
339		sfq_dec(q, x);
340		kfree_skb(skb);
341		sch->q.qlen--;
342		sch->qstats.drops++;
343		sch->qstats.backlog -= len;
344		return len;
345	}
346
347	if (d == 1) {
348		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
349		x = q->tail->next;
350		slot = &q->slots[x];
351		q->tail->next = slot->next;
352		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
353		goto drop;
354	}
355
356	return 0;
357}
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359static int
360sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
361{
362	struct sfq_sched_data *q = qdisc_priv(sch);
363	unsigned int hash;
364	sfq_index x, qlen;
365	struct sfq_slot *slot;
366	int uninitialized_var(ret);
 
 
367
368	hash = sfq_classify(skb, sch, &ret);
369	if (hash == 0) {
370		if (ret & __NET_XMIT_BYPASS)
371			sch->qstats.drops++;
372		kfree_skb(skb);
373		return ret;
374	}
375	hash--;
376
377	x = q->ht[hash];
378	slot = &q->slots[x];
379	if (x == SFQ_EMPTY_SLOT) {
380		x = q->dep[0].next; /* get a free slot */
 
 
381		q->ht[hash] = x;
382		slot = &q->slots[x];
383		slot->hash = hash;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384	}
385
386	/* If selected queue has length q->limit, do simple tail drop,
387	 * i.e. drop _this_ packet.
388	 */
389	if (slot->qlen >= q->limit)
390		return qdisc_drop(skb, sch);
 
 
 
 
 
 
 
 
 
 
391
 
392	sch->qstats.backlog += qdisc_pkt_len(skb);
 
393	slot_queue_add(slot, skb);
394	sfq_inc(q, x);
395	if (slot->qlen == 1) {		/* The flow is new */
396		if (q->tail == NULL) {	/* It is the first flow */
397			slot->next = x;
398		} else {
399			slot->next = q->tail->next;
400			q->tail->next = x;
401		}
 
 
 
 
402		q->tail = slot;
 
403		slot->allot = q->scaled_quantum;
404	}
405	if (++sch->q.qlen <= q->limit)
406		return NET_XMIT_SUCCESS;
407
408	qlen = slot->qlen;
409	sfq_drop(sch);
410	/* Return Congestion Notification only if we dropped a packet
411	 * from this flow.
412	 */
413	if (qlen != slot->qlen)
414		return NET_XMIT_CN;
415
416	/* As we dropped a packet, better let upper stack know this */
417	qdisc_tree_decrease_qlen(sch, 1);
418	return NET_XMIT_SUCCESS;
419}
420
421static struct sk_buff *
422sfq_dequeue(struct Qdisc *sch)
423{
424	struct sfq_sched_data *q = qdisc_priv(sch);
425	struct sk_buff *skb;
426	sfq_index a, next_a;
427	struct sfq_slot *slot;
428
429	/* No active slots */
430	if (q->tail == NULL)
431		return NULL;
432
433next_slot:
434	a = q->tail->next;
435	slot = &q->slots[a];
436	if (slot->allot <= 0) {
437		q->tail = slot;
438		slot->allot += q->scaled_quantum;
439		goto next_slot;
440	}
441	skb = slot_dequeue_head(slot);
442	sfq_dec(q, a);
443	qdisc_bstats_update(sch, skb);
444	sch->q.qlen--;
445	sch->qstats.backlog -= qdisc_pkt_len(skb);
446
447	/* Is the slot empty? */
448	if (slot->qlen == 0) {
449		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
450		next_a = slot->next;
451		if (a == next_a) {
452			q->tail = NULL; /* no more active slots */
453			return skb;
454		}
455		q->tail->next = next_a;
456	} else {
457		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
458	}
459	return skb;
460}
461
462static void
463sfq_reset(struct Qdisc *sch)
464{
465	struct sk_buff *skb;
466
467	while ((skb = sfq_dequeue(sch)) != NULL)
468		kfree_skb(skb);
469}
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471static void sfq_perturbation(unsigned long arg)
472{
473	struct Qdisc *sch = (struct Qdisc *)arg;
474	struct sfq_sched_data *q = qdisc_priv(sch);
 
475
476	q->perturbation = net_random();
 
 
 
 
477
478	if (q->perturb_period)
479		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
480}
481
482static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
483{
484	struct sfq_sched_data *q = qdisc_priv(sch);
485	struct tc_sfq_qopt *ctl = nla_data(opt);
 
486	unsigned int qlen;
 
487
488	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
489		return -EINVAL;
490
 
491	if (ctl->divisor &&
492	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
493		return -EINVAL;
494
 
 
 
 
495	sch_tree_lock(sch);
496	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
497	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 
 
498	q->perturb_period = ctl->perturb_period * HZ;
499	if (ctl->limit)
500		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
501	if (ctl->divisor)
502		q->divisor = ctl->divisor;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503	qlen = sch->q.qlen;
504	while (sch->q.qlen > q->limit)
505		sfq_drop(sch);
506	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
507
508	del_timer(&q->perturb_timer);
509	if (q->perturb_period) {
510		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
511		q->perturbation = net_random();
512	}
513	sch_tree_unlock(sch);
 
514	return 0;
515}
516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
518{
519	struct sfq_sched_data *q = qdisc_priv(sch);
520	size_t sz;
521	int i;
522
523	q->perturb_timer.function = sfq_perturbation;
524	q->perturb_timer.data = (unsigned long)sch;
525	init_timer_deferrable(&q->perturb_timer);
526
527	for (i = 0; i < SFQ_DEPTH; i++) {
528		q->dep[i].next = i + SFQ_SLOTS;
529		q->dep[i].prev = i + SFQ_SLOTS;
530	}
531
532	q->limit = SFQ_DEPTH - 1;
 
533	q->cur_depth = 0;
534	q->tail = NULL;
535	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
536	if (opt == NULL) {
537		q->quantum = psched_mtu(qdisc_dev(sch));
538		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
539		q->perturb_period = 0;
540		q->perturbation = net_random();
541	} else {
 
542		int err = sfq_change(sch, opt);
543		if (err)
544			return err;
545	}
546
547	sz = sizeof(q->ht[0]) * q->divisor;
548	q->ht = kmalloc(sz, GFP_KERNEL);
549	if (!q->ht && sz > PAGE_SIZE)
550		q->ht = vmalloc(sz);
551	if (!q->ht)
552		return -ENOMEM;
 
553	for (i = 0; i < q->divisor; i++)
554		q->ht[i] = SFQ_EMPTY_SLOT;
555
556	for (i = 0; i < SFQ_SLOTS; i++) {
557		slot_queue_init(&q->slots[i]);
558		sfq_link(q, i);
559	}
560	if (q->limit >= 1)
561		sch->flags |= TCQ_F_CAN_BYPASS;
562	else
563		sch->flags &= ~TCQ_F_CAN_BYPASS;
564	return 0;
565}
566
567static void sfq_destroy(struct Qdisc *sch)
568{
569	struct sfq_sched_data *q = qdisc_priv(sch);
570
571	tcf_destroy_chain(&q->filter_list);
572	q->perturb_period = 0;
573	del_timer_sync(&q->perturb_timer);
574	if (is_vmalloc_addr(q->ht))
575		vfree(q->ht);
576	else
577		kfree(q->ht);
578}
579
580static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
581{
582	struct sfq_sched_data *q = qdisc_priv(sch);
583	unsigned char *b = skb_tail_pointer(skb);
584	struct tc_sfq_qopt opt;
 
585
586	opt.quantum = q->quantum;
587	opt.perturb_period = q->perturb_period / HZ;
588
589	opt.limit = q->limit;
590	opt.divisor = q->divisor;
591	opt.flows = q->limit;
 
 
 
 
 
 
 
 
 
 
 
 
 
592
593	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 
594
595	return skb->len;
596
597nla_put_failure:
598	nlmsg_trim(skb, b);
599	return -1;
600}
601
602static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
603{
604	return NULL;
605}
606
607static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
608{
609	return 0;
610}
611
612static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
613			      u32 classid)
614{
615	/* we cannot bypass queue discipline anymore */
616	sch->flags &= ~TCQ_F_CAN_BYPASS;
617	return 0;
618}
619
620static void sfq_put(struct Qdisc *q, unsigned long cl)
621{
622}
623
624static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
625{
626	struct sfq_sched_data *q = qdisc_priv(sch);
627
628	if (cl)
629		return NULL;
630	return &q->filter_list;
631}
632
633static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
634			  struct sk_buff *skb, struct tcmsg *tcm)
635{
636	tcm->tcm_handle |= TC_H_MIN(cl);
637	return 0;
638}
639
640static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
641				struct gnet_dump *d)
642{
643	struct sfq_sched_data *q = qdisc_priv(sch);
644	sfq_index idx = q->ht[cl - 1];
645	struct gnet_stats_queue qs = { 0 };
646	struct tc_sfq_xstats xstats = { 0 };
647	struct sk_buff *skb;
648
649	if (idx != SFQ_EMPTY_SLOT) {
650		const struct sfq_slot *slot = &q->slots[idx];
651
652		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
653		qs.qlen = slot->qlen;
654		slot_queue_walk(slot, skb)
655			qs.backlog += qdisc_pkt_len(skb);
656	}
657	if (gnet_stats_copy_queue(d, &qs) < 0)
658		return -1;
659	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
660}
661
662static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
663{
664	struct sfq_sched_data *q = qdisc_priv(sch);
665	unsigned int i;
666
667	if (arg->stop)
668		return;
669
670	for (i = 0; i < q->divisor; i++) {
671		if (q->ht[i] == SFQ_EMPTY_SLOT ||
672		    arg->count < arg->skip) {
673			arg->count++;
674			continue;
675		}
676		if (arg->fn(sch, i + 1, arg) < 0) {
677			arg->stop = 1;
678			break;
679		}
680		arg->count++;
681	}
682}
683
684static const struct Qdisc_class_ops sfq_class_ops = {
685	.leaf		=	sfq_leaf,
686	.get		=	sfq_get,
687	.put		=	sfq_put,
688	.tcf_chain	=	sfq_find_tcf,
689	.bind_tcf	=	sfq_bind,
690	.unbind_tcf	=	sfq_put,
691	.dump		=	sfq_dump_class,
692	.dump_stats	=	sfq_dump_class_stats,
693	.walk		=	sfq_walk,
694};
695
696static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
697	.cl_ops		=	&sfq_class_ops,
698	.id		=	"sfq",
699	.priv_size	=	sizeof(struct sfq_sched_data),
700	.enqueue	=	sfq_enqueue,
701	.dequeue	=	sfq_dequeue,
702	.peek		=	qdisc_peek_dequeued,
703	.drop		=	sfq_drop,
704	.init		=	sfq_init,
705	.reset		=	sfq_reset,
706	.destroy	=	sfq_destroy,
707	.change		=	NULL,
708	.dump		=	sfq_dump,
709	.owner		=	THIS_MODULE,
710};
711
712static int __init sfq_module_init(void)
713{
714	return register_qdisc(&sfq_qdisc_ops);
715}
716static void __exit sfq_module_exit(void)
717{
718	unregister_qdisc(&sfq_qdisc_ops);
719}
720module_init(sfq_module_init)
721module_exit(sfq_module_exit)
722MODULE_LICENSE("GPL");