sch_sfq.c - net/sched/sch_sfq.c - Linux diff v5.4 - Bootlin Elixir Cross Referencer

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  4 *
 
 
 
 
 
  5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 */
  7
  8#include <linux/module.h>
  9#include <linux/types.h>
 10#include <linux/kernel.h>
 11#include <linux/jiffies.h>
 12#include <linux/string.h>
 13#include <linux/in.h>
 14#include <linux/errno.h>
 15#include <linux/init.h>
 16#include <linux/skbuff.h>
 17#include <linux/siphash.h>
 18#include <linux/slab.h>
 19#include <linux/vmalloc.h>
 20#include <net/netlink.h>
 21#include <net/pkt_sched.h>
 22#include <net/pkt_cls.h>
 23#include <net/red.h>
 24
 25
 26/*	Stochastic Fairness Queuing algorithm.
 27	=======================================
 28
 29	Source:
 30	Paul E. McKenney "Stochastic Fairness Queuing",
 31	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 32
 33	Paul E. McKenney "Stochastic Fairness Queuing",
 34	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 35
 36
 37	See also:
 38	M. Shreedhar and George Varghese "Efficient Fair
 39	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 40
 41
 42	This is not the thing that is usually called (W)FQ nowadays.
 43	It does not use any timestamp mechanism, but instead
 44	processes queues in round-robin order.
 45
 46	ADVANTAGE:
 47
 48	- It is very cheap. Both CPU and memory requirements are minimal.
 49
 50	DRAWBACKS:
 51
 52	- "Stochastic" -> It is not 100% fair.
 53	When hash collisions occur, several flows are considered as one.
 54
 55	- "Round-robin" -> It introduces larger delays than virtual clock
 56	based schemes, and should not be used for isolating interactive
 57	traffic	from non-interactive. It means, that this scheduler
 58	should be used as leaf of CBQ or P3, which put interactive traffic
 59	to higher priority band.
 60
 61	We still need true WFQ for top level CSZ, but using WFQ
 62	for the best effort traffic is absolutely pointless:
 63	SFQ is superior for this purpose.
 64
 65	IMPLEMENTATION:
 66	This implementation limits :
 67	- maximal queue length per flow to 127 packets.
 68	- max mtu to 2^18-1;
 69	- max 65408 flows,
 70	- number of hash buckets to 65536.
 71
 72	It is easy to increase these values, but not in flight.  */
 73
 74#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 75#define SFQ_DEFAULT_FLOWS	128
 76#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 77#define SFQ_EMPTY_SLOT		0xffff
 78#define SFQ_DEFAULT_HASH_DIVISOR 1024
 79
 80/* We use 16 bits to store allot, and want to handle packets up to 64K
 81 * Scale allot by 8 (1<<3) so that no overflow occurs.
 82 */
 83#define SFQ_ALLOT_SHIFT		3
 84#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 85
 86/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 87typedef u16 sfq_index;
 88
 89/*
 90 * We dont use pointers to save space.
 91 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 92 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 93 * are 'pointers' to dep[] array
 94 */
 95struct sfq_head {
 96	sfq_index	next;
 97	sfq_index	prev;
 98};
 99
100struct sfq_slot {
101	struct sk_buff	*skblist_next;
102	struct sk_buff	*skblist_prev;
103	sfq_index	qlen; /* number of skbs in skblist */
104	sfq_index	next; /* next slot in sfq RR chain */
105	struct sfq_head dep; /* anchor in dep[] chains */
106	unsigned short	hash; /* hash value (index in ht[]) */
107	short		allot; /* credit for this slot */
108
109	unsigned int    backlog;
110	struct red_vars vars;
111};
112
113struct sfq_sched_data {
114/* frequently used fields */
115	int		limit;		/* limit of total number of packets in this qdisc */
116	unsigned int	divisor;	/* number of slots in hash table */
117	u8		headdrop;
118	u8		maxdepth;	/* limit of packets per flow */
119
120	siphash_key_t 	perturbation;
121	u8		cur_depth;	/* depth of longest slot */
122	u8		flags;
123	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
124	struct tcf_proto __rcu *filter_list;
125	struct tcf_block *block;
126	sfq_index	*ht;		/* Hash table ('divisor' slots) */
127	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
128
129	struct red_parms *red_parms;
130	struct tc_sfqred_stats stats;
131	struct sfq_slot *tail;		/* current slot in round */
132
133	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
134					/* Linked lists of slots, indexed by depth
135					 * dep[0] : list of unused flows
136					 * dep[1] : list of flows with 1 packet
137					 * dep[X] : list of flows with X packets
138					 */
139
140	unsigned int	maxflows;	/* number of flows in flows array */
141	int		perturb_period;
142	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
143	struct timer_list perturb_timer;
144	struct Qdisc	*sch;
145};
146
147/*
148 * sfq_head are either in a sfq_slot or in dep[] array
149 */
150static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
151{
152	if (val < SFQ_MAX_FLOWS)
153		return &q->slots[val].dep;
154	return &q->dep[val - SFQ_MAX_FLOWS];
155}
156
157static unsigned int sfq_hash(const struct sfq_sched_data *q,
158			     const struct sk_buff *skb)
159{
160	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
161}
162
163static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
164				 int *qerr)
165{
166	struct sfq_sched_data *q = qdisc_priv(sch);
167	struct tcf_result res;
168	struct tcf_proto *fl;
169	int result;
170
171	if (TC_H_MAJ(skb->priority) == sch->handle &&
172	    TC_H_MIN(skb->priority) > 0 &&
173	    TC_H_MIN(skb->priority) <= q->divisor)
174		return TC_H_MIN(skb->priority);
175
176	fl = rcu_dereference_bh(q->filter_list);
177	if (!fl)
178		return sfq_hash(q, skb) + 1;
179
180	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
181	result = tcf_classify(skb, fl, &res, false);
182	if (result >= 0) {
183#ifdef CONFIG_NET_CLS_ACT
184		switch (result) {
185		case TC_ACT_STOLEN:
186		case TC_ACT_QUEUED:
187		case TC_ACT_TRAP:
188			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
189			/* fall through */
190		case TC_ACT_SHOT:
191			return 0;
192		}
193#endif
194		if (TC_H_MIN(res.classid) <= q->divisor)
195			return TC_H_MIN(res.classid);
196	}
197	return 0;
198}
199
200/*
201 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
202 */
203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204{
205	sfq_index p, n;
206	struct sfq_slot *slot = &q->slots[x];
207	int qlen = slot->qlen;
208
209	p = qlen + SFQ_MAX_FLOWS;
210	n = q->dep[qlen].next;
211
212	slot->dep.next = n;
213	slot->dep.prev = p;
214
215	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
216	sfq_dep_head(q, n)->prev = x;
217}
218
219#define sfq_unlink(q, x, n, p)			\
220	do {					\
221		n = q->slots[x].dep.next;	\
222		p = q->slots[x].dep.prev;	\
223		sfq_dep_head(q, p)->next = n;	\
224		sfq_dep_head(q, n)->prev = p;	\
225	} while (0)
226
227
228static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229{
230	sfq_index p, n;
231	int d;
232
233	sfq_unlink(q, x, n, p);
234
235	d = q->slots[x].qlen--;
236	if (n == p && q->cur_depth == d)
237		q->cur_depth--;
238	sfq_link(q, x);
239}
240
241static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242{
243	sfq_index p, n;
244	int d;
245
246	sfq_unlink(q, x, n, p);
247
248	d = ++q->slots[x].qlen;
249	if (q->cur_depth < d)
250		q->cur_depth = d;
251	sfq_link(q, x);
252}
253
254/* helper functions : might be changed when/if skb use a standard list_head */
255
256/* remove one skb from tail of slot queue */
257static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
258{
259	struct sk_buff *skb = slot->skblist_prev;
260
261	slot->skblist_prev = skb->prev;
262	skb->prev->next = (struct sk_buff *)slot;
263	skb->next = skb->prev = NULL;
264	return skb;
265}
266
267/* remove one skb from head of slot queue */
268static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
269{
270	struct sk_buff *skb = slot->skblist_next;
271
272	slot->skblist_next = skb->next;
273	skb->next->prev = (struct sk_buff *)slot;
274	skb->next = skb->prev = NULL;
275	return skb;
276}
277
278static inline void slot_queue_init(struct sfq_slot *slot)
279{
280	memset(slot, 0, sizeof(*slot));
281	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282}
283
284/* add skb to slot queue (tail add) */
285static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
286{
287	skb->prev = slot->skblist_prev;
288	skb->next = (struct sk_buff *)slot;
289	slot->skblist_prev->next = skb;
290	slot->skblist_prev = skb;
291}
292
293static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
294{
295	struct sfq_sched_data *q = qdisc_priv(sch);
296	sfq_index x, d = q->cur_depth;
297	struct sk_buff *skb;
298	unsigned int len;
299	struct sfq_slot *slot;
300
301	/* Queue is full! Find the longest slot and drop tail packet from it */
302	if (d > 1) {
303		x = q->dep[d].next;
304		slot = &q->slots[x];
305drop:
306		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
307		len = qdisc_pkt_len(skb);
308		slot->backlog -= len;
309		sfq_dec(q, x);
310		sch->q.qlen--;
 
311		qdisc_qstats_backlog_dec(sch, skb);
312		qdisc_drop(skb, sch, to_free);
313		return len;
314	}
315
316	if (d == 1) {
317		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318		x = q->tail->next;
319		slot = &q->slots[x];
320		q->tail->next = slot->next;
321		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322		goto drop;
323	}
324
325	return 0;
326}
327
328/* Is ECN parameter configured */
329static int sfq_prob_mark(const struct sfq_sched_data *q)
330{
331	return q->flags & TC_RED_ECN;
332}
333
334/* Should packets over max threshold just be marked */
335static int sfq_hard_mark(const struct sfq_sched_data *q)
336{
337	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338}
339
340static int sfq_headdrop(const struct sfq_sched_data *q)
341{
342	return q->headdrop;
343}
344
345static int
346sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
347{
348	struct sfq_sched_data *q = qdisc_priv(sch);
349	unsigned int hash, dropped;
350	sfq_index x, qlen;
351	struct sfq_slot *slot;
352	int uninitialized_var(ret);
353	struct sk_buff *head;
354	int delta;
355
356	hash = sfq_classify(skb, sch, &ret);
357	if (hash == 0) {
358		if (ret & __NET_XMIT_BYPASS)
359			qdisc_qstats_drop(sch);
360		__qdisc_drop(skb, to_free);
361		return ret;
362	}
363	hash--;
364
365	x = q->ht[hash];
366	slot = &q->slots[x];
367	if (x == SFQ_EMPTY_SLOT) {
368		x = q->dep[0].next; /* get a free slot */
369		if (x >= SFQ_MAX_FLOWS)
370			return qdisc_drop(skb, sch, to_free);
371		q->ht[hash] = x;
372		slot = &q->slots[x];
373		slot->hash = hash;
374		slot->backlog = 0; /* should already be 0 anyway... */
375		red_set_vars(&slot->vars);
376		goto enqueue;
377	}
378	if (q->red_parms) {
379		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380							&slot->vars,
381							slot->backlog);
382		switch (red_action(q->red_parms,
383				   &slot->vars,
384				   slot->vars.qavg)) {
385		case RED_DONT_MARK:
386			break;
387
388		case RED_PROB_MARK:
389			qdisc_qstats_overlimit(sch);
390			if (sfq_prob_mark(q)) {
391				/* We know we have at least one packet in queue */
392				if (sfq_headdrop(q) &&
393				    INET_ECN_set_ce(slot->skblist_next)) {
394					q->stats.prob_mark_head++;
395					break;
396				}
397				if (INET_ECN_set_ce(skb)) {
398					q->stats.prob_mark++;
399					break;
400				}
401			}
402			q->stats.prob_drop++;
403			goto congestion_drop;
404
405		case RED_HARD_MARK:
406			qdisc_qstats_overlimit(sch);
407			if (sfq_hard_mark(q)) {
408				/* We know we have at least one packet in queue */
409				if (sfq_headdrop(q) &&
410				    INET_ECN_set_ce(slot->skblist_next)) {
411					q->stats.forced_mark_head++;
412					break;
413				}
414				if (INET_ECN_set_ce(skb)) {
415					q->stats.forced_mark++;
416					break;
417				}
418			}
419			q->stats.forced_drop++;
420			goto congestion_drop;
421		}
422	}
423
424	if (slot->qlen >= q->maxdepth) {
425congestion_drop:
426		if (!sfq_headdrop(q))
427			return qdisc_drop(skb, sch, to_free);
428
429		/* We know we have at least one packet in queue */
430		head = slot_dequeue_head(slot);
431		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432		sch->qstats.backlog -= delta;
433		slot->backlog -= delta;
434		qdisc_drop(head, sch, to_free);
435
436		slot_queue_add(slot, skb);
437		qdisc_tree_reduce_backlog(sch, 0, delta);
438		return NET_XMIT_CN;
439	}
440
441enqueue:
442	qdisc_qstats_backlog_inc(sch, skb);
443	slot->backlog += qdisc_pkt_len(skb);
444	slot_queue_add(slot, skb);
445	sfq_inc(q, x);
446	if (slot->qlen == 1) {		/* The flow is new */
447		if (q->tail == NULL) {	/* It is the first flow */
448			slot->next = x;
449		} else {
450			slot->next = q->tail->next;
451			q->tail->next = x;
452		}
453		/* We put this flow at the end of our flow list.
454		 * This might sound unfair for a new flow to wait after old ones,
455		 * but we could endup servicing new flows only, and freeze old ones.
456		 */
457		q->tail = slot;
458		/* We could use a bigger initial quantum for new flows */
459		slot->allot = q->scaled_quantum;
460	}
461	if (++sch->q.qlen <= q->limit)
462		return NET_XMIT_SUCCESS;
463
464	qlen = slot->qlen;
465	dropped = sfq_drop(sch, to_free);
466	/* Return Congestion Notification only if we dropped a packet
467	 * from this flow.
468	 */
469	if (qlen != slot->qlen) {
470		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
471		return NET_XMIT_CN;
472	}
473
474	/* As we dropped a packet, better let upper stack know this */
475	qdisc_tree_reduce_backlog(sch, 1, dropped);
476	return NET_XMIT_SUCCESS;
477}
478
479static struct sk_buff *
480sfq_dequeue(struct Qdisc *sch)
481{
482	struct sfq_sched_data *q = qdisc_priv(sch);
483	struct sk_buff *skb;
484	sfq_index a, next_a;
485	struct sfq_slot *slot;
486
487	/* No active slots */
488	if (q->tail == NULL)
489		return NULL;
490
491next_slot:
492	a = q->tail->next;
493	slot = &q->slots[a];
494	if (slot->allot <= 0) {
495		q->tail = slot;
496		slot->allot += q->scaled_quantum;
497		goto next_slot;
498	}
499	skb = slot_dequeue_head(slot);
500	sfq_dec(q, a);
501	qdisc_bstats_update(sch, skb);
502	sch->q.qlen--;
503	qdisc_qstats_backlog_dec(sch, skb);
504	slot->backlog -= qdisc_pkt_len(skb);
505	/* Is the slot empty? */
506	if (slot->qlen == 0) {
507		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
508		next_a = slot->next;
509		if (a == next_a) {
510			q->tail = NULL; /* no more active slots */
511			return skb;
512		}
513		q->tail->next = next_a;
514	} else {
515		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
516	}
517	return skb;
518}
519
520static void
521sfq_reset(struct Qdisc *sch)
522{
523	struct sk_buff *skb;
524
525	while ((skb = sfq_dequeue(sch)) != NULL)
526		rtnl_kfree_skbs(skb, skb);
527}
528
529/*
530 * When q->perturbation is changed, we rehash all queued skbs
531 * to avoid OOO (Out Of Order) effects.
532 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
533 * counters.
534 */
535static void sfq_rehash(struct Qdisc *sch)
536{
537	struct sfq_sched_data *q = qdisc_priv(sch);
538	struct sk_buff *skb;
539	int i;
540	struct sfq_slot *slot;
541	struct sk_buff_head list;
542	int dropped = 0;
543	unsigned int drop_len = 0;
544
545	__skb_queue_head_init(&list);
546
547	for (i = 0; i < q->maxflows; i++) {
548		slot = &q->slots[i];
549		if (!slot->qlen)
550			continue;
551		while (slot->qlen) {
552			skb = slot_dequeue_head(slot);
553			sfq_dec(q, i);
554			__skb_queue_tail(&list, skb);
555		}
556		slot->backlog = 0;
557		red_set_vars(&slot->vars);
558		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
559	}
560	q->tail = NULL;
561
562	while ((skb = __skb_dequeue(&list)) != NULL) {
563		unsigned int hash = sfq_hash(q, skb);
564		sfq_index x = q->ht[hash];
565
566		slot = &q->slots[x];
567		if (x == SFQ_EMPTY_SLOT) {
568			x = q->dep[0].next; /* get a free slot */
569			if (x >= SFQ_MAX_FLOWS) {
570drop:
571				qdisc_qstats_backlog_dec(sch, skb);
572				drop_len += qdisc_pkt_len(skb);
573				kfree_skb(skb);
574				dropped++;
575				continue;
576			}
577			q->ht[hash] = x;
578			slot = &q->slots[x];
579			slot->hash = hash;
580		}
581		if (slot->qlen >= q->maxdepth)
582			goto drop;
583		slot_queue_add(slot, skb);
584		if (q->red_parms)
585			slot->vars.qavg = red_calc_qavg(q->red_parms,
586							&slot->vars,
587							slot->backlog);
588		slot->backlog += qdisc_pkt_len(skb);
589		sfq_inc(q, x);
590		if (slot->qlen == 1) {		/* The flow is new */
591			if (q->tail == NULL) {	/* It is the first flow */
592				slot->next = x;
593			} else {
594				slot->next = q->tail->next;
595				q->tail->next = x;
596			}
597			q->tail = slot;
598			slot->allot = q->scaled_quantum;
599		}
600	}
601	sch->q.qlen -= dropped;
602	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
603}
604
605static void sfq_perturbation(struct timer_list *t)
606{
607	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
608	struct Qdisc *sch = q->sch;
609	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
610	siphash_key_t nkey;
611
612	get_random_bytes(&nkey, sizeof(nkey));
613	spin_lock(root_lock);
614	q->perturbation = nkey;
615	if (!q->filter_list && q->tail)
616		sfq_rehash(sch);
617	spin_unlock(root_lock);
618
619	if (q->perturb_period)
620		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
621}
622
623static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
624{
625	struct sfq_sched_data *q = qdisc_priv(sch);
626	struct tc_sfq_qopt *ctl = nla_data(opt);
627	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
628	unsigned int qlen, dropped = 0;
629	struct red_parms *p = NULL;
630	struct sk_buff *to_free = NULL;
631	struct sk_buff *tail = NULL;
632
633	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
634		return -EINVAL;
635	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
636		ctl_v1 = nla_data(opt);
637	if (ctl->divisor &&
638	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
639		return -EINVAL;
640	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
641					ctl_v1->Wlog))
642		return -EINVAL;
643	if (ctl_v1 && ctl_v1->qth_min) {
644		p = kmalloc(sizeof(*p), GFP_KERNEL);
645		if (!p)
646			return -ENOMEM;
647	}
648	sch_tree_lock(sch);
649	if (ctl->quantum) {
650		q->quantum = ctl->quantum;
651		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
652	}
653	q->perturb_period = ctl->perturb_period * HZ;
654	if (ctl->flows)
655		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
656	if (ctl->divisor) {
657		q->divisor = ctl->divisor;
658		q->maxflows = min_t(u32, q->maxflows, q->divisor);
659	}
660	if (ctl_v1) {
661		if (ctl_v1->depth)
662			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
663		if (p) {
664			swap(q->red_parms, p);
665			red_set_parms(q->red_parms,
666				      ctl_v1->qth_min, ctl_v1->qth_max,
667				      ctl_v1->Wlog,
668				      ctl_v1->Plog, ctl_v1->Scell_log,
669				      NULL,
670				      ctl_v1->max_P);
671		}
672		q->flags = ctl_v1->flags;
673		q->headdrop = ctl_v1->headdrop;
674	}
675	if (ctl->limit) {
676		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
677		q->maxflows = min_t(u32, q->maxflows, q->limit);
678	}
679
680	qlen = sch->q.qlen;
681	while (sch->q.qlen > q->limit) {
682		dropped += sfq_drop(sch, &to_free);
683		if (!tail)
684			tail = to_free;
685	}
686
687	rtnl_kfree_skbs(to_free, tail);
688	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
689
690	del_timer(&q->perturb_timer);
691	if (q->perturb_period) {
692		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
693		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
694	}
695	sch_tree_unlock(sch);
696	kfree(p);
697	return 0;
698}
699
700static void *sfq_alloc(size_t sz)
701{
702	return  kvmalloc(sz, GFP_KERNEL);
 
 
 
 
703}
704
705static void sfq_free(void *addr)
706{
707	kvfree(addr);
708}
709
710static void sfq_destroy(struct Qdisc *sch)
711{
712	struct sfq_sched_data *q = qdisc_priv(sch);
713
714	tcf_block_put(q->block);
715	q->perturb_period = 0;
716	del_timer_sync(&q->perturb_timer);
717	sfq_free(q->ht);
718	sfq_free(q->slots);
719	kfree(q->red_parms);
720}
721
722static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
723		    struct netlink_ext_ack *extack)
724{
725	struct sfq_sched_data *q = qdisc_priv(sch);
726	int i;
727	int err;
728
729	q->sch = sch;
730	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
731
732	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
733	if (err)
734		return err;
735
736	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
737		q->dep[i].next = i + SFQ_MAX_FLOWS;
738		q->dep[i].prev = i + SFQ_MAX_FLOWS;
739	}
740
741	q->limit = SFQ_MAX_DEPTH;
742	q->maxdepth = SFQ_MAX_DEPTH;
743	q->cur_depth = 0;
744	q->tail = NULL;
745	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
746	q->maxflows = SFQ_DEFAULT_FLOWS;
747	q->quantum = psched_mtu(qdisc_dev(sch));
748	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
749	q->perturb_period = 0;
750	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
751
752	if (opt) {
753		int err = sfq_change(sch, opt);
754		if (err)
755			return err;
756	}
757
758	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
759	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
760	if (!q->ht || !q->slots) {
761		/* Note: sfq_destroy() will be called by our caller */
762		return -ENOMEM;
763	}
764
765	for (i = 0; i < q->divisor; i++)
766		q->ht[i] = SFQ_EMPTY_SLOT;
767
768	for (i = 0; i < q->maxflows; i++) {
769		slot_queue_init(&q->slots[i]);
770		sfq_link(q, i);
771	}
772	if (q->limit >= 1)
773		sch->flags |= TCQ_F_CAN_BYPASS;
774	else
775		sch->flags &= ~TCQ_F_CAN_BYPASS;
776	return 0;
777}
778
779static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
780{
781	struct sfq_sched_data *q = qdisc_priv(sch);
782	unsigned char *b = skb_tail_pointer(skb);
783	struct tc_sfq_qopt_v1 opt;
784	struct red_parms *p = q->red_parms;
785
786	memset(&opt, 0, sizeof(opt));
787	opt.v0.quantum	= q->quantum;
788	opt.v0.perturb_period = q->perturb_period / HZ;
789	opt.v0.limit	= q->limit;
790	opt.v0.divisor	= q->divisor;
791	opt.v0.flows	= q->maxflows;
792	opt.depth	= q->maxdepth;
793	opt.headdrop	= q->headdrop;
794
795	if (p) {
796		opt.qth_min	= p->qth_min >> p->Wlog;
797		opt.qth_max	= p->qth_max >> p->Wlog;
798		opt.Wlog	= p->Wlog;
799		opt.Plog	= p->Plog;
800		opt.Scell_log	= p->Scell_log;
801		opt.max_P	= p->max_P;
802	}
803	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
804	opt.flags	= q->flags;
805
806	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
807		goto nla_put_failure;
808
809	return skb->len;
810
811nla_put_failure:
812	nlmsg_trim(skb, b);
813	return -1;
814}
815
816static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
817{
818	return NULL;
819}
820
821static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
822{
823	return 0;
824}
825
826static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
827			      u32 classid)
828{
 
 
829	return 0;
830}
831
832static void sfq_unbind(struct Qdisc *q, unsigned long cl)
833{
834}
835
836static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
837				       struct netlink_ext_ack *extack)
838{
839	struct sfq_sched_data *q = qdisc_priv(sch);
840
841	if (cl)
842		return NULL;
843	return q->block;
844}
845
846static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
847			  struct sk_buff *skb, struct tcmsg *tcm)
848{
849	tcm->tcm_handle |= TC_H_MIN(cl);
850	return 0;
851}
852
853static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
854				struct gnet_dump *d)
855{
856	struct sfq_sched_data *q = qdisc_priv(sch);
857	sfq_index idx = q->ht[cl - 1];
858	struct gnet_stats_queue qs = { 0 };
859	struct tc_sfq_xstats xstats = { 0 };
860
861	if (idx != SFQ_EMPTY_SLOT) {
862		const struct sfq_slot *slot = &q->slots[idx];
863
864		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
865		qs.qlen = slot->qlen;
866		qs.backlog = slot->backlog;
867	}
868	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
869		return -1;
870	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
871}
872
873static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
874{
875	struct sfq_sched_data *q = qdisc_priv(sch);
876	unsigned int i;
877
878	if (arg->stop)
879		return;
880
881	for (i = 0; i < q->divisor; i++) {
882		if (q->ht[i] == SFQ_EMPTY_SLOT ||
883		    arg->count < arg->skip) {
884			arg->count++;
885			continue;
886		}
887		if (arg->fn(sch, i + 1, arg) < 0) {
888			arg->stop = 1;
889			break;
890		}
891		arg->count++;
892	}
893}
894
895static const struct Qdisc_class_ops sfq_class_ops = {
896	.leaf		=	sfq_leaf,
897	.find		=	sfq_find,
898	.tcf_block	=	sfq_tcf_block,
 
899	.bind_tcf	=	sfq_bind,
900	.unbind_tcf	=	sfq_unbind,
901	.dump		=	sfq_dump_class,
902	.dump_stats	=	sfq_dump_class_stats,
903	.walk		=	sfq_walk,
904};
905
906static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
907	.cl_ops		=	&sfq_class_ops,
908	.id		=	"sfq",
909	.priv_size	=	sizeof(struct sfq_sched_data),
910	.enqueue	=	sfq_enqueue,
911	.dequeue	=	sfq_dequeue,
912	.peek		=	qdisc_peek_dequeued,
 
913	.init		=	sfq_init,
914	.reset		=	sfq_reset,
915	.destroy	=	sfq_destroy,
916	.change		=	NULL,
917	.dump		=	sfq_dump,
918	.owner		=	THIS_MODULE,
919};
920
921static int __init sfq_module_init(void)
922{
923	return register_qdisc(&sfq_qdisc_ops);
924}
925static void __exit sfq_module_exit(void)
926{
927	unregister_qdisc(&sfq_qdisc_ops);
928}
929module_init(sfq_module_init)
930module_exit(sfq_module_exit)
931MODULE_LICENSE("GPL");

 
  1/*
  2 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 */
 11
 12#include <linux/module.h>
 13#include <linux/types.h>
 14#include <linux/kernel.h>
 15#include <linux/jiffies.h>
 16#include <linux/string.h>
 17#include <linux/in.h>
 18#include <linux/errno.h>
 19#include <linux/init.h>
 20#include <linux/skbuff.h>
 21#include <linux/jhash.h>
 22#include <linux/slab.h>
 23#include <linux/vmalloc.h>
 24#include <net/netlink.h>
 25#include <net/pkt_sched.h>
 
 26#include <net/red.h>
 27
 28
 29/*	Stochastic Fairness Queuing algorithm.
 30	=======================================
 31
 32	Source:
 33	Paul E. McKenney "Stochastic Fairness Queuing",
 34	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
 35
 36	Paul E. McKenney "Stochastic Fairness Queuing",
 37	"Interworking: Research and Experience", v.2, 1991, p.113-131.
 38
 39
 40	See also:
 41	M. Shreedhar and George Varghese "Efficient Fair
 42	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 43
 44
 45	This is not the thing that is usually called (W)FQ nowadays.
 46	It does not use any timestamp mechanism, but instead
 47	processes queues in round-robin order.
 48
 49	ADVANTAGE:
 50
 51	- It is very cheap. Both CPU and memory requirements are minimal.
 52
 53	DRAWBACKS:
 54
 55	- "Stochastic" -> It is not 100% fair.
 56	When hash collisions occur, several flows are considered as one.
 57
 58	- "Round-robin" -> It introduces larger delays than virtual clock
 59	based schemes, and should not be used for isolating interactive
 60	traffic	from non-interactive. It means, that this scheduler
 61	should be used as leaf of CBQ or P3, which put interactive traffic
 62	to higher priority band.
 63
 64	We still need true WFQ for top level CSZ, but using WFQ
 65	for the best effort traffic is absolutely pointless:
 66	SFQ is superior for this purpose.
 67
 68	IMPLEMENTATION:
 69	This implementation limits :
 70	- maximal queue length per flow to 127 packets.
 71	- max mtu to 2^18-1;
 72	- max 65408 flows,
 73	- number of hash buckets to 65536.
 74
 75	It is easy to increase these values, but not in flight.  */
 76
 77#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
 78#define SFQ_DEFAULT_FLOWS	128
 79#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
 80#define SFQ_EMPTY_SLOT		0xffff
 81#define SFQ_DEFAULT_HASH_DIVISOR 1024
 82
 83/* We use 16 bits to store allot, and want to handle packets up to 64K
 84 * Scale allot by 8 (1<<3) so that no overflow occurs.
 85 */
 86#define SFQ_ALLOT_SHIFT		3
 87#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 88
 89/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
 90typedef u16 sfq_index;
 91
 92/*
 93 * We dont use pointers to save space.
 94 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
 95 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
 96 * are 'pointers' to dep[] array
 97 */
 98struct sfq_head {
 99	sfq_index	next;
100	sfq_index	prev;
101};
102
103struct sfq_slot {
104	struct sk_buff	*skblist_next;
105	struct sk_buff	*skblist_prev;
106	sfq_index	qlen; /* number of skbs in skblist */
107	sfq_index	next; /* next slot in sfq RR chain */
108	struct sfq_head dep; /* anchor in dep[] chains */
109	unsigned short	hash; /* hash value (index in ht[]) */
110	short		allot; /* credit for this slot */
111
112	unsigned int    backlog;
113	struct red_vars vars;
114};
115
116struct sfq_sched_data {
117/* frequently used fields */
118	int		limit;		/* limit of total number of packets in this qdisc */
119	unsigned int	divisor;	/* number of slots in hash table */
120	u8		headdrop;
121	u8		maxdepth;	/* limit of packets per flow */
122
123	u32		perturbation;
124	u8		cur_depth;	/* depth of longest slot */
125	u8		flags;
126	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
127	struct tcf_proto __rcu *filter_list;
 
128	sfq_index	*ht;		/* Hash table ('divisor' slots) */
129	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
130
131	struct red_parms *red_parms;
132	struct tc_sfqred_stats stats;
133	struct sfq_slot *tail;		/* current slot in round */
134
135	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
136					/* Linked lists of slots, indexed by depth
137					 * dep[0] : list of unused flows
138					 * dep[1] : list of flows with 1 packet
139					 * dep[X] : list of flows with X packets
140					 */
141
142	unsigned int	maxflows;	/* number of flows in flows array */
143	int		perturb_period;
144	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
145	struct timer_list perturb_timer;
 
146};
147
148/*
149 * sfq_head are either in a sfq_slot or in dep[] array
150 */
151static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
152{
153	if (val < SFQ_MAX_FLOWS)
154		return &q->slots[val].dep;
155	return &q->dep[val - SFQ_MAX_FLOWS];
156}
157
158static unsigned int sfq_hash(const struct sfq_sched_data *q,
159			     const struct sk_buff *skb)
160{
161	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
162}
163
164static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
165				 int *qerr)
166{
167	struct sfq_sched_data *q = qdisc_priv(sch);
168	struct tcf_result res;
169	struct tcf_proto *fl;
170	int result;
171
172	if (TC_H_MAJ(skb->priority) == sch->handle &&
173	    TC_H_MIN(skb->priority) > 0 &&
174	    TC_H_MIN(skb->priority) <= q->divisor)
175		return TC_H_MIN(skb->priority);
176
177	fl = rcu_dereference_bh(q->filter_list);
178	if (!fl)
179		return sfq_hash(q, skb) + 1;
180
181	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
182	result = tc_classify(skb, fl, &res, false);
183	if (result >= 0) {
184#ifdef CONFIG_NET_CLS_ACT
185		switch (result) {
186		case TC_ACT_STOLEN:
187		case TC_ACT_QUEUED:
 
188			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 
189		case TC_ACT_SHOT:
190			return 0;
191		}
192#endif
193		if (TC_H_MIN(res.classid) <= q->divisor)
194			return TC_H_MIN(res.classid);
195	}
196	return 0;
197}
198
199/*
200 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
201 */
202static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
203{
204	sfq_index p, n;
205	struct sfq_slot *slot = &q->slots[x];
206	int qlen = slot->qlen;
207
208	p = qlen + SFQ_MAX_FLOWS;
209	n = q->dep[qlen].next;
210
211	slot->dep.next = n;
212	slot->dep.prev = p;
213
214	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
215	sfq_dep_head(q, n)->prev = x;
216}
217
218#define sfq_unlink(q, x, n, p)			\
219	do {					\
220		n = q->slots[x].dep.next;	\
221		p = q->slots[x].dep.prev;	\
222		sfq_dep_head(q, p)->next = n;	\
223		sfq_dep_head(q, n)->prev = p;	\
224	} while (0)
225
226
227static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
228{
229	sfq_index p, n;
230	int d;
231
232	sfq_unlink(q, x, n, p);
233
234	d = q->slots[x].qlen--;
235	if (n == p && q->cur_depth == d)
236		q->cur_depth--;
237	sfq_link(q, x);
238}
239
240static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
241{
242	sfq_index p, n;
243	int d;
244
245	sfq_unlink(q, x, n, p);
246
247	d = ++q->slots[x].qlen;
248	if (q->cur_depth < d)
249		q->cur_depth = d;
250	sfq_link(q, x);
251}
252
253/* helper functions : might be changed when/if skb use a standard list_head */
254
255/* remove one skb from tail of slot queue */
256static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
257{
258	struct sk_buff *skb = slot->skblist_prev;
259
260	slot->skblist_prev = skb->prev;
261	skb->prev->next = (struct sk_buff *)slot;
262	skb->next = skb->prev = NULL;
263	return skb;
264}
265
266/* remove one skb from head of slot queue */
267static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
268{
269	struct sk_buff *skb = slot->skblist_next;
270
271	slot->skblist_next = skb->next;
272	skb->next->prev = (struct sk_buff *)slot;
273	skb->next = skb->prev = NULL;
274	return skb;
275}
276
277static inline void slot_queue_init(struct sfq_slot *slot)
278{
279	memset(slot, 0, sizeof(*slot));
280	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
281}
282
283/* add skb to slot queue (tail add) */
284static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
285{
286	skb->prev = slot->skblist_prev;
287	skb->next = (struct sk_buff *)slot;
288	slot->skblist_prev->next = skb;
289	slot->skblist_prev = skb;
290}
291
292static unsigned int sfq_drop(struct Qdisc *sch)
293{
294	struct sfq_sched_data *q = qdisc_priv(sch);
295	sfq_index x, d = q->cur_depth;
296	struct sk_buff *skb;
297	unsigned int len;
298	struct sfq_slot *slot;
299
300	/* Queue is full! Find the longest slot and drop tail packet from it */
301	if (d > 1) {
302		x = q->dep[d].next;
303		slot = &q->slots[x];
304drop:
305		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
306		len = qdisc_pkt_len(skb);
307		slot->backlog -= len;
308		sfq_dec(q, x);
309		sch->q.qlen--;
310		qdisc_qstats_drop(sch);
311		qdisc_qstats_backlog_dec(sch, skb);
312		kfree_skb(skb);
313		return len;
314	}
315
316	if (d == 1) {
317		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318		x = q->tail->next;
319		slot = &q->slots[x];
320		q->tail->next = slot->next;
321		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322		goto drop;
323	}
324
325	return 0;
326}
327
328/* Is ECN parameter configured */
329static int sfq_prob_mark(const struct sfq_sched_data *q)
330{
331	return q->flags & TC_RED_ECN;
332}
333
334/* Should packets over max threshold just be marked */
335static int sfq_hard_mark(const struct sfq_sched_data *q)
336{
337	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338}
339
340static int sfq_headdrop(const struct sfq_sched_data *q)
341{
342	return q->headdrop;
343}
344
345static int
346sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
347{
348	struct sfq_sched_data *q = qdisc_priv(sch);
349	unsigned int hash, dropped;
350	sfq_index x, qlen;
351	struct sfq_slot *slot;
352	int uninitialized_var(ret);
353	struct sk_buff *head;
354	int delta;
355
356	hash = sfq_classify(skb, sch, &ret);
357	if (hash == 0) {
358		if (ret & __NET_XMIT_BYPASS)
359			qdisc_qstats_drop(sch);
360		kfree_skb(skb);
361		return ret;
362	}
363	hash--;
364
365	x = q->ht[hash];
366	slot = &q->slots[x];
367	if (x == SFQ_EMPTY_SLOT) {
368		x = q->dep[0].next; /* get a free slot */
369		if (x >= SFQ_MAX_FLOWS)
370			return qdisc_drop(skb, sch);
371		q->ht[hash] = x;
372		slot = &q->slots[x];
373		slot->hash = hash;
374		slot->backlog = 0; /* should already be 0 anyway... */
375		red_set_vars(&slot->vars);
376		goto enqueue;
377	}
378	if (q->red_parms) {
379		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380							&slot->vars,
381							slot->backlog);
382		switch (red_action(q->red_parms,
383				   &slot->vars,
384				   slot->vars.qavg)) {
385		case RED_DONT_MARK:
386			break;
387
388		case RED_PROB_MARK:
389			qdisc_qstats_overlimit(sch);
390			if (sfq_prob_mark(q)) {
391				/* We know we have at least one packet in queue */
392				if (sfq_headdrop(q) &&
393				    INET_ECN_set_ce(slot->skblist_next)) {
394					q->stats.prob_mark_head++;
395					break;
396				}
397				if (INET_ECN_set_ce(skb)) {
398					q->stats.prob_mark++;
399					break;
400				}
401			}
402			q->stats.prob_drop++;
403			goto congestion_drop;
404
405		case RED_HARD_MARK:
406			qdisc_qstats_overlimit(sch);
407			if (sfq_hard_mark(q)) {
408				/* We know we have at least one packet in queue */
409				if (sfq_headdrop(q) &&
410				    INET_ECN_set_ce(slot->skblist_next)) {
411					q->stats.forced_mark_head++;
412					break;
413				}
414				if (INET_ECN_set_ce(skb)) {
415					q->stats.forced_mark++;
416					break;
417				}
418			}
419			q->stats.forced_drop++;
420			goto congestion_drop;
421		}
422	}
423
424	if (slot->qlen >= q->maxdepth) {
425congestion_drop:
426		if (!sfq_headdrop(q))
427			return qdisc_drop(skb, sch);
428
429		/* We know we have at least one packet in queue */
430		head = slot_dequeue_head(slot);
431		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432		sch->qstats.backlog -= delta;
433		slot->backlog -= delta;
434		qdisc_drop(head, sch);
435
436		slot_queue_add(slot, skb);
 
437		return NET_XMIT_CN;
438	}
439
440enqueue:
441	qdisc_qstats_backlog_inc(sch, skb);
442	slot->backlog += qdisc_pkt_len(skb);
443	slot_queue_add(slot, skb);
444	sfq_inc(q, x);
445	if (slot->qlen == 1) {		/* The flow is new */
446		if (q->tail == NULL) {	/* It is the first flow */
447			slot->next = x;
448		} else {
449			slot->next = q->tail->next;
450			q->tail->next = x;
451		}
452		/* We put this flow at the end of our flow list.
453		 * This might sound unfair for a new flow to wait after old ones,
454		 * but we could endup servicing new flows only, and freeze old ones.
455		 */
456		q->tail = slot;
457		/* We could use a bigger initial quantum for new flows */
458		slot->allot = q->scaled_quantum;
459	}
460	if (++sch->q.qlen <= q->limit)
461		return NET_XMIT_SUCCESS;
462
463	qlen = slot->qlen;
464	dropped = sfq_drop(sch);
465	/* Return Congestion Notification only if we dropped a packet
466	 * from this flow.
467	 */
468	if (qlen != slot->qlen)
 
469		return NET_XMIT_CN;
 
470
471	/* As we dropped a packet, better let upper stack know this */
472	qdisc_tree_reduce_backlog(sch, 1, dropped);
473	return NET_XMIT_SUCCESS;
474}
475
476static struct sk_buff *
477sfq_dequeue(struct Qdisc *sch)
478{
479	struct sfq_sched_data *q = qdisc_priv(sch);
480	struct sk_buff *skb;
481	sfq_index a, next_a;
482	struct sfq_slot *slot;
483
484	/* No active slots */
485	if (q->tail == NULL)
486		return NULL;
487
488next_slot:
489	a = q->tail->next;
490	slot = &q->slots[a];
491	if (slot->allot <= 0) {
492		q->tail = slot;
493		slot->allot += q->scaled_quantum;
494		goto next_slot;
495	}
496	skb = slot_dequeue_head(slot);
497	sfq_dec(q, a);
498	qdisc_bstats_update(sch, skb);
499	sch->q.qlen--;
500	qdisc_qstats_backlog_dec(sch, skb);
501	slot->backlog -= qdisc_pkt_len(skb);
502	/* Is the slot empty? */
503	if (slot->qlen == 0) {
504		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
505		next_a = slot->next;
506		if (a == next_a) {
507			q->tail = NULL; /* no more active slots */
508			return skb;
509		}
510		q->tail->next = next_a;
511	} else {
512		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
513	}
514	return skb;
515}
516
517static void
518sfq_reset(struct Qdisc *sch)
519{
520	struct sk_buff *skb;
521
522	while ((skb = sfq_dequeue(sch)) != NULL)
523		kfree_skb(skb);
524}
525
526/*
527 * When q->perturbation is changed, we rehash all queued skbs
528 * to avoid OOO (Out Of Order) effects.
529 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
530 * counters.
531 */
532static void sfq_rehash(struct Qdisc *sch)
533{
534	struct sfq_sched_data *q = qdisc_priv(sch);
535	struct sk_buff *skb;
536	int i;
537	struct sfq_slot *slot;
538	struct sk_buff_head list;
539	int dropped = 0;
540	unsigned int drop_len = 0;
541
542	__skb_queue_head_init(&list);
543
544	for (i = 0; i < q->maxflows; i++) {
545		slot = &q->slots[i];
546		if (!slot->qlen)
547			continue;
548		while (slot->qlen) {
549			skb = slot_dequeue_head(slot);
550			sfq_dec(q, i);
551			__skb_queue_tail(&list, skb);
552		}
553		slot->backlog = 0;
554		red_set_vars(&slot->vars);
555		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
556	}
557	q->tail = NULL;
558
559	while ((skb = __skb_dequeue(&list)) != NULL) {
560		unsigned int hash = sfq_hash(q, skb);
561		sfq_index x = q->ht[hash];
562
563		slot = &q->slots[x];
564		if (x == SFQ_EMPTY_SLOT) {
565			x = q->dep[0].next; /* get a free slot */
566			if (x >= SFQ_MAX_FLOWS) {
567drop:
568				qdisc_qstats_backlog_dec(sch, skb);
569				drop_len += qdisc_pkt_len(skb);
570				kfree_skb(skb);
571				dropped++;
572				continue;
573			}
574			q->ht[hash] = x;
575			slot = &q->slots[x];
576			slot->hash = hash;
577		}
578		if (slot->qlen >= q->maxdepth)
579			goto drop;
580		slot_queue_add(slot, skb);
581		if (q->red_parms)
582			slot->vars.qavg = red_calc_qavg(q->red_parms,
583							&slot->vars,
584							slot->backlog);
585		slot->backlog += qdisc_pkt_len(skb);
586		sfq_inc(q, x);
587		if (slot->qlen == 1) {		/* The flow is new */
588			if (q->tail == NULL) {	/* It is the first flow */
589				slot->next = x;
590			} else {
591				slot->next = q->tail->next;
592				q->tail->next = x;
593			}
594			q->tail = slot;
595			slot->allot = q->scaled_quantum;
596		}
597	}
598	sch->q.qlen -= dropped;
599	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
600}
601
602static void sfq_perturbation(unsigned long arg)
603{
604	struct Qdisc *sch = (struct Qdisc *)arg;
605	struct sfq_sched_data *q = qdisc_priv(sch);
606	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 
607
 
608	spin_lock(root_lock);
609	q->perturbation = prandom_u32();
610	if (!q->filter_list && q->tail)
611		sfq_rehash(sch);
612	spin_unlock(root_lock);
613
614	if (q->perturb_period)
615		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
616}
617
618static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
619{
620	struct sfq_sched_data *q = qdisc_priv(sch);
621	struct tc_sfq_qopt *ctl = nla_data(opt);
622	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
623	unsigned int qlen, dropped = 0;
624	struct red_parms *p = NULL;
 
 
625
626	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
627		return -EINVAL;
628	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
629		ctl_v1 = nla_data(opt);
630	if (ctl->divisor &&
631	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
632		return -EINVAL;
 
 
 
633	if (ctl_v1 && ctl_v1->qth_min) {
634		p = kmalloc(sizeof(*p), GFP_KERNEL);
635		if (!p)
636			return -ENOMEM;
637	}
638	sch_tree_lock(sch);
639	if (ctl->quantum) {
640		q->quantum = ctl->quantum;
641		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
642	}
643	q->perturb_period = ctl->perturb_period * HZ;
644	if (ctl->flows)
645		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
646	if (ctl->divisor) {
647		q->divisor = ctl->divisor;
648		q->maxflows = min_t(u32, q->maxflows, q->divisor);
649	}
650	if (ctl_v1) {
651		if (ctl_v1->depth)
652			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
653		if (p) {
654			swap(q->red_parms, p);
655			red_set_parms(q->red_parms,
656				      ctl_v1->qth_min, ctl_v1->qth_max,
657				      ctl_v1->Wlog,
658				      ctl_v1->Plog, ctl_v1->Scell_log,
659				      NULL,
660				      ctl_v1->max_P);
661		}
662		q->flags = ctl_v1->flags;
663		q->headdrop = ctl_v1->headdrop;
664	}
665	if (ctl->limit) {
666		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
667		q->maxflows = min_t(u32, q->maxflows, q->limit);
668	}
669
670	qlen = sch->q.qlen;
671	while (sch->q.qlen > q->limit)
672		dropped += sfq_drop(sch);
 
 
 
 
 
673	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
674
675	del_timer(&q->perturb_timer);
676	if (q->perturb_period) {
677		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
678		q->perturbation = prandom_u32();
679	}
680	sch_tree_unlock(sch);
681	kfree(p);
682	return 0;
683}
684
685static void *sfq_alloc(size_t sz)
686{
687	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
688
689	if (!ptr)
690		ptr = vmalloc(sz);
691	return ptr;
692}
693
694static void sfq_free(void *addr)
695{
696	kvfree(addr);
697}
698
699static void sfq_destroy(struct Qdisc *sch)
700{
701	struct sfq_sched_data *q = qdisc_priv(sch);
702
703	tcf_destroy_chain(&q->filter_list);
704	q->perturb_period = 0;
705	del_timer_sync(&q->perturb_timer);
706	sfq_free(q->ht);
707	sfq_free(q->slots);
708	kfree(q->red_parms);
709}
710
711static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 
712{
713	struct sfq_sched_data *q = qdisc_priv(sch);
714	int i;
 
715
716	q->perturb_timer.function = sfq_perturbation;
717	q->perturb_timer.data = (unsigned long)sch;
718	init_timer_deferrable(&q->perturb_timer);
 
 
 
719
720	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
721		q->dep[i].next = i + SFQ_MAX_FLOWS;
722		q->dep[i].prev = i + SFQ_MAX_FLOWS;
723	}
724
725	q->limit = SFQ_MAX_DEPTH;
726	q->maxdepth = SFQ_MAX_DEPTH;
727	q->cur_depth = 0;
728	q->tail = NULL;
729	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
730	q->maxflows = SFQ_DEFAULT_FLOWS;
731	q->quantum = psched_mtu(qdisc_dev(sch));
732	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
733	q->perturb_period = 0;
734	q->perturbation = prandom_u32();
735
736	if (opt) {
737		int err = sfq_change(sch, opt);
738		if (err)
739			return err;
740	}
741
742	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
743	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
744	if (!q->ht || !q->slots) {
745		sfq_destroy(sch);
746		return -ENOMEM;
747	}
 
748	for (i = 0; i < q->divisor; i++)
749		q->ht[i] = SFQ_EMPTY_SLOT;
750
751	for (i = 0; i < q->maxflows; i++) {
752		slot_queue_init(&q->slots[i]);
753		sfq_link(q, i);
754	}
755	if (q->limit >= 1)
756		sch->flags |= TCQ_F_CAN_BYPASS;
757	else
758		sch->flags &= ~TCQ_F_CAN_BYPASS;
759	return 0;
760}
761
762static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
763{
764	struct sfq_sched_data *q = qdisc_priv(sch);
765	unsigned char *b = skb_tail_pointer(skb);
766	struct tc_sfq_qopt_v1 opt;
767	struct red_parms *p = q->red_parms;
768
769	memset(&opt, 0, sizeof(opt));
770	opt.v0.quantum	= q->quantum;
771	opt.v0.perturb_period = q->perturb_period / HZ;
772	opt.v0.limit	= q->limit;
773	opt.v0.divisor	= q->divisor;
774	opt.v0.flows	= q->maxflows;
775	opt.depth	= q->maxdepth;
776	opt.headdrop	= q->headdrop;
777
778	if (p) {
779		opt.qth_min	= p->qth_min >> p->Wlog;
780		opt.qth_max	= p->qth_max >> p->Wlog;
781		opt.Wlog	= p->Wlog;
782		opt.Plog	= p->Plog;
783		opt.Scell_log	= p->Scell_log;
784		opt.max_P	= p->max_P;
785	}
786	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
787	opt.flags	= q->flags;
788
789	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
790		goto nla_put_failure;
791
792	return skb->len;
793
794nla_put_failure:
795	nlmsg_trim(skb, b);
796	return -1;
797}
798
799static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
800{
801	return NULL;
802}
803
804static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
805{
806	return 0;
807}
808
809static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
810			      u32 classid)
811{
812	/* we cannot bypass queue discipline anymore */
813	sch->flags &= ~TCQ_F_CAN_BYPASS;
814	return 0;
815}
816
817static void sfq_put(struct Qdisc *q, unsigned long cl)
818{
819}
820
821static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
822					     unsigned long cl)
823{
824	struct sfq_sched_data *q = qdisc_priv(sch);
825
826	if (cl)
827		return NULL;
828	return &q->filter_list;
829}
830
831static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
832			  struct sk_buff *skb, struct tcmsg *tcm)
833{
834	tcm->tcm_handle |= TC_H_MIN(cl);
835	return 0;
836}
837
838static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
839				struct gnet_dump *d)
840{
841	struct sfq_sched_data *q = qdisc_priv(sch);
842	sfq_index idx = q->ht[cl - 1];
843	struct gnet_stats_queue qs = { 0 };
844	struct tc_sfq_xstats xstats = { 0 };
845
846	if (idx != SFQ_EMPTY_SLOT) {
847		const struct sfq_slot *slot = &q->slots[idx];
848
849		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
850		qs.qlen = slot->qlen;
851		qs.backlog = slot->backlog;
852	}
853	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
854		return -1;
855	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
856}
857
858static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
859{
860	struct sfq_sched_data *q = qdisc_priv(sch);
861	unsigned int i;
862
863	if (arg->stop)
864		return;
865
866	for (i = 0; i < q->divisor; i++) {
867		if (q->ht[i] == SFQ_EMPTY_SLOT ||
868		    arg->count < arg->skip) {
869			arg->count++;
870			continue;
871		}
872		if (arg->fn(sch, i + 1, arg) < 0) {
873			arg->stop = 1;
874			break;
875		}
876		arg->count++;
877	}
878}
879
880static const struct Qdisc_class_ops sfq_class_ops = {
881	.leaf		=	sfq_leaf,
882	.get		=	sfq_get,
883	.put		=	sfq_put,
884	.tcf_chain	=	sfq_find_tcf,
885	.bind_tcf	=	sfq_bind,
886	.unbind_tcf	=	sfq_put,
887	.dump		=	sfq_dump_class,
888	.dump_stats	=	sfq_dump_class_stats,
889	.walk		=	sfq_walk,
890};
891
892static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
893	.cl_ops		=	&sfq_class_ops,
894	.id		=	"sfq",
895	.priv_size	=	sizeof(struct sfq_sched_data),
896	.enqueue	=	sfq_enqueue,
897	.dequeue	=	sfq_dequeue,
898	.peek		=	qdisc_peek_dequeued,
899	.drop		=	sfq_drop,
900	.init		=	sfq_init,
901	.reset		=	sfq_reset,
902	.destroy	=	sfq_destroy,
903	.change		=	NULL,
904	.dump		=	sfq_dump,
905	.owner		=	THIS_MODULE,
906};
907
908static int __init sfq_module_init(void)
909{
910	return register_qdisc(&sfq_qdisc_ops);
911}
912static void __exit sfq_module_exit(void)
913{
914	unregister_qdisc(&sfq_qdisc_ops);
915}
916module_init(sfq_module_init)
917module_exit(sfq_module_exit)
918MODULE_LICENSE("GPL");