Loading...
1/*
2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
17#include <linux/in.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/skbuff.h>
21#include <linux/jhash.h>
22#include <linux/slab.h>
23#include <linux/vmalloc.h>
24#include <net/netlink.h>
25#include <net/pkt_sched.h>
26#include <net/flow_keys.h>
27#include <net/red.h>
28
29
30/* Stochastic Fairness Queuing algorithm.
31 =======================================
32
33 Source:
34 Paul E. McKenney "Stochastic Fairness Queuing",
35 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36
37 Paul E. McKenney "Stochastic Fairness Queuing",
38 "Interworking: Research and Experience", v.2, 1991, p.113-131.
39
40
41 See also:
42 M. Shreedhar and George Varghese "Efficient Fair
43 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44
45
46 This is not the thing that is usually called (W)FQ nowadays.
47 It does not use any timestamp mechanism, but instead
48 processes queues in round-robin order.
49
50 ADVANTAGE:
51
52 - It is very cheap. Both CPU and memory requirements are minimal.
53
54 DRAWBACKS:
55
56 - "Stochastic" -> It is not 100% fair.
57 When hash collisions occur, several flows are considered as one.
58
59 - "Round-robin" -> It introduces larger delays than virtual clock
60 based schemes, and should not be used for isolating interactive
61 traffic from non-interactive. It means, that this scheduler
62 should be used as leaf of CBQ or P3, which put interactive traffic
63 to higher priority band.
64
65 We still need true WFQ for top level CSZ, but using WFQ
66 for the best effort traffic is absolutely pointless:
67 SFQ is superior for this purpose.
68
69 IMPLEMENTATION:
70 This implementation limits :
71 - maximal queue length per flow to 127 packets.
72 - max mtu to 2^18-1;
73 - max 65408 flows,
74 - number of hash buckets to 65536.
75
76 It is easy to increase these values, but not in flight. */
77
78#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
79#define SFQ_DEFAULT_FLOWS 128
80#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81#define SFQ_EMPTY_SLOT 0xffff
82#define SFQ_DEFAULT_HASH_DIVISOR 1024
83
84/* We use 16 bits to store allot, and want to handle packets up to 64K
85 * Scale allot by 8 (1<<3) so that no overflow occurs.
86 */
87#define SFQ_ALLOT_SHIFT 3
88#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89
90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91typedef u16 sfq_index;
92
93/*
94 * We dont use pointers to save space.
95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97 * are 'pointers' to dep[] array
98 */
99struct sfq_head {
100 sfq_index next;
101 sfq_index prev;
102};
103
104struct sfq_slot {
105 struct sk_buff *skblist_next;
106 struct sk_buff *skblist_prev;
107 sfq_index qlen; /* number of skbs in skblist */
108 sfq_index next; /* next slot in sfq RR chain */
109 struct sfq_head dep; /* anchor in dep[] chains */
110 unsigned short hash; /* hash value (index in ht[]) */
111 short allot; /* credit for this slot */
112
113 unsigned int backlog;
114 struct red_vars vars;
115};
116
117struct sfq_sched_data {
118/* frequently used fields */
119 int limit; /* limit of total number of packets in this qdisc */
120 unsigned int divisor; /* number of slots in hash table */
121 u8 headdrop;
122 u8 maxdepth; /* limit of packets per flow */
123
124 u32 perturbation;
125 u8 cur_depth; /* depth of longest slot */
126 u8 flags;
127 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 struct tcf_proto *filter_list;
129 sfq_index *ht; /* Hash table ('divisor' slots) */
130 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
131
132 struct red_parms *red_parms;
133 struct tc_sfqred_stats stats;
134 struct sfq_slot *tail; /* current slot in round */
135
136 struct sfq_head dep[SFQ_MAX_DEPTH + 1];
137 /* Linked lists of slots, indexed by depth
138 * dep[0] : list of unused flows
139 * dep[1] : list of flows with 1 packet
140 * dep[X] : list of flows with X packets
141 */
142
143 unsigned int maxflows; /* number of flows in flows array */
144 int perturb_period;
145 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
146 struct timer_list perturb_timer;
147};
148
149/*
150 * sfq_head are either in a sfq_slot or in dep[] array
151 */
152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153{
154 if (val < SFQ_MAX_FLOWS)
155 return &q->slots[val].dep;
156 return &q->dep[val - SFQ_MAX_FLOWS];
157}
158
159/*
160 * In order to be able to quickly rehash our queue when timer changes
161 * q->perturbation, we store flow_keys in skb->cb[]
162 */
163struct sfq_skb_cb {
164 struct flow_keys keys;
165};
166
167static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168{
169 qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170 return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171}
172
173static unsigned int sfq_hash(const struct sfq_sched_data *q,
174 const struct sk_buff *skb)
175{
176 const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177 unsigned int hash;
178
179 hash = jhash_3words((__force u32)keys->dst,
180 (__force u32)keys->src ^ keys->ip_proto,
181 (__force u32)keys->ports, q->perturbation);
182 return hash & (q->divisor - 1);
183}
184
185static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186 int *qerr)
187{
188 struct sfq_sched_data *q = qdisc_priv(sch);
189 struct tcf_result res;
190 int result;
191
192 if (TC_H_MAJ(skb->priority) == sch->handle &&
193 TC_H_MIN(skb->priority) > 0 &&
194 TC_H_MIN(skb->priority) <= q->divisor)
195 return TC_H_MIN(skb->priority);
196
197 if (!q->filter_list) {
198 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199 return sfq_hash(q, skb) + 1;
200 }
201
202 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
203 result = tc_classify(skb, q->filter_list, &res);
204 if (result >= 0) {
205#ifdef CONFIG_NET_CLS_ACT
206 switch (result) {
207 case TC_ACT_STOLEN:
208 case TC_ACT_QUEUED:
209 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
210 case TC_ACT_SHOT:
211 return 0;
212 }
213#endif
214 if (TC_H_MIN(res.classid) <= q->divisor)
215 return TC_H_MIN(res.classid);
216 }
217 return 0;
218}
219
220/*
221 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222 */
223static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224{
225 sfq_index p, n;
226 struct sfq_slot *slot = &q->slots[x];
227 int qlen = slot->qlen;
228
229 p = qlen + SFQ_MAX_FLOWS;
230 n = q->dep[qlen].next;
231
232 slot->dep.next = n;
233 slot->dep.prev = p;
234
235 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
236 sfq_dep_head(q, n)->prev = x;
237}
238
239#define sfq_unlink(q, x, n, p) \
240 do { \
241 n = q->slots[x].dep.next; \
242 p = q->slots[x].dep.prev; \
243 sfq_dep_head(q, p)->next = n; \
244 sfq_dep_head(q, n)->prev = p; \
245 } while (0)
246
247
248static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
249{
250 sfq_index p, n;
251 int d;
252
253 sfq_unlink(q, x, n, p);
254
255 d = q->slots[x].qlen--;
256 if (n == p && q->cur_depth == d)
257 q->cur_depth--;
258 sfq_link(q, x);
259}
260
261static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
262{
263 sfq_index p, n;
264 int d;
265
266 sfq_unlink(q, x, n, p);
267
268 d = ++q->slots[x].qlen;
269 if (q->cur_depth < d)
270 q->cur_depth = d;
271 sfq_link(q, x);
272}
273
274/* helper functions : might be changed when/if skb use a standard list_head */
275
276/* remove one skb from tail of slot queue */
277static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
278{
279 struct sk_buff *skb = slot->skblist_prev;
280
281 slot->skblist_prev = skb->prev;
282 skb->prev->next = (struct sk_buff *)slot;
283 skb->next = skb->prev = NULL;
284 return skb;
285}
286
287/* remove one skb from head of slot queue */
288static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
289{
290 struct sk_buff *skb = slot->skblist_next;
291
292 slot->skblist_next = skb->next;
293 skb->next->prev = (struct sk_buff *)slot;
294 skb->next = skb->prev = NULL;
295 return skb;
296}
297
298static inline void slot_queue_init(struct sfq_slot *slot)
299{
300 memset(slot, 0, sizeof(*slot));
301 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
302}
303
304/* add skb to slot queue (tail add) */
305static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
306{
307 skb->prev = slot->skblist_prev;
308 skb->next = (struct sk_buff *)slot;
309 slot->skblist_prev->next = skb;
310 slot->skblist_prev = skb;
311}
312
313#define slot_queue_walk(slot, skb) \
314 for (skb = slot->skblist_next; \
315 skb != (struct sk_buff *)slot; \
316 skb = skb->next)
317
318static unsigned int sfq_drop(struct Qdisc *sch)
319{
320 struct sfq_sched_data *q = qdisc_priv(sch);
321 sfq_index x, d = q->cur_depth;
322 struct sk_buff *skb;
323 unsigned int len;
324 struct sfq_slot *slot;
325
326 /* Queue is full! Find the longest slot and drop tail packet from it */
327 if (d > 1) {
328 x = q->dep[d].next;
329 slot = &q->slots[x];
330drop:
331 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
332 len = qdisc_pkt_len(skb);
333 slot->backlog -= len;
334 sfq_dec(q, x);
335 kfree_skb(skb);
336 sch->q.qlen--;
337 sch->qstats.drops++;
338 sch->qstats.backlog -= len;
339 return len;
340 }
341
342 if (d == 1) {
343 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
344 x = q->tail->next;
345 slot = &q->slots[x];
346 q->tail->next = slot->next;
347 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
348 goto drop;
349 }
350
351 return 0;
352}
353
354/* Is ECN parameter configured */
355static int sfq_prob_mark(const struct sfq_sched_data *q)
356{
357 return q->flags & TC_RED_ECN;
358}
359
360/* Should packets over max threshold just be marked */
361static int sfq_hard_mark(const struct sfq_sched_data *q)
362{
363 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
364}
365
366static int sfq_headdrop(const struct sfq_sched_data *q)
367{
368 return q->headdrop;
369}
370
371static int
372sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
373{
374 struct sfq_sched_data *q = qdisc_priv(sch);
375 unsigned int hash;
376 sfq_index x, qlen;
377 struct sfq_slot *slot;
378 int uninitialized_var(ret);
379 struct sk_buff *head;
380 int delta;
381
382 hash = sfq_classify(skb, sch, &ret);
383 if (hash == 0) {
384 if (ret & __NET_XMIT_BYPASS)
385 sch->qstats.drops++;
386 kfree_skb(skb);
387 return ret;
388 }
389 hash--;
390
391 x = q->ht[hash];
392 slot = &q->slots[x];
393 if (x == SFQ_EMPTY_SLOT) {
394 x = q->dep[0].next; /* get a free slot */
395 if (x >= SFQ_MAX_FLOWS)
396 return qdisc_drop(skb, sch);
397 q->ht[hash] = x;
398 slot = &q->slots[x];
399 slot->hash = hash;
400 slot->backlog = 0; /* should already be 0 anyway... */
401 red_set_vars(&slot->vars);
402 goto enqueue;
403 }
404 if (q->red_parms) {
405 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
406 &slot->vars,
407 slot->backlog);
408 switch (red_action(q->red_parms,
409 &slot->vars,
410 slot->vars.qavg)) {
411 case RED_DONT_MARK:
412 break;
413
414 case RED_PROB_MARK:
415 sch->qstats.overlimits++;
416 if (sfq_prob_mark(q)) {
417 /* We know we have at least one packet in queue */
418 if (sfq_headdrop(q) &&
419 INET_ECN_set_ce(slot->skblist_next)) {
420 q->stats.prob_mark_head++;
421 break;
422 }
423 if (INET_ECN_set_ce(skb)) {
424 q->stats.prob_mark++;
425 break;
426 }
427 }
428 q->stats.prob_drop++;
429 goto congestion_drop;
430
431 case RED_HARD_MARK:
432 sch->qstats.overlimits++;
433 if (sfq_hard_mark(q)) {
434 /* We know we have at least one packet in queue */
435 if (sfq_headdrop(q) &&
436 INET_ECN_set_ce(slot->skblist_next)) {
437 q->stats.forced_mark_head++;
438 break;
439 }
440 if (INET_ECN_set_ce(skb)) {
441 q->stats.forced_mark++;
442 break;
443 }
444 }
445 q->stats.forced_drop++;
446 goto congestion_drop;
447 }
448 }
449
450 if (slot->qlen >= q->maxdepth) {
451congestion_drop:
452 if (!sfq_headdrop(q))
453 return qdisc_drop(skb, sch);
454
455 /* We know we have at least one packet in queue */
456 head = slot_dequeue_head(slot);
457 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
458 sch->qstats.backlog -= delta;
459 slot->backlog -= delta;
460 qdisc_drop(head, sch);
461
462 slot_queue_add(slot, skb);
463 return NET_XMIT_CN;
464 }
465
466enqueue:
467 sch->qstats.backlog += qdisc_pkt_len(skb);
468 slot->backlog += qdisc_pkt_len(skb);
469 slot_queue_add(slot, skb);
470 sfq_inc(q, x);
471 if (slot->qlen == 1) { /* The flow is new */
472 if (q->tail == NULL) { /* It is the first flow */
473 slot->next = x;
474 } else {
475 slot->next = q->tail->next;
476 q->tail->next = x;
477 }
478 /* We put this flow at the end of our flow list.
479 * This might sound unfair for a new flow to wait after old ones,
480 * but we could endup servicing new flows only, and freeze old ones.
481 */
482 q->tail = slot;
483 /* We could use a bigger initial quantum for new flows */
484 slot->allot = q->scaled_quantum;
485 }
486 if (++sch->q.qlen <= q->limit)
487 return NET_XMIT_SUCCESS;
488
489 qlen = slot->qlen;
490 sfq_drop(sch);
491 /* Return Congestion Notification only if we dropped a packet
492 * from this flow.
493 */
494 if (qlen != slot->qlen)
495 return NET_XMIT_CN;
496
497 /* As we dropped a packet, better let upper stack know this */
498 qdisc_tree_decrease_qlen(sch, 1);
499 return NET_XMIT_SUCCESS;
500}
501
502static struct sk_buff *
503sfq_dequeue(struct Qdisc *sch)
504{
505 struct sfq_sched_data *q = qdisc_priv(sch);
506 struct sk_buff *skb;
507 sfq_index a, next_a;
508 struct sfq_slot *slot;
509
510 /* No active slots */
511 if (q->tail == NULL)
512 return NULL;
513
514next_slot:
515 a = q->tail->next;
516 slot = &q->slots[a];
517 if (slot->allot <= 0) {
518 q->tail = slot;
519 slot->allot += q->scaled_quantum;
520 goto next_slot;
521 }
522 skb = slot_dequeue_head(slot);
523 sfq_dec(q, a);
524 qdisc_bstats_update(sch, skb);
525 sch->q.qlen--;
526 sch->qstats.backlog -= qdisc_pkt_len(skb);
527 slot->backlog -= qdisc_pkt_len(skb);
528 /* Is the slot empty? */
529 if (slot->qlen == 0) {
530 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
531 next_a = slot->next;
532 if (a == next_a) {
533 q->tail = NULL; /* no more active slots */
534 return skb;
535 }
536 q->tail->next = next_a;
537 } else {
538 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
539 }
540 return skb;
541}
542
543static void
544sfq_reset(struct Qdisc *sch)
545{
546 struct sk_buff *skb;
547
548 while ((skb = sfq_dequeue(sch)) != NULL)
549 kfree_skb(skb);
550}
551
552/*
553 * When q->perturbation is changed, we rehash all queued skbs
554 * to avoid OOO (Out Of Order) effects.
555 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
556 * counters.
557 */
558static void sfq_rehash(struct Qdisc *sch)
559{
560 struct sfq_sched_data *q = qdisc_priv(sch);
561 struct sk_buff *skb;
562 int i;
563 struct sfq_slot *slot;
564 struct sk_buff_head list;
565 int dropped = 0;
566
567 __skb_queue_head_init(&list);
568
569 for (i = 0; i < q->maxflows; i++) {
570 slot = &q->slots[i];
571 if (!slot->qlen)
572 continue;
573 while (slot->qlen) {
574 skb = slot_dequeue_head(slot);
575 sfq_dec(q, i);
576 __skb_queue_tail(&list, skb);
577 }
578 slot->backlog = 0;
579 red_set_vars(&slot->vars);
580 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
581 }
582 q->tail = NULL;
583
584 while ((skb = __skb_dequeue(&list)) != NULL) {
585 unsigned int hash = sfq_hash(q, skb);
586 sfq_index x = q->ht[hash];
587
588 slot = &q->slots[x];
589 if (x == SFQ_EMPTY_SLOT) {
590 x = q->dep[0].next; /* get a free slot */
591 if (x >= SFQ_MAX_FLOWS) {
592drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
593 kfree_skb(skb);
594 dropped++;
595 continue;
596 }
597 q->ht[hash] = x;
598 slot = &q->slots[x];
599 slot->hash = hash;
600 }
601 if (slot->qlen >= q->maxdepth)
602 goto drop;
603 slot_queue_add(slot, skb);
604 if (q->red_parms)
605 slot->vars.qavg = red_calc_qavg(q->red_parms,
606 &slot->vars,
607 slot->backlog);
608 slot->backlog += qdisc_pkt_len(skb);
609 sfq_inc(q, x);
610 if (slot->qlen == 1) { /* The flow is new */
611 if (q->tail == NULL) { /* It is the first flow */
612 slot->next = x;
613 } else {
614 slot->next = q->tail->next;
615 q->tail->next = x;
616 }
617 q->tail = slot;
618 slot->allot = q->scaled_quantum;
619 }
620 }
621 sch->q.qlen -= dropped;
622 qdisc_tree_decrease_qlen(sch, dropped);
623}
624
625static void sfq_perturbation(unsigned long arg)
626{
627 struct Qdisc *sch = (struct Qdisc *)arg;
628 struct sfq_sched_data *q = qdisc_priv(sch);
629 spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
630
631 spin_lock(root_lock);
632 q->perturbation = prandom_u32();
633 if (!q->filter_list && q->tail)
634 sfq_rehash(sch);
635 spin_unlock(root_lock);
636
637 if (q->perturb_period)
638 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
639}
640
641static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
642{
643 struct sfq_sched_data *q = qdisc_priv(sch);
644 struct tc_sfq_qopt *ctl = nla_data(opt);
645 struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
646 unsigned int qlen;
647 struct red_parms *p = NULL;
648
649 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
650 return -EINVAL;
651 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
652 ctl_v1 = nla_data(opt);
653 if (ctl->divisor &&
654 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
655 return -EINVAL;
656 if (ctl_v1 && ctl_v1->qth_min) {
657 p = kmalloc(sizeof(*p), GFP_KERNEL);
658 if (!p)
659 return -ENOMEM;
660 }
661 sch_tree_lock(sch);
662 if (ctl->quantum) {
663 q->quantum = ctl->quantum;
664 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
665 }
666 q->perturb_period = ctl->perturb_period * HZ;
667 if (ctl->flows)
668 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
669 if (ctl->divisor) {
670 q->divisor = ctl->divisor;
671 q->maxflows = min_t(u32, q->maxflows, q->divisor);
672 }
673 if (ctl_v1) {
674 if (ctl_v1->depth)
675 q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
676 if (p) {
677 swap(q->red_parms, p);
678 red_set_parms(q->red_parms,
679 ctl_v1->qth_min, ctl_v1->qth_max,
680 ctl_v1->Wlog,
681 ctl_v1->Plog, ctl_v1->Scell_log,
682 NULL,
683 ctl_v1->max_P);
684 }
685 q->flags = ctl_v1->flags;
686 q->headdrop = ctl_v1->headdrop;
687 }
688 if (ctl->limit) {
689 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
690 q->maxflows = min_t(u32, q->maxflows, q->limit);
691 }
692
693 qlen = sch->q.qlen;
694 while (sch->q.qlen > q->limit)
695 sfq_drop(sch);
696 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
697
698 del_timer(&q->perturb_timer);
699 if (q->perturb_period) {
700 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
701 q->perturbation = prandom_u32();
702 }
703 sch_tree_unlock(sch);
704 kfree(p);
705 return 0;
706}
707
708static void *sfq_alloc(size_t sz)
709{
710 void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
711
712 if (!ptr)
713 ptr = vmalloc(sz);
714 return ptr;
715}
716
717static void sfq_free(void *addr)
718{
719 if (addr) {
720 if (is_vmalloc_addr(addr))
721 vfree(addr);
722 else
723 kfree(addr);
724 }
725}
726
727static void sfq_destroy(struct Qdisc *sch)
728{
729 struct sfq_sched_data *q = qdisc_priv(sch);
730
731 tcf_destroy_chain(&q->filter_list);
732 q->perturb_period = 0;
733 del_timer_sync(&q->perturb_timer);
734 sfq_free(q->ht);
735 sfq_free(q->slots);
736 kfree(q->red_parms);
737}
738
739static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
740{
741 struct sfq_sched_data *q = qdisc_priv(sch);
742 int i;
743
744 q->perturb_timer.function = sfq_perturbation;
745 q->perturb_timer.data = (unsigned long)sch;
746 init_timer_deferrable(&q->perturb_timer);
747
748 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
749 q->dep[i].next = i + SFQ_MAX_FLOWS;
750 q->dep[i].prev = i + SFQ_MAX_FLOWS;
751 }
752
753 q->limit = SFQ_MAX_DEPTH;
754 q->maxdepth = SFQ_MAX_DEPTH;
755 q->cur_depth = 0;
756 q->tail = NULL;
757 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
758 q->maxflows = SFQ_DEFAULT_FLOWS;
759 q->quantum = psched_mtu(qdisc_dev(sch));
760 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
761 q->perturb_period = 0;
762 q->perturbation = prandom_u32();
763
764 if (opt) {
765 int err = sfq_change(sch, opt);
766 if (err)
767 return err;
768 }
769
770 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
771 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
772 if (!q->ht || !q->slots) {
773 sfq_destroy(sch);
774 return -ENOMEM;
775 }
776 for (i = 0; i < q->divisor; i++)
777 q->ht[i] = SFQ_EMPTY_SLOT;
778
779 for (i = 0; i < q->maxflows; i++) {
780 slot_queue_init(&q->slots[i]);
781 sfq_link(q, i);
782 }
783 if (q->limit >= 1)
784 sch->flags |= TCQ_F_CAN_BYPASS;
785 else
786 sch->flags &= ~TCQ_F_CAN_BYPASS;
787 return 0;
788}
789
790static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
791{
792 struct sfq_sched_data *q = qdisc_priv(sch);
793 unsigned char *b = skb_tail_pointer(skb);
794 struct tc_sfq_qopt_v1 opt;
795 struct red_parms *p = q->red_parms;
796
797 memset(&opt, 0, sizeof(opt));
798 opt.v0.quantum = q->quantum;
799 opt.v0.perturb_period = q->perturb_period / HZ;
800 opt.v0.limit = q->limit;
801 opt.v0.divisor = q->divisor;
802 opt.v0.flows = q->maxflows;
803 opt.depth = q->maxdepth;
804 opt.headdrop = q->headdrop;
805
806 if (p) {
807 opt.qth_min = p->qth_min >> p->Wlog;
808 opt.qth_max = p->qth_max >> p->Wlog;
809 opt.Wlog = p->Wlog;
810 opt.Plog = p->Plog;
811 opt.Scell_log = p->Scell_log;
812 opt.max_P = p->max_P;
813 }
814 memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
815 opt.flags = q->flags;
816
817 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
818 goto nla_put_failure;
819
820 return skb->len;
821
822nla_put_failure:
823 nlmsg_trim(skb, b);
824 return -1;
825}
826
827static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
828{
829 return NULL;
830}
831
832static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
833{
834 return 0;
835}
836
837static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
838 u32 classid)
839{
840 /* we cannot bypass queue discipline anymore */
841 sch->flags &= ~TCQ_F_CAN_BYPASS;
842 return 0;
843}
844
845static void sfq_put(struct Qdisc *q, unsigned long cl)
846{
847}
848
849static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
850{
851 struct sfq_sched_data *q = qdisc_priv(sch);
852
853 if (cl)
854 return NULL;
855 return &q->filter_list;
856}
857
858static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
859 struct sk_buff *skb, struct tcmsg *tcm)
860{
861 tcm->tcm_handle |= TC_H_MIN(cl);
862 return 0;
863}
864
865static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
866 struct gnet_dump *d)
867{
868 struct sfq_sched_data *q = qdisc_priv(sch);
869 sfq_index idx = q->ht[cl - 1];
870 struct gnet_stats_queue qs = { 0 };
871 struct tc_sfq_xstats xstats = { 0 };
872
873 if (idx != SFQ_EMPTY_SLOT) {
874 const struct sfq_slot *slot = &q->slots[idx];
875
876 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
877 qs.qlen = slot->qlen;
878 qs.backlog = slot->backlog;
879 }
880 if (gnet_stats_copy_queue(d, &qs) < 0)
881 return -1;
882 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
883}
884
885static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
886{
887 struct sfq_sched_data *q = qdisc_priv(sch);
888 unsigned int i;
889
890 if (arg->stop)
891 return;
892
893 for (i = 0; i < q->divisor; i++) {
894 if (q->ht[i] == SFQ_EMPTY_SLOT ||
895 arg->count < arg->skip) {
896 arg->count++;
897 continue;
898 }
899 if (arg->fn(sch, i + 1, arg) < 0) {
900 arg->stop = 1;
901 break;
902 }
903 arg->count++;
904 }
905}
906
907static const struct Qdisc_class_ops sfq_class_ops = {
908 .leaf = sfq_leaf,
909 .get = sfq_get,
910 .put = sfq_put,
911 .tcf_chain = sfq_find_tcf,
912 .bind_tcf = sfq_bind,
913 .unbind_tcf = sfq_put,
914 .dump = sfq_dump_class,
915 .dump_stats = sfq_dump_class_stats,
916 .walk = sfq_walk,
917};
918
919static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
920 .cl_ops = &sfq_class_ops,
921 .id = "sfq",
922 .priv_size = sizeof(struct sfq_sched_data),
923 .enqueue = sfq_enqueue,
924 .dequeue = sfq_dequeue,
925 .peek = qdisc_peek_dequeued,
926 .drop = sfq_drop,
927 .init = sfq_init,
928 .reset = sfq_reset,
929 .destroy = sfq_destroy,
930 .change = NULL,
931 .dump = sfq_dump,
932 .owner = THIS_MODULE,
933};
934
935static int __init sfq_module_init(void)
936{
937 return register_qdisc(&sfq_qdisc_ops);
938}
939static void __exit sfq_module_exit(void)
940{
941 unregister_qdisc(&sfq_qdisc_ops);
942}
943module_init(sfq_module_init)
944module_exit(sfq_module_exit)
945MODULE_LICENSE("GPL");
1/*
2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
17#include <linux/in.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/ipv6.h>
21#include <linux/skbuff.h>
22#include <linux/jhash.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <net/ip.h>
26#include <net/netlink.h>
27#include <net/pkt_sched.h>
28
29
30/* Stochastic Fairness Queuing algorithm.
31 =======================================
32
33 Source:
34 Paul E. McKenney "Stochastic Fairness Queuing",
35 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36
37 Paul E. McKenney "Stochastic Fairness Queuing",
38 "Interworking: Research and Experience", v.2, 1991, p.113-131.
39
40
41 See also:
42 M. Shreedhar and George Varghese "Efficient Fair
43 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44
45
46 This is not the thing that is usually called (W)FQ nowadays.
47 It does not use any timestamp mechanism, but instead
48 processes queues in round-robin order.
49
50 ADVANTAGE:
51
52 - It is very cheap. Both CPU and memory requirements are minimal.
53
54 DRAWBACKS:
55
56 - "Stochastic" -> It is not 100% fair.
57 When hash collisions occur, several flows are considered as one.
58
59 - "Round-robin" -> It introduces larger delays than virtual clock
60 based schemes, and should not be used for isolating interactive
61 traffic from non-interactive. It means, that this scheduler
62 should be used as leaf of CBQ or P3, which put interactive traffic
63 to higher priority band.
64
65 We still need true WFQ for top level CSZ, but using WFQ
66 for the best effort traffic is absolutely pointless:
67 SFQ is superior for this purpose.
68
69 IMPLEMENTATION:
70 This implementation limits maximal queue length to 128;
71 max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024.
72 The only goal of this restrictions was that all data
73 fit into one 4K page on 32bit arches.
74
75 It is easy to increase these values, but not in flight. */
76
77#define SFQ_DEPTH 128 /* max number of packets per flow */
78#define SFQ_SLOTS 128 /* max number of flows */
79#define SFQ_EMPTY_SLOT 255
80#define SFQ_DEFAULT_HASH_DIVISOR 1024
81
82/* We use 16 bits to store allot, and want to handle packets up to 64K
83 * Scale allot by 8 (1<<3) so that no overflow occurs.
84 */
85#define SFQ_ALLOT_SHIFT 3
86#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
87
88/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */
89typedef unsigned char sfq_index;
90
91/*
92 * We dont use pointers to save space.
93 * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array
94 * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
95 * are 'pointers' to dep[] array
96 */
97struct sfq_head {
98 sfq_index next;
99 sfq_index prev;
100};
101
102struct sfq_slot {
103 struct sk_buff *skblist_next;
104 struct sk_buff *skblist_prev;
105 sfq_index qlen; /* number of skbs in skblist */
106 sfq_index next; /* next slot in sfq chain */
107 struct sfq_head dep; /* anchor in dep[] chains */
108 unsigned short hash; /* hash value (index in ht[]) */
109 short allot; /* credit for this slot */
110};
111
112struct sfq_sched_data {
113/* Parameters */
114 int perturb_period;
115 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
116 int limit;
117 unsigned int divisor; /* number of slots in hash table */
118/* Variables */
119 struct tcf_proto *filter_list;
120 struct timer_list perturb_timer;
121 u32 perturbation;
122 sfq_index cur_depth; /* depth of longest slot */
123 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
124 struct sfq_slot *tail; /* current slot in round */
125 sfq_index *ht; /* Hash table (divisor slots) */
126 struct sfq_slot slots[SFQ_SLOTS];
127 struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
128};
129
130/*
131 * sfq_head are either in a sfq_slot or in dep[] array
132 */
133static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
134{
135 if (val < SFQ_SLOTS)
136 return &q->slots[val].dep;
137 return &q->dep[val - SFQ_SLOTS];
138}
139
140static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
141{
142 return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
143}
144
145static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
146{
147 u32 h, h2;
148
149 switch (skb->protocol) {
150 case htons(ETH_P_IP):
151 {
152 const struct iphdr *iph;
153 int poff;
154
155 if (!pskb_network_may_pull(skb, sizeof(*iph)))
156 goto err;
157 iph = ip_hdr(skb);
158 h = (__force u32)iph->daddr;
159 h2 = (__force u32)iph->saddr ^ iph->protocol;
160 if (ip_is_fragment(iph))
161 break;
162 poff = proto_ports_offset(iph->protocol);
163 if (poff >= 0 &&
164 pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
165 iph = ip_hdr(skb);
166 h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
167 }
168 break;
169 }
170 case htons(ETH_P_IPV6):
171 {
172 const struct ipv6hdr *iph;
173 int poff;
174
175 if (!pskb_network_may_pull(skb, sizeof(*iph)))
176 goto err;
177 iph = ipv6_hdr(skb);
178 h = (__force u32)iph->daddr.s6_addr32[3];
179 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
180 poff = proto_ports_offset(iph->nexthdr);
181 if (poff >= 0 &&
182 pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
183 iph = ipv6_hdr(skb);
184 h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
185 }
186 break;
187 }
188 default:
189err:
190 h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
191 h2 = (unsigned long)skb->sk;
192 }
193
194 return sfq_fold_hash(q, h, h2);
195}
196
197static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
198 int *qerr)
199{
200 struct sfq_sched_data *q = qdisc_priv(sch);
201 struct tcf_result res;
202 int result;
203
204 if (TC_H_MAJ(skb->priority) == sch->handle &&
205 TC_H_MIN(skb->priority) > 0 &&
206 TC_H_MIN(skb->priority) <= q->divisor)
207 return TC_H_MIN(skb->priority);
208
209 if (!q->filter_list)
210 return sfq_hash(q, skb) + 1;
211
212 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
213 result = tc_classify(skb, q->filter_list, &res);
214 if (result >= 0) {
215#ifdef CONFIG_NET_CLS_ACT
216 switch (result) {
217 case TC_ACT_STOLEN:
218 case TC_ACT_QUEUED:
219 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
220 case TC_ACT_SHOT:
221 return 0;
222 }
223#endif
224 if (TC_H_MIN(res.classid) <= q->divisor)
225 return TC_H_MIN(res.classid);
226 }
227 return 0;
228}
229
230/*
231 * x : slot number [0 .. SFQ_SLOTS - 1]
232 */
233static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
234{
235 sfq_index p, n;
236 int qlen = q->slots[x].qlen;
237
238 p = qlen + SFQ_SLOTS;
239 n = q->dep[qlen].next;
240
241 q->slots[x].dep.next = n;
242 q->slots[x].dep.prev = p;
243
244 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
245 sfq_dep_head(q, n)->prev = x;
246}
247
248#define sfq_unlink(q, x, n, p) \
249 n = q->slots[x].dep.next; \
250 p = q->slots[x].dep.prev; \
251 sfq_dep_head(q, p)->next = n; \
252 sfq_dep_head(q, n)->prev = p
253
254
255static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
256{
257 sfq_index p, n;
258 int d;
259
260 sfq_unlink(q, x, n, p);
261
262 d = q->slots[x].qlen--;
263 if (n == p && q->cur_depth == d)
264 q->cur_depth--;
265 sfq_link(q, x);
266}
267
268static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
269{
270 sfq_index p, n;
271 int d;
272
273 sfq_unlink(q, x, n, p);
274
275 d = ++q->slots[x].qlen;
276 if (q->cur_depth < d)
277 q->cur_depth = d;
278 sfq_link(q, x);
279}
280
281/* helper functions : might be changed when/if skb use a standard list_head */
282
283/* remove one skb from tail of slot queue */
284static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
285{
286 struct sk_buff *skb = slot->skblist_prev;
287
288 slot->skblist_prev = skb->prev;
289 skb->prev->next = (struct sk_buff *)slot;
290 skb->next = skb->prev = NULL;
291 return skb;
292}
293
294/* remove one skb from head of slot queue */
295static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
296{
297 struct sk_buff *skb = slot->skblist_next;
298
299 slot->skblist_next = skb->next;
300 skb->next->prev = (struct sk_buff *)slot;
301 skb->next = skb->prev = NULL;
302 return skb;
303}
304
305static inline void slot_queue_init(struct sfq_slot *slot)
306{
307 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
308}
309
310/* add skb to slot queue (tail add) */
311static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
312{
313 skb->prev = slot->skblist_prev;
314 skb->next = (struct sk_buff *)slot;
315 slot->skblist_prev->next = skb;
316 slot->skblist_prev = skb;
317}
318
319#define slot_queue_walk(slot, skb) \
320 for (skb = slot->skblist_next; \
321 skb != (struct sk_buff *)slot; \
322 skb = skb->next)
323
324static unsigned int sfq_drop(struct Qdisc *sch)
325{
326 struct sfq_sched_data *q = qdisc_priv(sch);
327 sfq_index x, d = q->cur_depth;
328 struct sk_buff *skb;
329 unsigned int len;
330 struct sfq_slot *slot;
331
332 /* Queue is full! Find the longest slot and drop tail packet from it */
333 if (d > 1) {
334 x = q->dep[d].next;
335 slot = &q->slots[x];
336drop:
337 skb = slot_dequeue_tail(slot);
338 len = qdisc_pkt_len(skb);
339 sfq_dec(q, x);
340 kfree_skb(skb);
341 sch->q.qlen--;
342 sch->qstats.drops++;
343 sch->qstats.backlog -= len;
344 return len;
345 }
346
347 if (d == 1) {
348 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
349 x = q->tail->next;
350 slot = &q->slots[x];
351 q->tail->next = slot->next;
352 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
353 goto drop;
354 }
355
356 return 0;
357}
358
359static int
360sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
361{
362 struct sfq_sched_data *q = qdisc_priv(sch);
363 unsigned int hash;
364 sfq_index x, qlen;
365 struct sfq_slot *slot;
366 int uninitialized_var(ret);
367
368 hash = sfq_classify(skb, sch, &ret);
369 if (hash == 0) {
370 if (ret & __NET_XMIT_BYPASS)
371 sch->qstats.drops++;
372 kfree_skb(skb);
373 return ret;
374 }
375 hash--;
376
377 x = q->ht[hash];
378 slot = &q->slots[x];
379 if (x == SFQ_EMPTY_SLOT) {
380 x = q->dep[0].next; /* get a free slot */
381 q->ht[hash] = x;
382 slot = &q->slots[x];
383 slot->hash = hash;
384 }
385
386 /* If selected queue has length q->limit, do simple tail drop,
387 * i.e. drop _this_ packet.
388 */
389 if (slot->qlen >= q->limit)
390 return qdisc_drop(skb, sch);
391
392 sch->qstats.backlog += qdisc_pkt_len(skb);
393 slot_queue_add(slot, skb);
394 sfq_inc(q, x);
395 if (slot->qlen == 1) { /* The flow is new */
396 if (q->tail == NULL) { /* It is the first flow */
397 slot->next = x;
398 } else {
399 slot->next = q->tail->next;
400 q->tail->next = x;
401 }
402 q->tail = slot;
403 slot->allot = q->scaled_quantum;
404 }
405 if (++sch->q.qlen <= q->limit)
406 return NET_XMIT_SUCCESS;
407
408 qlen = slot->qlen;
409 sfq_drop(sch);
410 /* Return Congestion Notification only if we dropped a packet
411 * from this flow.
412 */
413 if (qlen != slot->qlen)
414 return NET_XMIT_CN;
415
416 /* As we dropped a packet, better let upper stack know this */
417 qdisc_tree_decrease_qlen(sch, 1);
418 return NET_XMIT_SUCCESS;
419}
420
421static struct sk_buff *
422sfq_dequeue(struct Qdisc *sch)
423{
424 struct sfq_sched_data *q = qdisc_priv(sch);
425 struct sk_buff *skb;
426 sfq_index a, next_a;
427 struct sfq_slot *slot;
428
429 /* No active slots */
430 if (q->tail == NULL)
431 return NULL;
432
433next_slot:
434 a = q->tail->next;
435 slot = &q->slots[a];
436 if (slot->allot <= 0) {
437 q->tail = slot;
438 slot->allot += q->scaled_quantum;
439 goto next_slot;
440 }
441 skb = slot_dequeue_head(slot);
442 sfq_dec(q, a);
443 qdisc_bstats_update(sch, skb);
444 sch->q.qlen--;
445 sch->qstats.backlog -= qdisc_pkt_len(skb);
446
447 /* Is the slot empty? */
448 if (slot->qlen == 0) {
449 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
450 next_a = slot->next;
451 if (a == next_a) {
452 q->tail = NULL; /* no more active slots */
453 return skb;
454 }
455 q->tail->next = next_a;
456 } else {
457 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
458 }
459 return skb;
460}
461
462static void
463sfq_reset(struct Qdisc *sch)
464{
465 struct sk_buff *skb;
466
467 while ((skb = sfq_dequeue(sch)) != NULL)
468 kfree_skb(skb);
469}
470
471static void sfq_perturbation(unsigned long arg)
472{
473 struct Qdisc *sch = (struct Qdisc *)arg;
474 struct sfq_sched_data *q = qdisc_priv(sch);
475
476 q->perturbation = net_random();
477
478 if (q->perturb_period)
479 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
480}
481
482static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
483{
484 struct sfq_sched_data *q = qdisc_priv(sch);
485 struct tc_sfq_qopt *ctl = nla_data(opt);
486 unsigned int qlen;
487
488 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
489 return -EINVAL;
490
491 if (ctl->divisor &&
492 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
493 return -EINVAL;
494
495 sch_tree_lock(sch);
496 q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
497 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
498 q->perturb_period = ctl->perturb_period * HZ;
499 if (ctl->limit)
500 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
501 if (ctl->divisor)
502 q->divisor = ctl->divisor;
503 qlen = sch->q.qlen;
504 while (sch->q.qlen > q->limit)
505 sfq_drop(sch);
506 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
507
508 del_timer(&q->perturb_timer);
509 if (q->perturb_period) {
510 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
511 q->perturbation = net_random();
512 }
513 sch_tree_unlock(sch);
514 return 0;
515}
516
517static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
518{
519 struct sfq_sched_data *q = qdisc_priv(sch);
520 size_t sz;
521 int i;
522
523 q->perturb_timer.function = sfq_perturbation;
524 q->perturb_timer.data = (unsigned long)sch;
525 init_timer_deferrable(&q->perturb_timer);
526
527 for (i = 0; i < SFQ_DEPTH; i++) {
528 q->dep[i].next = i + SFQ_SLOTS;
529 q->dep[i].prev = i + SFQ_SLOTS;
530 }
531
532 q->limit = SFQ_DEPTH - 1;
533 q->cur_depth = 0;
534 q->tail = NULL;
535 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
536 if (opt == NULL) {
537 q->quantum = psched_mtu(qdisc_dev(sch));
538 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
539 q->perturb_period = 0;
540 q->perturbation = net_random();
541 } else {
542 int err = sfq_change(sch, opt);
543 if (err)
544 return err;
545 }
546
547 sz = sizeof(q->ht[0]) * q->divisor;
548 q->ht = kmalloc(sz, GFP_KERNEL);
549 if (!q->ht && sz > PAGE_SIZE)
550 q->ht = vmalloc(sz);
551 if (!q->ht)
552 return -ENOMEM;
553 for (i = 0; i < q->divisor; i++)
554 q->ht[i] = SFQ_EMPTY_SLOT;
555
556 for (i = 0; i < SFQ_SLOTS; i++) {
557 slot_queue_init(&q->slots[i]);
558 sfq_link(q, i);
559 }
560 if (q->limit >= 1)
561 sch->flags |= TCQ_F_CAN_BYPASS;
562 else
563 sch->flags &= ~TCQ_F_CAN_BYPASS;
564 return 0;
565}
566
567static void sfq_destroy(struct Qdisc *sch)
568{
569 struct sfq_sched_data *q = qdisc_priv(sch);
570
571 tcf_destroy_chain(&q->filter_list);
572 q->perturb_period = 0;
573 del_timer_sync(&q->perturb_timer);
574 if (is_vmalloc_addr(q->ht))
575 vfree(q->ht);
576 else
577 kfree(q->ht);
578}
579
580static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
581{
582 struct sfq_sched_data *q = qdisc_priv(sch);
583 unsigned char *b = skb_tail_pointer(skb);
584 struct tc_sfq_qopt opt;
585
586 opt.quantum = q->quantum;
587 opt.perturb_period = q->perturb_period / HZ;
588
589 opt.limit = q->limit;
590 opt.divisor = q->divisor;
591 opt.flows = q->limit;
592
593 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
594
595 return skb->len;
596
597nla_put_failure:
598 nlmsg_trim(skb, b);
599 return -1;
600}
601
602static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
603{
604 return NULL;
605}
606
607static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
608{
609 return 0;
610}
611
612static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
613 u32 classid)
614{
615 /* we cannot bypass queue discipline anymore */
616 sch->flags &= ~TCQ_F_CAN_BYPASS;
617 return 0;
618}
619
620static void sfq_put(struct Qdisc *q, unsigned long cl)
621{
622}
623
624static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
625{
626 struct sfq_sched_data *q = qdisc_priv(sch);
627
628 if (cl)
629 return NULL;
630 return &q->filter_list;
631}
632
633static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
634 struct sk_buff *skb, struct tcmsg *tcm)
635{
636 tcm->tcm_handle |= TC_H_MIN(cl);
637 return 0;
638}
639
640static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
641 struct gnet_dump *d)
642{
643 struct sfq_sched_data *q = qdisc_priv(sch);
644 sfq_index idx = q->ht[cl - 1];
645 struct gnet_stats_queue qs = { 0 };
646 struct tc_sfq_xstats xstats = { 0 };
647 struct sk_buff *skb;
648
649 if (idx != SFQ_EMPTY_SLOT) {
650 const struct sfq_slot *slot = &q->slots[idx];
651
652 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
653 qs.qlen = slot->qlen;
654 slot_queue_walk(slot, skb)
655 qs.backlog += qdisc_pkt_len(skb);
656 }
657 if (gnet_stats_copy_queue(d, &qs) < 0)
658 return -1;
659 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
660}
661
662static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
663{
664 struct sfq_sched_data *q = qdisc_priv(sch);
665 unsigned int i;
666
667 if (arg->stop)
668 return;
669
670 for (i = 0; i < q->divisor; i++) {
671 if (q->ht[i] == SFQ_EMPTY_SLOT ||
672 arg->count < arg->skip) {
673 arg->count++;
674 continue;
675 }
676 if (arg->fn(sch, i + 1, arg) < 0) {
677 arg->stop = 1;
678 break;
679 }
680 arg->count++;
681 }
682}
683
684static const struct Qdisc_class_ops sfq_class_ops = {
685 .leaf = sfq_leaf,
686 .get = sfq_get,
687 .put = sfq_put,
688 .tcf_chain = sfq_find_tcf,
689 .bind_tcf = sfq_bind,
690 .unbind_tcf = sfq_put,
691 .dump = sfq_dump_class,
692 .dump_stats = sfq_dump_class_stats,
693 .walk = sfq_walk,
694};
695
696static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
697 .cl_ops = &sfq_class_ops,
698 .id = "sfq",
699 .priv_size = sizeof(struct sfq_sched_data),
700 .enqueue = sfq_enqueue,
701 .dequeue = sfq_dequeue,
702 .peek = qdisc_peek_dequeued,
703 .drop = sfq_drop,
704 .init = sfq_init,
705 .reset = sfq_reset,
706 .destroy = sfq_destroy,
707 .change = NULL,
708 .dump = sfq_dump,
709 .owner = THIS_MODULE,
710};
711
712static int __init sfq_module_init(void)
713{
714 return register_qdisc(&sfq_qdisc_ops);
715}
716static void __exit sfq_module_exit(void)
717{
718 unregister_qdisc(&sfq_qdisc_ops);
719}
720module_init(sfq_module_init)
721module_exit(sfq_module_exit)
722MODULE_LICENSE("GPL");