sch_tbf.c - net/sched/sch_tbf.c - Linux diff v3.15 - Bootlin Elixir Cross Referencer

 
  1/*
  2 * net/sched/sch_tbf.c	Token Bucket Filter queue.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 11 *						 original idea by Martin Devera
 12 *
 13 */
 14
 15#include <linux/module.h>
 16#include <linux/types.h>
 17#include <linux/kernel.h>
 18#include <linux/string.h>
 19#include <linux/errno.h>
 20#include <linux/skbuff.h>
 21#include <net/netlink.h>
 22#include <net/sch_generic.h>
 
 23#include <net/pkt_sched.h>
 24
 25
 26/*	Simple Token Bucket Filter.
 27	=======================================
 28
 29	SOURCE.
 30	-------
 31
 32	None.
 33
 34	Description.
 35	------------
 36
 37	A data flow obeys TBF with rate R and depth B, if for any
 38	time interval t_i...t_f the number of transmitted bits
 39	does not exceed B + R*(t_f-t_i).
 40
 41	Packetized version of this definition:
 42	The sequence of packets of sizes s_i served at moments t_i
 43	obeys TBF, if for any i<=k:
 44
 45	s_i+....+s_k <= B + R*(t_k - t_i)
 46
 47	Algorithm.
 48	----------
 49
 50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
 51
 52	N(t+delta) = min{B/R, N(t) + delta}
 53
 54	If the first packet in queue has length S, it may be
 55	transmitted only at the time t_* when S/R <= N(t_*),
 56	and in this case N(t) jumps:
 57
 58	N(t_* + 0) = N(t_* - 0) - S/R.
 59
 60
 61
 62	Actually, QoS requires two TBF to be applied to a data stream.
 63	One of them controls steady state burst size, another
 64	one with rate P (peak rate) and depth M (equal to link MTU)
 65	limits bursts at a smaller time scale.
 66
 67	It is easy to see that P>R, and B>M. If P is infinity, this double
 68	TBF is equivalent to a single one.
 69
 70	When TBF works in reshaping mode, latency is estimated as:
 71
 72	lat = max ((L-B)/R, (L-M)/P)
 73
 74
 75	NOTES.
 76	------
 77
 78	If TBF throttles, it starts a watchdog timer, which will wake it up
 79	when it is ready to transmit.
 80	Note that the minimal timer resolution is 1/HZ.
 81	If no new packets arrive during this period,
 82	or if the device is not awaken by EOI for some previous packet,
 83	TBF can stop its activity for 1/HZ.
 84
 85
 86	This means, that with depth B, the maximal rate is
 87
 88	R_crit = B*HZ
 89
 90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 91
 92	Note that the peak rate TBF is much more tough: with MTU 1500
 93	P_crit = 150Kbytes/sec. So, if you need greater peak
 94	rates, use alpha with HZ=1000 :-)
 95
 96	With classful TBF, limit is just kept for backwards compatibility.
 97	It is passed to the default bfifo qdisc - if the inner qdisc is
 98	changed the limit is not effective anymore.
 99*/
100
101struct tbf_sched_data {
102/* Parameters */
103	u32		limit;		/* Maximal length of backlog: bytes */
104	u32		max_size;
105	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
106	s64		mtu;
107	struct psched_ratecfg rate;
108	struct psched_ratecfg peak;
109
110/* Variables */
111	s64	tokens;			/* Current number of B tokens */
112	s64	ptokens;		/* Current number of P tokens */
113	s64	t_c;			/* Time check-point */
114	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116};
117
118
119/* Time to Length, convert time in ns to length in bytes
120 * to determinate how many bytes can be sent in given time.
121 */
122static u64 psched_ns_t2l(const struct psched_ratecfg *r,
123			 u64 time_in_ns)
124{
125	/* The formula is :
126	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
127	 */
128	u64 len = time_in_ns * r->rate_bytes_ps;
129
130	do_div(len, NSEC_PER_SEC);
131
132	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
133		do_div(len, 53);
134		len = len * 48;
135	}
136
137	if (len > r->overhead)
138		len -= r->overhead;
139	else
140		len = 0;
141
142	return len;
143}
144
145/*
146 * Return length of individual segments of a gso packet,
147 * including all headers (MAC, IP, TCP/UDP)
148 */
149static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
150{
151	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
152	return hdr_len + skb_gso_transport_seglen(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153}
154
155/* GSO packet is too big, segment it so that tbf can transmit
156 * each segment in time
157 */
158static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
 
159{
160	struct tbf_sched_data *q = qdisc_priv(sch);
161	struct sk_buff *segs, *nskb;
162	netdev_features_t features = netif_skb_features(skb);
 
163	int ret, nb;
164
165	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
166
167	if (IS_ERR_OR_NULL(segs))
168		return qdisc_reshape_fail(skb, sch);
169
170	nb = 0;
171	while (segs) {
172		nskb = segs->next;
173		segs->next = NULL;
174		qdisc_skb_cb(segs)->pkt_len = segs->len;
175		ret = qdisc_enqueue(segs, q->qdisc);
 
176		if (ret != NET_XMIT_SUCCESS) {
177			if (net_xmit_drop_count(ret))
178				sch->qstats.drops++;
179		} else {
180			nb++;
181		}
182		segs = nskb;
183	}
184	sch->q.qlen += nb;
185	if (nb > 1)
186		qdisc_tree_decrease_qlen(sch, 1 - nb);
187	consume_skb(skb);
188	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
189}
190
191static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
192{
193	struct tbf_sched_data *q = qdisc_priv(sch);
 
194	int ret;
195
196	if (qdisc_pkt_len(skb) > q->max_size) {
197		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
198			return tbf_segment(skb, sch);
199		return qdisc_reshape_fail(skb, sch);
 
200	}
201	ret = qdisc_enqueue(skb, q->qdisc);
202	if (ret != NET_XMIT_SUCCESS) {
203		if (net_xmit_drop_count(ret))
204			sch->qstats.drops++;
205		return ret;
206	}
207
 
208	sch->q.qlen++;
209	return NET_XMIT_SUCCESS;
210}
211
212static unsigned int tbf_drop(struct Qdisc *sch)
213{
214	struct tbf_sched_data *q = qdisc_priv(sch);
215	unsigned int len = 0;
216
217	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
218		sch->q.qlen--;
219		sch->qstats.drops++;
220	}
221	return len;
222}
223
224static bool tbf_peak_present(const struct tbf_sched_data *q)
225{
226	return q->peak.rate_bytes_ps;
227}
228
229static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
230{
231	struct tbf_sched_data *q = qdisc_priv(sch);
232	struct sk_buff *skb;
233
234	skb = q->qdisc->ops->peek(q->qdisc);
235
236	if (skb) {
237		s64 now;
238		s64 toks;
239		s64 ptoks = 0;
240		unsigned int len = qdisc_pkt_len(skb);
241
242		now = ktime_to_ns(ktime_get());
243		toks = min_t(s64, now - q->t_c, q->buffer);
244
245		if (tbf_peak_present(q)) {
246			ptoks = toks + q->ptokens;
247			if (ptoks > q->mtu)
248				ptoks = q->mtu;
249			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
250		}
251		toks += q->tokens;
252		if (toks > q->buffer)
253			toks = q->buffer;
254		toks -= (s64) psched_l2t_ns(&q->rate, len);
255
256		if ((toks|ptoks) >= 0) {
257			skb = qdisc_dequeue_peeked(q->qdisc);
258			if (unlikely(!skb))
259				return NULL;
260
261			q->t_c = now;
262			q->tokens = toks;
263			q->ptokens = ptoks;
 
264			sch->q.qlen--;
265			qdisc_unthrottled(sch);
266			qdisc_bstats_update(sch, skb);
267			return skb;
268		}
269
270		qdisc_watchdog_schedule_ns(&q->watchdog,
271					   now + max_t(long, -toks, -ptoks));
272
273		/* Maybe we have a shorter packet in the queue,
274		   which can be sent now. It sounds cool,
275		   but, however, this is wrong in principle.
276		   We MUST NOT reorder packets under these circumstances.
277
278		   Really, if we split the flow into independent
279		   subflows, it would be a very good solution.
280		   This is the main idea of all FQ algorithms
281		   (cf. CSZ, HPFQ, HFSC)
282		 */
283
284		sch->qstats.overlimits++;
285	}
286	return NULL;
287}
288
289static void tbf_reset(struct Qdisc *sch)
290{
291	struct tbf_sched_data *q = qdisc_priv(sch);
292
293	qdisc_reset(q->qdisc);
 
294	sch->q.qlen = 0;
295	q->t_c = ktime_to_ns(ktime_get());
296	q->tokens = q->buffer;
297	q->ptokens = q->mtu;
298	qdisc_watchdog_cancel(&q->watchdog);
299}
300
301static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
302	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
303	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
304	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
305	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
306	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
307	[TCA_TBF_BURST] = { .type = NLA_U32 },
308	[TCA_TBF_PBURST] = { .type = NLA_U32 },
309};
310
311static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 
312{
313	int err;
314	struct tbf_sched_data *q = qdisc_priv(sch);
315	struct nlattr *tb[TCA_TBF_MAX + 1];
316	struct tc_tbf_qopt *qopt;
317	struct Qdisc *child = NULL;
318	struct psched_ratecfg rate;
319	struct psched_ratecfg peak;
320	u64 max_size;
321	s64 buffer, mtu;
322	u64 rate64 = 0, prate64 = 0;
323
324	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
 
325	if (err < 0)
326		return err;
327
328	err = -EINVAL;
329	if (tb[TCA_TBF_PARMS] == NULL)
330		goto done;
331
332	qopt = nla_data(tb[TCA_TBF_PARMS]);
333	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
334		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
335					      tb[TCA_TBF_RTAB]));
 
336
337	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
338			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
339						      tb[TCA_TBF_PTAB]));
 
340
341	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
342	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
343
344	if (tb[TCA_TBF_RATE64])
345		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
346	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
347
348	if (tb[TCA_TBF_BURST]) {
349		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
350		buffer = psched_l2t_ns(&rate, max_size);
351	} else {
352		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
353	}
354
355	if (qopt->peakrate.rate) {
356		if (tb[TCA_TBF_PRATE64])
357			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
358		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
359		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
360			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
361					peak.rate_bytes_ps, rate.rate_bytes_ps);
362			err = -EINVAL;
363			goto done;
364		}
365
366		if (tb[TCA_TBF_PBURST]) {
367			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
368			max_size = min_t(u32, max_size, pburst);
369			mtu = psched_l2t_ns(&peak, pburst);
370		} else {
371			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
372		}
373	} else {
374		memset(&peak, 0, sizeof(peak));
375	}
376
377	if (max_size < psched_mtu(qdisc_dev(sch)))
378		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
379				    max_size, qdisc_dev(sch)->name,
380				    psched_mtu(qdisc_dev(sch)));
381
382	if (!max_size) {
383		err = -EINVAL;
384		goto done;
385	}
386
387	if (q->qdisc != &noop_qdisc) {
388		err = fifo_set_limit(q->qdisc, qopt->limit);
389		if (err)
390			goto done;
391	} else if (qopt->limit > 0) {
392		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
 
393		if (IS_ERR(child)) {
394			err = PTR_ERR(child);
395			goto done;
396		}
 
 
 
397	}
398
399	sch_tree_lock(sch);
400	if (child) {
401		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
402		qdisc_destroy(q->qdisc);
403		q->qdisc = child;
404	}
405	q->limit = qopt->limit;
406	if (tb[TCA_TBF_PBURST])
407		q->mtu = mtu;
408	else
409		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
410	q->max_size = max_size;
411	if (tb[TCA_TBF_BURST])
412		q->buffer = buffer;
413	else
414		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
415	q->tokens = q->buffer;
416	q->ptokens = q->mtu;
417
418	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
419	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
420
421	sch_tree_unlock(sch);
422	err = 0;
 
 
423done:
424	return err;
425}
426
427static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
 
428{
429	struct tbf_sched_data *q = qdisc_priv(sch);
430
431	if (opt == NULL)
432		return -EINVAL;
433
434	q->t_c = ktime_to_ns(ktime_get());
435	qdisc_watchdog_init(&q->watchdog, sch);
436	q->qdisc = &noop_qdisc;
437
438	return tbf_change(sch, opt);
 
 
 
 
 
439}
440
441static void tbf_destroy(struct Qdisc *sch)
442{
443	struct tbf_sched_data *q = qdisc_priv(sch);
444
445	qdisc_watchdog_cancel(&q->watchdog);
446	qdisc_destroy(q->qdisc);
 
447}
448
449static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
450{
451	struct tbf_sched_data *q = qdisc_priv(sch);
452	struct nlattr *nest;
453	struct tc_tbf_qopt opt;
 
454
455	sch->qstats.backlog = q->qdisc->qstats.backlog;
456	nest = nla_nest_start(skb, TCA_OPTIONS);
 
 
 
457	if (nest == NULL)
458		goto nla_put_failure;
459
460	opt.limit = q->limit;
461	psched_ratecfg_getrate(&opt.rate, &q->rate);
462	if (tbf_peak_present(q))
463		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
464	else
465		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
466	opt.mtu = PSCHED_NS2TICKS(q->mtu);
467	opt.buffer = PSCHED_NS2TICKS(q->buffer);
468	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
469		goto nla_put_failure;
470	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
471	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
 
472		goto nla_put_failure;
473	if (tbf_peak_present(q) &&
474	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
475	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
 
476		goto nla_put_failure;
477
478	return nla_nest_end(skb, nest);
479
480nla_put_failure:
481	nla_nest_cancel(skb, nest);
482	return -1;
483}
484
485static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
486			  struct sk_buff *skb, struct tcmsg *tcm)
487{
488	struct tbf_sched_data *q = qdisc_priv(sch);
489
490	tcm->tcm_handle |= TC_H_MIN(1);
491	tcm->tcm_info = q->qdisc->handle;
492
493	return 0;
494}
495
496static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
497		     struct Qdisc **old)
498{
499	struct tbf_sched_data *q = qdisc_priv(sch);
500
501	if (new == NULL)
502		new = &noop_qdisc;
503
504	sch_tree_lock(sch);
505	*old = q->qdisc;
506	q->qdisc = new;
507	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
508	qdisc_reset(*old);
509	sch_tree_unlock(sch);
510
511	return 0;
512}
513
514static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
515{
516	struct tbf_sched_data *q = qdisc_priv(sch);
517	return q->qdisc;
518}
519
520static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
521{
522	return 1;
523}
524
525static void tbf_put(struct Qdisc *sch, unsigned long arg)
526{
527}
528
529static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
530{
531	if (!walker->stop) {
532		if (walker->count >= walker->skip)
533			if (walker->fn(sch, 1, walker) < 0) {
534				walker->stop = 1;
535				return;
536			}
537		walker->count++;
538	}
539}
540
541static const struct Qdisc_class_ops tbf_class_ops = {
542	.graft		=	tbf_graft,
543	.leaf		=	tbf_leaf,
544	.get		=	tbf_get,
545	.put		=	tbf_put,
546	.walk		=	tbf_walk,
547	.dump		=	tbf_dump_class,
548};
549
550static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
551	.next		=	NULL,
552	.cl_ops		=	&tbf_class_ops,
553	.id		=	"tbf",
554	.priv_size	=	sizeof(struct tbf_sched_data),
555	.enqueue	=	tbf_enqueue,
556	.dequeue	=	tbf_dequeue,
557	.peek		=	qdisc_peek_dequeued,
558	.drop		=	tbf_drop,
559	.init		=	tbf_init,
560	.reset		=	tbf_reset,
561	.destroy	=	tbf_destroy,
562	.change		=	tbf_change,
563	.dump		=	tbf_dump,
564	.owner		=	THIS_MODULE,
565};
566
567static int __init tbf_module_init(void)
568{
569	return register_qdisc(&tbf_qdisc_ops);
570}
571
572static void __exit tbf_module_exit(void)
573{
574	unregister_qdisc(&tbf_qdisc_ops);
575}
576module_init(tbf_module_init)
577module_exit(tbf_module_exit)
578MODULE_LICENSE("GPL");

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * net/sched/sch_tbf.c	Token Bucket Filter queue.
  4 *
 
 
 
 
 
  5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  7 *						 original idea by Martin Devera
 
  8 */
  9
 10#include <linux/module.h>
 11#include <linux/types.h>
 12#include <linux/kernel.h>
 13#include <linux/string.h>
 14#include <linux/errno.h>
 15#include <linux/skbuff.h>
 16#include <net/netlink.h>
 17#include <net/sch_generic.h>
 18#include <net/pkt_cls.h>
 19#include <net/pkt_sched.h>
 20
 21
 22/*	Simple Token Bucket Filter.
 23	=======================================
 24
 25	SOURCE.
 26	-------
 27
 28	None.
 29
 30	Description.
 31	------------
 32
 33	A data flow obeys TBF with rate R and depth B, if for any
 34	time interval t_i...t_f the number of transmitted bits
 35	does not exceed B + R*(t_f-t_i).
 36
 37	Packetized version of this definition:
 38	The sequence of packets of sizes s_i served at moments t_i
 39	obeys TBF, if for any i<=k:
 40
 41	s_i+....+s_k <= B + R*(t_k - t_i)
 42
 43	Algorithm.
 44	----------
 45
 46	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
 47
 48	N(t+delta) = min{B/R, N(t) + delta}
 49
 50	If the first packet in queue has length S, it may be
 51	transmitted only at the time t_* when S/R <= N(t_*),
 52	and in this case N(t) jumps:
 53
 54	N(t_* + 0) = N(t_* - 0) - S/R.
 55
 56
 57
 58	Actually, QoS requires two TBF to be applied to a data stream.
 59	One of them controls steady state burst size, another
 60	one with rate P (peak rate) and depth M (equal to link MTU)
 61	limits bursts at a smaller time scale.
 62
 63	It is easy to see that P>R, and B>M. If P is infinity, this double
 64	TBF is equivalent to a single one.
 65
 66	When TBF works in reshaping mode, latency is estimated as:
 67
 68	lat = max ((L-B)/R, (L-M)/P)
 69
 70
 71	NOTES.
 72	------
 73
 74	If TBF throttles, it starts a watchdog timer, which will wake it up
 75	when it is ready to transmit.
 76	Note that the minimal timer resolution is 1/HZ.
 77	If no new packets arrive during this period,
 78	or if the device is not awaken by EOI for some previous packet,
 79	TBF can stop its activity for 1/HZ.
 80
 81
 82	This means, that with depth B, the maximal rate is
 83
 84	R_crit = B*HZ
 85
 86	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 87
 88	Note that the peak rate TBF is much more tough: with MTU 1500
 89	P_crit = 150Kbytes/sec. So, if you need greater peak
 90	rates, use alpha with HZ=1000 :-)
 91
 92	With classful TBF, limit is just kept for backwards compatibility.
 93	It is passed to the default bfifo qdisc - if the inner qdisc is
 94	changed the limit is not effective anymore.
 95*/
 96
 97struct tbf_sched_data {
 98/* Parameters */
 99	u32		limit;		/* Maximal length of backlog: bytes */
100	u32		max_size;
101	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
102	s64		mtu;
103	struct psched_ratecfg rate;
104	struct psched_ratecfg peak;
105
106/* Variables */
107	s64	tokens;			/* Current number of B tokens */
108	s64	ptokens;		/* Current number of P tokens */
109	s64	t_c;			/* Time check-point */
110	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
111	struct qdisc_watchdog watchdog;	/* Watchdog timer */
112};
113
114
115/* Time to Length, convert time in ns to length in bytes
116 * to determinate how many bytes can be sent in given time.
117 */
118static u64 psched_ns_t2l(const struct psched_ratecfg *r,
119			 u64 time_in_ns)
120{
121	/* The formula is :
122	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
123	 */
124	u64 len = time_in_ns * r->rate_bytes_ps;
125
126	do_div(len, NSEC_PER_SEC);
127
128	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
129		do_div(len, 53);
130		len = len * 48;
131	}
132
133	if (len > r->overhead)
134		len -= r->overhead;
135	else
136		len = 0;
137
138	return len;
139}
140
141static void tbf_offload_change(struct Qdisc *sch)
 
 
 
 
142{
143	struct tbf_sched_data *q = qdisc_priv(sch);
144	struct net_device *dev = qdisc_dev(sch);
145	struct tc_tbf_qopt_offload qopt;
146
147	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
148		return;
149
150	qopt.command = TC_TBF_REPLACE;
151	qopt.handle = sch->handle;
152	qopt.parent = sch->parent;
153	qopt.replace_params.rate = q->rate;
154	qopt.replace_params.max_size = q->max_size;
155	qopt.replace_params.qstats = &sch->qstats;
156
157	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
158}
159
160static void tbf_offload_destroy(struct Qdisc *sch)
161{
162	struct net_device *dev = qdisc_dev(sch);
163	struct tc_tbf_qopt_offload qopt;
164
165	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
166		return;
167
168	qopt.command = TC_TBF_DESTROY;
169	qopt.handle = sch->handle;
170	qopt.parent = sch->parent;
171	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
172}
173
174static int tbf_offload_dump(struct Qdisc *sch)
175{
176	struct tc_tbf_qopt_offload qopt;
177
178	qopt.command = TC_TBF_STATS;
179	qopt.handle = sch->handle;
180	qopt.parent = sch->parent;
181	qopt.stats.bstats = &sch->bstats;
182	qopt.stats.qstats = &sch->qstats;
183
184	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
185}
186
187/* GSO packet is too big, segment it so that tbf can transmit
188 * each segment in time
189 */
190static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
191		       struct sk_buff **to_free)
192{
193	struct tbf_sched_data *q = qdisc_priv(sch);
194	struct sk_buff *segs, *nskb;
195	netdev_features_t features = netif_skb_features(skb);
196	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
197	int ret, nb;
198
199	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
200
201	if (IS_ERR_OR_NULL(segs))
202		return qdisc_drop(skb, sch, to_free);
203
204	nb = 0;
205	skb_list_walk_safe(segs, segs, nskb) {
206		skb_mark_not_on_list(segs);
 
207		qdisc_skb_cb(segs)->pkt_len = segs->len;
208		len += segs->len;
209		ret = qdisc_enqueue(segs, q->qdisc, to_free);
210		if (ret != NET_XMIT_SUCCESS) {
211			if (net_xmit_drop_count(ret))
212				qdisc_qstats_drop(sch);
213		} else {
214			nb++;
215		}
 
216	}
217	sch->q.qlen += nb;
218	if (nb > 1)
219		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
220	consume_skb(skb);
221	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
222}
223
224static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
225		       struct sk_buff **to_free)
226{
227	struct tbf_sched_data *q = qdisc_priv(sch);
228	unsigned int len = qdisc_pkt_len(skb);
229	int ret;
230
231	if (qdisc_pkt_len(skb) > q->max_size) {
232		if (skb_is_gso(skb) &&
233		    skb_gso_validate_mac_len(skb, q->max_size))
234			return tbf_segment(skb, sch, to_free);
235		return qdisc_drop(skb, sch, to_free);
236	}
237	ret = qdisc_enqueue(skb, q->qdisc, to_free);
238	if (ret != NET_XMIT_SUCCESS) {
239		if (net_xmit_drop_count(ret))
240			qdisc_qstats_drop(sch);
241		return ret;
242	}
243
244	sch->qstats.backlog += len;
245	sch->q.qlen++;
246	return NET_XMIT_SUCCESS;
247}
248
 
 
 
 
 
 
 
 
 
 
 
 
249static bool tbf_peak_present(const struct tbf_sched_data *q)
250{
251	return q->peak.rate_bytes_ps;
252}
253
254static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
255{
256	struct tbf_sched_data *q = qdisc_priv(sch);
257	struct sk_buff *skb;
258
259	skb = q->qdisc->ops->peek(q->qdisc);
260
261	if (skb) {
262		s64 now;
263		s64 toks;
264		s64 ptoks = 0;
265		unsigned int len = qdisc_pkt_len(skb);
266
267		now = ktime_get_ns();
268		toks = min_t(s64, now - q->t_c, q->buffer);
269
270		if (tbf_peak_present(q)) {
271			ptoks = toks + q->ptokens;
272			if (ptoks > q->mtu)
273				ptoks = q->mtu;
274			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
275		}
276		toks += q->tokens;
277		if (toks > q->buffer)
278			toks = q->buffer;
279		toks -= (s64) psched_l2t_ns(&q->rate, len);
280
281		if ((toks|ptoks) >= 0) {
282			skb = qdisc_dequeue_peeked(q->qdisc);
283			if (unlikely(!skb))
284				return NULL;
285
286			q->t_c = now;
287			q->tokens = toks;
288			q->ptokens = ptoks;
289			qdisc_qstats_backlog_dec(sch, skb);
290			sch->q.qlen--;
 
291			qdisc_bstats_update(sch, skb);
292			return skb;
293		}
294
295		qdisc_watchdog_schedule_ns(&q->watchdog,
296					   now + max_t(long, -toks, -ptoks));
297
298		/* Maybe we have a shorter packet in the queue,
299		   which can be sent now. It sounds cool,
300		   but, however, this is wrong in principle.
301		   We MUST NOT reorder packets under these circumstances.
302
303		   Really, if we split the flow into independent
304		   subflows, it would be a very good solution.
305		   This is the main idea of all FQ algorithms
306		   (cf. CSZ, HPFQ, HFSC)
307		 */
308
309		qdisc_qstats_overlimit(sch);
310	}
311	return NULL;
312}
313
314static void tbf_reset(struct Qdisc *sch)
315{
316	struct tbf_sched_data *q = qdisc_priv(sch);
317
318	qdisc_reset(q->qdisc);
319	sch->qstats.backlog = 0;
320	sch->q.qlen = 0;
321	q->t_c = ktime_get_ns();
322	q->tokens = q->buffer;
323	q->ptokens = q->mtu;
324	qdisc_watchdog_cancel(&q->watchdog);
325}
326
327static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
328	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
329	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
330	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
331	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
332	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
333	[TCA_TBF_BURST] = { .type = NLA_U32 },
334	[TCA_TBF_PBURST] = { .type = NLA_U32 },
335};
336
337static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
338		      struct netlink_ext_ack *extack)
339{
340	int err;
341	struct tbf_sched_data *q = qdisc_priv(sch);
342	struct nlattr *tb[TCA_TBF_MAX + 1];
343	struct tc_tbf_qopt *qopt;
344	struct Qdisc *child = NULL;
345	struct psched_ratecfg rate;
346	struct psched_ratecfg peak;
347	u64 max_size;
348	s64 buffer, mtu;
349	u64 rate64 = 0, prate64 = 0;
350
351	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
352					  NULL);
353	if (err < 0)
354		return err;
355
356	err = -EINVAL;
357	if (tb[TCA_TBF_PARMS] == NULL)
358		goto done;
359
360	qopt = nla_data(tb[TCA_TBF_PARMS]);
361	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
362		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
363					      tb[TCA_TBF_RTAB],
364					      NULL));
365
366	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
367			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
368						      tb[TCA_TBF_PTAB],
369						      NULL));
370
371	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
372	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
373
374	if (tb[TCA_TBF_RATE64])
375		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
376	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
377
378	if (tb[TCA_TBF_BURST]) {
379		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
380		buffer = psched_l2t_ns(&rate, max_size);
381	} else {
382		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
383	}
384
385	if (qopt->peakrate.rate) {
386		if (tb[TCA_TBF_PRATE64])
387			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
388		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
389		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
390			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
391					peak.rate_bytes_ps, rate.rate_bytes_ps);
392			err = -EINVAL;
393			goto done;
394		}
395
396		if (tb[TCA_TBF_PBURST]) {
397			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
398			max_size = min_t(u32, max_size, pburst);
399			mtu = psched_l2t_ns(&peak, pburst);
400		} else {
401			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
402		}
403	} else {
404		memset(&peak, 0, sizeof(peak));
405	}
406
407	if (max_size < psched_mtu(qdisc_dev(sch)))
408		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
409				    max_size, qdisc_dev(sch)->name,
410				    psched_mtu(qdisc_dev(sch)));
411
412	if (!max_size) {
413		err = -EINVAL;
414		goto done;
415	}
416
417	if (q->qdisc != &noop_qdisc) {
418		err = fifo_set_limit(q->qdisc, qopt->limit);
419		if (err)
420			goto done;
421	} else if (qopt->limit > 0) {
422		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
423					 extack);
424		if (IS_ERR(child)) {
425			err = PTR_ERR(child);
426			goto done;
427		}
428
429		/* child is fifo, no need to check for noop_qdisc */
430		qdisc_hash_add(child, true);
431	}
432
433	sch_tree_lock(sch);
434	if (child) {
435		qdisc_tree_flush_backlog(q->qdisc);
436		qdisc_put(q->qdisc);
437		q->qdisc = child;
438	}
439	q->limit = qopt->limit;
440	if (tb[TCA_TBF_PBURST])
441		q->mtu = mtu;
442	else
443		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
444	q->max_size = max_size;
445	if (tb[TCA_TBF_BURST])
446		q->buffer = buffer;
447	else
448		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
449	q->tokens = q->buffer;
450	q->ptokens = q->mtu;
451
452	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
453	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
454
455	sch_tree_unlock(sch);
456	err = 0;
457
458	tbf_offload_change(sch);
459done:
460	return err;
461}
462
463static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
464		    struct netlink_ext_ack *extack)
465{
466	struct tbf_sched_data *q = qdisc_priv(sch);
467
 
 
 
 
468	qdisc_watchdog_init(&q->watchdog, sch);
469	q->qdisc = &noop_qdisc;
470
471	if (!opt)
472		return -EINVAL;
473
474	q->t_c = ktime_get_ns();
475
476	return tbf_change(sch, opt, extack);
477}
478
479static void tbf_destroy(struct Qdisc *sch)
480{
481	struct tbf_sched_data *q = qdisc_priv(sch);
482
483	qdisc_watchdog_cancel(&q->watchdog);
484	tbf_offload_destroy(sch);
485	qdisc_put(q->qdisc);
486}
487
488static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
489{
490	struct tbf_sched_data *q = qdisc_priv(sch);
491	struct nlattr *nest;
492	struct tc_tbf_qopt opt;
493	int err;
494
495	err = tbf_offload_dump(sch);
496	if (err)
497		return err;
498
499	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
500	if (nest == NULL)
501		goto nla_put_failure;
502
503	opt.limit = q->limit;
504	psched_ratecfg_getrate(&opt.rate, &q->rate);
505	if (tbf_peak_present(q))
506		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
507	else
508		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
509	opt.mtu = PSCHED_NS2TICKS(q->mtu);
510	opt.buffer = PSCHED_NS2TICKS(q->buffer);
511	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
512		goto nla_put_failure;
513	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
514	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
515			      TCA_TBF_PAD))
516		goto nla_put_failure;
517	if (tbf_peak_present(q) &&
518	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
519	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
520			      TCA_TBF_PAD))
521		goto nla_put_failure;
522
523	return nla_nest_end(skb, nest);
524
525nla_put_failure:
526	nla_nest_cancel(skb, nest);
527	return -1;
528}
529
530static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
531			  struct sk_buff *skb, struct tcmsg *tcm)
532{
533	struct tbf_sched_data *q = qdisc_priv(sch);
534
535	tcm->tcm_handle |= TC_H_MIN(1);
536	tcm->tcm_info = q->qdisc->handle;
537
538	return 0;
539}
540
541static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
542		     struct Qdisc **old, struct netlink_ext_ack *extack)
543{
544	struct tbf_sched_data *q = qdisc_priv(sch);
545
546	if (new == NULL)
547		new = &noop_qdisc;
548
549	*old = qdisc_replace(sch, new, &q->qdisc);
 
 
 
 
 
 
550	return 0;
551}
552
553static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
554{
555	struct tbf_sched_data *q = qdisc_priv(sch);
556	return q->qdisc;
557}
558
559static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
560{
561	return 1;
562}
563
 
 
 
 
564static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
565{
566	if (!walker->stop) {
567		if (walker->count >= walker->skip)
568			if (walker->fn(sch, 1, walker) < 0) {
569				walker->stop = 1;
570				return;
571			}
572		walker->count++;
573	}
574}
575
576static const struct Qdisc_class_ops tbf_class_ops = {
577	.graft		=	tbf_graft,
578	.leaf		=	tbf_leaf,
579	.find		=	tbf_find,
 
580	.walk		=	tbf_walk,
581	.dump		=	tbf_dump_class,
582};
583
584static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
585	.next		=	NULL,
586	.cl_ops		=	&tbf_class_ops,
587	.id		=	"tbf",
588	.priv_size	=	sizeof(struct tbf_sched_data),
589	.enqueue	=	tbf_enqueue,
590	.dequeue	=	tbf_dequeue,
591	.peek		=	qdisc_peek_dequeued,
 
592	.init		=	tbf_init,
593	.reset		=	tbf_reset,
594	.destroy	=	tbf_destroy,
595	.change		=	tbf_change,
596	.dump		=	tbf_dump,
597	.owner		=	THIS_MODULE,
598};
599
600static int __init tbf_module_init(void)
601{
602	return register_qdisc(&tbf_qdisc_ops);
603}
604
605static void __exit tbf_module_exit(void)
606{
607	unregister_qdisc(&tbf_qdisc_ops);
608}
609module_init(tbf_module_init)
610module_exit(tbf_module_exit)
611MODULE_LICENSE("GPL");