sch_tbf.c - net/sched/sch_tbf.c - Linux diff v3.5.6

 
  1/*
  2 * net/sched/sch_tbf.c	Token Bucket Filter queue.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 11 *						 original idea by Martin Devera
 12 *
 13 */
 14
 15#include <linux/module.h>
 16#include <linux/types.h>
 17#include <linux/kernel.h>
 18#include <linux/string.h>
 19#include <linux/errno.h>
 20#include <linux/skbuff.h>
 
 21#include <net/netlink.h>
 
 
 22#include <net/pkt_sched.h>
 23
 24
 25/*	Simple Token Bucket Filter.
 26	=======================================
 27
 28	SOURCE.
 29	-------
 30
 31	None.
 32
 33	Description.
 34	------------
 35
 36	A data flow obeys TBF with rate R and depth B, if for any
 37	time interval t_i...t_f the number of transmitted bits
 38	does not exceed B + R*(t_f-t_i).
 39
 40	Packetized version of this definition:
 41	The sequence of packets of sizes s_i served at moments t_i
 42	obeys TBF, if for any i<=k:
 43
 44	s_i+....+s_k <= B + R*(t_k - t_i)
 45
 46	Algorithm.
 47	----------
 48
 49	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
 50
 51	N(t+delta) = min{B/R, N(t) + delta}
 52
 53	If the first packet in queue has length S, it may be
 54	transmitted only at the time t_* when S/R <= N(t_*),
 55	and in this case N(t) jumps:
 56
 57	N(t_* + 0) = N(t_* - 0) - S/R.
 58
 59
 60
 61	Actually, QoS requires two TBF to be applied to a data stream.
 62	One of them controls steady state burst size, another
 63	one with rate P (peak rate) and depth M (equal to link MTU)
 64	limits bursts at a smaller time scale.
 65
 66	It is easy to see that P>R, and B>M. If P is infinity, this double
 67	TBF is equivalent to a single one.
 68
 69	When TBF works in reshaping mode, latency is estimated as:
 70
 71	lat = max ((L-B)/R, (L-M)/P)
 72
 73
 74	NOTES.
 75	------
 76
 77	If TBF throttles, it starts a watchdog timer, which will wake it up
 78	when it is ready to transmit.
 79	Note that the minimal timer resolution is 1/HZ.
 80	If no new packets arrive during this period,
 81	or if the device is not awaken by EOI for some previous packet,
 82	TBF can stop its activity for 1/HZ.
 83
 84
 85	This means, that with depth B, the maximal rate is
 86
 87	R_crit = B*HZ
 88
 89	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 90
 91	Note that the peak rate TBF is much more tough: with MTU 1500
 92	P_crit = 150Kbytes/sec. So, if you need greater peak
 93	rates, use alpha with HZ=1000 :-)
 94
 95	With classful TBF, limit is just kept for backwards compatibility.
 96	It is passed to the default bfifo qdisc - if the inner qdisc is
 97	changed the limit is not effective anymore.
 98*/
 99
100struct tbf_sched_data {
101/* Parameters */
102	u32		limit;		/* Maximal length of backlog: bytes */
103	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
104	u32		mtu;
105	u32		max_size;
106	struct qdisc_rate_table	*R_tab;
107	struct qdisc_rate_table	*P_tab;
 
 
108
109/* Variables */
110	long	tokens;			/* Current number of B tokens */
111	long	ptokens;		/* Current number of P tokens */
112	psched_time_t	t_c;		/* Time check-point */
113	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
114	struct qdisc_watchdog watchdog;	/* Watchdog timer */
115};
116
117#define L2T(q, L)   qdisc_l2t((q)->R_tab, L)
118#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
119
120static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121{
122	struct tbf_sched_data *q = qdisc_priv(sch);
123	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
125	if (qdisc_pkt_len(skb) > q->max_size)
126		return qdisc_reshape_fail(skb, sch);
 
 
 
 
127
128	ret = qdisc_enqueue(skb, q->qdisc);
 
 
 
 
 
 
129	if (ret != NET_XMIT_SUCCESS) {
130		if (net_xmit_drop_count(ret))
131			sch->qstats.drops++;
132		return ret;
133	}
134
 
135	sch->q.qlen++;
136	return NET_XMIT_SUCCESS;
137}
138
139static unsigned int tbf_drop(struct Qdisc *sch)
140{
141	struct tbf_sched_data *q = qdisc_priv(sch);
142	unsigned int len = 0;
143
144	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
145		sch->q.qlen--;
146		sch->qstats.drops++;
147	}
148	return len;
149}
150
151static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
152{
153	struct tbf_sched_data *q = qdisc_priv(sch);
154	struct sk_buff *skb;
155
156	skb = q->qdisc->ops->peek(q->qdisc);
157
158	if (skb) {
159		psched_time_t now;
160		long toks;
161		long ptoks = 0;
162		unsigned int len = qdisc_pkt_len(skb);
163
164		now = psched_get_time();
165		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
166
167		if (q->P_tab) {
168			ptoks = toks + q->ptokens;
169			if (ptoks > (long)q->mtu)
170				ptoks = q->mtu;
171			ptoks -= L2T_P(q, len);
172		}
173		toks += q->tokens;
174		if (toks > (long)q->buffer)
175			toks = q->buffer;
176		toks -= L2T(q, len);
177
178		if ((toks|ptoks) >= 0) {
179			skb = qdisc_dequeue_peeked(q->qdisc);
180			if (unlikely(!skb))
181				return NULL;
182
183			q->t_c = now;
184			q->tokens = toks;
185			q->ptokens = ptoks;
 
186			sch->q.qlen--;
187			qdisc_unthrottled(sch);
188			qdisc_bstats_update(sch, skb);
189			return skb;
190		}
191
192		qdisc_watchdog_schedule(&q->watchdog,
193					now + max_t(long, -toks, -ptoks));
194
195		/* Maybe we have a shorter packet in the queue,
196		   which can be sent now. It sounds cool,
197		   but, however, this is wrong in principle.
198		   We MUST NOT reorder packets under these circumstances.
199
200		   Really, if we split the flow into independent
201		   subflows, it would be a very good solution.
202		   This is the main idea of all FQ algorithms
203		   (cf. CSZ, HPFQ, HFSC)
204		 */
205
206		sch->qstats.overlimits++;
207	}
208	return NULL;
209}
210
211static void tbf_reset(struct Qdisc *sch)
212{
213	struct tbf_sched_data *q = qdisc_priv(sch);
214
215	qdisc_reset(q->qdisc);
216	sch->q.qlen = 0;
217	q->t_c = psched_get_time();
218	q->tokens = q->buffer;
219	q->ptokens = q->mtu;
220	qdisc_watchdog_cancel(&q->watchdog);
221}
222
223static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
224	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
225	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
226	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 
 
 
 
227};
228
229static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 
230{
231	int err;
232	struct tbf_sched_data *q = qdisc_priv(sch);
233	struct nlattr *tb[TCA_TBF_PTAB + 1];
234	struct tc_tbf_qopt *qopt;
235	struct qdisc_rate_table *rtab = NULL;
236	struct qdisc_rate_table *ptab = NULL;
237	struct Qdisc *child = NULL;
238	int max_size, n;
 
 
 
 
 
239
240	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
 
241	if (err < 0)
242		return err;
243
244	err = -EINVAL;
245	if (tb[TCA_TBF_PARMS] == NULL)
246		goto done;
247
248	qopt = nla_data(tb[TCA_TBF_PARMS]);
249	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
250	if (rtab == NULL)
251		goto done;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
253	if (qopt->peakrate.rate) {
254		if (qopt->peakrate.rate > qopt->rate.rate)
255			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
256		if (ptab == NULL)
 
 
 
 
257			goto done;
258	}
259
260	for (n = 0; n < 256; n++)
261		if (rtab->data[n] > qopt->buffer)
262			break;
263	max_size = (n << qopt->rate.cell_log) - 1;
264	if (ptab) {
265		int size;
266
267		for (n = 0; n < 256; n++)
268			if (ptab->data[n] > qopt->mtu)
269				break;
270		size = (n << qopt->peakrate.cell_log) - 1;
271		if (size < max_size)
272			max_size = size;
273	}
274	if (max_size < 0)
 
 
 
 
 
 
 
275		goto done;
 
276
277	if (q->qdisc != &noop_qdisc) {
278		err = fifo_set_limit(q->qdisc, qopt->limit);
279		if (err)
280			goto done;
281	} else if (qopt->limit > 0) {
282		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
 
283		if (IS_ERR(child)) {
284			err = PTR_ERR(child);
285			goto done;
286		}
 
 
 
287	}
288
289	sch_tree_lock(sch);
290	if (child) {
291		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
292		qdisc_destroy(q->qdisc);
293		q->qdisc = child;
294	}
295	q->limit = qopt->limit;
296	q->mtu = qopt->mtu;
 
 
 
297	q->max_size = max_size;
298	q->buffer = qopt->buffer;
 
 
 
299	q->tokens = q->buffer;
300	q->ptokens = q->mtu;
301
302	swap(q->R_tab, rtab);
303	swap(q->P_tab, ptab);
304
305	sch_tree_unlock(sch);
 
306	err = 0;
 
 
307done:
308	if (rtab)
309		qdisc_put_rtab(rtab);
310	if (ptab)
311		qdisc_put_rtab(ptab);
312	return err;
313}
314
315static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
 
316{
317	struct tbf_sched_data *q = qdisc_priv(sch);
318
319	if (opt == NULL)
320		return -EINVAL;
321
322	q->t_c = psched_get_time();
323	qdisc_watchdog_init(&q->watchdog, sch);
324	q->qdisc = &noop_qdisc;
325
326	return tbf_change(sch, opt);
 
 
 
 
 
327}
328
329static void tbf_destroy(struct Qdisc *sch)
330{
331	struct tbf_sched_data *q = qdisc_priv(sch);
332
333	qdisc_watchdog_cancel(&q->watchdog);
334
335	if (q->P_tab)
336		qdisc_put_rtab(q->P_tab);
337	if (q->R_tab)
338		qdisc_put_rtab(q->R_tab);
339
340	qdisc_destroy(q->qdisc);
341}
342
343static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
344{
345	struct tbf_sched_data *q = qdisc_priv(sch);
346	struct nlattr *nest;
347	struct tc_tbf_qopt opt;
 
348
349	sch->qstats.backlog = q->qdisc->qstats.backlog;
350	nest = nla_nest_start(skb, TCA_OPTIONS);
 
 
 
351	if (nest == NULL)
352		goto nla_put_failure;
353
354	opt.limit = q->limit;
355	opt.rate = q->R_tab->rate;
356	if (q->P_tab)
357		opt.peakrate = q->P_tab->rate;
358	else
359		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
360	opt.mtu = q->mtu;
361	opt.buffer = q->buffer;
362	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
363		goto nla_put_failure;
 
 
 
 
 
 
 
 
 
364
365	nla_nest_end(skb, nest);
366	return skb->len;
367
368nla_put_failure:
369	nla_nest_cancel(skb, nest);
370	return -1;
371}
372
373static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
374			  struct sk_buff *skb, struct tcmsg *tcm)
375{
376	struct tbf_sched_data *q = qdisc_priv(sch);
377
378	tcm->tcm_handle |= TC_H_MIN(1);
379	tcm->tcm_info = q->qdisc->handle;
380
381	return 0;
382}
383
384static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
385		     struct Qdisc **old)
386{
387	struct tbf_sched_data *q = qdisc_priv(sch);
388
389	if (new == NULL)
390		new = &noop_qdisc;
391
392	sch_tree_lock(sch);
393	*old = q->qdisc;
394	q->qdisc = new;
395	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
396	qdisc_reset(*old);
397	sch_tree_unlock(sch);
398
 
399	return 0;
400}
401
402static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
403{
404	struct tbf_sched_data *q = qdisc_priv(sch);
405	return q->qdisc;
406}
407
408static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
409{
410	return 1;
411}
412
413static void tbf_put(struct Qdisc *sch, unsigned long arg)
414{
415}
416
417static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
418{
419	if (!walker->stop) {
420		if (walker->count >= walker->skip)
421			if (walker->fn(sch, 1, walker) < 0) {
422				walker->stop = 1;
423				return;
424			}
425		walker->count++;
426	}
427}
428
429static const struct Qdisc_class_ops tbf_class_ops = {
430	.graft		=	tbf_graft,
431	.leaf		=	tbf_leaf,
432	.get		=	tbf_get,
433	.put		=	tbf_put,
434	.walk		=	tbf_walk,
435	.dump		=	tbf_dump_class,
436};
437
438static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
439	.next		=	NULL,
440	.cl_ops		=	&tbf_class_ops,
441	.id		=	"tbf",
442	.priv_size	=	sizeof(struct tbf_sched_data),
443	.enqueue	=	tbf_enqueue,
444	.dequeue	=	tbf_dequeue,
445	.peek		=	qdisc_peek_dequeued,
446	.drop		=	tbf_drop,
447	.init		=	tbf_init,
448	.reset		=	tbf_reset,
449	.destroy	=	tbf_destroy,
450	.change		=	tbf_change,
451	.dump		=	tbf_dump,
452	.owner		=	THIS_MODULE,
453};
 
454
455static int __init tbf_module_init(void)
456{
457	return register_qdisc(&tbf_qdisc_ops);
458}
459
460static void __exit tbf_module_exit(void)
461{
462	unregister_qdisc(&tbf_qdisc_ops);
463}
464module_init(tbf_module_init)
465module_exit(tbf_module_exit)
466MODULE_LICENSE("GPL");

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * net/sched/sch_tbf.c	Token Bucket Filter queue.
  4 *
 
 
 
 
 
  5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  7 *						 original idea by Martin Devera
 
  8 */
  9
 10#include <linux/module.h>
 11#include <linux/types.h>
 12#include <linux/kernel.h>
 13#include <linux/string.h>
 14#include <linux/errno.h>
 15#include <linux/skbuff.h>
 16#include <net/gso.h>
 17#include <net/netlink.h>
 18#include <net/sch_generic.h>
 19#include <net/pkt_cls.h>
 20#include <net/pkt_sched.h>
 21
 22
 23/*	Simple Token Bucket Filter.
 24	=======================================
 25
 26	SOURCE.
 27	-------
 28
 29	None.
 30
 31	Description.
 32	------------
 33
 34	A data flow obeys TBF with rate R and depth B, if for any
 35	time interval t_i...t_f the number of transmitted bits
 36	does not exceed B + R*(t_f-t_i).
 37
 38	Packetized version of this definition:
 39	The sequence of packets of sizes s_i served at moments t_i
 40	obeys TBF, if for any i<=k:
 41
 42	s_i+....+s_k <= B + R*(t_k - t_i)
 43
 44	Algorithm.
 45	----------
 46
 47	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
 48
 49	N(t+delta) = min{B/R, N(t) + delta}
 50
 51	If the first packet in queue has length S, it may be
 52	transmitted only at the time t_* when S/R <= N(t_*),
 53	and in this case N(t) jumps:
 54
 55	N(t_* + 0) = N(t_* - 0) - S/R.
 56
 57
 58
 59	Actually, QoS requires two TBF to be applied to a data stream.
 60	One of them controls steady state burst size, another
 61	one with rate P (peak rate) and depth M (equal to link MTU)
 62	limits bursts at a smaller time scale.
 63
 64	It is easy to see that P>R, and B>M. If P is infinity, this double
 65	TBF is equivalent to a single one.
 66
 67	When TBF works in reshaping mode, latency is estimated as:
 68
 69	lat = max ((L-B)/R, (L-M)/P)
 70
 71
 72	NOTES.
 73	------
 74
 75	If TBF throttles, it starts a watchdog timer, which will wake it up
 76	when it is ready to transmit.
 77	Note that the minimal timer resolution is 1/HZ.
 78	If no new packets arrive during this period,
 79	or if the device is not awaken by EOI for some previous packet,
 80	TBF can stop its activity for 1/HZ.
 81
 82
 83	This means, that with depth B, the maximal rate is
 84
 85	R_crit = B*HZ
 86
 87	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 88
 89	Note that the peak rate TBF is much more tough: with MTU 1500
 90	P_crit = 150Kbytes/sec. So, if you need greater peak
 91	rates, use alpha with HZ=1000 :-)
 92
 93	With classful TBF, limit is just kept for backwards compatibility.
 94	It is passed to the default bfifo qdisc - if the inner qdisc is
 95	changed the limit is not effective anymore.
 96*/
 97
 98struct tbf_sched_data {
 99/* Parameters */
100	u32		limit;		/* Maximal length of backlog: bytes */
 
 
101	u32		max_size;
102	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
103	s64		mtu;
104	struct psched_ratecfg rate;
105	struct psched_ratecfg peak;
106
107/* Variables */
108	s64	tokens;			/* Current number of B tokens */
109	s64	ptokens;		/* Current number of P tokens */
110	s64	t_c;			/* Time check-point */
111	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
112	struct qdisc_watchdog watchdog;	/* Watchdog timer */
113};
114
 
 
115
116/* Time to Length, convert time in ns to length in bytes
117 * to determinate how many bytes can be sent in given time.
118 */
119static u64 psched_ns_t2l(const struct psched_ratecfg *r,
120			 u64 time_in_ns)
121{
122	/* The formula is :
123	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
124	 */
125	u64 len = time_in_ns * r->rate_bytes_ps;
126
127	do_div(len, NSEC_PER_SEC);
128
129	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
130		do_div(len, 53);
131		len = len * 48;
132	}
133
134	if (len > r->overhead)
135		len -= r->overhead;
136	else
137		len = 0;
138
139	return len;
140}
141
142static void tbf_offload_change(struct Qdisc *sch)
143{
144	struct tbf_sched_data *q = qdisc_priv(sch);
145	struct net_device *dev = qdisc_dev(sch);
146	struct tc_tbf_qopt_offload qopt;
147
148	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
149		return;
150
151	qopt.command = TC_TBF_REPLACE;
152	qopt.handle = sch->handle;
153	qopt.parent = sch->parent;
154	qopt.replace_params.rate = q->rate;
155	qopt.replace_params.max_size = q->max_size;
156	qopt.replace_params.qstats = &sch->qstats;
157
158	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
159}
160
161static void tbf_offload_destroy(struct Qdisc *sch)
162{
163	struct net_device *dev = qdisc_dev(sch);
164	struct tc_tbf_qopt_offload qopt;
165
166	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
167		return;
168
169	qopt.command = TC_TBF_DESTROY;
170	qopt.handle = sch->handle;
171	qopt.parent = sch->parent;
172	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
173}
174
175static int tbf_offload_dump(struct Qdisc *sch)
176{
177	struct tc_tbf_qopt_offload qopt;
178
179	qopt.command = TC_TBF_STATS;
180	qopt.handle = sch->handle;
181	qopt.parent = sch->parent;
182	qopt.stats.bstats = &sch->bstats;
183	qopt.stats.qstats = &sch->qstats;
184
185	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
186}
187
188static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
189			      struct Qdisc *old, struct netlink_ext_ack *extack)
190{
191	struct tc_tbf_qopt_offload graft_offload = {
192		.handle		= sch->handle,
193		.parent		= sch->parent,
194		.child_handle	= new->handle,
195		.command	= TC_TBF_GRAFT,
196	};
197
198	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
199				   TC_SETUP_QDISC_TBF, &graft_offload, extack);
200}
201
202/* GSO packet is too big, segment it so that tbf can transmit
203 * each segment in time
204 */
205static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
206		       struct sk_buff **to_free)
207{
208	struct tbf_sched_data *q = qdisc_priv(sch);
209	struct sk_buff *segs, *nskb;
210	netdev_features_t features = netif_skb_features(skb);
211	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
212	int ret, nb;
213
214	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
215
216	if (IS_ERR_OR_NULL(segs))
217		return qdisc_drop(skb, sch, to_free);
218
219	nb = 0;
220	skb_list_walk_safe(segs, segs, nskb) {
221		skb_mark_not_on_list(segs);
222		qdisc_skb_cb(segs)->pkt_len = segs->len;
223		len += segs->len;
224		ret = qdisc_enqueue(segs, q->qdisc, to_free);
225		if (ret != NET_XMIT_SUCCESS) {
226			if (net_xmit_drop_count(ret))
227				qdisc_qstats_drop(sch);
228		} else {
229			nb++;
230		}
231	}
232	sch->q.qlen += nb;
233	if (nb > 1)
234		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
235	consume_skb(skb);
236	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
237}
238
239static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
240		       struct sk_buff **to_free)
241{
242	struct tbf_sched_data *q = qdisc_priv(sch);
243	unsigned int len = qdisc_pkt_len(skb);
244	int ret;
245
246	if (qdisc_pkt_len(skb) > q->max_size) {
247		if (skb_is_gso(skb) &&
248		    skb_gso_validate_mac_len(skb, q->max_size))
249			return tbf_segment(skb, sch, to_free);
250		return qdisc_drop(skb, sch, to_free);
251	}
252	ret = qdisc_enqueue(skb, q->qdisc, to_free);
253	if (ret != NET_XMIT_SUCCESS) {
254		if (net_xmit_drop_count(ret))
255			qdisc_qstats_drop(sch);
256		return ret;
257	}
258
259	sch->qstats.backlog += len;
260	sch->q.qlen++;
261	return NET_XMIT_SUCCESS;
262}
263
264static bool tbf_peak_present(const struct tbf_sched_data *q)
265{
266	return q->peak.rate_bytes_ps;
 
 
 
 
 
 
 
267}
268
269static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
270{
271	struct tbf_sched_data *q = qdisc_priv(sch);
272	struct sk_buff *skb;
273
274	skb = q->qdisc->ops->peek(q->qdisc);
275
276	if (skb) {
277		s64 now;
278		s64 toks;
279		s64 ptoks = 0;
280		unsigned int len = qdisc_pkt_len(skb);
281
282		now = ktime_get_ns();
283		toks = min_t(s64, now - q->t_c, q->buffer);
284
285		if (tbf_peak_present(q)) {
286			ptoks = toks + q->ptokens;
287			if (ptoks > q->mtu)
288				ptoks = q->mtu;
289			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
290		}
291		toks += q->tokens;
292		if (toks > q->buffer)
293			toks = q->buffer;
294		toks -= (s64) psched_l2t_ns(&q->rate, len);
295
296		if ((toks|ptoks) >= 0) {
297			skb = qdisc_dequeue_peeked(q->qdisc);
298			if (unlikely(!skb))
299				return NULL;
300
301			q->t_c = now;
302			q->tokens = toks;
303			q->ptokens = ptoks;
304			qdisc_qstats_backlog_dec(sch, skb);
305			sch->q.qlen--;
 
306			qdisc_bstats_update(sch, skb);
307			return skb;
308		}
309
310		qdisc_watchdog_schedule_ns(&q->watchdog,
311					   now + max_t(long, -toks, -ptoks));
312
313		/* Maybe we have a shorter packet in the queue,
314		   which can be sent now. It sounds cool,
315		   but, however, this is wrong in principle.
316		   We MUST NOT reorder packets under these circumstances.
317
318		   Really, if we split the flow into independent
319		   subflows, it would be a very good solution.
320		   This is the main idea of all FQ algorithms
321		   (cf. CSZ, HPFQ, HFSC)
322		 */
323
324		qdisc_qstats_overlimit(sch);
325	}
326	return NULL;
327}
328
329static void tbf_reset(struct Qdisc *sch)
330{
331	struct tbf_sched_data *q = qdisc_priv(sch);
332
333	qdisc_reset(q->qdisc);
334	q->t_c = ktime_get_ns();
 
335	q->tokens = q->buffer;
336	q->ptokens = q->mtu;
337	qdisc_watchdog_cancel(&q->watchdog);
338}
339
340static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
341	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
342	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
343	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
344	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
345	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
346	[TCA_TBF_BURST] = { .type = NLA_U32 },
347	[TCA_TBF_PBURST] = { .type = NLA_U32 },
348};
349
350static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
351		      struct netlink_ext_ack *extack)
352{
353	int err;
354	struct tbf_sched_data *q = qdisc_priv(sch);
355	struct nlattr *tb[TCA_TBF_MAX + 1];
356	struct tc_tbf_qopt *qopt;
 
 
357	struct Qdisc *child = NULL;
358	struct Qdisc *old = NULL;
359	struct psched_ratecfg rate;
360	struct psched_ratecfg peak;
361	u64 max_size;
362	s64 buffer, mtu;
363	u64 rate64 = 0, prate64 = 0;
364
365	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
366					  NULL);
367	if (err < 0)
368		return err;
369
370	err = -EINVAL;
371	if (tb[TCA_TBF_PARMS] == NULL)
372		goto done;
373
374	qopt = nla_data(tb[TCA_TBF_PARMS]);
375	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
376		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
377					      tb[TCA_TBF_RTAB],
378					      NULL));
379
380	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
381			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
382						      tb[TCA_TBF_PTAB],
383						      NULL));
384
385	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
386	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
387
388	if (tb[TCA_TBF_RATE64])
389		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
390	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
391
392	if (tb[TCA_TBF_BURST]) {
393		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
394		buffer = psched_l2t_ns(&rate, max_size);
395	} else {
396		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
397	}
398
399	if (qopt->peakrate.rate) {
400		if (tb[TCA_TBF_PRATE64])
401			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
402		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
403		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
404			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
405					peak.rate_bytes_ps, rate.rate_bytes_ps);
406			err = -EINVAL;
407			goto done;
408		}
409
410		if (tb[TCA_TBF_PBURST]) {
411			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
412			max_size = min_t(u32, max_size, pburst);
413			mtu = psched_l2t_ns(&peak, pburst);
414		} else {
415			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
416		}
417	} else {
418		memset(&peak, 0, sizeof(peak));
 
 
 
 
419	}
420
421	if (max_size < psched_mtu(qdisc_dev(sch)))
422		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
423				    max_size, qdisc_dev(sch)->name,
424				    psched_mtu(qdisc_dev(sch)));
425
426	if (!max_size) {
427		err = -EINVAL;
428		goto done;
429	}
430
431	if (q->qdisc != &noop_qdisc) {
432		err = fifo_set_limit(q->qdisc, qopt->limit);
433		if (err)
434			goto done;
435	} else if (qopt->limit > 0) {
436		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
437					 extack);
438		if (IS_ERR(child)) {
439			err = PTR_ERR(child);
440			goto done;
441		}
442
443		/* child is fifo, no need to check for noop_qdisc */
444		qdisc_hash_add(child, true);
445	}
446
447	sch_tree_lock(sch);
448	if (child) {
449		qdisc_tree_flush_backlog(q->qdisc);
450		old = q->qdisc;
451		q->qdisc = child;
452	}
453	q->limit = qopt->limit;
454	if (tb[TCA_TBF_PBURST])
455		q->mtu = mtu;
456	else
457		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
458	q->max_size = max_size;
459	if (tb[TCA_TBF_BURST])
460		q->buffer = buffer;
461	else
462		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
463	q->tokens = q->buffer;
464	q->ptokens = q->mtu;
465
466	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
467	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
468
469	sch_tree_unlock(sch);
470	qdisc_put(old);
471	err = 0;
472
473	tbf_offload_change(sch);
474done:
 
 
 
 
475	return err;
476}
477
478static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
479		    struct netlink_ext_ack *extack)
480{
481	struct tbf_sched_data *q = qdisc_priv(sch);
482
 
 
 
 
483	qdisc_watchdog_init(&q->watchdog, sch);
484	q->qdisc = &noop_qdisc;
485
486	if (!opt)
487		return -EINVAL;
488
489	q->t_c = ktime_get_ns();
490
491	return tbf_change(sch, opt, extack);
492}
493
494static void tbf_destroy(struct Qdisc *sch)
495{
496	struct tbf_sched_data *q = qdisc_priv(sch);
497
498	qdisc_watchdog_cancel(&q->watchdog);
499	tbf_offload_destroy(sch);
500	qdisc_put(q->qdisc);
 
 
 
 
 
501}
502
503static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
504{
505	struct tbf_sched_data *q = qdisc_priv(sch);
506	struct nlattr *nest;
507	struct tc_tbf_qopt opt;
508	int err;
509
510	err = tbf_offload_dump(sch);
511	if (err)
512		return err;
513
514	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
515	if (nest == NULL)
516		goto nla_put_failure;
517
518	opt.limit = q->limit;
519	psched_ratecfg_getrate(&opt.rate, &q->rate);
520	if (tbf_peak_present(q))
521		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
522	else
523		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
524	opt.mtu = PSCHED_NS2TICKS(q->mtu);
525	opt.buffer = PSCHED_NS2TICKS(q->buffer);
526	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
527		goto nla_put_failure;
528	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
529	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
530			      TCA_TBF_PAD))
531		goto nla_put_failure;
532	if (tbf_peak_present(q) &&
533	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
534	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
535			      TCA_TBF_PAD))
536		goto nla_put_failure;
537
538	return nla_nest_end(skb, nest);
 
539
540nla_put_failure:
541	nla_nest_cancel(skb, nest);
542	return -1;
543}
544
545static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
546			  struct sk_buff *skb, struct tcmsg *tcm)
547{
548	struct tbf_sched_data *q = qdisc_priv(sch);
549
550	tcm->tcm_handle |= TC_H_MIN(1);
551	tcm->tcm_info = q->qdisc->handle;
552
553	return 0;
554}
555
556static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
557		     struct Qdisc **old, struct netlink_ext_ack *extack)
558{
559	struct tbf_sched_data *q = qdisc_priv(sch);
560
561	if (new == NULL)
562		new = &noop_qdisc;
563
564	*old = qdisc_replace(sch, new, &q->qdisc);
 
 
 
 
 
565
566	tbf_offload_graft(sch, new, *old, extack);
567	return 0;
568}
569
570static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
571{
572	struct tbf_sched_data *q = qdisc_priv(sch);
573	return q->qdisc;
574}
575
576static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
577{
578	return 1;
579}
580
 
 
 
 
581static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
582{
583	if (!walker->stop) {
584		tc_qdisc_stats_dump(sch, 1, walker);
 
 
 
 
 
585	}
586}
587
588static const struct Qdisc_class_ops tbf_class_ops = {
589	.graft		=	tbf_graft,
590	.leaf		=	tbf_leaf,
591	.find		=	tbf_find,
 
592	.walk		=	tbf_walk,
593	.dump		=	tbf_dump_class,
594};
595
596static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
597	.next		=	NULL,
598	.cl_ops		=	&tbf_class_ops,
599	.id		=	"tbf",
600	.priv_size	=	sizeof(struct tbf_sched_data),
601	.enqueue	=	tbf_enqueue,
602	.dequeue	=	tbf_dequeue,
603	.peek		=	qdisc_peek_dequeued,
 
604	.init		=	tbf_init,
605	.reset		=	tbf_reset,
606	.destroy	=	tbf_destroy,
607	.change		=	tbf_change,
608	.dump		=	tbf_dump,
609	.owner		=	THIS_MODULE,
610};
611MODULE_ALIAS_NET_SCH("tbf");
612
613static int __init tbf_module_init(void)
614{
615	return register_qdisc(&tbf_qdisc_ops);
616}
617
618static void __exit tbf_module_exit(void)
619{
620	unregister_qdisc(&tbf_qdisc_ops);
621}
622module_init(tbf_module_init)
623module_exit(tbf_module_exit)
624MODULE_LICENSE("GPL");
625MODULE_DESCRIPTION("Token Bucket Filter qdisc");