sch_cbq.c - net/sched/sch_cbq.c - Linux diff v3.5.6

 
   1/*
   2 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
 
  22
  23
  24/*	Class-Based Queueing (CBQ) algorithm.
  25	=======================================
  26
  27	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  28		 Management Models for Packet Networks",
  29		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  30
  31		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  32
  33		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  34		 Parameters", 1996
  35
  36		 [4] Sally Floyd and Michael Speer, "Experimental Results
  37		 for Class-Based Queueing", 1998, not published.
  38
  39	-----------------------------------------------------------------------
  40
  41	Algorithm skeleton was taken from NS simulator cbq.cc.
  42	If someone wants to check this code against the LBL version,
  43	he should take into account that ONLY the skeleton was borrowed,
  44	the implementation is different. Particularly:
  45
  46	--- The WRR algorithm is different. Our version looks more
  47	reasonable (I hope) and works when quanta are allowed to be
  48	less than MTU, which is always the case when real time classes
  49	have small rates. Note, that the statement of [3] is
  50	incomplete, delay may actually be estimated even if class
  51	per-round allotment is less than MTU. Namely, if per-round
  52	allotment is W*r_i, and r_1+...+r_k = r < 1
  53
  54	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  55
  56	In the worst case we have IntServ estimate with D = W*r+k*MTU
  57	and C = MTU*r. The proof (if correct at all) is trivial.
  58
  59
  60	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  61	interpret some places, which look like wrong translations
  62	from NS. Anyone is advised to find these differences
  63	and explain to me, why I am wrong 8).
  64
  65	--- Linux has no EOI event, so that we cannot estimate true class
  66	idle time. Workaround is to consider the next dequeue event
  67	as sign that previous packet is finished. This is wrong because of
  68	internal device queueing, but on a permanently loaded link it is true.
  69	Moreover, combined with clock integrator, this scheme looks
  70	very close to an ideal solution.  */
  71
  72struct cbq_sched_data;
  73
  74
  75struct cbq_class {
  76	struct Qdisc_class_common common;
  77	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  78
  79/* Parameters */
  80	unsigned char		priority;	/* class priority */
  81	unsigned char		priority2;	/* priority to be used after overlimit */
  82	unsigned char		ewma_log;	/* time constant for idle time calculation */
  83	unsigned char		ovl_strategy;
  84#ifdef CONFIG_NET_CLS_ACT
  85	unsigned char		police;
  86#endif
  87
  88	u32			defmap;
  89
  90	/* Link-sharing scheduler parameters */
  91	long			maxidle;	/* Class parameters: see below. */
  92	long			offtime;
  93	long			minidle;
  94	u32			avpkt;
  95	struct qdisc_rate_table	*R_tab;
  96
  97	/* Overlimit strategy parameters */
  98	void			(*overlimit)(struct cbq_class *cl);
  99	psched_tdiff_t		penalty;
 100
 101	/* General scheduler (WRR) parameters */
 102	long			allot;
 103	long			quantum;	/* Allotment per WRR round */
 104	long			weight;		/* Relative allotment: see below */
 105
 106	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
 107	struct cbq_class	*split;		/* Ptr to split node */
 108	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
 109	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
 110	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
 111						   parent otherwise */
 112	struct cbq_class	*sibling;	/* Sibling chain */
 113	struct cbq_class	*children;	/* Pointer to children chain */
 114
 115	struct Qdisc		*q;		/* Elementary queueing discipline */
 116
 117
 118/* Variables */
 119	unsigned char		cpriority;	/* Effective priority */
 120	unsigned char		delayed;
 121	unsigned char		level;		/* level of the class in hierarchy:
 122						   0 for leaf classes, and maximal
 123						   level of children + 1 for nodes.
 124						 */
 125
 126	psched_time_t		last;		/* Last end of service */
 127	psched_time_t		undertime;
 128	long			avgidle;
 129	long			deficit;	/* Saved deficit for WRR */
 130	psched_time_t		penalized;
 131	struct gnet_stats_basic_packed bstats;
 132	struct gnet_stats_queue qstats;
 133	struct gnet_stats_rate_est rate_est;
 134	struct tc_cbq_xstats	xstats;
 135
 136	struct tcf_proto	*filter_list;
 
 137
 138	int			refcnt;
 139	int			filters;
 140
 141	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 142};
 143
 144struct cbq_sched_data {
 145	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 146	int			nclasses[TC_CBQ_MAXPRIO + 1];
 147	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 148
 149	struct cbq_class	link;
 150
 151	unsigned int		activemask;
 152	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 153								   with backlog */
 154
 155#ifdef CONFIG_NET_CLS_ACT
 156	struct cbq_class	*rx_class;
 157#endif
 158	struct cbq_class	*tx_class;
 159	struct cbq_class	*tx_borrowed;
 160	int			tx_len;
 161	psched_time_t		now;		/* Cached timestamp */
 162	psched_time_t		now_rt;		/* Cached real time */
 163	unsigned int		pmask;
 164
 165	struct hrtimer		delay_timer;
 166	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 167						   started when CBQ has
 168						   backlog, but cannot
 169						   transmit just now */
 170	psched_tdiff_t		wd_expires;
 171	int			toplevel;
 172	u32			hgenerator;
 173};
 174
 175
 176#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 177
 178static inline struct cbq_class *
 179cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 180{
 181	struct Qdisc_class_common *clc;
 182
 183	clc = qdisc_class_find(&q->clhash, classid);
 184	if (clc == NULL)
 185		return NULL;
 186	return container_of(clc, struct cbq_class, common);
 187}
 188
 189#ifdef CONFIG_NET_CLS_ACT
 190
 191static struct cbq_class *
 192cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 193{
 194	struct cbq_class *cl;
 195
 196	for (cl = this->tparent; cl; cl = cl->tparent) {
 197		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 198
 199		if (new != NULL && new != this)
 200			return new;
 201	}
 202	return NULL;
 203}
 204
 205#endif
 206
 207/* Classify packet. The procedure is pretty complicated, but
 208 * it allows us to combine link sharing and priority scheduling
 209 * transparently.
 210 *
 211 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 212 * so that it resolves to split nodes. Then packets are classified
 213 * by logical priority, or a more specific classifier may be attached
 214 * to the split node.
 215 */
 216
 217static struct cbq_class *
 218cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 219{
 220	struct cbq_sched_data *q = qdisc_priv(sch);
 221	struct cbq_class *head = &q->link;
 222	struct cbq_class **defmap;
 223	struct cbq_class *cl = NULL;
 224	u32 prio = skb->priority;
 
 225	struct tcf_result res;
 226
 227	/*
 228	 *  Step 1. If skb->priority points to one of our classes, use it.
 229	 */
 230	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 231	    (cl = cbq_class_lookup(q, prio)) != NULL)
 232		return cl;
 233
 234	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 235	for (;;) {
 236		int result = 0;
 237		defmap = head->defaults;
 238
 
 239		/*
 240		 * Step 2+n. Apply classifier.
 241		 */
 242		if (!head->filter_list ||
 243		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
 244			goto fallback;
 
 
 245
 246		cl = (void *)res.class;
 247		if (!cl) {
 248			if (TC_H_MAJ(res.classid))
 249				cl = cbq_class_lookup(q, res.classid);
 250			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 251				cl = defmap[TC_PRIO_BESTEFFORT];
 252
 253			if (cl == NULL || cl->level >= head->level)
 254				goto fallback;
 255		}
 256
 
 257#ifdef CONFIG_NET_CLS_ACT
 258		switch (result) {
 259		case TC_ACT_QUEUED:
 260		case TC_ACT_STOLEN:
 
 261			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 262		case TC_ACT_SHOT:
 263			return NULL;
 264		case TC_ACT_RECLASSIFY:
 265			return cbq_reclassify(skb, cl);
 266		}
 267#endif
 268		if (cl->level == 0)
 269			return cl;
 270
 271		/*
 272		 * Step 3+n. If classifier selected a link sharing class,
 273		 *	   apply agency specific classifier.
 274		 *	   Repeat this procdure until we hit a leaf node.
 275		 */
 276		head = cl;
 277	}
 278
 279fallback:
 280	cl = head;
 281
 282	/*
 283	 * Step 4. No success...
 284	 */
 285	if (TC_H_MAJ(prio) == 0 &&
 286	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 287	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 288		return head;
 289
 290	return cl;
 291}
 292
 293/*
 294 * A packet has just been enqueued on the empty class.
 295 * cbq_activate_class adds it to the tail of active class list
 296 * of its priority band.
 297 */
 298
 299static inline void cbq_activate_class(struct cbq_class *cl)
 300{
 301	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 302	int prio = cl->cpriority;
 303	struct cbq_class *cl_tail;
 304
 305	cl_tail = q->active[prio];
 306	q->active[prio] = cl;
 307
 308	if (cl_tail != NULL) {
 309		cl->next_alive = cl_tail->next_alive;
 310		cl_tail->next_alive = cl;
 311	} else {
 312		cl->next_alive = cl;
 313		q->activemask |= (1<<prio);
 314	}
 315}
 316
 317/*
 318 * Unlink class from active chain.
 319 * Note that this same procedure is done directly in cbq_dequeue*
 320 * during round-robin procedure.
 321 */
 322
 323static void cbq_deactivate_class(struct cbq_class *this)
 324{
 325	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 326	int prio = this->cpriority;
 327	struct cbq_class *cl;
 328	struct cbq_class *cl_prev = q->active[prio];
 329
 330	do {
 331		cl = cl_prev->next_alive;
 332		if (cl == this) {
 333			cl_prev->next_alive = cl->next_alive;
 334			cl->next_alive = NULL;
 335
 336			if (cl == q->active[prio]) {
 337				q->active[prio] = cl_prev;
 338				if (cl == q->active[prio]) {
 339					q->active[prio] = NULL;
 340					q->activemask &= ~(1<<prio);
 341					return;
 342				}
 343			}
 344			return;
 345		}
 346	} while ((cl_prev = cl) != q->active[prio]);
 347}
 348
 349static void
 350cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 351{
 352	int toplevel = q->toplevel;
 353
 354	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
 355		psched_time_t now;
 356		psched_tdiff_t incr;
 357
 358		now = psched_get_time();
 359		incr = now - q->now_rt;
 360		now = q->now + incr;
 361
 362		do {
 363			if (cl->undertime < now) {
 364				q->toplevel = cl->level;
 365				return;
 366			}
 367		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 368	}
 369}
 370
 371static int
 372cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 373{
 374	struct cbq_sched_data *q = qdisc_priv(sch);
 375	int uninitialized_var(ret);
 376	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 377
 378#ifdef CONFIG_NET_CLS_ACT
 379	q->rx_class = cl;
 380#endif
 381	if (cl == NULL) {
 382		if (ret & __NET_XMIT_BYPASS)
 383			sch->qstats.drops++;
 384		kfree_skb(skb);
 385		return ret;
 386	}
 387
 388#ifdef CONFIG_NET_CLS_ACT
 389	cl->q->__parent = sch;
 390#endif
 391	ret = qdisc_enqueue(skb, cl->q);
 392	if (ret == NET_XMIT_SUCCESS) {
 393		sch->q.qlen++;
 394		cbq_mark_toplevel(q, cl);
 395		if (!cl->next_alive)
 396			cbq_activate_class(cl);
 397		return ret;
 398	}
 399
 400	if (net_xmit_drop_count(ret)) {
 401		sch->qstats.drops++;
 402		cbq_mark_toplevel(q, cl);
 403		cl->qstats.drops++;
 404	}
 405	return ret;
 406}
 407
 408/* Overlimit actions */
 409
 410/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
 411
 412static void cbq_ovl_classic(struct cbq_class *cl)
 413{
 414	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 415	psched_tdiff_t delay = cl->undertime - q->now;
 416
 417	if (!cl->delayed) {
 418		delay += cl->offtime;
 419
 420		/*
 421		 * Class goes to sleep, so that it will have no
 422		 * chance to work avgidle. Let's forgive it 8)
 423		 *
 424		 * BTW cbq-2.0 has a crap in this
 425		 * place, apparently they forgot to shift it by cl->ewma_log.
 426		 */
 427		if (cl->avgidle < 0)
 428			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 429		if (cl->avgidle < cl->minidle)
 430			cl->avgidle = cl->minidle;
 431		if (delay <= 0)
 432			delay = 1;
 433		cl->undertime = q->now + delay;
 434
 435		cl->xstats.overactions++;
 436		cl->delayed = 1;
 437	}
 438	if (q->wd_expires == 0 || q->wd_expires > delay)
 439		q->wd_expires = delay;
 440
 441	/* Dirty work! We must schedule wakeups based on
 442	 * real available rate, rather than leaf rate,
 443	 * which may be tiny (even zero).
 444	 */
 445	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 446		struct cbq_class *b;
 447		psched_tdiff_t base_delay = q->wd_expires;
 448
 449		for (b = cl->borrow; b; b = b->borrow) {
 450			delay = b->undertime - q->now;
 451			if (delay < base_delay) {
 452				if (delay <= 0)
 453					delay = 1;
 454				base_delay = delay;
 455			}
 456		}
 457
 458		q->wd_expires = base_delay;
 459	}
 460}
 461
 462/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
 463 * they go overlimit
 464 */
 465
 466static void cbq_ovl_rclassic(struct cbq_class *cl)
 467{
 468	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 469	struct cbq_class *this = cl;
 470
 471	do {
 472		if (cl->level > q->toplevel) {
 473			cl = NULL;
 474			break;
 475		}
 476	} while ((cl = cl->borrow) != NULL);
 477
 478	if (cl == NULL)
 479		cl = this;
 480	cbq_ovl_classic(cl);
 481}
 482
 483/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
 484
 485static void cbq_ovl_delay(struct cbq_class *cl)
 486{
 487	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 488	psched_tdiff_t delay = cl->undertime - q->now;
 489
 490	if (test_bit(__QDISC_STATE_DEACTIVATED,
 491		     &qdisc_root_sleeping(cl->qdisc)->state))
 492		return;
 493
 494	if (!cl->delayed) {
 495		psched_time_t sched = q->now;
 496		ktime_t expires;
 497
 498		delay += cl->offtime;
 499		if (cl->avgidle < 0)
 500			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 501		if (cl->avgidle < cl->minidle)
 502			cl->avgidle = cl->minidle;
 503		cl->undertime = q->now + delay;
 504
 505		if (delay > 0) {
 506			sched += delay + cl->penalty;
 507			cl->penalized = sched;
 508			cl->cpriority = TC_CBQ_MAXPRIO;
 509			q->pmask |= (1<<TC_CBQ_MAXPRIO);
 510
 511			expires = ktime_set(0, 0);
 512			expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
 513			if (hrtimer_try_to_cancel(&q->delay_timer) &&
 514			    ktime_to_ns(ktime_sub(
 515					hrtimer_get_expires(&q->delay_timer),
 516					expires)) > 0)
 517				hrtimer_set_expires(&q->delay_timer, expires);
 518			hrtimer_restart(&q->delay_timer);
 519			cl->delayed = 1;
 520			cl->xstats.overactions++;
 521			return;
 522		}
 523		delay = 1;
 524	}
 525	if (q->wd_expires == 0 || q->wd_expires > delay)
 526		q->wd_expires = delay;
 527}
 528
 529/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
 530
 531static void cbq_ovl_lowprio(struct cbq_class *cl)
 532{
 533	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 534
 535	cl->penalized = q->now + cl->penalty;
 536
 537	if (cl->cpriority != cl->priority2) {
 538		cl->cpriority = cl->priority2;
 539		q->pmask |= (1<<cl->cpriority);
 540		cl->xstats.overactions++;
 541	}
 542	cbq_ovl_classic(cl);
 543}
 544
 545/* TC_CBQ_OVL_DROP: penalize class by dropping */
 546
 547static void cbq_ovl_drop(struct cbq_class *cl)
 548{
 549	if (cl->q->ops->drop)
 550		if (cl->q->ops->drop(cl->q))
 551			cl->qdisc->q.qlen--;
 552	cl->xstats.overactions++;
 553	cbq_ovl_classic(cl);
 554}
 555
 556static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 557				       psched_time_t now)
 558{
 559	struct cbq_class *cl;
 560	struct cbq_class *cl_prev = q->active[prio];
 561	psched_time_t sched = now;
 562
 563	if (cl_prev == NULL)
 564		return 0;
 565
 566	do {
 567		cl = cl_prev->next_alive;
 568		if (now - cl->penalized > 0) {
 569			cl_prev->next_alive = cl->next_alive;
 570			cl->next_alive = NULL;
 571			cl->cpriority = cl->priority;
 572			cl->delayed = 0;
 573			cbq_activate_class(cl);
 574
 575			if (cl == q->active[prio]) {
 576				q->active[prio] = cl_prev;
 577				if (cl == q->active[prio]) {
 578					q->active[prio] = NULL;
 579					return 0;
 580				}
 581			}
 582
 583			cl = cl_prev->next_alive;
 584		} else if (sched - cl->penalized > 0)
 585			sched = cl->penalized;
 586	} while ((cl_prev = cl) != q->active[prio]);
 587
 588	return sched - now;
 589}
 590
 591static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 592{
 593	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 594						delay_timer);
 595	struct Qdisc *sch = q->watchdog.qdisc;
 596	psched_time_t now;
 597	psched_tdiff_t delay = 0;
 598	unsigned int pmask;
 599
 600	now = psched_get_time();
 601
 602	pmask = q->pmask;
 603	q->pmask = 0;
 604
 605	while (pmask) {
 606		int prio = ffz(~pmask);
 607		psched_tdiff_t tmp;
 608
 609		pmask &= ~(1<<prio);
 610
 611		tmp = cbq_undelay_prio(q, prio, now);
 612		if (tmp > 0) {
 613			q->pmask |= 1<<prio;
 614			if (tmp < delay || delay == 0)
 615				delay = tmp;
 616		}
 617	}
 618
 619	if (delay) {
 620		ktime_t time;
 621
 622		time = ktime_set(0, 0);
 623		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 624		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
 625	}
 626
 627	qdisc_unthrottled(sch);
 628	__netif_schedule(qdisc_root(sch));
 629	return HRTIMER_NORESTART;
 630}
 631
 632#ifdef CONFIG_NET_CLS_ACT
 633static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 634{
 635	struct Qdisc *sch = child->__parent;
 636	struct cbq_sched_data *q = qdisc_priv(sch);
 637	struct cbq_class *cl = q->rx_class;
 638
 639	q->rx_class = NULL;
 640
 641	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
 642		int ret;
 643
 644		cbq_mark_toplevel(q, cl);
 645
 646		q->rx_class = cl;
 647		cl->q->__parent = sch;
 648
 649		ret = qdisc_enqueue(skb, cl->q);
 650		if (ret == NET_XMIT_SUCCESS) {
 651			sch->q.qlen++;
 652			if (!cl->next_alive)
 653				cbq_activate_class(cl);
 654			return 0;
 655		}
 656		if (net_xmit_drop_count(ret))
 657			sch->qstats.drops++;
 658		return 0;
 659	}
 660
 661	sch->qstats.drops++;
 662	return -1;
 663}
 664#endif
 665
 666/*
 667 * It is mission critical procedure.
 668 *
 669 * We "regenerate" toplevel cutoff, if transmitting class
 670 * has backlog and it is not regulated. It is not part of
 671 * original CBQ description, but looks more reasonable.
 672 * Probably, it is wrong. This question needs further investigation.
 673 */
 674
 675static inline void
 676cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 677		    struct cbq_class *borrowed)
 678{
 679	if (cl && q->toplevel >= borrowed->level) {
 680		if (cl->q->q.qlen > 1) {
 681			do {
 682				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 683					q->toplevel = borrowed->level;
 684					return;
 685				}
 686			} while ((borrowed = borrowed->borrow) != NULL);
 687		}
 688#if 0
 689	/* It is not necessary now. Uncommenting it
 690	   will save CPU cycles, but decrease fairness.
 691	 */
 692		q->toplevel = TC_CBQ_MAXLEVEL;
 693#endif
 694	}
 695}
 696
 697static void
 698cbq_update(struct cbq_sched_data *q)
 699{
 700	struct cbq_class *this = q->tx_class;
 701	struct cbq_class *cl = this;
 702	int len = q->tx_len;
 
 703
 704	q->tx_class = NULL;
 
 
 
 
 705
 706	for ( ; cl; cl = cl->share) {
 707		long avgidle = cl->avgidle;
 708		long idle;
 709
 710		cl->bstats.packets++;
 711		cl->bstats.bytes += len;
 712
 713		/*
 714		 * (now - last) is total time between packet right edges.
 715		 * (last_pktlen/rate) is "virtual" busy time, so that
 716		 *
 717		 *	idle = (now - last) - last_pktlen/rate
 718		 */
 719
 720		idle = q->now - cl->last;
 721		if ((unsigned long)idle > 128*1024*1024) {
 722			avgidle = cl->maxidle;
 723		} else {
 724			idle -= L2T(cl, len);
 725
 726		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 727		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 728		 * cl->avgidle == true_avgidle/W,
 729		 * hence:
 730		 */
 731			avgidle += idle - (avgidle>>cl->ewma_log);
 732		}
 733
 734		if (avgidle <= 0) {
 735			/* Overlimit or at-limit */
 736
 737			if (avgidle < cl->minidle)
 738				avgidle = cl->minidle;
 739
 740			cl->avgidle = avgidle;
 741
 742			/* Calculate expected time, when this class
 743			 * will be allowed to send.
 744			 * It will occur, when:
 745			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 746			 * idle = (1/W - 1)*(-true_avgidle)
 747			 * or
 748			 * idle = (1 - W)*(-cl->avgidle);
 749			 */
 750			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 751
 752			/*
 753			 * That is not all.
 754			 * To maintain the rate allocated to the class,
 755			 * we add to undertime virtual clock,
 756			 * necessary to complete transmitted packet.
 757			 * (len/phys_bandwidth has been already passed
 758			 * to the moment of cbq_update)
 759			 */
 760
 761			idle -= L2T(&q->link, len);
 762			idle += L2T(cl, len);
 763
 764			cl->undertime = q->now + idle;
 765		} else {
 766			/* Underlimit */
 767
 768			cl->undertime = PSCHED_PASTPERFECT;
 769			if (avgidle > cl->maxidle)
 770				cl->avgidle = cl->maxidle;
 771			else
 772				cl->avgidle = avgidle;
 773		}
 774		cl->last = q->now;
 
 775	}
 776
 777	cbq_update_toplevel(q, this, q->tx_borrowed);
 778}
 779
 780static inline struct cbq_class *
 781cbq_under_limit(struct cbq_class *cl)
 782{
 783	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 784	struct cbq_class *this_cl = cl;
 785
 786	if (cl->tparent == NULL)
 787		return cl;
 788
 789	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 790		cl->delayed = 0;
 791		return cl;
 792	}
 793
 794	do {
 795		/* It is very suspicious place. Now overlimit
 796		 * action is generated for not bounded classes
 797		 * only if link is completely congested.
 798		 * Though it is in agree with ancestor-only paradigm,
 799		 * it looks very stupid. Particularly,
 800		 * it means that this chunk of code will either
 801		 * never be called or result in strong amplification
 802		 * of burstiness. Dangerous, silly, and, however,
 803		 * no another solution exists.
 804		 */
 805		cl = cl->borrow;
 806		if (!cl) {
 807			this_cl->qstats.overlimits++;
 808			this_cl->overlimit(this_cl);
 809			return NULL;
 810		}
 811		if (cl->level > q->toplevel)
 812			return NULL;
 813	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 814
 815	cl->delayed = 0;
 816	return cl;
 817}
 818
 819static inline struct sk_buff *
 820cbq_dequeue_prio(struct Qdisc *sch, int prio)
 821{
 822	struct cbq_sched_data *q = qdisc_priv(sch);
 823	struct cbq_class *cl_tail, *cl_prev, *cl;
 824	struct sk_buff *skb;
 825	int deficit;
 826
 827	cl_tail = cl_prev = q->active[prio];
 828	cl = cl_prev->next_alive;
 829
 830	do {
 831		deficit = 0;
 832
 833		/* Start round */
 834		do {
 835			struct cbq_class *borrow = cl;
 836
 837			if (cl->q->q.qlen &&
 838			    (borrow = cbq_under_limit(cl)) == NULL)
 839				goto skip_class;
 840
 841			if (cl->deficit <= 0) {
 842				/* Class exhausted its allotment per
 843				 * this round. Switch to the next one.
 844				 */
 845				deficit = 1;
 846				cl->deficit += cl->quantum;
 847				goto next_class;
 848			}
 849
 850			skb = cl->q->dequeue(cl->q);
 851
 852			/* Class did not give us any skb :-(
 853			 * It could occur even if cl->q->q.qlen != 0
 854			 * f.e. if cl->q == "tbf"
 855			 */
 856			if (skb == NULL)
 857				goto skip_class;
 858
 859			cl->deficit -= qdisc_pkt_len(skb);
 860			q->tx_class = cl;
 861			q->tx_borrowed = borrow;
 862			if (borrow != cl) {
 863#ifndef CBQ_XSTATS_BORROWS_BYTES
 864				borrow->xstats.borrows++;
 865				cl->xstats.borrows++;
 866#else
 867				borrow->xstats.borrows += qdisc_pkt_len(skb);
 868				cl->xstats.borrows += qdisc_pkt_len(skb);
 869#endif
 870			}
 871			q->tx_len = qdisc_pkt_len(skb);
 872
 873			if (cl->deficit <= 0) {
 874				q->active[prio] = cl;
 875				cl = cl->next_alive;
 876				cl->deficit += cl->quantum;
 877			}
 878			return skb;
 879
 880skip_class:
 881			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 882				/* Class is empty or penalized.
 883				 * Unlink it from active chain.
 884				 */
 885				cl_prev->next_alive = cl->next_alive;
 886				cl->next_alive = NULL;
 887
 888				/* Did cl_tail point to it? */
 889				if (cl == cl_tail) {
 890					/* Repair it! */
 891					cl_tail = cl_prev;
 892
 893					/* Was it the last class in this band? */
 894					if (cl == cl_tail) {
 895						/* Kill the band! */
 896						q->active[prio] = NULL;
 897						q->activemask &= ~(1<<prio);
 898						if (cl->q->q.qlen)
 899							cbq_activate_class(cl);
 900						return NULL;
 901					}
 902
 903					q->active[prio] = cl_tail;
 904				}
 905				if (cl->q->q.qlen)
 906					cbq_activate_class(cl);
 907
 908				cl = cl_prev;
 909			}
 910
 911next_class:
 912			cl_prev = cl;
 913			cl = cl->next_alive;
 914		} while (cl_prev != cl_tail);
 915	} while (deficit);
 916
 917	q->active[prio] = cl_prev;
 918
 919	return NULL;
 920}
 921
 922static inline struct sk_buff *
 923cbq_dequeue_1(struct Qdisc *sch)
 924{
 925	struct cbq_sched_data *q = qdisc_priv(sch);
 926	struct sk_buff *skb;
 927	unsigned int activemask;
 928
 929	activemask = q->activemask & 0xFF;
 930	while (activemask) {
 931		int prio = ffz(~activemask);
 932		activemask &= ~(1<<prio);
 933		skb = cbq_dequeue_prio(sch, prio);
 934		if (skb)
 935			return skb;
 936	}
 937	return NULL;
 938}
 939
 940static struct sk_buff *
 941cbq_dequeue(struct Qdisc *sch)
 942{
 943	struct sk_buff *skb;
 944	struct cbq_sched_data *q = qdisc_priv(sch);
 945	psched_time_t now;
 946	psched_tdiff_t incr;
 947
 948	now = psched_get_time();
 949	incr = now - q->now_rt;
 950
 951	if (q->tx_class) {
 952		psched_tdiff_t incr2;
 953		/* Time integrator. We calculate EOS time
 954		 * by adding expected packet transmission time.
 955		 * If real time is greater, we warp artificial clock,
 956		 * so that:
 957		 *
 958		 * cbq_time = max(real_time, work);
 959		 */
 960		incr2 = L2T(&q->link, q->tx_len);
 961		q->now += incr2;
 962		cbq_update(q);
 963		if ((incr -= incr2) < 0)
 964			incr = 0;
 965	}
 966	q->now += incr;
 967	q->now_rt = now;
 968
 969	for (;;) {
 970		q->wd_expires = 0;
 971
 972		skb = cbq_dequeue_1(sch);
 973		if (skb) {
 974			qdisc_bstats_update(sch, skb);
 975			sch->q.qlen--;
 976			qdisc_unthrottled(sch);
 977			return skb;
 978		}
 979
 980		/* All the classes are overlimit.
 981		 *
 982		 * It is possible, if:
 983		 *
 984		 * 1. Scheduler is empty.
 985		 * 2. Toplevel cutoff inhibited borrowing.
 986		 * 3. Root class is overlimit.
 987		 *
 988		 * Reset 2d and 3d conditions and retry.
 989		 *
 990		 * Note, that NS and cbq-2.0 are buggy, peeking
 991		 * an arbitrary class is appropriate for ancestor-only
 992		 * sharing, but not for toplevel algorithm.
 993		 *
 994		 * Our version is better, but slower, because it requires
 995		 * two passes, but it is unavoidable with top-level sharing.
 996		 */
 997
 998		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 999		    q->link.undertime == PSCHED_PASTPERFECT)
1000			break;
1001
1002		q->toplevel = TC_CBQ_MAXLEVEL;
1003		q->link.undertime = PSCHED_PASTPERFECT;
1004	}
1005
1006	/* No packets in scheduler or nobody wants to give them to us :-(
1007	 * Sigh... start watchdog timer in the last case.
1008	 */
1009
1010	if (sch->q.qlen) {
1011		sch->qstats.overlimits++;
1012		if (q->wd_expires)
1013			qdisc_watchdog_schedule(&q->watchdog,
1014						now + q->wd_expires);
1015	}
1016	return NULL;
1017}
1018
1019/* CBQ class maintanance routines */
1020
1021static void cbq_adjust_levels(struct cbq_class *this)
1022{
1023	if (this == NULL)
1024		return;
1025
1026	do {
1027		int level = 0;
1028		struct cbq_class *cl;
1029
1030		cl = this->children;
1031		if (cl) {
1032			do {
1033				if (cl->level > level)
1034					level = cl->level;
1035			} while ((cl = cl->sibling) != this->children);
1036		}
1037		this->level = level + 1;
1038	} while ((this = this->tparent) != NULL);
1039}
1040
1041static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1042{
1043	struct cbq_class *cl;
1044	struct hlist_node *n;
1045	unsigned int h;
1046
1047	if (q->quanta[prio] == 0)
1048		return;
1049
1050	for (h = 0; h < q->clhash.hashsize; h++) {
1051		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1052			/* BUGGGG... Beware! This expression suffer of
1053			 * arithmetic overflows!
1054			 */
1055			if (cl->priority == prio) {
1056				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1057					q->quanta[prio];
1058			}
1059			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1060				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1061					   cl->common.classid, cl->quantum);
 
1062				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1063			}
1064		}
1065	}
1066}
1067
1068static void cbq_sync_defmap(struct cbq_class *cl)
1069{
1070	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1071	struct cbq_class *split = cl->split;
1072	unsigned int h;
1073	int i;
1074
1075	if (split == NULL)
1076		return;
1077
1078	for (i = 0; i <= TC_PRIO_MAX; i++) {
1079		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1080			split->defaults[i] = NULL;
1081	}
1082
1083	for (i = 0; i <= TC_PRIO_MAX; i++) {
1084		int level = split->level;
1085
1086		if (split->defaults[i])
1087			continue;
1088
1089		for (h = 0; h < q->clhash.hashsize; h++) {
1090			struct hlist_node *n;
1091			struct cbq_class *c;
1092
1093			hlist_for_each_entry(c, n, &q->clhash.hash[h],
1094					     common.hnode) {
1095				if (c->split == split && c->level < level &&
1096				    c->defmap & (1<<i)) {
1097					split->defaults[i] = c;
1098					level = c->level;
1099				}
1100			}
1101		}
1102	}
1103}
1104
1105static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1106{
1107	struct cbq_class *split = NULL;
1108
1109	if (splitid == 0) {
1110		split = cl->split;
1111		if (!split)
1112			return;
1113		splitid = split->common.classid;
1114	}
1115
1116	if (split == NULL || split->common.classid != splitid) {
1117		for (split = cl->tparent; split; split = split->tparent)
1118			if (split->common.classid == splitid)
1119				break;
1120	}
1121
1122	if (split == NULL)
1123		return;
1124
1125	if (cl->split != split) {
1126		cl->defmap = 0;
1127		cbq_sync_defmap(cl);
1128		cl->split = split;
1129		cl->defmap = def & mask;
1130	} else
1131		cl->defmap = (cl->defmap & ~mask) | (def & mask);
1132
1133	cbq_sync_defmap(cl);
1134}
1135
1136static void cbq_unlink_class(struct cbq_class *this)
1137{
1138	struct cbq_class *cl, **clp;
1139	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1140
1141	qdisc_class_hash_remove(&q->clhash, &this->common);
1142
1143	if (this->tparent) {
1144		clp = &this->sibling;
1145		cl = *clp;
1146		do {
1147			if (cl == this) {
1148				*clp = cl->sibling;
1149				break;
1150			}
1151			clp = &cl->sibling;
1152		} while ((cl = *clp) != this->sibling);
1153
1154		if (this->tparent->children == this) {
1155			this->tparent->children = this->sibling;
1156			if (this->sibling == this)
1157				this->tparent->children = NULL;
1158		}
1159	} else {
1160		WARN_ON(this->sibling != this);
1161	}
1162}
1163
1164static void cbq_link_class(struct cbq_class *this)
1165{
1166	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1167	struct cbq_class *parent = this->tparent;
1168
1169	this->sibling = this;
1170	qdisc_class_hash_insert(&q->clhash, &this->common);
1171
1172	if (parent == NULL)
1173		return;
1174
1175	if (parent->children == NULL) {
1176		parent->children = this;
1177	} else {
1178		this->sibling = parent->children->sibling;
1179		parent->children->sibling = this;
1180	}
1181}
1182
1183static unsigned int cbq_drop(struct Qdisc *sch)
1184{
1185	struct cbq_sched_data *q = qdisc_priv(sch);
1186	struct cbq_class *cl, *cl_head;
1187	int prio;
1188	unsigned int len;
1189
1190	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1191		cl_head = q->active[prio];
1192		if (!cl_head)
1193			continue;
1194
1195		cl = cl_head;
1196		do {
1197			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1198				sch->q.qlen--;
1199				if (!cl->q->q.qlen)
1200					cbq_deactivate_class(cl);
1201				return len;
1202			}
1203		} while ((cl = cl->next_alive) != cl_head);
1204	}
1205	return 0;
1206}
1207
1208static void
1209cbq_reset(struct Qdisc *sch)
1210{
1211	struct cbq_sched_data *q = qdisc_priv(sch);
1212	struct cbq_class *cl;
1213	struct hlist_node *n;
1214	int prio;
1215	unsigned int h;
1216
1217	q->activemask = 0;
1218	q->pmask = 0;
1219	q->tx_class = NULL;
1220	q->tx_borrowed = NULL;
1221	qdisc_watchdog_cancel(&q->watchdog);
1222	hrtimer_cancel(&q->delay_timer);
1223	q->toplevel = TC_CBQ_MAXLEVEL;
1224	q->now = psched_get_time();
1225	q->now_rt = q->now;
1226
1227	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1228		q->active[prio] = NULL;
1229
1230	for (h = 0; h < q->clhash.hashsize; h++) {
1231		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1232			qdisc_reset(cl->q);
1233
1234			cl->next_alive = NULL;
1235			cl->undertime = PSCHED_PASTPERFECT;
1236			cl->avgidle = cl->maxidle;
1237			cl->deficit = cl->quantum;
1238			cl->cpriority = cl->priority;
1239		}
1240	}
1241	sch->q.qlen = 0;
1242}
1243
1244
1245static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1246{
1247	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1248		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1249		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1250	}
1251	if (lss->change & TCF_CBQ_LSS_EWMA)
1252		cl->ewma_log = lss->ewma_log;
1253	if (lss->change & TCF_CBQ_LSS_AVPKT)
1254		cl->avpkt = lss->avpkt;
1255	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1256		cl->minidle = -(long)lss->minidle;
1257	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1258		cl->maxidle = lss->maxidle;
1259		cl->avgidle = lss->maxidle;
1260	}
1261	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1262		cl->offtime = lss->offtime;
1263	return 0;
1264}
1265
1266static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1267{
1268	q->nclasses[cl->priority]--;
1269	q->quanta[cl->priority] -= cl->weight;
1270	cbq_normalize_quanta(q, cl->priority);
1271}
1272
1273static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1274{
1275	q->nclasses[cl->priority]++;
1276	q->quanta[cl->priority] += cl->weight;
1277	cbq_normalize_quanta(q, cl->priority);
1278}
1279
1280static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1281{
1282	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1283
1284	if (wrr->allot)
1285		cl->allot = wrr->allot;
1286	if (wrr->weight)
1287		cl->weight = wrr->weight;
1288	if (wrr->priority) {
1289		cl->priority = wrr->priority - 1;
1290		cl->cpriority = cl->priority;
1291		if (cl->priority >= cl->priority2)
1292			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1293	}
1294
1295	cbq_addprio(q, cl);
1296	return 0;
1297}
1298
1299static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1300{
1301	switch (ovl->strategy) {
1302	case TC_CBQ_OVL_CLASSIC:
1303		cl->overlimit = cbq_ovl_classic;
1304		break;
1305	case TC_CBQ_OVL_DELAY:
1306		cl->overlimit = cbq_ovl_delay;
1307		break;
1308	case TC_CBQ_OVL_LOWPRIO:
1309		if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1310		    ovl->priority2 - 1 <= cl->priority)
1311			return -EINVAL;
1312		cl->priority2 = ovl->priority2 - 1;
1313		cl->overlimit = cbq_ovl_lowprio;
1314		break;
1315	case TC_CBQ_OVL_DROP:
1316		cl->overlimit = cbq_ovl_drop;
1317		break;
1318	case TC_CBQ_OVL_RCLASSIC:
1319		cl->overlimit = cbq_ovl_rclassic;
1320		break;
1321	default:
1322		return -EINVAL;
1323	}
1324	cl->penalty = ovl->penalty;
1325	return 0;
1326}
1327
1328#ifdef CONFIG_NET_CLS_ACT
1329static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1330{
1331	cl->police = p->police;
1332
1333	if (cl->q->handle) {
1334		if (p->police == TC_POLICE_RECLASSIFY)
1335			cl->q->reshape_fail = cbq_reshape_fail;
1336		else
1337			cl->q->reshape_fail = NULL;
1338	}
1339	return 0;
1340}
1341#endif
1342
1343static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1344{
1345	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1346	return 0;
1347}
1348
1349static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1350	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1351	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1352	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1353	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1354	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1355	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1356	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1357};
1358
1359static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1360{
1361	struct cbq_sched_data *q = qdisc_priv(sch);
1362	struct nlattr *tb[TCA_CBQ_MAX + 1];
1363	struct tc_ratespec *r;
1364	int err;
1365
1366	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
 
 
1367	if (err < 0)
1368		return err;
1369
1370	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
 
1371		return -EINVAL;
 
1372
1373	r = nla_data(tb[TCA_CBQ_RATE]);
1374
1375	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
 
1376		return -EINVAL;
1377
 
 
 
 
1378	err = qdisc_class_hash_init(&q->clhash);
1379	if (err < 0)
1380		goto put_rtab;
1381
1382	q->link.refcnt = 1;
1383	q->link.sibling = &q->link;
1384	q->link.common.classid = sch->handle;
1385	q->link.qdisc = sch;
1386	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1387				      sch->handle);
1388	if (!q->link.q)
1389		q->link.q = &noop_qdisc;
 
 
1390
1391	q->link.priority = TC_CBQ_MAXPRIO - 1;
1392	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1393	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1394	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1395	q->link.overlimit = cbq_ovl_classic;
1396	q->link.allot = psched_mtu(qdisc_dev(sch));
1397	q->link.quantum = q->link.allot;
1398	q->link.weight = q->link.R_tab->rate.rate;
1399
1400	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1401	q->link.avpkt = q->link.allot/2;
1402	q->link.minidle = -0x7FFFFFFF;
1403
1404	qdisc_watchdog_init(&q->watchdog, sch);
1405	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1406	q->delay_timer.function = cbq_undelay;
1407	q->toplevel = TC_CBQ_MAXLEVEL;
1408	q->now = psched_get_time();
1409	q->now_rt = q->now;
1410
1411	cbq_link_class(&q->link);
1412
1413	if (tb[TCA_CBQ_LSSOPT])
1414		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1415
1416	cbq_addprio(q, &q->link);
1417	return 0;
1418
 
 
 
1419put_rtab:
1420	qdisc_put_rtab(q->link.R_tab);
1421	return err;
1422}
1423
1424static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1425{
1426	unsigned char *b = skb_tail_pointer(skb);
1427
1428	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1429		goto nla_put_failure;
1430	return skb->len;
1431
1432nla_put_failure:
1433	nlmsg_trim(skb, b);
1434	return -1;
1435}
1436
1437static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1438{
1439	unsigned char *b = skb_tail_pointer(skb);
1440	struct tc_cbq_lssopt opt;
1441
1442	opt.flags = 0;
1443	if (cl->borrow == NULL)
1444		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1445	if (cl->share == NULL)
1446		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1447	opt.ewma_log = cl->ewma_log;
1448	opt.level = cl->level;
1449	opt.avpkt = cl->avpkt;
1450	opt.maxidle = cl->maxidle;
1451	opt.minidle = (u32)(-cl->minidle);
1452	opt.offtime = cl->offtime;
1453	opt.change = ~0;
1454	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1455		goto nla_put_failure;
1456	return skb->len;
1457
1458nla_put_failure:
1459	nlmsg_trim(skb, b);
1460	return -1;
1461}
1462
1463static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1464{
1465	unsigned char *b = skb_tail_pointer(skb);
1466	struct tc_cbq_wrropt opt;
1467
 
1468	opt.flags = 0;
1469	opt.allot = cl->allot;
1470	opt.priority = cl->priority + 1;
1471	opt.cpriority = cl->cpriority + 1;
1472	opt.weight = cl->weight;
1473	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1474		goto nla_put_failure;
1475	return skb->len;
1476
1477nla_put_failure:
1478	nlmsg_trim(skb, b);
1479	return -1;
1480}
1481
1482static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1483{
1484	unsigned char *b = skb_tail_pointer(skb);
1485	struct tc_cbq_ovl opt;
1486
1487	opt.strategy = cl->ovl_strategy;
1488	opt.priority2 = cl->priority2 + 1;
1489	opt.pad = 0;
1490	opt.penalty = cl->penalty;
1491	if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
1492		goto nla_put_failure;
1493	return skb->len;
1494
1495nla_put_failure:
1496	nlmsg_trim(skb, b);
1497	return -1;
1498}
1499
1500static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1501{
1502	unsigned char *b = skb_tail_pointer(skb);
1503	struct tc_cbq_fopt opt;
1504
1505	if (cl->split || cl->defmap) {
1506		opt.split = cl->split ? cl->split->common.classid : 0;
1507		opt.defmap = cl->defmap;
1508		opt.defchange = ~0;
1509		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1510			goto nla_put_failure;
1511	}
1512	return skb->len;
1513
1514nla_put_failure:
1515	nlmsg_trim(skb, b);
1516	return -1;
1517}
1518
1519#ifdef CONFIG_NET_CLS_ACT
1520static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1521{
1522	unsigned char *b = skb_tail_pointer(skb);
1523	struct tc_cbq_police opt;
1524
1525	if (cl->police) {
1526		opt.police = cl->police;
1527		opt.__res1 = 0;
1528		opt.__res2 = 0;
1529		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
1530			goto nla_put_failure;
1531	}
1532	return skb->len;
1533
1534nla_put_failure:
1535	nlmsg_trim(skb, b);
1536	return -1;
1537}
1538#endif
1539
1540static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1541{
1542	if (cbq_dump_lss(skb, cl) < 0 ||
1543	    cbq_dump_rate(skb, cl) < 0 ||
1544	    cbq_dump_wrr(skb, cl) < 0 ||
1545	    cbq_dump_ovl(skb, cl) < 0 ||
1546#ifdef CONFIG_NET_CLS_ACT
1547	    cbq_dump_police(skb, cl) < 0 ||
1548#endif
1549	    cbq_dump_fopt(skb, cl) < 0)
1550		return -1;
1551	return 0;
1552}
1553
1554static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1555{
1556	struct cbq_sched_data *q = qdisc_priv(sch);
1557	struct nlattr *nest;
1558
1559	nest = nla_nest_start(skb, TCA_OPTIONS);
1560	if (nest == NULL)
1561		goto nla_put_failure;
1562	if (cbq_dump_attr(skb, &q->link) < 0)
1563		goto nla_put_failure;
1564	nla_nest_end(skb, nest);
1565	return skb->len;
1566
1567nla_put_failure:
1568	nla_nest_cancel(skb, nest);
1569	return -1;
1570}
1571
1572static int
1573cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1574{
1575	struct cbq_sched_data *q = qdisc_priv(sch);
1576
1577	q->link.xstats.avgidle = q->link.avgidle;
1578	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1579}
1580
1581static int
1582cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1583	       struct sk_buff *skb, struct tcmsg *tcm)
1584{
1585	struct cbq_class *cl = (struct cbq_class *)arg;
1586	struct nlattr *nest;
1587
1588	if (cl->tparent)
1589		tcm->tcm_parent = cl->tparent->common.classid;
1590	else
1591		tcm->tcm_parent = TC_H_ROOT;
1592	tcm->tcm_handle = cl->common.classid;
1593	tcm->tcm_info = cl->q->handle;
1594
1595	nest = nla_nest_start(skb, TCA_OPTIONS);
1596	if (nest == NULL)
1597		goto nla_put_failure;
1598	if (cbq_dump_attr(skb, cl) < 0)
1599		goto nla_put_failure;
1600	nla_nest_end(skb, nest);
1601	return skb->len;
1602
1603nla_put_failure:
1604	nla_nest_cancel(skb, nest);
1605	return -1;
1606}
1607
1608static int
1609cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1610	struct gnet_dump *d)
1611{
1612	struct cbq_sched_data *q = qdisc_priv(sch);
1613	struct cbq_class *cl = (struct cbq_class *)arg;
 
1614
1615	cl->qstats.qlen = cl->q->q.qlen;
1616	cl->xstats.avgidle = cl->avgidle;
1617	cl->xstats.undertime = 0;
 
1618
1619	if (cl->undertime != PSCHED_PASTPERFECT)
1620		cl->xstats.undertime = cl->undertime - q->now;
1621
1622	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1623	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1624	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1625		return -1;
1626
1627	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1628}
1629
1630static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1631		     struct Qdisc **old)
1632{
1633	struct cbq_class *cl = (struct cbq_class *)arg;
1634
1635	if (new == NULL) {
1636		new = qdisc_create_dflt(sch->dev_queue,
1637					&pfifo_qdisc_ops, cl->common.classid);
1638		if (new == NULL)
1639			return -ENOBUFS;
1640	} else {
1641#ifdef CONFIG_NET_CLS_ACT
1642		if (cl->police == TC_POLICE_RECLASSIFY)
1643			new->reshape_fail = cbq_reshape_fail;
1644#endif
1645	}
1646	sch_tree_lock(sch);
1647	*old = cl->q;
1648	cl->q = new;
1649	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1650	qdisc_reset(*old);
1651	sch_tree_unlock(sch);
1652
 
1653	return 0;
1654}
1655
1656static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1657{
1658	struct cbq_class *cl = (struct cbq_class *)arg;
1659
1660	return cl->q;
1661}
1662
1663static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1664{
1665	struct cbq_class *cl = (struct cbq_class *)arg;
1666
1667	if (cl->q->q.qlen == 0)
1668		cbq_deactivate_class(cl);
1669}
1670
1671static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1672{
1673	struct cbq_sched_data *q = qdisc_priv(sch);
1674	struct cbq_class *cl = cbq_class_lookup(q, classid);
1675
1676	if (cl) {
1677		cl->refcnt++;
1678		return (unsigned long)cl;
1679	}
1680	return 0;
1681}
1682
1683static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1684{
1685	struct cbq_sched_data *q = qdisc_priv(sch);
1686
1687	WARN_ON(cl->filters);
1688
1689	tcf_destroy_chain(&cl->filter_list);
1690	qdisc_destroy(cl->q);
1691	qdisc_put_rtab(cl->R_tab);
1692	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1693	if (cl != &q->link)
1694		kfree(cl);
1695}
1696
1697static void cbq_destroy(struct Qdisc *sch)
1698{
1699	struct cbq_sched_data *q = qdisc_priv(sch);
1700	struct hlist_node *n, *next;
1701	struct cbq_class *cl;
1702	unsigned int h;
1703
1704#ifdef CONFIG_NET_CLS_ACT
1705	q->rx_class = NULL;
1706#endif
1707	/*
1708	 * Filters must be destroyed first because we don't destroy the
1709	 * classes from root to leafs which means that filters can still
1710	 * be bound to classes which have been destroyed already. --TGR '04
1711	 */
1712	for (h = 0; h < q->clhash.hashsize; h++) {
1713		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1714			tcf_destroy_chain(&cl->filter_list);
 
 
1715	}
1716	for (h = 0; h < q->clhash.hashsize; h++) {
1717		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1718					  common.hnode)
1719			cbq_destroy_class(sch, cl);
1720	}
1721	qdisc_class_hash_destroy(&q->clhash);
1722}
1723
1724static void cbq_put(struct Qdisc *sch, unsigned long arg)
1725{
1726	struct cbq_class *cl = (struct cbq_class *)arg;
1727
1728	if (--cl->refcnt == 0) {
1729#ifdef CONFIG_NET_CLS_ACT
1730		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1731		struct cbq_sched_data *q = qdisc_priv(sch);
1732
1733		spin_lock_bh(root_lock);
1734		if (q->rx_class == cl)
1735			q->rx_class = NULL;
1736		spin_unlock_bh(root_lock);
1737#endif
1738
1739		cbq_destroy_class(sch, cl);
1740	}
1741}
1742
1743static int
1744cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1745		 unsigned long *arg)
1746{
1747	int err;
1748	struct cbq_sched_data *q = qdisc_priv(sch);
1749	struct cbq_class *cl = (struct cbq_class *)*arg;
1750	struct nlattr *opt = tca[TCA_OPTIONS];
1751	struct nlattr *tb[TCA_CBQ_MAX + 1];
1752	struct cbq_class *parent;
1753	struct qdisc_rate_table *rtab = NULL;
1754
1755	if (opt == NULL)
1756		return -EINVAL;
1757
1758	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1759	if (err < 0)
1760		return err;
1761
 
 
 
 
 
1762	if (cl) {
1763		/* Check parent */
1764		if (parentid) {
1765			if (cl->tparent &&
1766			    cl->tparent->common.classid != parentid)
 
1767				return -EINVAL;
1768			if (!cl->tparent && parentid != TC_H_ROOT)
 
 
1769				return -EINVAL;
 
1770		}
1771
1772		if (tb[TCA_CBQ_RATE]) {
1773			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1774					      tb[TCA_CBQ_RTAB]);
1775			if (rtab == NULL)
1776				return -EINVAL;
1777		}
1778
1779		if (tca[TCA_RATE]) {
1780			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1781						    qdisc_root_sleeping_lock(sch),
 
 
1782						    tca[TCA_RATE]);
1783			if (err) {
1784				if (rtab)
1785					qdisc_put_rtab(rtab);
1786				return err;
1787			}
1788		}
1789
1790		/* Change class parameters */
1791		sch_tree_lock(sch);
1792
1793		if (cl->next_alive != NULL)
1794			cbq_deactivate_class(cl);
1795
1796		if (rtab) {
1797			qdisc_put_rtab(cl->R_tab);
1798			cl->R_tab = rtab;
1799		}
1800
1801		if (tb[TCA_CBQ_LSSOPT])
1802			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1803
1804		if (tb[TCA_CBQ_WRROPT]) {
1805			cbq_rmprio(q, cl);
1806			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1807		}
1808
1809		if (tb[TCA_CBQ_OVL_STRATEGY])
1810			cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1811
1812#ifdef CONFIG_NET_CLS_ACT
1813		if (tb[TCA_CBQ_POLICE])
1814			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1815#endif
1816
1817		if (tb[TCA_CBQ_FOPT])
1818			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1819
1820		if (cl->q->q.qlen)
1821			cbq_activate_class(cl);
1822
1823		sch_tree_unlock(sch);
1824
1825		return 0;
1826	}
1827
1828	if (parentid == TC_H_ROOT)
1829		return -EINVAL;
1830
1831	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1832	    tb[TCA_CBQ_LSSOPT] == NULL)
1833		return -EINVAL;
 
1834
1835	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
 
1836	if (rtab == NULL)
1837		return -EINVAL;
1838
1839	if (classid) {
1840		err = -EINVAL;
1841		if (TC_H_MAJ(classid ^ sch->handle) ||
1842		    cbq_class_lookup(q, classid))
 
1843			goto failure;
 
1844	} else {
1845		int i;
1846		classid = TC_H_MAKE(sch->handle, 0x8000);
1847
1848		for (i = 0; i < 0x8000; i++) {
1849			if (++q->hgenerator >= 0x8000)
1850				q->hgenerator = 1;
1851			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1852				break;
1853		}
1854		err = -ENOSR;
1855		if (i >= 0x8000)
 
1856			goto failure;
 
1857		classid = classid|q->hgenerator;
1858	}
1859
1860	parent = &q->link;
1861	if (parentid) {
1862		parent = cbq_class_lookup(q, parentid);
1863		err = -EINVAL;
1864		if (parent == NULL)
 
1865			goto failure;
 
1866	}
1867
1868	err = -ENOBUFS;
1869	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1870	if (cl == NULL)
1871		goto failure;
1872
 
 
 
 
 
 
 
1873	if (tca[TCA_RATE]) {
1874		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1875					qdisc_root_sleeping_lock(sch),
1876					tca[TCA_RATE]);
1877		if (err) {
 
 
1878			kfree(cl);
1879			goto failure;
1880		}
1881	}
1882
1883	cl->R_tab = rtab;
1884	rtab = NULL;
1885	cl->refcnt = 1;
1886	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1887	if (!cl->q)
1888		cl->q = &noop_qdisc;
 
 
 
1889	cl->common.classid = classid;
1890	cl->tparent = parent;
1891	cl->qdisc = sch;
1892	cl->allot = parent->allot;
1893	cl->quantum = cl->allot;
1894	cl->weight = cl->R_tab->rate.rate;
1895
1896	sch_tree_lock(sch);
1897	cbq_link_class(cl);
1898	cl->borrow = cl->tparent;
1899	if (cl->tparent != &q->link)
1900		cl->share = cl->tparent;
1901	cbq_adjust_levels(parent);
1902	cl->minidle = -0x7FFFFFFF;
1903	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1904	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1905	if (cl->ewma_log == 0)
1906		cl->ewma_log = q->link.ewma_log;
1907	if (cl->maxidle == 0)
1908		cl->maxidle = q->link.maxidle;
1909	if (cl->avpkt == 0)
1910		cl->avpkt = q->link.avpkt;
1911	cl->overlimit = cbq_ovl_classic;
1912	if (tb[TCA_CBQ_OVL_STRATEGY])
1913		cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1914#ifdef CONFIG_NET_CLS_ACT
1915	if (tb[TCA_CBQ_POLICE])
1916		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1917#endif
1918	if (tb[TCA_CBQ_FOPT])
1919		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1920	sch_tree_unlock(sch);
1921
1922	qdisc_class_hash_grow(sch, &q->clhash);
1923
1924	*arg = (unsigned long)cl;
1925	return 0;
1926
1927failure:
1928	qdisc_put_rtab(rtab);
1929	return err;
1930}
1931
1932static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 
1933{
1934	struct cbq_sched_data *q = qdisc_priv(sch);
1935	struct cbq_class *cl = (struct cbq_class *)arg;
1936	unsigned int qlen;
1937
1938	if (cl->filters || cl->children || cl == &q->link)
1939		return -EBUSY;
1940
1941	sch_tree_lock(sch);
1942
1943	qlen = cl->q->q.qlen;
1944	qdisc_reset(cl->q);
1945	qdisc_tree_decrease_qlen(cl->q, qlen);
1946
1947	if (cl->next_alive)
1948		cbq_deactivate_class(cl);
1949
1950	if (q->tx_borrowed == cl)
1951		q->tx_borrowed = q->tx_class;
1952	if (q->tx_class == cl) {
1953		q->tx_class = NULL;
1954		q->tx_borrowed = NULL;
1955	}
1956#ifdef CONFIG_NET_CLS_ACT
1957	if (q->rx_class == cl)
1958		q->rx_class = NULL;
1959#endif
1960
1961	cbq_unlink_class(cl);
1962	cbq_adjust_levels(cl->tparent);
1963	cl->defmap = 0;
1964	cbq_sync_defmap(cl);
1965
1966	cbq_rmprio(q, cl);
1967	sch_tree_unlock(sch);
1968
1969	BUG_ON(--cl->refcnt == 0);
1970	/*
1971	 * This shouldn't happen: we "hold" one cops->get() when called
1972	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1973	 */
1974
1975	return 0;
1976}
1977
1978static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
 
1979{
1980	struct cbq_sched_data *q = qdisc_priv(sch);
1981	struct cbq_class *cl = (struct cbq_class *)arg;
1982
1983	if (cl == NULL)
1984		cl = &q->link;
1985
1986	return &cl->filter_list;
1987}
1988
1989static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1990				     u32 classid)
1991{
1992	struct cbq_sched_data *q = qdisc_priv(sch);
1993	struct cbq_class *p = (struct cbq_class *)parent;
1994	struct cbq_class *cl = cbq_class_lookup(q, classid);
1995
1996	if (cl) {
1997		if (p && p->level <= cl->level)
1998			return 0;
1999		cl->filters++;
2000		return (unsigned long)cl;
2001	}
2002	return 0;
2003}
2004
2005static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2006{
2007	struct cbq_class *cl = (struct cbq_class *)arg;
2008
2009	cl->filters--;
2010}
2011
2012static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2013{
2014	struct cbq_sched_data *q = qdisc_priv(sch);
2015	struct cbq_class *cl;
2016	struct hlist_node *n;
2017	unsigned int h;
2018
2019	if (arg->stop)
2020		return;
2021
2022	for (h = 0; h < q->clhash.hashsize; h++) {
2023		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2024			if (arg->count < arg->skip) {
2025				arg->count++;
2026				continue;
2027			}
2028			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2029				arg->stop = 1;
2030				return;
2031			}
2032			arg->count++;
2033		}
2034	}
2035}
2036
2037static const struct Qdisc_class_ops cbq_class_ops = {
2038	.graft		=	cbq_graft,
2039	.leaf		=	cbq_leaf,
2040	.qlen_notify	=	cbq_qlen_notify,
2041	.get		=	cbq_get,
2042	.put		=	cbq_put,
2043	.change		=	cbq_change_class,
2044	.delete		=	cbq_delete,
2045	.walk		=	cbq_walk,
2046	.tcf_chain	=	cbq_find_tcf,
2047	.bind_tcf	=	cbq_bind_filter,
2048	.unbind_tcf	=	cbq_unbind_filter,
2049	.dump		=	cbq_dump_class,
2050	.dump_stats	=	cbq_dump_class_stats,
2051};
2052
2053static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2054	.next		=	NULL,
2055	.cl_ops		=	&cbq_class_ops,
2056	.id		=	"cbq",
2057	.priv_size	=	sizeof(struct cbq_sched_data),
2058	.enqueue	=	cbq_enqueue,
2059	.dequeue	=	cbq_dequeue,
2060	.peek		=	qdisc_peek_dequeued,
2061	.drop		=	cbq_drop,
2062	.init		=	cbq_init,
2063	.reset		=	cbq_reset,
2064	.destroy	=	cbq_destroy,
2065	.change		=	NULL,
2066	.dump		=	cbq_dump,
2067	.dump_stats	=	cbq_dump_stats,
2068	.owner		=	THIS_MODULE,
2069};
2070
2071static int __init cbq_module_init(void)
2072{
2073	return register_qdisc(&cbq_qdisc_ops);
2074}
2075static void __exit cbq_module_exit(void)
2076{
2077	unregister_qdisc(&cbq_qdisc_ops);
2078}
2079module_init(cbq_module_init)
2080module_exit(cbq_module_exit)
2081MODULE_LICENSE("GPL");

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   4 *
 
 
 
 
 
   5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/slab.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/string.h>
  13#include <linux/errno.h>
  14#include <linux/skbuff.h>
  15#include <net/netlink.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*	Class-Based Queueing (CBQ) algorithm.
  21	=======================================
  22
  23	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24		 Management Models for Packet Networks",
  25		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26
  27		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28
  29		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30		 Parameters", 1996
  31
  32		 [4] Sally Floyd and Michael Speer, "Experimental Results
  33		 for Class-Based Queueing", 1998, not published.
  34
  35	-----------------------------------------------------------------------
  36
  37	Algorithm skeleton was taken from NS simulator cbq.cc.
  38	If someone wants to check this code against the LBL version,
  39	he should take into account that ONLY the skeleton was borrowed,
  40	the implementation is different. Particularly:
  41
  42	--- The WRR algorithm is different. Our version looks more
  43	reasonable (I hope) and works when quanta are allowed to be
  44	less than MTU, which is always the case when real time classes
  45	have small rates. Note, that the statement of [3] is
  46	incomplete, delay may actually be estimated even if class
  47	per-round allotment is less than MTU. Namely, if per-round
  48	allotment is W*r_i, and r_1+...+r_k = r < 1
  49
  50	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51
  52	In the worst case we have IntServ estimate with D = W*r+k*MTU
  53	and C = MTU*r. The proof (if correct at all) is trivial.
  54
  55
  56	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57	interpret some places, which look like wrong translations
  58	from NS. Anyone is advised to find these differences
  59	and explain to me, why I am wrong 8).
  60
  61	--- Linux has no EOI event, so that we cannot estimate true class
  62	idle time. Workaround is to consider the next dequeue event
  63	as sign that previous packet is finished. This is wrong because of
  64	internal device queueing, but on a permanently loaded link it is true.
  65	Moreover, combined with clock integrator, this scheme looks
  66	very close to an ideal solution.  */
  67
  68struct cbq_sched_data;
  69
  70
  71struct cbq_class {
  72	struct Qdisc_class_common common;
  73	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  74
  75/* Parameters */
  76	unsigned char		priority;	/* class priority */
  77	unsigned char		priority2;	/* priority to be used after overlimit */
  78	unsigned char		ewma_log;	/* time constant for idle time calculation */
 
 
 
 
  79
  80	u32			defmap;
  81
  82	/* Link-sharing scheduler parameters */
  83	long			maxidle;	/* Class parameters: see below. */
  84	long			offtime;
  85	long			minidle;
  86	u32			avpkt;
  87	struct qdisc_rate_table	*R_tab;
  88
 
 
 
 
  89	/* General scheduler (WRR) parameters */
  90	long			allot;
  91	long			quantum;	/* Allotment per WRR round */
  92	long			weight;		/* Relative allotment: see below */
  93
  94	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
  95	struct cbq_class	*split;		/* Ptr to split node */
  96	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
  97	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
  98	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
  99						   parent otherwise */
 100	struct cbq_class	*sibling;	/* Sibling chain */
 101	struct cbq_class	*children;	/* Pointer to children chain */
 102
 103	struct Qdisc		*q;		/* Elementary queueing discipline */
 104
 105
 106/* Variables */
 107	unsigned char		cpriority;	/* Effective priority */
 108	unsigned char		delayed;
 109	unsigned char		level;		/* level of the class in hierarchy:
 110						   0 for leaf classes, and maximal
 111						   level of children + 1 for nodes.
 112						 */
 113
 114	psched_time_t		last;		/* Last end of service */
 115	psched_time_t		undertime;
 116	long			avgidle;
 117	long			deficit;	/* Saved deficit for WRR */
 118	psched_time_t		penalized;
 119	struct gnet_stats_basic_sync bstats;
 120	struct gnet_stats_queue qstats;
 121	struct net_rate_estimator __rcu *rate_est;
 122	struct tc_cbq_xstats	xstats;
 123
 124	struct tcf_proto __rcu	*filter_list;
 125	struct tcf_block	*block;
 126
 
 127	int			filters;
 128
 129	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 130};
 131
 132struct cbq_sched_data {
 133	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 134	int			nclasses[TC_CBQ_MAXPRIO + 1];
 135	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 136
 137	struct cbq_class	link;
 138
 139	unsigned int		activemask;
 140	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 141								   with backlog */
 142
 143#ifdef CONFIG_NET_CLS_ACT
 144	struct cbq_class	*rx_class;
 145#endif
 146	struct cbq_class	*tx_class;
 147	struct cbq_class	*tx_borrowed;
 148	int			tx_len;
 149	psched_time_t		now;		/* Cached timestamp */
 
 150	unsigned int		pmask;
 151
 
 152	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 153						   started when CBQ has
 154						   backlog, but cannot
 155						   transmit just now */
 156	psched_tdiff_t		wd_expires;
 157	int			toplevel;
 158	u32			hgenerator;
 159};
 160
 161
 162#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 163
 164static inline struct cbq_class *
 165cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 166{
 167	struct Qdisc_class_common *clc;
 168
 169	clc = qdisc_class_find(&q->clhash, classid);
 170	if (clc == NULL)
 171		return NULL;
 172	return container_of(clc, struct cbq_class, common);
 173}
 174
 175#ifdef CONFIG_NET_CLS_ACT
 176
 177static struct cbq_class *
 178cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 179{
 180	struct cbq_class *cl;
 181
 182	for (cl = this->tparent; cl; cl = cl->tparent) {
 183		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 184
 185		if (new != NULL && new != this)
 186			return new;
 187	}
 188	return NULL;
 189}
 190
 191#endif
 192
 193/* Classify packet. The procedure is pretty complicated, but
 194 * it allows us to combine link sharing and priority scheduling
 195 * transparently.
 196 *
 197 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 198 * so that it resolves to split nodes. Then packets are classified
 199 * by logical priority, or a more specific classifier may be attached
 200 * to the split node.
 201 */
 202
 203static struct cbq_class *
 204cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 205{
 206	struct cbq_sched_data *q = qdisc_priv(sch);
 207	struct cbq_class *head = &q->link;
 208	struct cbq_class **defmap;
 209	struct cbq_class *cl = NULL;
 210	u32 prio = skb->priority;
 211	struct tcf_proto *fl;
 212	struct tcf_result res;
 213
 214	/*
 215	 *  Step 1. If skb->priority points to one of our classes, use it.
 216	 */
 217	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 218	    (cl = cbq_class_lookup(q, prio)) != NULL)
 219		return cl;
 220
 221	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 222	for (;;) {
 223		int result = 0;
 224		defmap = head->defaults;
 225
 226		fl = rcu_dereference_bh(head->filter_list);
 227		/*
 228		 * Step 2+n. Apply classifier.
 229		 */
 230		result = tcf_classify(skb, NULL, fl, &res, true);
 231		if (!fl || result < 0)
 232			goto fallback;
 233		if (result == TC_ACT_SHOT)
 234			return NULL;
 235
 236		cl = (void *)res.class;
 237		if (!cl) {
 238			if (TC_H_MAJ(res.classid))
 239				cl = cbq_class_lookup(q, res.classid);
 240			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 241				cl = defmap[TC_PRIO_BESTEFFORT];
 242
 243			if (cl == NULL)
 244				goto fallback;
 245		}
 246		if (cl->level >= head->level)
 247			goto fallback;
 248#ifdef CONFIG_NET_CLS_ACT
 249		switch (result) {
 250		case TC_ACT_QUEUED:
 251		case TC_ACT_STOLEN:
 252		case TC_ACT_TRAP:
 253			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 254			fallthrough;
 
 255		case TC_ACT_RECLASSIFY:
 256			return cbq_reclassify(skb, cl);
 257		}
 258#endif
 259		if (cl->level == 0)
 260			return cl;
 261
 262		/*
 263		 * Step 3+n. If classifier selected a link sharing class,
 264		 *	   apply agency specific classifier.
 265		 *	   Repeat this procedure until we hit a leaf node.
 266		 */
 267		head = cl;
 268	}
 269
 270fallback:
 271	cl = head;
 272
 273	/*
 274	 * Step 4. No success...
 275	 */
 276	if (TC_H_MAJ(prio) == 0 &&
 277	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 278	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 279		return head;
 280
 281	return cl;
 282}
 283
 284/*
 285 * A packet has just been enqueued on the empty class.
 286 * cbq_activate_class adds it to the tail of active class list
 287 * of its priority band.
 288 */
 289
 290static inline void cbq_activate_class(struct cbq_class *cl)
 291{
 292	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 293	int prio = cl->cpriority;
 294	struct cbq_class *cl_tail;
 295
 296	cl_tail = q->active[prio];
 297	q->active[prio] = cl;
 298
 299	if (cl_tail != NULL) {
 300		cl->next_alive = cl_tail->next_alive;
 301		cl_tail->next_alive = cl;
 302	} else {
 303		cl->next_alive = cl;
 304		q->activemask |= (1<<prio);
 305	}
 306}
 307
 308/*
 309 * Unlink class from active chain.
 310 * Note that this same procedure is done directly in cbq_dequeue*
 311 * during round-robin procedure.
 312 */
 313
 314static void cbq_deactivate_class(struct cbq_class *this)
 315{
 316	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 317	int prio = this->cpriority;
 318	struct cbq_class *cl;
 319	struct cbq_class *cl_prev = q->active[prio];
 320
 321	do {
 322		cl = cl_prev->next_alive;
 323		if (cl == this) {
 324			cl_prev->next_alive = cl->next_alive;
 325			cl->next_alive = NULL;
 326
 327			if (cl == q->active[prio]) {
 328				q->active[prio] = cl_prev;
 329				if (cl == q->active[prio]) {
 330					q->active[prio] = NULL;
 331					q->activemask &= ~(1<<prio);
 332					return;
 333				}
 334			}
 335			return;
 336		}
 337	} while ((cl_prev = cl) != q->active[prio]);
 338}
 339
 340static void
 341cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 342{
 343	int toplevel = q->toplevel;
 344
 345	if (toplevel > cl->level) {
 346		psched_time_t now = psched_get_time();
 
 
 
 
 
 347
 348		do {
 349			if (cl->undertime < now) {
 350				q->toplevel = cl->level;
 351				return;
 352			}
 353		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 354	}
 355}
 356
 357static int
 358cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 359	    struct sk_buff **to_free)
 360{
 361	struct cbq_sched_data *q = qdisc_priv(sch);
 362	int ret;
 363	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 364
 365#ifdef CONFIG_NET_CLS_ACT
 366	q->rx_class = cl;
 367#endif
 368	if (cl == NULL) {
 369		if (ret & __NET_XMIT_BYPASS)
 370			qdisc_qstats_drop(sch);
 371		__qdisc_drop(skb, to_free);
 372		return ret;
 373	}
 374
 375	ret = qdisc_enqueue(skb, cl->q, to_free);
 
 
 
 376	if (ret == NET_XMIT_SUCCESS) {
 377		sch->q.qlen++;
 378		cbq_mark_toplevel(q, cl);
 379		if (!cl->next_alive)
 380			cbq_activate_class(cl);
 381		return ret;
 382	}
 383
 384	if (net_xmit_drop_count(ret)) {
 385		qdisc_qstats_drop(sch);
 386		cbq_mark_toplevel(q, cl);
 387		cl->qstats.drops++;
 388	}
 389	return ret;
 390}
 391
 392/* Overlimit action: penalize leaf class by adding offtime */
 393static void cbq_overlimit(struct cbq_class *cl)
 
 
 
 394{
 395	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 396	psched_tdiff_t delay = cl->undertime - q->now;
 397
 398	if (!cl->delayed) {
 399		delay += cl->offtime;
 400
 401		/*
 402		 * Class goes to sleep, so that it will have no
 403		 * chance to work avgidle. Let's forgive it 8)
 404		 *
 405		 * BTW cbq-2.0 has a crap in this
 406		 * place, apparently they forgot to shift it by cl->ewma_log.
 407		 */
 408		if (cl->avgidle < 0)
 409			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 410		if (cl->avgidle < cl->minidle)
 411			cl->avgidle = cl->minidle;
 412		if (delay <= 0)
 413			delay = 1;
 414		cl->undertime = q->now + delay;
 415
 416		cl->xstats.overactions++;
 417		cl->delayed = 1;
 418	}
 419	if (q->wd_expires == 0 || q->wd_expires > delay)
 420		q->wd_expires = delay;
 421
 422	/* Dirty work! We must schedule wakeups based on
 423	 * real available rate, rather than leaf rate,
 424	 * which may be tiny (even zero).
 425	 */
 426	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 427		struct cbq_class *b;
 428		psched_tdiff_t base_delay = q->wd_expires;
 429
 430		for (b = cl->borrow; b; b = b->borrow) {
 431			delay = b->undertime - q->now;
 432			if (delay < base_delay) {
 433				if (delay <= 0)
 434					delay = 1;
 435				base_delay = delay;
 436			}
 437		}
 438
 439		q->wd_expires = base_delay;
 440	}
 441}
 442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 443/*
 444 * It is mission critical procedure.
 445 *
 446 * We "regenerate" toplevel cutoff, if transmitting class
 447 * has backlog and it is not regulated. It is not part of
 448 * original CBQ description, but looks more reasonable.
 449 * Probably, it is wrong. This question needs further investigation.
 450 */
 451
 452static inline void
 453cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 454		    struct cbq_class *borrowed)
 455{
 456	if (cl && q->toplevel >= borrowed->level) {
 457		if (cl->q->q.qlen > 1) {
 458			do {
 459				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 460					q->toplevel = borrowed->level;
 461					return;
 462				}
 463			} while ((borrowed = borrowed->borrow) != NULL);
 464		}
 465#if 0
 466	/* It is not necessary now. Uncommenting it
 467	   will save CPU cycles, but decrease fairness.
 468	 */
 469		q->toplevel = TC_CBQ_MAXLEVEL;
 470#endif
 471	}
 472}
 473
 474static void
 475cbq_update(struct cbq_sched_data *q)
 476{
 477	struct cbq_class *this = q->tx_class;
 478	struct cbq_class *cl = this;
 479	int len = q->tx_len;
 480	psched_time_t now;
 481
 482	q->tx_class = NULL;
 483	/* Time integrator. We calculate EOS time
 484	 * by adding expected packet transmission time.
 485	 */
 486	now = q->now + L2T(&q->link, len);
 487
 488	for ( ; cl; cl = cl->share) {
 489		long avgidle = cl->avgidle;
 490		long idle;
 491
 492		_bstats_update(&cl->bstats, len, 1);
 
 493
 494		/*
 495		 * (now - last) is total time between packet right edges.
 496		 * (last_pktlen/rate) is "virtual" busy time, so that
 497		 *
 498		 *	idle = (now - last) - last_pktlen/rate
 499		 */
 500
 501		idle = now - cl->last;
 502		if ((unsigned long)idle > 128*1024*1024) {
 503			avgidle = cl->maxidle;
 504		} else {
 505			idle -= L2T(cl, len);
 506
 507		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 508		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 509		 * cl->avgidle == true_avgidle/W,
 510		 * hence:
 511		 */
 512			avgidle += idle - (avgidle>>cl->ewma_log);
 513		}
 514
 515		if (avgidle <= 0) {
 516			/* Overlimit or at-limit */
 517
 518			if (avgidle < cl->minidle)
 519				avgidle = cl->minidle;
 520
 521			cl->avgidle = avgidle;
 522
 523			/* Calculate expected time, when this class
 524			 * will be allowed to send.
 525			 * It will occur, when:
 526			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 527			 * idle = (1/W - 1)*(-true_avgidle)
 528			 * or
 529			 * idle = (1 - W)*(-cl->avgidle);
 530			 */
 531			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 532
 533			/*
 534			 * That is not all.
 535			 * To maintain the rate allocated to the class,
 536			 * we add to undertime virtual clock,
 537			 * necessary to complete transmitted packet.
 538			 * (len/phys_bandwidth has been already passed
 539			 * to the moment of cbq_update)
 540			 */
 541
 542			idle -= L2T(&q->link, len);
 543			idle += L2T(cl, len);
 544
 545			cl->undertime = now + idle;
 546		} else {
 547			/* Underlimit */
 548
 549			cl->undertime = PSCHED_PASTPERFECT;
 550			if (avgidle > cl->maxidle)
 551				cl->avgidle = cl->maxidle;
 552			else
 553				cl->avgidle = avgidle;
 554		}
 555		if ((s64)(now - cl->last) > 0)
 556			cl->last = now;
 557	}
 558
 559	cbq_update_toplevel(q, this, q->tx_borrowed);
 560}
 561
 562static inline struct cbq_class *
 563cbq_under_limit(struct cbq_class *cl)
 564{
 565	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 566	struct cbq_class *this_cl = cl;
 567
 568	if (cl->tparent == NULL)
 569		return cl;
 570
 571	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 572		cl->delayed = 0;
 573		return cl;
 574	}
 575
 576	do {
 577		/* It is very suspicious place. Now overlimit
 578		 * action is generated for not bounded classes
 579		 * only if link is completely congested.
 580		 * Though it is in agree with ancestor-only paradigm,
 581		 * it looks very stupid. Particularly,
 582		 * it means that this chunk of code will either
 583		 * never be called or result in strong amplification
 584		 * of burstiness. Dangerous, silly, and, however,
 585		 * no another solution exists.
 586		 */
 587		cl = cl->borrow;
 588		if (!cl) {
 589			this_cl->qstats.overlimits++;
 590			cbq_overlimit(this_cl);
 591			return NULL;
 592		}
 593		if (cl->level > q->toplevel)
 594			return NULL;
 595	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 596
 597	cl->delayed = 0;
 598	return cl;
 599}
 600
 601static inline struct sk_buff *
 602cbq_dequeue_prio(struct Qdisc *sch, int prio)
 603{
 604	struct cbq_sched_data *q = qdisc_priv(sch);
 605	struct cbq_class *cl_tail, *cl_prev, *cl;
 606	struct sk_buff *skb;
 607	int deficit;
 608
 609	cl_tail = cl_prev = q->active[prio];
 610	cl = cl_prev->next_alive;
 611
 612	do {
 613		deficit = 0;
 614
 615		/* Start round */
 616		do {
 617			struct cbq_class *borrow = cl;
 618
 619			if (cl->q->q.qlen &&
 620			    (borrow = cbq_under_limit(cl)) == NULL)
 621				goto skip_class;
 622
 623			if (cl->deficit <= 0) {
 624				/* Class exhausted its allotment per
 625				 * this round. Switch to the next one.
 626				 */
 627				deficit = 1;
 628				cl->deficit += cl->quantum;
 629				goto next_class;
 630			}
 631
 632			skb = cl->q->dequeue(cl->q);
 633
 634			/* Class did not give us any skb :-(
 635			 * It could occur even if cl->q->q.qlen != 0
 636			 * f.e. if cl->q == "tbf"
 637			 */
 638			if (skb == NULL)
 639				goto skip_class;
 640
 641			cl->deficit -= qdisc_pkt_len(skb);
 642			q->tx_class = cl;
 643			q->tx_borrowed = borrow;
 644			if (borrow != cl) {
 645#ifndef CBQ_XSTATS_BORROWS_BYTES
 646				borrow->xstats.borrows++;
 647				cl->xstats.borrows++;
 648#else
 649				borrow->xstats.borrows += qdisc_pkt_len(skb);
 650				cl->xstats.borrows += qdisc_pkt_len(skb);
 651#endif
 652			}
 653			q->tx_len = qdisc_pkt_len(skb);
 654
 655			if (cl->deficit <= 0) {
 656				q->active[prio] = cl;
 657				cl = cl->next_alive;
 658				cl->deficit += cl->quantum;
 659			}
 660			return skb;
 661
 662skip_class:
 663			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 664				/* Class is empty or penalized.
 665				 * Unlink it from active chain.
 666				 */
 667				cl_prev->next_alive = cl->next_alive;
 668				cl->next_alive = NULL;
 669
 670				/* Did cl_tail point to it? */
 671				if (cl == cl_tail) {
 672					/* Repair it! */
 673					cl_tail = cl_prev;
 674
 675					/* Was it the last class in this band? */
 676					if (cl == cl_tail) {
 677						/* Kill the band! */
 678						q->active[prio] = NULL;
 679						q->activemask &= ~(1<<prio);
 680						if (cl->q->q.qlen)
 681							cbq_activate_class(cl);
 682						return NULL;
 683					}
 684
 685					q->active[prio] = cl_tail;
 686				}
 687				if (cl->q->q.qlen)
 688					cbq_activate_class(cl);
 689
 690				cl = cl_prev;
 691			}
 692
 693next_class:
 694			cl_prev = cl;
 695			cl = cl->next_alive;
 696		} while (cl_prev != cl_tail);
 697	} while (deficit);
 698
 699	q->active[prio] = cl_prev;
 700
 701	return NULL;
 702}
 703
 704static inline struct sk_buff *
 705cbq_dequeue_1(struct Qdisc *sch)
 706{
 707	struct cbq_sched_data *q = qdisc_priv(sch);
 708	struct sk_buff *skb;
 709	unsigned int activemask;
 710
 711	activemask = q->activemask & 0xFF;
 712	while (activemask) {
 713		int prio = ffz(~activemask);
 714		activemask &= ~(1<<prio);
 715		skb = cbq_dequeue_prio(sch, prio);
 716		if (skb)
 717			return skb;
 718	}
 719	return NULL;
 720}
 721
 722static struct sk_buff *
 723cbq_dequeue(struct Qdisc *sch)
 724{
 725	struct sk_buff *skb;
 726	struct cbq_sched_data *q = qdisc_priv(sch);
 727	psched_time_t now;
 
 728
 729	now = psched_get_time();
 
 730
 731	if (q->tx_class)
 
 
 
 
 
 
 
 
 
 
 732		cbq_update(q);
 733
 734	q->now = now;
 
 
 
 735
 736	for (;;) {
 737		q->wd_expires = 0;
 738
 739		skb = cbq_dequeue_1(sch);
 740		if (skb) {
 741			qdisc_bstats_update(sch, skb);
 742			sch->q.qlen--;
 
 743			return skb;
 744		}
 745
 746		/* All the classes are overlimit.
 747		 *
 748		 * It is possible, if:
 749		 *
 750		 * 1. Scheduler is empty.
 751		 * 2. Toplevel cutoff inhibited borrowing.
 752		 * 3. Root class is overlimit.
 753		 *
 754		 * Reset 2d and 3d conditions and retry.
 755		 *
 756		 * Note, that NS and cbq-2.0 are buggy, peeking
 757		 * an arbitrary class is appropriate for ancestor-only
 758		 * sharing, but not for toplevel algorithm.
 759		 *
 760		 * Our version is better, but slower, because it requires
 761		 * two passes, but it is unavoidable with top-level sharing.
 762		 */
 763
 764		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 765		    q->link.undertime == PSCHED_PASTPERFECT)
 766			break;
 767
 768		q->toplevel = TC_CBQ_MAXLEVEL;
 769		q->link.undertime = PSCHED_PASTPERFECT;
 770	}
 771
 772	/* No packets in scheduler or nobody wants to give them to us :-(
 773	 * Sigh... start watchdog timer in the last case.
 774	 */
 775
 776	if (sch->q.qlen) {
 777		qdisc_qstats_overlimit(sch);
 778		if (q->wd_expires)
 779			qdisc_watchdog_schedule(&q->watchdog,
 780						now + q->wd_expires);
 781	}
 782	return NULL;
 783}
 784
 785/* CBQ class maintenance routines */
 786
 787static void cbq_adjust_levels(struct cbq_class *this)
 788{
 789	if (this == NULL)
 790		return;
 791
 792	do {
 793		int level = 0;
 794		struct cbq_class *cl;
 795
 796		cl = this->children;
 797		if (cl) {
 798			do {
 799				if (cl->level > level)
 800					level = cl->level;
 801			} while ((cl = cl->sibling) != this->children);
 802		}
 803		this->level = level + 1;
 804	} while ((this = this->tparent) != NULL);
 805}
 806
 807static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 808{
 809	struct cbq_class *cl;
 
 810	unsigned int h;
 811
 812	if (q->quanta[prio] == 0)
 813		return;
 814
 815	for (h = 0; h < q->clhash.hashsize; h++) {
 816		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 817			/* BUGGGG... Beware! This expression suffer of
 818			 * arithmetic overflows!
 819			 */
 820			if (cl->priority == prio) {
 821				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 822					q->quanta[prio];
 823			}
 824			if (cl->quantum <= 0 ||
 825			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 826				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 827					cl->common.classid, cl->quantum);
 828				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 829			}
 830		}
 831	}
 832}
 833
 834static void cbq_sync_defmap(struct cbq_class *cl)
 835{
 836	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 837	struct cbq_class *split = cl->split;
 838	unsigned int h;
 839	int i;
 840
 841	if (split == NULL)
 842		return;
 843
 844	for (i = 0; i <= TC_PRIO_MAX; i++) {
 845		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 846			split->defaults[i] = NULL;
 847	}
 848
 849	for (i = 0; i <= TC_PRIO_MAX; i++) {
 850		int level = split->level;
 851
 852		if (split->defaults[i])
 853			continue;
 854
 855		for (h = 0; h < q->clhash.hashsize; h++) {
 
 856			struct cbq_class *c;
 857
 858			hlist_for_each_entry(c, &q->clhash.hash[h],
 859					     common.hnode) {
 860				if (c->split == split && c->level < level &&
 861				    c->defmap & (1<<i)) {
 862					split->defaults[i] = c;
 863					level = c->level;
 864				}
 865			}
 866		}
 867	}
 868}
 869
 870static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 871{
 872	struct cbq_class *split = NULL;
 873
 874	if (splitid == 0) {
 875		split = cl->split;
 876		if (!split)
 877			return;
 878		splitid = split->common.classid;
 879	}
 880
 881	if (split == NULL || split->common.classid != splitid) {
 882		for (split = cl->tparent; split; split = split->tparent)
 883			if (split->common.classid == splitid)
 884				break;
 885	}
 886
 887	if (split == NULL)
 888		return;
 889
 890	if (cl->split != split) {
 891		cl->defmap = 0;
 892		cbq_sync_defmap(cl);
 893		cl->split = split;
 894		cl->defmap = def & mask;
 895	} else
 896		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 897
 898	cbq_sync_defmap(cl);
 899}
 900
 901static void cbq_unlink_class(struct cbq_class *this)
 902{
 903	struct cbq_class *cl, **clp;
 904	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 905
 906	qdisc_class_hash_remove(&q->clhash, &this->common);
 907
 908	if (this->tparent) {
 909		clp = &this->sibling;
 910		cl = *clp;
 911		do {
 912			if (cl == this) {
 913				*clp = cl->sibling;
 914				break;
 915			}
 916			clp = &cl->sibling;
 917		} while ((cl = *clp) != this->sibling);
 918
 919		if (this->tparent->children == this) {
 920			this->tparent->children = this->sibling;
 921			if (this->sibling == this)
 922				this->tparent->children = NULL;
 923		}
 924	} else {
 925		WARN_ON(this->sibling != this);
 926	}
 927}
 928
 929static void cbq_link_class(struct cbq_class *this)
 930{
 931	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 932	struct cbq_class *parent = this->tparent;
 933
 934	this->sibling = this;
 935	qdisc_class_hash_insert(&q->clhash, &this->common);
 936
 937	if (parent == NULL)
 938		return;
 939
 940	if (parent->children == NULL) {
 941		parent->children = this;
 942	} else {
 943		this->sibling = parent->children->sibling;
 944		parent->children->sibling = this;
 945	}
 946}
 947
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 948static void
 949cbq_reset(struct Qdisc *sch)
 950{
 951	struct cbq_sched_data *q = qdisc_priv(sch);
 952	struct cbq_class *cl;
 
 953	int prio;
 954	unsigned int h;
 955
 956	q->activemask = 0;
 957	q->pmask = 0;
 958	q->tx_class = NULL;
 959	q->tx_borrowed = NULL;
 960	qdisc_watchdog_cancel(&q->watchdog);
 
 961	q->toplevel = TC_CBQ_MAXLEVEL;
 962	q->now = psched_get_time();
 
 963
 964	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
 965		q->active[prio] = NULL;
 966
 967	for (h = 0; h < q->clhash.hashsize; h++) {
 968		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 969			qdisc_reset(cl->q);
 970
 971			cl->next_alive = NULL;
 972			cl->undertime = PSCHED_PASTPERFECT;
 973			cl->avgidle = cl->maxidle;
 974			cl->deficit = cl->quantum;
 975			cl->cpriority = cl->priority;
 976		}
 977	}
 
 978}
 979
 980
 981static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
 982{
 983	if (lss->change & TCF_CBQ_LSS_FLAGS) {
 984		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
 985		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
 986	}
 987	if (lss->change & TCF_CBQ_LSS_EWMA)
 988		cl->ewma_log = lss->ewma_log;
 989	if (lss->change & TCF_CBQ_LSS_AVPKT)
 990		cl->avpkt = lss->avpkt;
 991	if (lss->change & TCF_CBQ_LSS_MINIDLE)
 992		cl->minidle = -(long)lss->minidle;
 993	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
 994		cl->maxidle = lss->maxidle;
 995		cl->avgidle = lss->maxidle;
 996	}
 997	if (lss->change & TCF_CBQ_LSS_OFFTIME)
 998		cl->offtime = lss->offtime;
 
 999}
1000
1001static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1002{
1003	q->nclasses[cl->priority]--;
1004	q->quanta[cl->priority] -= cl->weight;
1005	cbq_normalize_quanta(q, cl->priority);
1006}
1007
1008static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1009{
1010	q->nclasses[cl->priority]++;
1011	q->quanta[cl->priority] += cl->weight;
1012	cbq_normalize_quanta(q, cl->priority);
1013}
1014
1015static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1016{
1017	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1018
1019	if (wrr->allot)
1020		cl->allot = wrr->allot;
1021	if (wrr->weight)
1022		cl->weight = wrr->weight;
1023	if (wrr->priority) {
1024		cl->priority = wrr->priority - 1;
1025		cl->cpriority = cl->priority;
1026		if (cl->priority >= cl->priority2)
1027			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1028	}
1029
1030	cbq_addprio(q, cl);
1031	return 0;
1032}
1033
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1034static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1035{
1036	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1037	return 0;
1038}
1039
1040static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1041	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1042	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1043	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1044	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1045	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1046	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1047	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1048};
1049
1050static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1051			 struct nlattr *opt,
1052			 struct netlink_ext_ack *extack)
1053{
1054	int err;
1055
1056	if (!opt) {
1057		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1058		return -EINVAL;
1059	}
1060
1061	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1062					  cbq_policy, extack);
1063	if (err < 0)
1064		return err;
1065
1066	if (tb[TCA_CBQ_WRROPT]) {
1067		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1068
1069		if (wrr->priority > TC_CBQ_MAXPRIO) {
1070			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1071			err = -EINVAL;
1072		}
1073	}
1074	return err;
1075}
1076
1077static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1078		    struct netlink_ext_ack *extack)
1079{
1080	struct cbq_sched_data *q = qdisc_priv(sch);
1081	struct nlattr *tb[TCA_CBQ_MAX + 1];
1082	struct tc_ratespec *r;
1083	int err;
1084
1085	qdisc_watchdog_init(&q->watchdog, sch);
1086
1087	err = cbq_opt_parse(tb, opt, extack);
1088	if (err < 0)
1089		return err;
1090
1091	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1092		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1093		return -EINVAL;
1094	}
1095
1096	r = nla_data(tb[TCA_CBQ_RATE]);
1097
1098	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1099	if (!q->link.R_tab)
1100		return -EINVAL;
1101
1102	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1103	if (err)
1104		goto put_rtab;
1105
1106	err = qdisc_class_hash_init(&q->clhash);
1107	if (err < 0)
1108		goto put_block;
1109
 
1110	q->link.sibling = &q->link;
1111	q->link.common.classid = sch->handle;
1112	q->link.qdisc = sch;
1113	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1114				      sch->handle, NULL);
1115	if (!q->link.q)
1116		q->link.q = &noop_qdisc;
1117	else
1118		qdisc_hash_add(q->link.q, true);
1119
1120	q->link.priority = TC_CBQ_MAXPRIO - 1;
1121	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1122	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
 
 
1123	q->link.allot = psched_mtu(qdisc_dev(sch));
1124	q->link.quantum = q->link.allot;
1125	q->link.weight = q->link.R_tab->rate.rate;
1126
1127	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1128	q->link.avpkt = q->link.allot/2;
1129	q->link.minidle = -0x7FFFFFFF;
1130
 
 
 
1131	q->toplevel = TC_CBQ_MAXLEVEL;
1132	q->now = psched_get_time();
 
1133
1134	cbq_link_class(&q->link);
1135
1136	if (tb[TCA_CBQ_LSSOPT])
1137		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1138
1139	cbq_addprio(q, &q->link);
1140	return 0;
1141
1142put_block:
1143	tcf_block_put(q->link.block);
1144
1145put_rtab:
1146	qdisc_put_rtab(q->link.R_tab);
1147	return err;
1148}
1149
1150static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1151{
1152	unsigned char *b = skb_tail_pointer(skb);
1153
1154	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1155		goto nla_put_failure;
1156	return skb->len;
1157
1158nla_put_failure:
1159	nlmsg_trim(skb, b);
1160	return -1;
1161}
1162
1163static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1164{
1165	unsigned char *b = skb_tail_pointer(skb);
1166	struct tc_cbq_lssopt opt;
1167
1168	opt.flags = 0;
1169	if (cl->borrow == NULL)
1170		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1171	if (cl->share == NULL)
1172		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1173	opt.ewma_log = cl->ewma_log;
1174	opt.level = cl->level;
1175	opt.avpkt = cl->avpkt;
1176	opt.maxidle = cl->maxidle;
1177	opt.minidle = (u32)(-cl->minidle);
1178	opt.offtime = cl->offtime;
1179	opt.change = ~0;
1180	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1181		goto nla_put_failure;
1182	return skb->len;
1183
1184nla_put_failure:
1185	nlmsg_trim(skb, b);
1186	return -1;
1187}
1188
1189static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1190{
1191	unsigned char *b = skb_tail_pointer(skb);
1192	struct tc_cbq_wrropt opt;
1193
1194	memset(&opt, 0, sizeof(opt));
1195	opt.flags = 0;
1196	opt.allot = cl->allot;
1197	opt.priority = cl->priority + 1;
1198	opt.cpriority = cl->cpriority + 1;
1199	opt.weight = cl->weight;
1200	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1201		goto nla_put_failure;
1202	return skb->len;
1203
1204nla_put_failure:
1205	nlmsg_trim(skb, b);
1206	return -1;
1207}
1208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1210{
1211	unsigned char *b = skb_tail_pointer(skb);
1212	struct tc_cbq_fopt opt;
1213
1214	if (cl->split || cl->defmap) {
1215		opt.split = cl->split ? cl->split->common.classid : 0;
1216		opt.defmap = cl->defmap;
1217		opt.defchange = ~0;
1218		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1219			goto nla_put_failure;
1220	}
1221	return skb->len;
1222
1223nla_put_failure:
1224	nlmsg_trim(skb, b);
1225	return -1;
1226}
1227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1228static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1229{
1230	if (cbq_dump_lss(skb, cl) < 0 ||
1231	    cbq_dump_rate(skb, cl) < 0 ||
1232	    cbq_dump_wrr(skb, cl) < 0 ||
 
 
 
 
1233	    cbq_dump_fopt(skb, cl) < 0)
1234		return -1;
1235	return 0;
1236}
1237
1238static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1239{
1240	struct cbq_sched_data *q = qdisc_priv(sch);
1241	struct nlattr *nest;
1242
1243	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1244	if (nest == NULL)
1245		goto nla_put_failure;
1246	if (cbq_dump_attr(skb, &q->link) < 0)
1247		goto nla_put_failure;
1248	return nla_nest_end(skb, nest);
 
1249
1250nla_put_failure:
1251	nla_nest_cancel(skb, nest);
1252	return -1;
1253}
1254
1255static int
1256cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1257{
1258	struct cbq_sched_data *q = qdisc_priv(sch);
1259
1260	q->link.xstats.avgidle = q->link.avgidle;
1261	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1262}
1263
1264static int
1265cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1266	       struct sk_buff *skb, struct tcmsg *tcm)
1267{
1268	struct cbq_class *cl = (struct cbq_class *)arg;
1269	struct nlattr *nest;
1270
1271	if (cl->tparent)
1272		tcm->tcm_parent = cl->tparent->common.classid;
1273	else
1274		tcm->tcm_parent = TC_H_ROOT;
1275	tcm->tcm_handle = cl->common.classid;
1276	tcm->tcm_info = cl->q->handle;
1277
1278	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1279	if (nest == NULL)
1280		goto nla_put_failure;
1281	if (cbq_dump_attr(skb, cl) < 0)
1282		goto nla_put_failure;
1283	return nla_nest_end(skb, nest);
 
1284
1285nla_put_failure:
1286	nla_nest_cancel(skb, nest);
1287	return -1;
1288}
1289
1290static int
1291cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1292	struct gnet_dump *d)
1293{
1294	struct cbq_sched_data *q = qdisc_priv(sch);
1295	struct cbq_class *cl = (struct cbq_class *)arg;
1296	__u32 qlen;
1297
 
1298	cl->xstats.avgidle = cl->avgidle;
1299	cl->xstats.undertime = 0;
1300	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1301
1302	if (cl->undertime != PSCHED_PASTPERFECT)
1303		cl->xstats.undertime = cl->undertime - q->now;
1304
1305	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1306	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1307	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1308		return -1;
1309
1310	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1311}
1312
1313static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1314		     struct Qdisc **old, struct netlink_ext_ack *extack)
1315{
1316	struct cbq_class *cl = (struct cbq_class *)arg;
1317
1318	if (new == NULL) {
1319		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1320					cl->common.classid, extack);
1321		if (new == NULL)
1322			return -ENOBUFS;
 
 
 
 
 
1323	}
 
 
 
 
 
 
1324
1325	*old = qdisc_replace(sch, new, &cl->q);
1326	return 0;
1327}
1328
1329static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1330{
1331	struct cbq_class *cl = (struct cbq_class *)arg;
1332
1333	return cl->q;
1334}
1335
1336static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1337{
1338	struct cbq_class *cl = (struct cbq_class *)arg;
1339
1340	cbq_deactivate_class(cl);
 
1341}
1342
1343static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1344{
1345	struct cbq_sched_data *q = qdisc_priv(sch);
 
1346
1347	return (unsigned long)cbq_class_lookup(q, classid);
 
 
 
 
1348}
1349
1350static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1351{
1352	struct cbq_sched_data *q = qdisc_priv(sch);
1353
1354	WARN_ON(cl->filters);
1355
1356	tcf_block_put(cl->block);
1357	qdisc_put(cl->q);
1358	qdisc_put_rtab(cl->R_tab);
1359	gen_kill_estimator(&cl->rate_est);
1360	if (cl != &q->link)
1361		kfree(cl);
1362}
1363
1364static void cbq_destroy(struct Qdisc *sch)
1365{
1366	struct cbq_sched_data *q = qdisc_priv(sch);
1367	struct hlist_node *next;
1368	struct cbq_class *cl;
1369	unsigned int h;
1370
1371#ifdef CONFIG_NET_CLS_ACT
1372	q->rx_class = NULL;
1373#endif
1374	/*
1375	 * Filters must be destroyed first because we don't destroy the
1376	 * classes from root to leafs which means that filters can still
1377	 * be bound to classes which have been destroyed already. --TGR '04
1378	 */
1379	for (h = 0; h < q->clhash.hashsize; h++) {
1380		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1381			tcf_block_put(cl->block);
1382			cl->block = NULL;
1383		}
1384	}
1385	for (h = 0; h < q->clhash.hashsize; h++) {
1386		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1387					  common.hnode)
1388			cbq_destroy_class(sch, cl);
1389	}
1390	qdisc_class_hash_destroy(&q->clhash);
1391}
1392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1393static int
1394cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1395		 unsigned long *arg, struct netlink_ext_ack *extack)
1396{
1397	int err;
1398	struct cbq_sched_data *q = qdisc_priv(sch);
1399	struct cbq_class *cl = (struct cbq_class *)*arg;
1400	struct nlattr *opt = tca[TCA_OPTIONS];
1401	struct nlattr *tb[TCA_CBQ_MAX + 1];
1402	struct cbq_class *parent;
1403	struct qdisc_rate_table *rtab = NULL;
1404
1405	err = cbq_opt_parse(tb, opt, extack);
 
 
 
1406	if (err < 0)
1407		return err;
1408
1409	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1410		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1411		return -EOPNOTSUPP;
1412	}
1413
1414	if (cl) {
1415		/* Check parent */
1416		if (parentid) {
1417			if (cl->tparent &&
1418			    cl->tparent->common.classid != parentid) {
1419				NL_SET_ERR_MSG(extack, "Invalid parent id");
1420				return -EINVAL;
1421			}
1422			if (!cl->tparent && parentid != TC_H_ROOT) {
1423				NL_SET_ERR_MSG(extack, "Parent must be root");
1424				return -EINVAL;
1425			}
1426		}
1427
1428		if (tb[TCA_CBQ_RATE]) {
1429			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1430					      tb[TCA_CBQ_RTAB], extack);
1431			if (rtab == NULL)
1432				return -EINVAL;
1433		}
1434
1435		if (tca[TCA_RATE]) {
1436			err = gen_replace_estimator(&cl->bstats, NULL,
1437						    &cl->rate_est,
1438						    NULL,
1439						    true,
1440						    tca[TCA_RATE]);
1441			if (err) {
1442				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1443				qdisc_put_rtab(rtab);
1444				return err;
1445			}
1446		}
1447
1448		/* Change class parameters */
1449		sch_tree_lock(sch);
1450
1451		if (cl->next_alive != NULL)
1452			cbq_deactivate_class(cl);
1453
1454		if (rtab) {
1455			qdisc_put_rtab(cl->R_tab);
1456			cl->R_tab = rtab;
1457		}
1458
1459		if (tb[TCA_CBQ_LSSOPT])
1460			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1461
1462		if (tb[TCA_CBQ_WRROPT]) {
1463			cbq_rmprio(q, cl);
1464			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1465		}
1466
 
 
 
 
 
 
 
 
1467		if (tb[TCA_CBQ_FOPT])
1468			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1469
1470		if (cl->q->q.qlen)
1471			cbq_activate_class(cl);
1472
1473		sch_tree_unlock(sch);
1474
1475		return 0;
1476	}
1477
1478	if (parentid == TC_H_ROOT)
1479		return -EINVAL;
1480
1481	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1482		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1483		return -EINVAL;
1484	}
1485
1486	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1487			      extack);
1488	if (rtab == NULL)
1489		return -EINVAL;
1490
1491	if (classid) {
1492		err = -EINVAL;
1493		if (TC_H_MAJ(classid ^ sch->handle) ||
1494		    cbq_class_lookup(q, classid)) {
1495			NL_SET_ERR_MSG(extack, "Specified class not found");
1496			goto failure;
1497		}
1498	} else {
1499		int i;
1500		classid = TC_H_MAKE(sch->handle, 0x8000);
1501
1502		for (i = 0; i < 0x8000; i++) {
1503			if (++q->hgenerator >= 0x8000)
1504				q->hgenerator = 1;
1505			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1506				break;
1507		}
1508		err = -ENOSR;
1509		if (i >= 0x8000) {
1510			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1511			goto failure;
1512		}
1513		classid = classid|q->hgenerator;
1514	}
1515
1516	parent = &q->link;
1517	if (parentid) {
1518		parent = cbq_class_lookup(q, parentid);
1519		err = -EINVAL;
1520		if (!parent) {
1521			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1522			goto failure;
1523		}
1524	}
1525
1526	err = -ENOBUFS;
1527	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1528	if (cl == NULL)
1529		goto failure;
1530
1531	gnet_stats_basic_sync_init(&cl->bstats);
1532	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1533	if (err) {
1534		kfree(cl);
1535		goto failure;
1536	}
1537
1538	if (tca[TCA_RATE]) {
1539		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1540					NULL, true, tca[TCA_RATE]);
 
1541		if (err) {
1542			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1543			tcf_block_put(cl->block);
1544			kfree(cl);
1545			goto failure;
1546		}
1547	}
1548
1549	cl->R_tab = rtab;
1550	rtab = NULL;
1551	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1552				  NULL);
1553	if (!cl->q)
1554		cl->q = &noop_qdisc;
1555	else
1556		qdisc_hash_add(cl->q, true);
1557
1558	cl->common.classid = classid;
1559	cl->tparent = parent;
1560	cl->qdisc = sch;
1561	cl->allot = parent->allot;
1562	cl->quantum = cl->allot;
1563	cl->weight = cl->R_tab->rate.rate;
1564
1565	sch_tree_lock(sch);
1566	cbq_link_class(cl);
1567	cl->borrow = cl->tparent;
1568	if (cl->tparent != &q->link)
1569		cl->share = cl->tparent;
1570	cbq_adjust_levels(parent);
1571	cl->minidle = -0x7FFFFFFF;
1572	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1573	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1574	if (cl->ewma_log == 0)
1575		cl->ewma_log = q->link.ewma_log;
1576	if (cl->maxidle == 0)
1577		cl->maxidle = q->link.maxidle;
1578	if (cl->avpkt == 0)
1579		cl->avpkt = q->link.avpkt;
 
 
 
 
 
 
 
1580	if (tb[TCA_CBQ_FOPT])
1581		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1582	sch_tree_unlock(sch);
1583
1584	qdisc_class_hash_grow(sch, &q->clhash);
1585
1586	*arg = (unsigned long)cl;
1587	return 0;
1588
1589failure:
1590	qdisc_put_rtab(rtab);
1591	return err;
1592}
1593
1594static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1595		      struct netlink_ext_ack *extack)
1596{
1597	struct cbq_sched_data *q = qdisc_priv(sch);
1598	struct cbq_class *cl = (struct cbq_class *)arg;
 
1599
1600	if (cl->filters || cl->children || cl == &q->link)
1601		return -EBUSY;
1602
1603	sch_tree_lock(sch);
1604
1605	qdisc_purge_queue(cl->q);
 
 
1606
1607	if (cl->next_alive)
1608		cbq_deactivate_class(cl);
1609
1610	if (q->tx_borrowed == cl)
1611		q->tx_borrowed = q->tx_class;
1612	if (q->tx_class == cl) {
1613		q->tx_class = NULL;
1614		q->tx_borrowed = NULL;
1615	}
1616#ifdef CONFIG_NET_CLS_ACT
1617	if (q->rx_class == cl)
1618		q->rx_class = NULL;
1619#endif
1620
1621	cbq_unlink_class(cl);
1622	cbq_adjust_levels(cl->tparent);
1623	cl->defmap = 0;
1624	cbq_sync_defmap(cl);
1625
1626	cbq_rmprio(q, cl);
1627	sch_tree_unlock(sch);
1628
1629	cbq_destroy_class(sch, cl);
 
 
 
 
 
1630	return 0;
1631}
1632
1633static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1634				       struct netlink_ext_ack *extack)
1635{
1636	struct cbq_sched_data *q = qdisc_priv(sch);
1637	struct cbq_class *cl = (struct cbq_class *)arg;
1638
1639	if (cl == NULL)
1640		cl = &q->link;
1641
1642	return cl->block;
1643}
1644
1645static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1646				     u32 classid)
1647{
1648	struct cbq_sched_data *q = qdisc_priv(sch);
1649	struct cbq_class *p = (struct cbq_class *)parent;
1650	struct cbq_class *cl = cbq_class_lookup(q, classid);
1651
1652	if (cl) {
1653		if (p && p->level <= cl->level)
1654			return 0;
1655		cl->filters++;
1656		return (unsigned long)cl;
1657	}
1658	return 0;
1659}
1660
1661static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1662{
1663	struct cbq_class *cl = (struct cbq_class *)arg;
1664
1665	cl->filters--;
1666}
1667
1668static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1669{
1670	struct cbq_sched_data *q = qdisc_priv(sch);
1671	struct cbq_class *cl;
 
1672	unsigned int h;
1673
1674	if (arg->stop)
1675		return;
1676
1677	for (h = 0; h < q->clhash.hashsize; h++) {
1678		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1679			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
 
 
 
 
 
1680				return;
 
 
1681		}
1682	}
1683}
1684
1685static const struct Qdisc_class_ops cbq_class_ops = {
1686	.graft		=	cbq_graft,
1687	.leaf		=	cbq_leaf,
1688	.qlen_notify	=	cbq_qlen_notify,
1689	.find		=	cbq_find,
 
1690	.change		=	cbq_change_class,
1691	.delete		=	cbq_delete,
1692	.walk		=	cbq_walk,
1693	.tcf_block	=	cbq_tcf_block,
1694	.bind_tcf	=	cbq_bind_filter,
1695	.unbind_tcf	=	cbq_unbind_filter,
1696	.dump		=	cbq_dump_class,
1697	.dump_stats	=	cbq_dump_class_stats,
1698};
1699
1700static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1701	.next		=	NULL,
1702	.cl_ops		=	&cbq_class_ops,
1703	.id		=	"cbq",
1704	.priv_size	=	sizeof(struct cbq_sched_data),
1705	.enqueue	=	cbq_enqueue,
1706	.dequeue	=	cbq_dequeue,
1707	.peek		=	qdisc_peek_dequeued,
 
1708	.init		=	cbq_init,
1709	.reset		=	cbq_reset,
1710	.destroy	=	cbq_destroy,
1711	.change		=	NULL,
1712	.dump		=	cbq_dump,
1713	.dump_stats	=	cbq_dump_stats,
1714	.owner		=	THIS_MODULE,
1715};
1716
1717static int __init cbq_module_init(void)
1718{
1719	return register_qdisc(&cbq_qdisc_ops);
1720}
1721static void __exit cbq_module_exit(void)
1722{
1723	unregister_qdisc(&cbq_qdisc_ops);
1724}
1725module_init(cbq_module_init)
1726module_exit(cbq_module_exit)
1727MODULE_LICENSE("GPL");