Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   4 *
 
 
 
 
 
   5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/slab.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/string.h>
  13#include <linux/errno.h>
  14#include <linux/skbuff.h>
  15#include <net/netlink.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*	Class-Based Queueing (CBQ) algorithm.
  21	=======================================
  22
  23	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24		 Management Models for Packet Networks",
  25		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26
  27		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28
  29		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30		 Parameters", 1996
  31
  32		 [4] Sally Floyd and Michael Speer, "Experimental Results
  33		 for Class-Based Queueing", 1998, not published.
  34
  35	-----------------------------------------------------------------------
  36
  37	Algorithm skeleton was taken from NS simulator cbq.cc.
  38	If someone wants to check this code against the LBL version,
  39	he should take into account that ONLY the skeleton was borrowed,
  40	the implementation is different. Particularly:
  41
  42	--- The WRR algorithm is different. Our version looks more
  43	reasonable (I hope) and works when quanta are allowed to be
  44	less than MTU, which is always the case when real time classes
  45	have small rates. Note, that the statement of [3] is
  46	incomplete, delay may actually be estimated even if class
  47	per-round allotment is less than MTU. Namely, if per-round
  48	allotment is W*r_i, and r_1+...+r_k = r < 1
  49
  50	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51
  52	In the worst case we have IntServ estimate with D = W*r+k*MTU
  53	and C = MTU*r. The proof (if correct at all) is trivial.
  54
  55
  56	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57	interpret some places, which look like wrong translations
  58	from NS. Anyone is advised to find these differences
  59	and explain to me, why I am wrong 8).
  60
  61	--- Linux has no EOI event, so that we cannot estimate true class
  62	idle time. Workaround is to consider the next dequeue event
  63	as sign that previous packet is finished. This is wrong because of
  64	internal device queueing, but on a permanently loaded link it is true.
  65	Moreover, combined with clock integrator, this scheme looks
  66	very close to an ideal solution.  */
  67
  68struct cbq_sched_data;
  69
  70
  71struct cbq_class {
  72	struct Qdisc_class_common common;
  73	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  74
  75/* Parameters */
  76	unsigned char		priority;	/* class priority */
  77	unsigned char		priority2;	/* priority to be used after overlimit */
  78	unsigned char		ewma_log;	/* time constant for idle time calculation */
  79
  80	u32			defmap;
  81
  82	/* Link-sharing scheduler parameters */
  83	long			maxidle;	/* Class parameters: see below. */
  84	long			offtime;
  85	long			minidle;
  86	u32			avpkt;
  87	struct qdisc_rate_table	*R_tab;
  88
  89	/* General scheduler (WRR) parameters */
  90	long			allot;
  91	long			quantum;	/* Allotment per WRR round */
  92	long			weight;		/* Relative allotment: see below */
  93
  94	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
  95	struct cbq_class	*split;		/* Ptr to split node */
  96	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
  97	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
  98	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
  99						   parent otherwise */
 100	struct cbq_class	*sibling;	/* Sibling chain */
 101	struct cbq_class	*children;	/* Pointer to children chain */
 102
 103	struct Qdisc		*q;		/* Elementary queueing discipline */
 104
 105
 106/* Variables */
 107	unsigned char		cpriority;	/* Effective priority */
 108	unsigned char		delayed;
 109	unsigned char		level;		/* level of the class in hierarchy:
 110						   0 for leaf classes, and maximal
 111						   level of children + 1 for nodes.
 112						 */
 113
 114	psched_time_t		last;		/* Last end of service */
 115	psched_time_t		undertime;
 116	long			avgidle;
 117	long			deficit;	/* Saved deficit for WRR */
 118	psched_time_t		penalized;
 119	struct gnet_stats_basic_packed bstats;
 120	struct gnet_stats_queue qstats;
 121	struct net_rate_estimator __rcu *rate_est;
 122	struct tc_cbq_xstats	xstats;
 123
 124	struct tcf_proto __rcu	*filter_list;
 125	struct tcf_block	*block;
 126
 
 127	int			filters;
 128
 129	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 130};
 131
 132struct cbq_sched_data {
 133	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 134	int			nclasses[TC_CBQ_MAXPRIO + 1];
 135	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 136
 137	struct cbq_class	link;
 138
 139	unsigned int		activemask;
 140	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 141								   with backlog */
 142
 143#ifdef CONFIG_NET_CLS_ACT
 144	struct cbq_class	*rx_class;
 145#endif
 146	struct cbq_class	*tx_class;
 147	struct cbq_class	*tx_borrowed;
 148	int			tx_len;
 149	psched_time_t		now;		/* Cached timestamp */
 150	unsigned int		pmask;
 151
 152	struct hrtimer		delay_timer;
 153	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 154						   started when CBQ has
 155						   backlog, but cannot
 156						   transmit just now */
 157	psched_tdiff_t		wd_expires;
 158	int			toplevel;
 159	u32			hgenerator;
 160};
 161
 162
 163#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 164
 165static inline struct cbq_class *
 166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 167{
 168	struct Qdisc_class_common *clc;
 169
 170	clc = qdisc_class_find(&q->clhash, classid);
 171	if (clc == NULL)
 172		return NULL;
 173	return container_of(clc, struct cbq_class, common);
 174}
 175
 176#ifdef CONFIG_NET_CLS_ACT
 177
 178static struct cbq_class *
 179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 180{
 181	struct cbq_class *cl;
 182
 183	for (cl = this->tparent; cl; cl = cl->tparent) {
 184		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 185
 186		if (new != NULL && new != this)
 187			return new;
 188	}
 189	return NULL;
 190}
 191
 192#endif
 193
 194/* Classify packet. The procedure is pretty complicated, but
 195 * it allows us to combine link sharing and priority scheduling
 196 * transparently.
 197 *
 198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 199 * so that it resolves to split nodes. Then packets are classified
 200 * by logical priority, or a more specific classifier may be attached
 201 * to the split node.
 202 */
 203
 204static struct cbq_class *
 205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 206{
 207	struct cbq_sched_data *q = qdisc_priv(sch);
 208	struct cbq_class *head = &q->link;
 209	struct cbq_class **defmap;
 210	struct cbq_class *cl = NULL;
 211	u32 prio = skb->priority;
 212	struct tcf_proto *fl;
 213	struct tcf_result res;
 214
 215	/*
 216	 *  Step 1. If skb->priority points to one of our classes, use it.
 217	 */
 218	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 219	    (cl = cbq_class_lookup(q, prio)) != NULL)
 220		return cl;
 221
 222	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 223	for (;;) {
 224		int result = 0;
 225		defmap = head->defaults;
 226
 227		fl = rcu_dereference_bh(head->filter_list);
 228		/*
 229		 * Step 2+n. Apply classifier.
 230		 */
 231		result = tcf_classify(skb, fl, &res, true);
 232		if (!fl || result < 0)
 233			goto fallback;
 234
 235		cl = (void *)res.class;
 236		if (!cl) {
 237			if (TC_H_MAJ(res.classid))
 238				cl = cbq_class_lookup(q, res.classid);
 239			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 240				cl = defmap[TC_PRIO_BESTEFFORT];
 241
 242			if (cl == NULL)
 243				goto fallback;
 244		}
 245		if (cl->level >= head->level)
 246			goto fallback;
 247#ifdef CONFIG_NET_CLS_ACT
 248		switch (result) {
 249		case TC_ACT_QUEUED:
 250		case TC_ACT_STOLEN:
 251		case TC_ACT_TRAP:
 252			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 253			/* fall through */
 254		case TC_ACT_SHOT:
 255			return NULL;
 256		case TC_ACT_RECLASSIFY:
 257			return cbq_reclassify(skb, cl);
 258		}
 259#endif
 260		if (cl->level == 0)
 261			return cl;
 262
 263		/*
 264		 * Step 3+n. If classifier selected a link sharing class,
 265		 *	   apply agency specific classifier.
 266		 *	   Repeat this procdure until we hit a leaf node.
 267		 */
 268		head = cl;
 269	}
 270
 271fallback:
 272	cl = head;
 273
 274	/*
 275	 * Step 4. No success...
 276	 */
 277	if (TC_H_MAJ(prio) == 0 &&
 278	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 279	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 280		return head;
 281
 282	return cl;
 283}
 284
 285/*
 286 * A packet has just been enqueued on the empty class.
 287 * cbq_activate_class adds it to the tail of active class list
 288 * of its priority band.
 289 */
 290
 291static inline void cbq_activate_class(struct cbq_class *cl)
 292{
 293	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 294	int prio = cl->cpriority;
 295	struct cbq_class *cl_tail;
 296
 297	cl_tail = q->active[prio];
 298	q->active[prio] = cl;
 299
 300	if (cl_tail != NULL) {
 301		cl->next_alive = cl_tail->next_alive;
 302		cl_tail->next_alive = cl;
 303	} else {
 304		cl->next_alive = cl;
 305		q->activemask |= (1<<prio);
 306	}
 307}
 308
 309/*
 310 * Unlink class from active chain.
 311 * Note that this same procedure is done directly in cbq_dequeue*
 312 * during round-robin procedure.
 313 */
 314
 315static void cbq_deactivate_class(struct cbq_class *this)
 316{
 317	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 318	int prio = this->cpriority;
 319	struct cbq_class *cl;
 320	struct cbq_class *cl_prev = q->active[prio];
 321
 322	do {
 323		cl = cl_prev->next_alive;
 324		if (cl == this) {
 325			cl_prev->next_alive = cl->next_alive;
 326			cl->next_alive = NULL;
 327
 328			if (cl == q->active[prio]) {
 329				q->active[prio] = cl_prev;
 330				if (cl == q->active[prio]) {
 331					q->active[prio] = NULL;
 332					q->activemask &= ~(1<<prio);
 333					return;
 334				}
 335			}
 336			return;
 337		}
 338	} while ((cl_prev = cl) != q->active[prio]);
 339}
 340
 341static void
 342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 343{
 344	int toplevel = q->toplevel;
 345
 346	if (toplevel > cl->level) {
 347		psched_time_t now = psched_get_time();
 348
 349		do {
 350			if (cl->undertime < now) {
 351				q->toplevel = cl->level;
 352				return;
 353			}
 354		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 355	}
 356}
 357
 358static int
 359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 360	    struct sk_buff **to_free)
 361{
 362	struct cbq_sched_data *q = qdisc_priv(sch);
 363	int uninitialized_var(ret);
 364	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 365
 366#ifdef CONFIG_NET_CLS_ACT
 367	q->rx_class = cl;
 368#endif
 369	if (cl == NULL) {
 370		if (ret & __NET_XMIT_BYPASS)
 371			qdisc_qstats_drop(sch);
 372		__qdisc_drop(skb, to_free);
 373		return ret;
 374	}
 375
 376	ret = qdisc_enqueue(skb, cl->q, to_free);
 377	if (ret == NET_XMIT_SUCCESS) {
 378		sch->q.qlen++;
 379		cbq_mark_toplevel(q, cl);
 380		if (!cl->next_alive)
 381			cbq_activate_class(cl);
 382		return ret;
 383	}
 384
 385	if (net_xmit_drop_count(ret)) {
 386		qdisc_qstats_drop(sch);
 387		cbq_mark_toplevel(q, cl);
 388		cl->qstats.drops++;
 389	}
 390	return ret;
 391}
 392
 393/* Overlimit action: penalize leaf class by adding offtime */
 394static void cbq_overlimit(struct cbq_class *cl)
 395{
 396	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 397	psched_tdiff_t delay = cl->undertime - q->now;
 398
 399	if (!cl->delayed) {
 400		delay += cl->offtime;
 401
 402		/*
 403		 * Class goes to sleep, so that it will have no
 404		 * chance to work avgidle. Let's forgive it 8)
 405		 *
 406		 * BTW cbq-2.0 has a crap in this
 407		 * place, apparently they forgot to shift it by cl->ewma_log.
 408		 */
 409		if (cl->avgidle < 0)
 410			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 411		if (cl->avgidle < cl->minidle)
 412			cl->avgidle = cl->minidle;
 413		if (delay <= 0)
 414			delay = 1;
 415		cl->undertime = q->now + delay;
 416
 417		cl->xstats.overactions++;
 418		cl->delayed = 1;
 419	}
 420	if (q->wd_expires == 0 || q->wd_expires > delay)
 421		q->wd_expires = delay;
 422
 423	/* Dirty work! We must schedule wakeups based on
 424	 * real available rate, rather than leaf rate,
 425	 * which may be tiny (even zero).
 426	 */
 427	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 428		struct cbq_class *b;
 429		psched_tdiff_t base_delay = q->wd_expires;
 430
 431		for (b = cl->borrow; b; b = b->borrow) {
 432			delay = b->undertime - q->now;
 433			if (delay < base_delay) {
 434				if (delay <= 0)
 435					delay = 1;
 436				base_delay = delay;
 437			}
 438		}
 439
 440		q->wd_expires = base_delay;
 441	}
 442}
 443
 444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 445				       psched_time_t now)
 446{
 447	struct cbq_class *cl;
 448	struct cbq_class *cl_prev = q->active[prio];
 449	psched_time_t sched = now;
 450
 451	if (cl_prev == NULL)
 452		return 0;
 453
 454	do {
 455		cl = cl_prev->next_alive;
 456		if (now - cl->penalized > 0) {
 457			cl_prev->next_alive = cl->next_alive;
 458			cl->next_alive = NULL;
 459			cl->cpriority = cl->priority;
 460			cl->delayed = 0;
 461			cbq_activate_class(cl);
 462
 463			if (cl == q->active[prio]) {
 464				q->active[prio] = cl_prev;
 465				if (cl == q->active[prio]) {
 466					q->active[prio] = NULL;
 467					return 0;
 468				}
 469			}
 470
 471			cl = cl_prev->next_alive;
 472		} else if (sched - cl->penalized > 0)
 473			sched = cl->penalized;
 474	} while ((cl_prev = cl) != q->active[prio]);
 475
 476	return sched - now;
 477}
 478
 479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 480{
 481	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 482						delay_timer);
 483	struct Qdisc *sch = q->watchdog.qdisc;
 484	psched_time_t now;
 485	psched_tdiff_t delay = 0;
 486	unsigned int pmask;
 487
 488	now = psched_get_time();
 489
 490	pmask = q->pmask;
 491	q->pmask = 0;
 492
 493	while (pmask) {
 494		int prio = ffz(~pmask);
 495		psched_tdiff_t tmp;
 496
 497		pmask &= ~(1<<prio);
 498
 499		tmp = cbq_undelay_prio(q, prio, now);
 500		if (tmp > 0) {
 501			q->pmask |= 1<<prio;
 502			if (tmp < delay || delay == 0)
 503				delay = tmp;
 504		}
 505	}
 506
 507	if (delay) {
 508		ktime_t time;
 509
 510		time = 0;
 511		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 512		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 513	}
 514
 515	__netif_schedule(qdisc_root(sch));
 516	return HRTIMER_NORESTART;
 517}
 518
 519/*
 520 * It is mission critical procedure.
 521 *
 522 * We "regenerate" toplevel cutoff, if transmitting class
 523 * has backlog and it is not regulated. It is not part of
 524 * original CBQ description, but looks more reasonable.
 525 * Probably, it is wrong. This question needs further investigation.
 526 */
 527
 528static inline void
 529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 530		    struct cbq_class *borrowed)
 531{
 532	if (cl && q->toplevel >= borrowed->level) {
 533		if (cl->q->q.qlen > 1) {
 534			do {
 535				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 536					q->toplevel = borrowed->level;
 537					return;
 538				}
 539			} while ((borrowed = borrowed->borrow) != NULL);
 540		}
 541#if 0
 542	/* It is not necessary now. Uncommenting it
 543	   will save CPU cycles, but decrease fairness.
 544	 */
 545		q->toplevel = TC_CBQ_MAXLEVEL;
 546#endif
 547	}
 548}
 549
 550static void
 551cbq_update(struct cbq_sched_data *q)
 552{
 553	struct cbq_class *this = q->tx_class;
 554	struct cbq_class *cl = this;
 555	int len = q->tx_len;
 556	psched_time_t now;
 557
 558	q->tx_class = NULL;
 559	/* Time integrator. We calculate EOS time
 560	 * by adding expected packet transmission time.
 561	 */
 562	now = q->now + L2T(&q->link, len);
 563
 564	for ( ; cl; cl = cl->share) {
 565		long avgidle = cl->avgidle;
 566		long idle;
 567
 568		cl->bstats.packets++;
 569		cl->bstats.bytes += len;
 570
 571		/*
 572		 * (now - last) is total time between packet right edges.
 573		 * (last_pktlen/rate) is "virtual" busy time, so that
 574		 *
 575		 *	idle = (now - last) - last_pktlen/rate
 576		 */
 577
 578		idle = now - cl->last;
 579		if ((unsigned long)idle > 128*1024*1024) {
 580			avgidle = cl->maxidle;
 581		} else {
 582			idle -= L2T(cl, len);
 583
 584		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 585		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 586		 * cl->avgidle == true_avgidle/W,
 587		 * hence:
 588		 */
 589			avgidle += idle - (avgidle>>cl->ewma_log);
 590		}
 591
 592		if (avgidle <= 0) {
 593			/* Overlimit or at-limit */
 594
 595			if (avgidle < cl->minidle)
 596				avgidle = cl->minidle;
 597
 598			cl->avgidle = avgidle;
 599
 600			/* Calculate expected time, when this class
 601			 * will be allowed to send.
 602			 * It will occur, when:
 603			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 604			 * idle = (1/W - 1)*(-true_avgidle)
 605			 * or
 606			 * idle = (1 - W)*(-cl->avgidle);
 607			 */
 608			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 609
 610			/*
 611			 * That is not all.
 612			 * To maintain the rate allocated to the class,
 613			 * we add to undertime virtual clock,
 614			 * necessary to complete transmitted packet.
 615			 * (len/phys_bandwidth has been already passed
 616			 * to the moment of cbq_update)
 617			 */
 618
 619			idle -= L2T(&q->link, len);
 620			idle += L2T(cl, len);
 621
 622			cl->undertime = now + idle;
 623		} else {
 624			/* Underlimit */
 625
 626			cl->undertime = PSCHED_PASTPERFECT;
 627			if (avgidle > cl->maxidle)
 628				cl->avgidle = cl->maxidle;
 629			else
 630				cl->avgidle = avgidle;
 631		}
 632		if ((s64)(now - cl->last) > 0)
 633			cl->last = now;
 634	}
 635
 636	cbq_update_toplevel(q, this, q->tx_borrowed);
 637}
 638
 639static inline struct cbq_class *
 640cbq_under_limit(struct cbq_class *cl)
 641{
 642	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 643	struct cbq_class *this_cl = cl;
 644
 645	if (cl->tparent == NULL)
 646		return cl;
 647
 648	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 649		cl->delayed = 0;
 650		return cl;
 651	}
 652
 653	do {
 654		/* It is very suspicious place. Now overlimit
 655		 * action is generated for not bounded classes
 656		 * only if link is completely congested.
 657		 * Though it is in agree with ancestor-only paradigm,
 658		 * it looks very stupid. Particularly,
 659		 * it means that this chunk of code will either
 660		 * never be called or result in strong amplification
 661		 * of burstiness. Dangerous, silly, and, however,
 662		 * no another solution exists.
 663		 */
 664		cl = cl->borrow;
 665		if (!cl) {
 666			this_cl->qstats.overlimits++;
 667			cbq_overlimit(this_cl);
 668			return NULL;
 669		}
 670		if (cl->level > q->toplevel)
 671			return NULL;
 672	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 673
 674	cl->delayed = 0;
 675	return cl;
 676}
 677
 678static inline struct sk_buff *
 679cbq_dequeue_prio(struct Qdisc *sch, int prio)
 680{
 681	struct cbq_sched_data *q = qdisc_priv(sch);
 682	struct cbq_class *cl_tail, *cl_prev, *cl;
 683	struct sk_buff *skb;
 684	int deficit;
 685
 686	cl_tail = cl_prev = q->active[prio];
 687	cl = cl_prev->next_alive;
 688
 689	do {
 690		deficit = 0;
 691
 692		/* Start round */
 693		do {
 694			struct cbq_class *borrow = cl;
 695
 696			if (cl->q->q.qlen &&
 697			    (borrow = cbq_under_limit(cl)) == NULL)
 698				goto skip_class;
 699
 700			if (cl->deficit <= 0) {
 701				/* Class exhausted its allotment per
 702				 * this round. Switch to the next one.
 703				 */
 704				deficit = 1;
 705				cl->deficit += cl->quantum;
 706				goto next_class;
 707			}
 708
 709			skb = cl->q->dequeue(cl->q);
 710
 711			/* Class did not give us any skb :-(
 712			 * It could occur even if cl->q->q.qlen != 0
 713			 * f.e. if cl->q == "tbf"
 714			 */
 715			if (skb == NULL)
 716				goto skip_class;
 717
 718			cl->deficit -= qdisc_pkt_len(skb);
 719			q->tx_class = cl;
 720			q->tx_borrowed = borrow;
 721			if (borrow != cl) {
 722#ifndef CBQ_XSTATS_BORROWS_BYTES
 723				borrow->xstats.borrows++;
 724				cl->xstats.borrows++;
 725#else
 726				borrow->xstats.borrows += qdisc_pkt_len(skb);
 727				cl->xstats.borrows += qdisc_pkt_len(skb);
 728#endif
 729			}
 730			q->tx_len = qdisc_pkt_len(skb);
 731
 732			if (cl->deficit <= 0) {
 733				q->active[prio] = cl;
 734				cl = cl->next_alive;
 735				cl->deficit += cl->quantum;
 736			}
 737			return skb;
 738
 739skip_class:
 740			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 741				/* Class is empty or penalized.
 742				 * Unlink it from active chain.
 743				 */
 744				cl_prev->next_alive = cl->next_alive;
 745				cl->next_alive = NULL;
 746
 747				/* Did cl_tail point to it? */
 748				if (cl == cl_tail) {
 749					/* Repair it! */
 750					cl_tail = cl_prev;
 751
 752					/* Was it the last class in this band? */
 753					if (cl == cl_tail) {
 754						/* Kill the band! */
 755						q->active[prio] = NULL;
 756						q->activemask &= ~(1<<prio);
 757						if (cl->q->q.qlen)
 758							cbq_activate_class(cl);
 759						return NULL;
 760					}
 761
 762					q->active[prio] = cl_tail;
 763				}
 764				if (cl->q->q.qlen)
 765					cbq_activate_class(cl);
 766
 767				cl = cl_prev;
 768			}
 769
 770next_class:
 771			cl_prev = cl;
 772			cl = cl->next_alive;
 773		} while (cl_prev != cl_tail);
 774	} while (deficit);
 775
 776	q->active[prio] = cl_prev;
 777
 778	return NULL;
 779}
 780
 781static inline struct sk_buff *
 782cbq_dequeue_1(struct Qdisc *sch)
 783{
 784	struct cbq_sched_data *q = qdisc_priv(sch);
 785	struct sk_buff *skb;
 786	unsigned int activemask;
 787
 788	activemask = q->activemask & 0xFF;
 789	while (activemask) {
 790		int prio = ffz(~activemask);
 791		activemask &= ~(1<<prio);
 792		skb = cbq_dequeue_prio(sch, prio);
 793		if (skb)
 794			return skb;
 795	}
 796	return NULL;
 797}
 798
 799static struct sk_buff *
 800cbq_dequeue(struct Qdisc *sch)
 801{
 802	struct sk_buff *skb;
 803	struct cbq_sched_data *q = qdisc_priv(sch);
 804	psched_time_t now;
 805
 806	now = psched_get_time();
 807
 808	if (q->tx_class)
 809		cbq_update(q);
 810
 811	q->now = now;
 812
 813	for (;;) {
 814		q->wd_expires = 0;
 815
 816		skb = cbq_dequeue_1(sch);
 817		if (skb) {
 818			qdisc_bstats_update(sch, skb);
 819			sch->q.qlen--;
 820			return skb;
 821		}
 822
 823		/* All the classes are overlimit.
 824		 *
 825		 * It is possible, if:
 826		 *
 827		 * 1. Scheduler is empty.
 828		 * 2. Toplevel cutoff inhibited borrowing.
 829		 * 3. Root class is overlimit.
 830		 *
 831		 * Reset 2d and 3d conditions and retry.
 832		 *
 833		 * Note, that NS and cbq-2.0 are buggy, peeking
 834		 * an arbitrary class is appropriate for ancestor-only
 835		 * sharing, but not for toplevel algorithm.
 836		 *
 837		 * Our version is better, but slower, because it requires
 838		 * two passes, but it is unavoidable with top-level sharing.
 839		 */
 840
 841		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 842		    q->link.undertime == PSCHED_PASTPERFECT)
 843			break;
 844
 845		q->toplevel = TC_CBQ_MAXLEVEL;
 846		q->link.undertime = PSCHED_PASTPERFECT;
 847	}
 848
 849	/* No packets in scheduler or nobody wants to give them to us :-(
 850	 * Sigh... start watchdog timer in the last case.
 851	 */
 852
 853	if (sch->q.qlen) {
 854		qdisc_qstats_overlimit(sch);
 855		if (q->wd_expires)
 856			qdisc_watchdog_schedule(&q->watchdog,
 857						now + q->wd_expires);
 858	}
 859	return NULL;
 860}
 861
 862/* CBQ class maintanance routines */
 863
 864static void cbq_adjust_levels(struct cbq_class *this)
 865{
 866	if (this == NULL)
 867		return;
 868
 869	do {
 870		int level = 0;
 871		struct cbq_class *cl;
 872
 873		cl = this->children;
 874		if (cl) {
 875			do {
 876				if (cl->level > level)
 877					level = cl->level;
 878			} while ((cl = cl->sibling) != this->children);
 879		}
 880		this->level = level + 1;
 881	} while ((this = this->tparent) != NULL);
 882}
 883
 884static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 885{
 886	struct cbq_class *cl;
 887	unsigned int h;
 888
 889	if (q->quanta[prio] == 0)
 890		return;
 891
 892	for (h = 0; h < q->clhash.hashsize; h++) {
 893		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 894			/* BUGGGG... Beware! This expression suffer of
 895			 * arithmetic overflows!
 896			 */
 897			if (cl->priority == prio) {
 898				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 899					q->quanta[prio];
 900			}
 901			if (cl->quantum <= 0 ||
 902			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 903				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 904					cl->common.classid, cl->quantum);
 905				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 906			}
 907		}
 908	}
 909}
 910
 911static void cbq_sync_defmap(struct cbq_class *cl)
 912{
 913	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 914	struct cbq_class *split = cl->split;
 915	unsigned int h;
 916	int i;
 917
 918	if (split == NULL)
 919		return;
 920
 921	for (i = 0; i <= TC_PRIO_MAX; i++) {
 922		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 923			split->defaults[i] = NULL;
 924	}
 925
 926	for (i = 0; i <= TC_PRIO_MAX; i++) {
 927		int level = split->level;
 928
 929		if (split->defaults[i])
 930			continue;
 931
 932		for (h = 0; h < q->clhash.hashsize; h++) {
 933			struct cbq_class *c;
 934
 935			hlist_for_each_entry(c, &q->clhash.hash[h],
 936					     common.hnode) {
 937				if (c->split == split && c->level < level &&
 938				    c->defmap & (1<<i)) {
 939					split->defaults[i] = c;
 940					level = c->level;
 941				}
 942			}
 943		}
 944	}
 945}
 946
 947static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 948{
 949	struct cbq_class *split = NULL;
 950
 951	if (splitid == 0) {
 952		split = cl->split;
 953		if (!split)
 954			return;
 955		splitid = split->common.classid;
 956	}
 957
 958	if (split == NULL || split->common.classid != splitid) {
 959		for (split = cl->tparent; split; split = split->tparent)
 960			if (split->common.classid == splitid)
 961				break;
 962	}
 963
 964	if (split == NULL)
 965		return;
 966
 967	if (cl->split != split) {
 968		cl->defmap = 0;
 969		cbq_sync_defmap(cl);
 970		cl->split = split;
 971		cl->defmap = def & mask;
 972	} else
 973		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 974
 975	cbq_sync_defmap(cl);
 976}
 977
 978static void cbq_unlink_class(struct cbq_class *this)
 979{
 980	struct cbq_class *cl, **clp;
 981	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 982
 983	qdisc_class_hash_remove(&q->clhash, &this->common);
 984
 985	if (this->tparent) {
 986		clp = &this->sibling;
 987		cl = *clp;
 988		do {
 989			if (cl == this) {
 990				*clp = cl->sibling;
 991				break;
 992			}
 993			clp = &cl->sibling;
 994		} while ((cl = *clp) != this->sibling);
 995
 996		if (this->tparent->children == this) {
 997			this->tparent->children = this->sibling;
 998			if (this->sibling == this)
 999				this->tparent->children = NULL;
1000		}
1001	} else {
1002		WARN_ON(this->sibling != this);
1003	}
1004}
1005
1006static void cbq_link_class(struct cbq_class *this)
1007{
1008	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009	struct cbq_class *parent = this->tparent;
1010
1011	this->sibling = this;
1012	qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014	if (parent == NULL)
1015		return;
1016
1017	if (parent->children == NULL) {
1018		parent->children = this;
1019	} else {
1020		this->sibling = parent->children->sibling;
1021		parent->children->sibling = this;
1022	}
1023}
1024
1025static void
1026cbq_reset(struct Qdisc *sch)
1027{
1028	struct cbq_sched_data *q = qdisc_priv(sch);
1029	struct cbq_class *cl;
1030	int prio;
1031	unsigned int h;
1032
1033	q->activemask = 0;
1034	q->pmask = 0;
1035	q->tx_class = NULL;
1036	q->tx_borrowed = NULL;
1037	qdisc_watchdog_cancel(&q->watchdog);
1038	hrtimer_cancel(&q->delay_timer);
1039	q->toplevel = TC_CBQ_MAXLEVEL;
1040	q->now = psched_get_time();
1041
1042	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043		q->active[prio] = NULL;
1044
1045	for (h = 0; h < q->clhash.hashsize; h++) {
1046		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047			qdisc_reset(cl->q);
1048
1049			cl->next_alive = NULL;
1050			cl->undertime = PSCHED_PASTPERFECT;
1051			cl->avgidle = cl->maxidle;
1052			cl->deficit = cl->quantum;
1053			cl->cpriority = cl->priority;
1054		}
1055	}
1056	sch->q.qlen = 0;
1057}
1058
1059
1060static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061{
1062	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065	}
1066	if (lss->change & TCF_CBQ_LSS_EWMA)
1067		cl->ewma_log = lss->ewma_log;
1068	if (lss->change & TCF_CBQ_LSS_AVPKT)
1069		cl->avpkt = lss->avpkt;
1070	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071		cl->minidle = -(long)lss->minidle;
1072	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073		cl->maxidle = lss->maxidle;
1074		cl->avgidle = lss->maxidle;
1075	}
1076	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077		cl->offtime = lss->offtime;
1078	return 0;
1079}
1080
1081static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082{
1083	q->nclasses[cl->priority]--;
1084	q->quanta[cl->priority] -= cl->weight;
1085	cbq_normalize_quanta(q, cl->priority);
1086}
1087
1088static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089{
1090	q->nclasses[cl->priority]++;
1091	q->quanta[cl->priority] += cl->weight;
1092	cbq_normalize_quanta(q, cl->priority);
1093}
1094
1095static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096{
1097	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098
1099	if (wrr->allot)
1100		cl->allot = wrr->allot;
1101	if (wrr->weight)
1102		cl->weight = wrr->weight;
1103	if (wrr->priority) {
1104		cl->priority = wrr->priority - 1;
1105		cl->cpriority = cl->priority;
1106		if (cl->priority >= cl->priority2)
1107			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108	}
1109
1110	cbq_addprio(q, cl);
1111	return 0;
1112}
1113
1114static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115{
1116	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117	return 0;
1118}
1119
1120static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1122	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1123	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1124	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1125	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1126	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1128};
1129
1130static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1131			 struct nlattr *opt,
1132			 struct netlink_ext_ack *extack)
1133{
1134	int err;
1135
1136	if (!opt) {
1137		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1138		return -EINVAL;
1139	}
1140
1141	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1142					  cbq_policy, extack);
1143	if (err < 0)
1144		return err;
1145
1146	if (tb[TCA_CBQ_WRROPT]) {
1147		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148
1149		if (wrr->priority > TC_CBQ_MAXPRIO) {
1150			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1151			err = -EINVAL;
1152		}
1153	}
1154	return err;
1155}
1156
1157static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1158		    struct netlink_ext_ack *extack)
1159{
1160	struct cbq_sched_data *q = qdisc_priv(sch);
1161	struct nlattr *tb[TCA_CBQ_MAX + 1];
1162	struct tc_ratespec *r;
1163	int err;
1164
1165	qdisc_watchdog_init(&q->watchdog, sch);
1166	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1167	q->delay_timer.function = cbq_undelay;
1168
1169	err = cbq_opt_parse(tb, opt, extack);
1170	if (err < 0)
1171		return err;
1172
1173	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1174		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1175		return -EINVAL;
1176	}
1177
1178	r = nla_data(tb[TCA_CBQ_RATE]);
1179
1180	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1181	if (!q->link.R_tab)
1182		return -EINVAL;
1183
1184	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1185	if (err)
1186		goto put_rtab;
1187
1188	err = qdisc_class_hash_init(&q->clhash);
1189	if (err < 0)
1190		goto put_block;
1191
 
1192	q->link.sibling = &q->link;
1193	q->link.common.classid = sch->handle;
1194	q->link.qdisc = sch;
1195	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1196				      sch->handle, NULL);
1197	if (!q->link.q)
1198		q->link.q = &noop_qdisc;
1199	else
1200		qdisc_hash_add(q->link.q, true);
1201
1202	q->link.priority = TC_CBQ_MAXPRIO - 1;
1203	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1204	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1205	q->link.allot = psched_mtu(qdisc_dev(sch));
1206	q->link.quantum = q->link.allot;
1207	q->link.weight = q->link.R_tab->rate.rate;
1208
1209	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1210	q->link.avpkt = q->link.allot/2;
1211	q->link.minidle = -0x7FFFFFFF;
1212
 
 
 
1213	q->toplevel = TC_CBQ_MAXLEVEL;
1214	q->now = psched_get_time();
1215
1216	cbq_link_class(&q->link);
1217
1218	if (tb[TCA_CBQ_LSSOPT])
1219		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1220
1221	cbq_addprio(q, &q->link);
1222	return 0;
1223
1224put_block:
1225	tcf_block_put(q->link.block);
1226
1227put_rtab:
1228	qdisc_put_rtab(q->link.R_tab);
1229	return err;
1230}
1231
1232static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1233{
1234	unsigned char *b = skb_tail_pointer(skb);
1235
1236	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1237		goto nla_put_failure;
1238	return skb->len;
1239
1240nla_put_failure:
1241	nlmsg_trim(skb, b);
1242	return -1;
1243}
1244
1245static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1246{
1247	unsigned char *b = skb_tail_pointer(skb);
1248	struct tc_cbq_lssopt opt;
1249
1250	opt.flags = 0;
1251	if (cl->borrow == NULL)
1252		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1253	if (cl->share == NULL)
1254		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1255	opt.ewma_log = cl->ewma_log;
1256	opt.level = cl->level;
1257	opt.avpkt = cl->avpkt;
1258	opt.maxidle = cl->maxidle;
1259	opt.minidle = (u32)(-cl->minidle);
1260	opt.offtime = cl->offtime;
1261	opt.change = ~0;
1262	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1263		goto nla_put_failure;
1264	return skb->len;
1265
1266nla_put_failure:
1267	nlmsg_trim(skb, b);
1268	return -1;
1269}
1270
1271static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1272{
1273	unsigned char *b = skb_tail_pointer(skb);
1274	struct tc_cbq_wrropt opt;
1275
1276	memset(&opt, 0, sizeof(opt));
1277	opt.flags = 0;
1278	opt.allot = cl->allot;
1279	opt.priority = cl->priority + 1;
1280	opt.cpriority = cl->cpriority + 1;
1281	opt.weight = cl->weight;
1282	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1283		goto nla_put_failure;
1284	return skb->len;
1285
1286nla_put_failure:
1287	nlmsg_trim(skb, b);
1288	return -1;
1289}
1290
1291static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1292{
1293	unsigned char *b = skb_tail_pointer(skb);
1294	struct tc_cbq_fopt opt;
1295
1296	if (cl->split || cl->defmap) {
1297		opt.split = cl->split ? cl->split->common.classid : 0;
1298		opt.defmap = cl->defmap;
1299		opt.defchange = ~0;
1300		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1301			goto nla_put_failure;
1302	}
1303	return skb->len;
1304
1305nla_put_failure:
1306	nlmsg_trim(skb, b);
1307	return -1;
1308}
1309
1310static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1311{
1312	if (cbq_dump_lss(skb, cl) < 0 ||
1313	    cbq_dump_rate(skb, cl) < 0 ||
1314	    cbq_dump_wrr(skb, cl) < 0 ||
1315	    cbq_dump_fopt(skb, cl) < 0)
1316		return -1;
1317	return 0;
1318}
1319
1320static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1321{
1322	struct cbq_sched_data *q = qdisc_priv(sch);
1323	struct nlattr *nest;
1324
1325	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1326	if (nest == NULL)
1327		goto nla_put_failure;
1328	if (cbq_dump_attr(skb, &q->link) < 0)
1329		goto nla_put_failure;
1330	return nla_nest_end(skb, nest);
1331
1332nla_put_failure:
1333	nla_nest_cancel(skb, nest);
1334	return -1;
1335}
1336
1337static int
1338cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1339{
1340	struct cbq_sched_data *q = qdisc_priv(sch);
1341
1342	q->link.xstats.avgidle = q->link.avgidle;
1343	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1344}
1345
1346static int
1347cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1348	       struct sk_buff *skb, struct tcmsg *tcm)
1349{
1350	struct cbq_class *cl = (struct cbq_class *)arg;
1351	struct nlattr *nest;
1352
1353	if (cl->tparent)
1354		tcm->tcm_parent = cl->tparent->common.classid;
1355	else
1356		tcm->tcm_parent = TC_H_ROOT;
1357	tcm->tcm_handle = cl->common.classid;
1358	tcm->tcm_info = cl->q->handle;
1359
1360	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1361	if (nest == NULL)
1362		goto nla_put_failure;
1363	if (cbq_dump_attr(skb, cl) < 0)
1364		goto nla_put_failure;
1365	return nla_nest_end(skb, nest);
1366
1367nla_put_failure:
1368	nla_nest_cancel(skb, nest);
1369	return -1;
1370}
1371
1372static int
1373cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1374	struct gnet_dump *d)
1375{
1376	struct cbq_sched_data *q = qdisc_priv(sch);
1377	struct cbq_class *cl = (struct cbq_class *)arg;
1378	__u32 qlen;
1379
1380	cl->xstats.avgidle = cl->avgidle;
1381	cl->xstats.undertime = 0;
1382	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1383
1384	if (cl->undertime != PSCHED_PASTPERFECT)
1385		cl->xstats.undertime = cl->undertime - q->now;
1386
1387	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1388				  d, NULL, &cl->bstats) < 0 ||
1389	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1390	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1391		return -1;
1392
1393	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1394}
1395
1396static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1397		     struct Qdisc **old, struct netlink_ext_ack *extack)
1398{
1399	struct cbq_class *cl = (struct cbq_class *)arg;
1400
1401	if (new == NULL) {
1402		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1403					cl->common.classid, extack);
1404		if (new == NULL)
1405			return -ENOBUFS;
1406	}
1407
1408	*old = qdisc_replace(sch, new, &cl->q);
1409	return 0;
1410}
1411
1412static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1413{
1414	struct cbq_class *cl = (struct cbq_class *)arg;
1415
1416	return cl->q;
1417}
1418
1419static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1420{
1421	struct cbq_class *cl = (struct cbq_class *)arg;
1422
1423	cbq_deactivate_class(cl);
 
1424}
1425
1426static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1427{
1428	struct cbq_sched_data *q = qdisc_priv(sch);
 
1429
1430	return (unsigned long)cbq_class_lookup(q, classid);
 
 
 
 
1431}
1432
1433static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1434{
1435	struct cbq_sched_data *q = qdisc_priv(sch);
1436
1437	WARN_ON(cl->filters);
1438
1439	tcf_block_put(cl->block);
1440	qdisc_put(cl->q);
1441	qdisc_put_rtab(cl->R_tab);
1442	gen_kill_estimator(&cl->rate_est);
1443	if (cl != &q->link)
1444		kfree(cl);
1445}
1446
1447static void cbq_destroy(struct Qdisc *sch)
1448{
1449	struct cbq_sched_data *q = qdisc_priv(sch);
1450	struct hlist_node *next;
1451	struct cbq_class *cl;
1452	unsigned int h;
1453
1454#ifdef CONFIG_NET_CLS_ACT
1455	q->rx_class = NULL;
1456#endif
1457	/*
1458	 * Filters must be destroyed first because we don't destroy the
1459	 * classes from root to leafs which means that filters can still
1460	 * be bound to classes which have been destroyed already. --TGR '04
1461	 */
1462	for (h = 0; h < q->clhash.hashsize; h++) {
1463		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1464			tcf_block_put(cl->block);
1465			cl->block = NULL;
1466		}
1467	}
1468	for (h = 0; h < q->clhash.hashsize; h++) {
1469		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1470					  common.hnode)
1471			cbq_destroy_class(sch, cl);
1472	}
1473	qdisc_class_hash_destroy(&q->clhash);
1474}
1475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1476static int
1477cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1478		 unsigned long *arg, struct netlink_ext_ack *extack)
1479{
1480	int err;
1481	struct cbq_sched_data *q = qdisc_priv(sch);
1482	struct cbq_class *cl = (struct cbq_class *)*arg;
1483	struct nlattr *opt = tca[TCA_OPTIONS];
1484	struct nlattr *tb[TCA_CBQ_MAX + 1];
1485	struct cbq_class *parent;
1486	struct qdisc_rate_table *rtab = NULL;
1487
1488	err = cbq_opt_parse(tb, opt, extack);
 
 
 
1489	if (err < 0)
1490		return err;
1491
1492	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1493		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1494		return -EOPNOTSUPP;
1495	}
1496
1497	if (cl) {
1498		/* Check parent */
1499		if (parentid) {
1500			if (cl->tparent &&
1501			    cl->tparent->common.classid != parentid) {
1502				NL_SET_ERR_MSG(extack, "Invalid parent id");
1503				return -EINVAL;
1504			}
1505			if (!cl->tparent && parentid != TC_H_ROOT) {
1506				NL_SET_ERR_MSG(extack, "Parent must be root");
1507				return -EINVAL;
1508			}
1509		}
1510
1511		if (tb[TCA_CBQ_RATE]) {
1512			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1513					      tb[TCA_CBQ_RTAB], extack);
1514			if (rtab == NULL)
1515				return -EINVAL;
1516		}
1517
1518		if (tca[TCA_RATE]) {
1519			err = gen_replace_estimator(&cl->bstats, NULL,
1520						    &cl->rate_est,
1521						    NULL,
1522						    qdisc_root_sleeping_running(sch),
1523						    tca[TCA_RATE]);
1524			if (err) {
1525				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1526				qdisc_put_rtab(rtab);
1527				return err;
1528			}
1529		}
1530
1531		/* Change class parameters */
1532		sch_tree_lock(sch);
1533
1534		if (cl->next_alive != NULL)
1535			cbq_deactivate_class(cl);
1536
1537		if (rtab) {
1538			qdisc_put_rtab(cl->R_tab);
1539			cl->R_tab = rtab;
1540		}
1541
1542		if (tb[TCA_CBQ_LSSOPT])
1543			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1544
1545		if (tb[TCA_CBQ_WRROPT]) {
1546			cbq_rmprio(q, cl);
1547			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1548		}
1549
1550		if (tb[TCA_CBQ_FOPT])
1551			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1552
1553		if (cl->q->q.qlen)
1554			cbq_activate_class(cl);
1555
1556		sch_tree_unlock(sch);
1557
1558		return 0;
1559	}
1560
1561	if (parentid == TC_H_ROOT)
1562		return -EINVAL;
1563
1564	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1565		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1566		return -EINVAL;
1567	}
1568
1569	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1570			      extack);
1571	if (rtab == NULL)
1572		return -EINVAL;
1573
1574	if (classid) {
1575		err = -EINVAL;
1576		if (TC_H_MAJ(classid ^ sch->handle) ||
1577		    cbq_class_lookup(q, classid)) {
1578			NL_SET_ERR_MSG(extack, "Specified class not found");
1579			goto failure;
1580		}
1581	} else {
1582		int i;
1583		classid = TC_H_MAKE(sch->handle, 0x8000);
1584
1585		for (i = 0; i < 0x8000; i++) {
1586			if (++q->hgenerator >= 0x8000)
1587				q->hgenerator = 1;
1588			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1589				break;
1590		}
1591		err = -ENOSR;
1592		if (i >= 0x8000) {
1593			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1594			goto failure;
1595		}
1596		classid = classid|q->hgenerator;
1597	}
1598
1599	parent = &q->link;
1600	if (parentid) {
1601		parent = cbq_class_lookup(q, parentid);
1602		err = -EINVAL;
1603		if (!parent) {
1604			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1605			goto failure;
1606		}
1607	}
1608
1609	err = -ENOBUFS;
1610	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1611	if (cl == NULL)
1612		goto failure;
1613
1614	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1615	if (err) {
1616		kfree(cl);
1617		return err;
1618	}
1619
1620	if (tca[TCA_RATE]) {
1621		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1622					NULL,
1623					qdisc_root_sleeping_running(sch),
1624					tca[TCA_RATE]);
1625		if (err) {
1626			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1627			tcf_block_put(cl->block);
1628			kfree(cl);
1629			goto failure;
1630		}
1631	}
1632
1633	cl->R_tab = rtab;
1634	rtab = NULL;
1635	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1636				  NULL);
1637	if (!cl->q)
1638		cl->q = &noop_qdisc;
1639	else
1640		qdisc_hash_add(cl->q, true);
1641
1642	cl->common.classid = classid;
1643	cl->tparent = parent;
1644	cl->qdisc = sch;
1645	cl->allot = parent->allot;
1646	cl->quantum = cl->allot;
1647	cl->weight = cl->R_tab->rate.rate;
1648
1649	sch_tree_lock(sch);
1650	cbq_link_class(cl);
1651	cl->borrow = cl->tparent;
1652	if (cl->tparent != &q->link)
1653		cl->share = cl->tparent;
1654	cbq_adjust_levels(parent);
1655	cl->minidle = -0x7FFFFFFF;
1656	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1657	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1658	if (cl->ewma_log == 0)
1659		cl->ewma_log = q->link.ewma_log;
1660	if (cl->maxidle == 0)
1661		cl->maxidle = q->link.maxidle;
1662	if (cl->avpkt == 0)
1663		cl->avpkt = q->link.avpkt;
1664	if (tb[TCA_CBQ_FOPT])
1665		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1666	sch_tree_unlock(sch);
1667
1668	qdisc_class_hash_grow(sch, &q->clhash);
1669
1670	*arg = (unsigned long)cl;
1671	return 0;
1672
1673failure:
1674	qdisc_put_rtab(rtab);
1675	return err;
1676}
1677
1678static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1679{
1680	struct cbq_sched_data *q = qdisc_priv(sch);
1681	struct cbq_class *cl = (struct cbq_class *)arg;
 
1682
1683	if (cl->filters || cl->children || cl == &q->link)
1684		return -EBUSY;
1685
1686	sch_tree_lock(sch);
1687
1688	qdisc_purge_queue(cl->q);
 
 
 
1689
1690	if (cl->next_alive)
1691		cbq_deactivate_class(cl);
1692
1693	if (q->tx_borrowed == cl)
1694		q->tx_borrowed = q->tx_class;
1695	if (q->tx_class == cl) {
1696		q->tx_class = NULL;
1697		q->tx_borrowed = NULL;
1698	}
1699#ifdef CONFIG_NET_CLS_ACT
1700	if (q->rx_class == cl)
1701		q->rx_class = NULL;
1702#endif
1703
1704	cbq_unlink_class(cl);
1705	cbq_adjust_levels(cl->tparent);
1706	cl->defmap = 0;
1707	cbq_sync_defmap(cl);
1708
1709	cbq_rmprio(q, cl);
1710	sch_tree_unlock(sch);
1711
1712	cbq_destroy_class(sch, cl);
 
 
 
 
 
1713	return 0;
1714}
1715
1716static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1717				       struct netlink_ext_ack *extack)
1718{
1719	struct cbq_sched_data *q = qdisc_priv(sch);
1720	struct cbq_class *cl = (struct cbq_class *)arg;
1721
1722	if (cl == NULL)
1723		cl = &q->link;
1724
1725	return cl->block;
1726}
1727
1728static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1729				     u32 classid)
1730{
1731	struct cbq_sched_data *q = qdisc_priv(sch);
1732	struct cbq_class *p = (struct cbq_class *)parent;
1733	struct cbq_class *cl = cbq_class_lookup(q, classid);
1734
1735	if (cl) {
1736		if (p && p->level <= cl->level)
1737			return 0;
1738		cl->filters++;
1739		return (unsigned long)cl;
1740	}
1741	return 0;
1742}
1743
1744static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1745{
1746	struct cbq_class *cl = (struct cbq_class *)arg;
1747
1748	cl->filters--;
1749}
1750
1751static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1752{
1753	struct cbq_sched_data *q = qdisc_priv(sch);
1754	struct cbq_class *cl;
1755	unsigned int h;
1756
1757	if (arg->stop)
1758		return;
1759
1760	for (h = 0; h < q->clhash.hashsize; h++) {
1761		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1762			if (arg->count < arg->skip) {
1763				arg->count++;
1764				continue;
1765			}
1766			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1767				arg->stop = 1;
1768				return;
1769			}
1770			arg->count++;
1771		}
1772	}
1773}
1774
1775static const struct Qdisc_class_ops cbq_class_ops = {
1776	.graft		=	cbq_graft,
1777	.leaf		=	cbq_leaf,
1778	.qlen_notify	=	cbq_qlen_notify,
1779	.find		=	cbq_find,
 
1780	.change		=	cbq_change_class,
1781	.delete		=	cbq_delete,
1782	.walk		=	cbq_walk,
1783	.tcf_block	=	cbq_tcf_block,
1784	.bind_tcf	=	cbq_bind_filter,
1785	.unbind_tcf	=	cbq_unbind_filter,
1786	.dump		=	cbq_dump_class,
1787	.dump_stats	=	cbq_dump_class_stats,
1788};
1789
1790static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1791	.next		=	NULL,
1792	.cl_ops		=	&cbq_class_ops,
1793	.id		=	"cbq",
1794	.priv_size	=	sizeof(struct cbq_sched_data),
1795	.enqueue	=	cbq_enqueue,
1796	.dequeue	=	cbq_dequeue,
1797	.peek		=	qdisc_peek_dequeued,
1798	.init		=	cbq_init,
1799	.reset		=	cbq_reset,
1800	.destroy	=	cbq_destroy,
1801	.change		=	NULL,
1802	.dump		=	cbq_dump,
1803	.dump_stats	=	cbq_dump_stats,
1804	.owner		=	THIS_MODULE,
1805};
1806
1807static int __init cbq_module_init(void)
1808{
1809	return register_qdisc(&cbq_qdisc_ops);
1810}
1811static void __exit cbq_module_exit(void)
1812{
1813	unregister_qdisc(&cbq_qdisc_ops);
1814}
1815module_init(cbq_module_init)
1816module_exit(cbq_module_exit)
1817MODULE_LICENSE("GPL");
v4.10.11
 
   1/*
   2 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
 
  22
  23
  24/*	Class-Based Queueing (CBQ) algorithm.
  25	=======================================
  26
  27	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  28		 Management Models for Packet Networks",
  29		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  30
  31		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  32
  33		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  34		 Parameters", 1996
  35
  36		 [4] Sally Floyd and Michael Speer, "Experimental Results
  37		 for Class-Based Queueing", 1998, not published.
  38
  39	-----------------------------------------------------------------------
  40
  41	Algorithm skeleton was taken from NS simulator cbq.cc.
  42	If someone wants to check this code against the LBL version,
  43	he should take into account that ONLY the skeleton was borrowed,
  44	the implementation is different. Particularly:
  45
  46	--- The WRR algorithm is different. Our version looks more
  47	reasonable (I hope) and works when quanta are allowed to be
  48	less than MTU, which is always the case when real time classes
  49	have small rates. Note, that the statement of [3] is
  50	incomplete, delay may actually be estimated even if class
  51	per-round allotment is less than MTU. Namely, if per-round
  52	allotment is W*r_i, and r_1+...+r_k = r < 1
  53
  54	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  55
  56	In the worst case we have IntServ estimate with D = W*r+k*MTU
  57	and C = MTU*r. The proof (if correct at all) is trivial.
  58
  59
  60	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  61	interpret some places, which look like wrong translations
  62	from NS. Anyone is advised to find these differences
  63	and explain to me, why I am wrong 8).
  64
  65	--- Linux has no EOI event, so that we cannot estimate true class
  66	idle time. Workaround is to consider the next dequeue event
  67	as sign that previous packet is finished. This is wrong because of
  68	internal device queueing, but on a permanently loaded link it is true.
  69	Moreover, combined with clock integrator, this scheme looks
  70	very close to an ideal solution.  */
  71
  72struct cbq_sched_data;
  73
  74
  75struct cbq_class {
  76	struct Qdisc_class_common common;
  77	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  78
  79/* Parameters */
  80	unsigned char		priority;	/* class priority */
  81	unsigned char		priority2;	/* priority to be used after overlimit */
  82	unsigned char		ewma_log;	/* time constant for idle time calculation */
  83
  84	u32			defmap;
  85
  86	/* Link-sharing scheduler parameters */
  87	long			maxidle;	/* Class parameters: see below. */
  88	long			offtime;
  89	long			minidle;
  90	u32			avpkt;
  91	struct qdisc_rate_table	*R_tab;
  92
  93	/* General scheduler (WRR) parameters */
  94	long			allot;
  95	long			quantum;	/* Allotment per WRR round */
  96	long			weight;		/* Relative allotment: see below */
  97
  98	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
  99	struct cbq_class	*split;		/* Ptr to split node */
 100	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
 101	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
 102	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
 103						   parent otherwise */
 104	struct cbq_class	*sibling;	/* Sibling chain */
 105	struct cbq_class	*children;	/* Pointer to children chain */
 106
 107	struct Qdisc		*q;		/* Elementary queueing discipline */
 108
 109
 110/* Variables */
 111	unsigned char		cpriority;	/* Effective priority */
 112	unsigned char		delayed;
 113	unsigned char		level;		/* level of the class in hierarchy:
 114						   0 for leaf classes, and maximal
 115						   level of children + 1 for nodes.
 116						 */
 117
 118	psched_time_t		last;		/* Last end of service */
 119	psched_time_t		undertime;
 120	long			avgidle;
 121	long			deficit;	/* Saved deficit for WRR */
 122	psched_time_t		penalized;
 123	struct gnet_stats_basic_packed bstats;
 124	struct gnet_stats_queue qstats;
 125	struct net_rate_estimator __rcu *rate_est;
 126	struct tc_cbq_xstats	xstats;
 127
 128	struct tcf_proto __rcu	*filter_list;
 
 129
 130	int			refcnt;
 131	int			filters;
 132
 133	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 134};
 135
 136struct cbq_sched_data {
 137	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 138	int			nclasses[TC_CBQ_MAXPRIO + 1];
 139	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 140
 141	struct cbq_class	link;
 142
 143	unsigned int		activemask;
 144	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 145								   with backlog */
 146
 147#ifdef CONFIG_NET_CLS_ACT
 148	struct cbq_class	*rx_class;
 149#endif
 150	struct cbq_class	*tx_class;
 151	struct cbq_class	*tx_borrowed;
 152	int			tx_len;
 153	psched_time_t		now;		/* Cached timestamp */
 154	unsigned int		pmask;
 155
 156	struct hrtimer		delay_timer;
 157	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 158						   started when CBQ has
 159						   backlog, but cannot
 160						   transmit just now */
 161	psched_tdiff_t		wd_expires;
 162	int			toplevel;
 163	u32			hgenerator;
 164};
 165
 166
 167#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 168
 169static inline struct cbq_class *
 170cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 171{
 172	struct Qdisc_class_common *clc;
 173
 174	clc = qdisc_class_find(&q->clhash, classid);
 175	if (clc == NULL)
 176		return NULL;
 177	return container_of(clc, struct cbq_class, common);
 178}
 179
 180#ifdef CONFIG_NET_CLS_ACT
 181
 182static struct cbq_class *
 183cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 184{
 185	struct cbq_class *cl;
 186
 187	for (cl = this->tparent; cl; cl = cl->tparent) {
 188		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 189
 190		if (new != NULL && new != this)
 191			return new;
 192	}
 193	return NULL;
 194}
 195
 196#endif
 197
 198/* Classify packet. The procedure is pretty complicated, but
 199 * it allows us to combine link sharing and priority scheduling
 200 * transparently.
 201 *
 202 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 203 * so that it resolves to split nodes. Then packets are classified
 204 * by logical priority, or a more specific classifier may be attached
 205 * to the split node.
 206 */
 207
 208static struct cbq_class *
 209cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 210{
 211	struct cbq_sched_data *q = qdisc_priv(sch);
 212	struct cbq_class *head = &q->link;
 213	struct cbq_class **defmap;
 214	struct cbq_class *cl = NULL;
 215	u32 prio = skb->priority;
 216	struct tcf_proto *fl;
 217	struct tcf_result res;
 218
 219	/*
 220	 *  Step 1. If skb->priority points to one of our classes, use it.
 221	 */
 222	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 223	    (cl = cbq_class_lookup(q, prio)) != NULL)
 224		return cl;
 225
 226	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 227	for (;;) {
 228		int result = 0;
 229		defmap = head->defaults;
 230
 231		fl = rcu_dereference_bh(head->filter_list);
 232		/*
 233		 * Step 2+n. Apply classifier.
 234		 */
 235		result = tc_classify(skb, fl, &res, true);
 236		if (!fl || result < 0)
 237			goto fallback;
 238
 239		cl = (void *)res.class;
 240		if (!cl) {
 241			if (TC_H_MAJ(res.classid))
 242				cl = cbq_class_lookup(q, res.classid);
 243			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 244				cl = defmap[TC_PRIO_BESTEFFORT];
 245
 246			if (cl == NULL)
 247				goto fallback;
 248		}
 249		if (cl->level >= head->level)
 250			goto fallback;
 251#ifdef CONFIG_NET_CLS_ACT
 252		switch (result) {
 253		case TC_ACT_QUEUED:
 254		case TC_ACT_STOLEN:
 
 255			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 
 256		case TC_ACT_SHOT:
 257			return NULL;
 258		case TC_ACT_RECLASSIFY:
 259			return cbq_reclassify(skb, cl);
 260		}
 261#endif
 262		if (cl->level == 0)
 263			return cl;
 264
 265		/*
 266		 * Step 3+n. If classifier selected a link sharing class,
 267		 *	   apply agency specific classifier.
 268		 *	   Repeat this procdure until we hit a leaf node.
 269		 */
 270		head = cl;
 271	}
 272
 273fallback:
 274	cl = head;
 275
 276	/*
 277	 * Step 4. No success...
 278	 */
 279	if (TC_H_MAJ(prio) == 0 &&
 280	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 281	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 282		return head;
 283
 284	return cl;
 285}
 286
 287/*
 288 * A packet has just been enqueued on the empty class.
 289 * cbq_activate_class adds it to the tail of active class list
 290 * of its priority band.
 291 */
 292
 293static inline void cbq_activate_class(struct cbq_class *cl)
 294{
 295	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 296	int prio = cl->cpriority;
 297	struct cbq_class *cl_tail;
 298
 299	cl_tail = q->active[prio];
 300	q->active[prio] = cl;
 301
 302	if (cl_tail != NULL) {
 303		cl->next_alive = cl_tail->next_alive;
 304		cl_tail->next_alive = cl;
 305	} else {
 306		cl->next_alive = cl;
 307		q->activemask |= (1<<prio);
 308	}
 309}
 310
 311/*
 312 * Unlink class from active chain.
 313 * Note that this same procedure is done directly in cbq_dequeue*
 314 * during round-robin procedure.
 315 */
 316
 317static void cbq_deactivate_class(struct cbq_class *this)
 318{
 319	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 320	int prio = this->cpriority;
 321	struct cbq_class *cl;
 322	struct cbq_class *cl_prev = q->active[prio];
 323
 324	do {
 325		cl = cl_prev->next_alive;
 326		if (cl == this) {
 327			cl_prev->next_alive = cl->next_alive;
 328			cl->next_alive = NULL;
 329
 330			if (cl == q->active[prio]) {
 331				q->active[prio] = cl_prev;
 332				if (cl == q->active[prio]) {
 333					q->active[prio] = NULL;
 334					q->activemask &= ~(1<<prio);
 335					return;
 336				}
 337			}
 338			return;
 339		}
 340	} while ((cl_prev = cl) != q->active[prio]);
 341}
 342
 343static void
 344cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 345{
 346	int toplevel = q->toplevel;
 347
 348	if (toplevel > cl->level) {
 349		psched_time_t now = psched_get_time();
 350
 351		do {
 352			if (cl->undertime < now) {
 353				q->toplevel = cl->level;
 354				return;
 355			}
 356		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 357	}
 358}
 359
 360static int
 361cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 362	    struct sk_buff **to_free)
 363{
 364	struct cbq_sched_data *q = qdisc_priv(sch);
 365	int uninitialized_var(ret);
 366	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 367
 368#ifdef CONFIG_NET_CLS_ACT
 369	q->rx_class = cl;
 370#endif
 371	if (cl == NULL) {
 372		if (ret & __NET_XMIT_BYPASS)
 373			qdisc_qstats_drop(sch);
 374		__qdisc_drop(skb, to_free);
 375		return ret;
 376	}
 377
 378	ret = qdisc_enqueue(skb, cl->q, to_free);
 379	if (ret == NET_XMIT_SUCCESS) {
 380		sch->q.qlen++;
 381		cbq_mark_toplevel(q, cl);
 382		if (!cl->next_alive)
 383			cbq_activate_class(cl);
 384		return ret;
 385	}
 386
 387	if (net_xmit_drop_count(ret)) {
 388		qdisc_qstats_drop(sch);
 389		cbq_mark_toplevel(q, cl);
 390		cl->qstats.drops++;
 391	}
 392	return ret;
 393}
 394
 395/* Overlimit action: penalize leaf class by adding offtime */
 396static void cbq_overlimit(struct cbq_class *cl)
 397{
 398	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 399	psched_tdiff_t delay = cl->undertime - q->now;
 400
 401	if (!cl->delayed) {
 402		delay += cl->offtime;
 403
 404		/*
 405		 * Class goes to sleep, so that it will have no
 406		 * chance to work avgidle. Let's forgive it 8)
 407		 *
 408		 * BTW cbq-2.0 has a crap in this
 409		 * place, apparently they forgot to shift it by cl->ewma_log.
 410		 */
 411		if (cl->avgidle < 0)
 412			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 413		if (cl->avgidle < cl->minidle)
 414			cl->avgidle = cl->minidle;
 415		if (delay <= 0)
 416			delay = 1;
 417		cl->undertime = q->now + delay;
 418
 419		cl->xstats.overactions++;
 420		cl->delayed = 1;
 421	}
 422	if (q->wd_expires == 0 || q->wd_expires > delay)
 423		q->wd_expires = delay;
 424
 425	/* Dirty work! We must schedule wakeups based on
 426	 * real available rate, rather than leaf rate,
 427	 * which may be tiny (even zero).
 428	 */
 429	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 430		struct cbq_class *b;
 431		psched_tdiff_t base_delay = q->wd_expires;
 432
 433		for (b = cl->borrow; b; b = b->borrow) {
 434			delay = b->undertime - q->now;
 435			if (delay < base_delay) {
 436				if (delay <= 0)
 437					delay = 1;
 438				base_delay = delay;
 439			}
 440		}
 441
 442		q->wd_expires = base_delay;
 443	}
 444}
 445
 446static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 447				       psched_time_t now)
 448{
 449	struct cbq_class *cl;
 450	struct cbq_class *cl_prev = q->active[prio];
 451	psched_time_t sched = now;
 452
 453	if (cl_prev == NULL)
 454		return 0;
 455
 456	do {
 457		cl = cl_prev->next_alive;
 458		if (now - cl->penalized > 0) {
 459			cl_prev->next_alive = cl->next_alive;
 460			cl->next_alive = NULL;
 461			cl->cpriority = cl->priority;
 462			cl->delayed = 0;
 463			cbq_activate_class(cl);
 464
 465			if (cl == q->active[prio]) {
 466				q->active[prio] = cl_prev;
 467				if (cl == q->active[prio]) {
 468					q->active[prio] = NULL;
 469					return 0;
 470				}
 471			}
 472
 473			cl = cl_prev->next_alive;
 474		} else if (sched - cl->penalized > 0)
 475			sched = cl->penalized;
 476	} while ((cl_prev = cl) != q->active[prio]);
 477
 478	return sched - now;
 479}
 480
 481static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 482{
 483	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 484						delay_timer);
 485	struct Qdisc *sch = q->watchdog.qdisc;
 486	psched_time_t now;
 487	psched_tdiff_t delay = 0;
 488	unsigned int pmask;
 489
 490	now = psched_get_time();
 491
 492	pmask = q->pmask;
 493	q->pmask = 0;
 494
 495	while (pmask) {
 496		int prio = ffz(~pmask);
 497		psched_tdiff_t tmp;
 498
 499		pmask &= ~(1<<prio);
 500
 501		tmp = cbq_undelay_prio(q, prio, now);
 502		if (tmp > 0) {
 503			q->pmask |= 1<<prio;
 504			if (tmp < delay || delay == 0)
 505				delay = tmp;
 506		}
 507	}
 508
 509	if (delay) {
 510		ktime_t time;
 511
 512		time = 0;
 513		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 514		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 515	}
 516
 517	__netif_schedule(qdisc_root(sch));
 518	return HRTIMER_NORESTART;
 519}
 520
 521/*
 522 * It is mission critical procedure.
 523 *
 524 * We "regenerate" toplevel cutoff, if transmitting class
 525 * has backlog and it is not regulated. It is not part of
 526 * original CBQ description, but looks more reasonable.
 527 * Probably, it is wrong. This question needs further investigation.
 528 */
 529
 530static inline void
 531cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 532		    struct cbq_class *borrowed)
 533{
 534	if (cl && q->toplevel >= borrowed->level) {
 535		if (cl->q->q.qlen > 1) {
 536			do {
 537				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 538					q->toplevel = borrowed->level;
 539					return;
 540				}
 541			} while ((borrowed = borrowed->borrow) != NULL);
 542		}
 543#if 0
 544	/* It is not necessary now. Uncommenting it
 545	   will save CPU cycles, but decrease fairness.
 546	 */
 547		q->toplevel = TC_CBQ_MAXLEVEL;
 548#endif
 549	}
 550}
 551
 552static void
 553cbq_update(struct cbq_sched_data *q)
 554{
 555	struct cbq_class *this = q->tx_class;
 556	struct cbq_class *cl = this;
 557	int len = q->tx_len;
 558	psched_time_t now;
 559
 560	q->tx_class = NULL;
 561	/* Time integrator. We calculate EOS time
 562	 * by adding expected packet transmission time.
 563	 */
 564	now = q->now + L2T(&q->link, len);
 565
 566	for ( ; cl; cl = cl->share) {
 567		long avgidle = cl->avgidle;
 568		long idle;
 569
 570		cl->bstats.packets++;
 571		cl->bstats.bytes += len;
 572
 573		/*
 574		 * (now - last) is total time between packet right edges.
 575		 * (last_pktlen/rate) is "virtual" busy time, so that
 576		 *
 577		 *	idle = (now - last) - last_pktlen/rate
 578		 */
 579
 580		idle = now - cl->last;
 581		if ((unsigned long)idle > 128*1024*1024) {
 582			avgidle = cl->maxidle;
 583		} else {
 584			idle -= L2T(cl, len);
 585
 586		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 587		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 588		 * cl->avgidle == true_avgidle/W,
 589		 * hence:
 590		 */
 591			avgidle += idle - (avgidle>>cl->ewma_log);
 592		}
 593
 594		if (avgidle <= 0) {
 595			/* Overlimit or at-limit */
 596
 597			if (avgidle < cl->minidle)
 598				avgidle = cl->minidle;
 599
 600			cl->avgidle = avgidle;
 601
 602			/* Calculate expected time, when this class
 603			 * will be allowed to send.
 604			 * It will occur, when:
 605			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 606			 * idle = (1/W - 1)*(-true_avgidle)
 607			 * or
 608			 * idle = (1 - W)*(-cl->avgidle);
 609			 */
 610			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 611
 612			/*
 613			 * That is not all.
 614			 * To maintain the rate allocated to the class,
 615			 * we add to undertime virtual clock,
 616			 * necessary to complete transmitted packet.
 617			 * (len/phys_bandwidth has been already passed
 618			 * to the moment of cbq_update)
 619			 */
 620
 621			idle -= L2T(&q->link, len);
 622			idle += L2T(cl, len);
 623
 624			cl->undertime = now + idle;
 625		} else {
 626			/* Underlimit */
 627
 628			cl->undertime = PSCHED_PASTPERFECT;
 629			if (avgidle > cl->maxidle)
 630				cl->avgidle = cl->maxidle;
 631			else
 632				cl->avgidle = avgidle;
 633		}
 634		if ((s64)(now - cl->last) > 0)
 635			cl->last = now;
 636	}
 637
 638	cbq_update_toplevel(q, this, q->tx_borrowed);
 639}
 640
 641static inline struct cbq_class *
 642cbq_under_limit(struct cbq_class *cl)
 643{
 644	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 645	struct cbq_class *this_cl = cl;
 646
 647	if (cl->tparent == NULL)
 648		return cl;
 649
 650	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 651		cl->delayed = 0;
 652		return cl;
 653	}
 654
 655	do {
 656		/* It is very suspicious place. Now overlimit
 657		 * action is generated for not bounded classes
 658		 * only if link is completely congested.
 659		 * Though it is in agree with ancestor-only paradigm,
 660		 * it looks very stupid. Particularly,
 661		 * it means that this chunk of code will either
 662		 * never be called or result in strong amplification
 663		 * of burstiness. Dangerous, silly, and, however,
 664		 * no another solution exists.
 665		 */
 666		cl = cl->borrow;
 667		if (!cl) {
 668			this_cl->qstats.overlimits++;
 669			cbq_overlimit(this_cl);
 670			return NULL;
 671		}
 672		if (cl->level > q->toplevel)
 673			return NULL;
 674	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 675
 676	cl->delayed = 0;
 677	return cl;
 678}
 679
 680static inline struct sk_buff *
 681cbq_dequeue_prio(struct Qdisc *sch, int prio)
 682{
 683	struct cbq_sched_data *q = qdisc_priv(sch);
 684	struct cbq_class *cl_tail, *cl_prev, *cl;
 685	struct sk_buff *skb;
 686	int deficit;
 687
 688	cl_tail = cl_prev = q->active[prio];
 689	cl = cl_prev->next_alive;
 690
 691	do {
 692		deficit = 0;
 693
 694		/* Start round */
 695		do {
 696			struct cbq_class *borrow = cl;
 697
 698			if (cl->q->q.qlen &&
 699			    (borrow = cbq_under_limit(cl)) == NULL)
 700				goto skip_class;
 701
 702			if (cl->deficit <= 0) {
 703				/* Class exhausted its allotment per
 704				 * this round. Switch to the next one.
 705				 */
 706				deficit = 1;
 707				cl->deficit += cl->quantum;
 708				goto next_class;
 709			}
 710
 711			skb = cl->q->dequeue(cl->q);
 712
 713			/* Class did not give us any skb :-(
 714			 * It could occur even if cl->q->q.qlen != 0
 715			 * f.e. if cl->q == "tbf"
 716			 */
 717			if (skb == NULL)
 718				goto skip_class;
 719
 720			cl->deficit -= qdisc_pkt_len(skb);
 721			q->tx_class = cl;
 722			q->tx_borrowed = borrow;
 723			if (borrow != cl) {
 724#ifndef CBQ_XSTATS_BORROWS_BYTES
 725				borrow->xstats.borrows++;
 726				cl->xstats.borrows++;
 727#else
 728				borrow->xstats.borrows += qdisc_pkt_len(skb);
 729				cl->xstats.borrows += qdisc_pkt_len(skb);
 730#endif
 731			}
 732			q->tx_len = qdisc_pkt_len(skb);
 733
 734			if (cl->deficit <= 0) {
 735				q->active[prio] = cl;
 736				cl = cl->next_alive;
 737				cl->deficit += cl->quantum;
 738			}
 739			return skb;
 740
 741skip_class:
 742			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 743				/* Class is empty or penalized.
 744				 * Unlink it from active chain.
 745				 */
 746				cl_prev->next_alive = cl->next_alive;
 747				cl->next_alive = NULL;
 748
 749				/* Did cl_tail point to it? */
 750				if (cl == cl_tail) {
 751					/* Repair it! */
 752					cl_tail = cl_prev;
 753
 754					/* Was it the last class in this band? */
 755					if (cl == cl_tail) {
 756						/* Kill the band! */
 757						q->active[prio] = NULL;
 758						q->activemask &= ~(1<<prio);
 759						if (cl->q->q.qlen)
 760							cbq_activate_class(cl);
 761						return NULL;
 762					}
 763
 764					q->active[prio] = cl_tail;
 765				}
 766				if (cl->q->q.qlen)
 767					cbq_activate_class(cl);
 768
 769				cl = cl_prev;
 770			}
 771
 772next_class:
 773			cl_prev = cl;
 774			cl = cl->next_alive;
 775		} while (cl_prev != cl_tail);
 776	} while (deficit);
 777
 778	q->active[prio] = cl_prev;
 779
 780	return NULL;
 781}
 782
 783static inline struct sk_buff *
 784cbq_dequeue_1(struct Qdisc *sch)
 785{
 786	struct cbq_sched_data *q = qdisc_priv(sch);
 787	struct sk_buff *skb;
 788	unsigned int activemask;
 789
 790	activemask = q->activemask & 0xFF;
 791	while (activemask) {
 792		int prio = ffz(~activemask);
 793		activemask &= ~(1<<prio);
 794		skb = cbq_dequeue_prio(sch, prio);
 795		if (skb)
 796			return skb;
 797	}
 798	return NULL;
 799}
 800
 801static struct sk_buff *
 802cbq_dequeue(struct Qdisc *sch)
 803{
 804	struct sk_buff *skb;
 805	struct cbq_sched_data *q = qdisc_priv(sch);
 806	psched_time_t now;
 807
 808	now = psched_get_time();
 809
 810	if (q->tx_class)
 811		cbq_update(q);
 812
 813	q->now = now;
 814
 815	for (;;) {
 816		q->wd_expires = 0;
 817
 818		skb = cbq_dequeue_1(sch);
 819		if (skb) {
 820			qdisc_bstats_update(sch, skb);
 821			sch->q.qlen--;
 822			return skb;
 823		}
 824
 825		/* All the classes are overlimit.
 826		 *
 827		 * It is possible, if:
 828		 *
 829		 * 1. Scheduler is empty.
 830		 * 2. Toplevel cutoff inhibited borrowing.
 831		 * 3. Root class is overlimit.
 832		 *
 833		 * Reset 2d and 3d conditions and retry.
 834		 *
 835		 * Note, that NS and cbq-2.0 are buggy, peeking
 836		 * an arbitrary class is appropriate for ancestor-only
 837		 * sharing, but not for toplevel algorithm.
 838		 *
 839		 * Our version is better, but slower, because it requires
 840		 * two passes, but it is unavoidable with top-level sharing.
 841		 */
 842
 843		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 844		    q->link.undertime == PSCHED_PASTPERFECT)
 845			break;
 846
 847		q->toplevel = TC_CBQ_MAXLEVEL;
 848		q->link.undertime = PSCHED_PASTPERFECT;
 849	}
 850
 851	/* No packets in scheduler or nobody wants to give them to us :-(
 852	 * Sigh... start watchdog timer in the last case.
 853	 */
 854
 855	if (sch->q.qlen) {
 856		qdisc_qstats_overlimit(sch);
 857		if (q->wd_expires)
 858			qdisc_watchdog_schedule(&q->watchdog,
 859						now + q->wd_expires);
 860	}
 861	return NULL;
 862}
 863
 864/* CBQ class maintanance routines */
 865
 866static void cbq_adjust_levels(struct cbq_class *this)
 867{
 868	if (this == NULL)
 869		return;
 870
 871	do {
 872		int level = 0;
 873		struct cbq_class *cl;
 874
 875		cl = this->children;
 876		if (cl) {
 877			do {
 878				if (cl->level > level)
 879					level = cl->level;
 880			} while ((cl = cl->sibling) != this->children);
 881		}
 882		this->level = level + 1;
 883	} while ((this = this->tparent) != NULL);
 884}
 885
 886static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 887{
 888	struct cbq_class *cl;
 889	unsigned int h;
 890
 891	if (q->quanta[prio] == 0)
 892		return;
 893
 894	for (h = 0; h < q->clhash.hashsize; h++) {
 895		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 896			/* BUGGGG... Beware! This expression suffer of
 897			 * arithmetic overflows!
 898			 */
 899			if (cl->priority == prio) {
 900				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 901					q->quanta[prio];
 902			}
 903			if (cl->quantum <= 0 ||
 904			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 905				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 906					cl->common.classid, cl->quantum);
 907				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 908			}
 909		}
 910	}
 911}
 912
 913static void cbq_sync_defmap(struct cbq_class *cl)
 914{
 915	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 916	struct cbq_class *split = cl->split;
 917	unsigned int h;
 918	int i;
 919
 920	if (split == NULL)
 921		return;
 922
 923	for (i = 0; i <= TC_PRIO_MAX; i++) {
 924		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 925			split->defaults[i] = NULL;
 926	}
 927
 928	for (i = 0; i <= TC_PRIO_MAX; i++) {
 929		int level = split->level;
 930
 931		if (split->defaults[i])
 932			continue;
 933
 934		for (h = 0; h < q->clhash.hashsize; h++) {
 935			struct cbq_class *c;
 936
 937			hlist_for_each_entry(c, &q->clhash.hash[h],
 938					     common.hnode) {
 939				if (c->split == split && c->level < level &&
 940				    c->defmap & (1<<i)) {
 941					split->defaults[i] = c;
 942					level = c->level;
 943				}
 944			}
 945		}
 946	}
 947}
 948
 949static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 950{
 951	struct cbq_class *split = NULL;
 952
 953	if (splitid == 0) {
 954		split = cl->split;
 955		if (!split)
 956			return;
 957		splitid = split->common.classid;
 958	}
 959
 960	if (split == NULL || split->common.classid != splitid) {
 961		for (split = cl->tparent; split; split = split->tparent)
 962			if (split->common.classid == splitid)
 963				break;
 964	}
 965
 966	if (split == NULL)
 967		return;
 968
 969	if (cl->split != split) {
 970		cl->defmap = 0;
 971		cbq_sync_defmap(cl);
 972		cl->split = split;
 973		cl->defmap = def & mask;
 974	} else
 975		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 976
 977	cbq_sync_defmap(cl);
 978}
 979
 980static void cbq_unlink_class(struct cbq_class *this)
 981{
 982	struct cbq_class *cl, **clp;
 983	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 984
 985	qdisc_class_hash_remove(&q->clhash, &this->common);
 986
 987	if (this->tparent) {
 988		clp = &this->sibling;
 989		cl = *clp;
 990		do {
 991			if (cl == this) {
 992				*clp = cl->sibling;
 993				break;
 994			}
 995			clp = &cl->sibling;
 996		} while ((cl = *clp) != this->sibling);
 997
 998		if (this->tparent->children == this) {
 999			this->tparent->children = this->sibling;
1000			if (this->sibling == this)
1001				this->tparent->children = NULL;
1002		}
1003	} else {
1004		WARN_ON(this->sibling != this);
1005	}
1006}
1007
1008static void cbq_link_class(struct cbq_class *this)
1009{
1010	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1011	struct cbq_class *parent = this->tparent;
1012
1013	this->sibling = this;
1014	qdisc_class_hash_insert(&q->clhash, &this->common);
1015
1016	if (parent == NULL)
1017		return;
1018
1019	if (parent->children == NULL) {
1020		parent->children = this;
1021	} else {
1022		this->sibling = parent->children->sibling;
1023		parent->children->sibling = this;
1024	}
1025}
1026
1027static void
1028cbq_reset(struct Qdisc *sch)
1029{
1030	struct cbq_sched_data *q = qdisc_priv(sch);
1031	struct cbq_class *cl;
1032	int prio;
1033	unsigned int h;
1034
1035	q->activemask = 0;
1036	q->pmask = 0;
1037	q->tx_class = NULL;
1038	q->tx_borrowed = NULL;
1039	qdisc_watchdog_cancel(&q->watchdog);
1040	hrtimer_cancel(&q->delay_timer);
1041	q->toplevel = TC_CBQ_MAXLEVEL;
1042	q->now = psched_get_time();
1043
1044	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1045		q->active[prio] = NULL;
1046
1047	for (h = 0; h < q->clhash.hashsize; h++) {
1048		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1049			qdisc_reset(cl->q);
1050
1051			cl->next_alive = NULL;
1052			cl->undertime = PSCHED_PASTPERFECT;
1053			cl->avgidle = cl->maxidle;
1054			cl->deficit = cl->quantum;
1055			cl->cpriority = cl->priority;
1056		}
1057	}
1058	sch->q.qlen = 0;
1059}
1060
1061
1062static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1063{
1064	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1065		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1066		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1067	}
1068	if (lss->change & TCF_CBQ_LSS_EWMA)
1069		cl->ewma_log = lss->ewma_log;
1070	if (lss->change & TCF_CBQ_LSS_AVPKT)
1071		cl->avpkt = lss->avpkt;
1072	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1073		cl->minidle = -(long)lss->minidle;
1074	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1075		cl->maxidle = lss->maxidle;
1076		cl->avgidle = lss->maxidle;
1077	}
1078	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1079		cl->offtime = lss->offtime;
1080	return 0;
1081}
1082
1083static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1084{
1085	q->nclasses[cl->priority]--;
1086	q->quanta[cl->priority] -= cl->weight;
1087	cbq_normalize_quanta(q, cl->priority);
1088}
1089
1090static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1091{
1092	q->nclasses[cl->priority]++;
1093	q->quanta[cl->priority] += cl->weight;
1094	cbq_normalize_quanta(q, cl->priority);
1095}
1096
1097static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1098{
1099	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1100
1101	if (wrr->allot)
1102		cl->allot = wrr->allot;
1103	if (wrr->weight)
1104		cl->weight = wrr->weight;
1105	if (wrr->priority) {
1106		cl->priority = wrr->priority - 1;
1107		cl->cpriority = cl->priority;
1108		if (cl->priority >= cl->priority2)
1109			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1110	}
1111
1112	cbq_addprio(q, cl);
1113	return 0;
1114}
1115
1116static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1117{
1118	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1119	return 0;
1120}
1121
1122static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1123	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1124	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1125	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1126	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1127	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1128	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1129	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1130};
1131
1132static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1133{
1134	struct cbq_sched_data *q = qdisc_priv(sch);
1135	struct nlattr *tb[TCA_CBQ_MAX + 1];
1136	struct tc_ratespec *r;
1137	int err;
1138
1139	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
 
 
 
 
1140	if (err < 0)
1141		return err;
1142
1143	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
 
1144		return -EINVAL;
 
1145
1146	r = nla_data(tb[TCA_CBQ_RATE]);
1147
1148	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
 
1149		return -EINVAL;
1150
 
 
 
 
1151	err = qdisc_class_hash_init(&q->clhash);
1152	if (err < 0)
1153		goto put_rtab;
1154
1155	q->link.refcnt = 1;
1156	q->link.sibling = &q->link;
1157	q->link.common.classid = sch->handle;
1158	q->link.qdisc = sch;
1159	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1160				      sch->handle);
1161	if (!q->link.q)
1162		q->link.q = &noop_qdisc;
 
 
1163
1164	q->link.priority = TC_CBQ_MAXPRIO - 1;
1165	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1166	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1167	q->link.allot = psched_mtu(qdisc_dev(sch));
1168	q->link.quantum = q->link.allot;
1169	q->link.weight = q->link.R_tab->rate.rate;
1170
1171	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1172	q->link.avpkt = q->link.allot/2;
1173	q->link.minidle = -0x7FFFFFFF;
1174
1175	qdisc_watchdog_init(&q->watchdog, sch);
1176	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1177	q->delay_timer.function = cbq_undelay;
1178	q->toplevel = TC_CBQ_MAXLEVEL;
1179	q->now = psched_get_time();
1180
1181	cbq_link_class(&q->link);
1182
1183	if (tb[TCA_CBQ_LSSOPT])
1184		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1185
1186	cbq_addprio(q, &q->link);
1187	return 0;
1188
 
 
 
1189put_rtab:
1190	qdisc_put_rtab(q->link.R_tab);
1191	return err;
1192}
1193
1194static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1195{
1196	unsigned char *b = skb_tail_pointer(skb);
1197
1198	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1199		goto nla_put_failure;
1200	return skb->len;
1201
1202nla_put_failure:
1203	nlmsg_trim(skb, b);
1204	return -1;
1205}
1206
1207static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1208{
1209	unsigned char *b = skb_tail_pointer(skb);
1210	struct tc_cbq_lssopt opt;
1211
1212	opt.flags = 0;
1213	if (cl->borrow == NULL)
1214		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1215	if (cl->share == NULL)
1216		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1217	opt.ewma_log = cl->ewma_log;
1218	opt.level = cl->level;
1219	opt.avpkt = cl->avpkt;
1220	opt.maxidle = cl->maxidle;
1221	opt.minidle = (u32)(-cl->minidle);
1222	opt.offtime = cl->offtime;
1223	opt.change = ~0;
1224	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1225		goto nla_put_failure;
1226	return skb->len;
1227
1228nla_put_failure:
1229	nlmsg_trim(skb, b);
1230	return -1;
1231}
1232
1233static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1234{
1235	unsigned char *b = skb_tail_pointer(skb);
1236	struct tc_cbq_wrropt opt;
1237
1238	memset(&opt, 0, sizeof(opt));
1239	opt.flags = 0;
1240	opt.allot = cl->allot;
1241	opt.priority = cl->priority + 1;
1242	opt.cpriority = cl->cpriority + 1;
1243	opt.weight = cl->weight;
1244	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1245		goto nla_put_failure;
1246	return skb->len;
1247
1248nla_put_failure:
1249	nlmsg_trim(skb, b);
1250	return -1;
1251}
1252
1253static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1254{
1255	unsigned char *b = skb_tail_pointer(skb);
1256	struct tc_cbq_fopt opt;
1257
1258	if (cl->split || cl->defmap) {
1259		opt.split = cl->split ? cl->split->common.classid : 0;
1260		opt.defmap = cl->defmap;
1261		opt.defchange = ~0;
1262		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1263			goto nla_put_failure;
1264	}
1265	return skb->len;
1266
1267nla_put_failure:
1268	nlmsg_trim(skb, b);
1269	return -1;
1270}
1271
1272static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1273{
1274	if (cbq_dump_lss(skb, cl) < 0 ||
1275	    cbq_dump_rate(skb, cl) < 0 ||
1276	    cbq_dump_wrr(skb, cl) < 0 ||
1277	    cbq_dump_fopt(skb, cl) < 0)
1278		return -1;
1279	return 0;
1280}
1281
1282static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1283{
1284	struct cbq_sched_data *q = qdisc_priv(sch);
1285	struct nlattr *nest;
1286
1287	nest = nla_nest_start(skb, TCA_OPTIONS);
1288	if (nest == NULL)
1289		goto nla_put_failure;
1290	if (cbq_dump_attr(skb, &q->link) < 0)
1291		goto nla_put_failure;
1292	return nla_nest_end(skb, nest);
1293
1294nla_put_failure:
1295	nla_nest_cancel(skb, nest);
1296	return -1;
1297}
1298
1299static int
1300cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1301{
1302	struct cbq_sched_data *q = qdisc_priv(sch);
1303
1304	q->link.xstats.avgidle = q->link.avgidle;
1305	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1306}
1307
1308static int
1309cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1310	       struct sk_buff *skb, struct tcmsg *tcm)
1311{
1312	struct cbq_class *cl = (struct cbq_class *)arg;
1313	struct nlattr *nest;
1314
1315	if (cl->tparent)
1316		tcm->tcm_parent = cl->tparent->common.classid;
1317	else
1318		tcm->tcm_parent = TC_H_ROOT;
1319	tcm->tcm_handle = cl->common.classid;
1320	tcm->tcm_info = cl->q->handle;
1321
1322	nest = nla_nest_start(skb, TCA_OPTIONS);
1323	if (nest == NULL)
1324		goto nla_put_failure;
1325	if (cbq_dump_attr(skb, cl) < 0)
1326		goto nla_put_failure;
1327	return nla_nest_end(skb, nest);
1328
1329nla_put_failure:
1330	nla_nest_cancel(skb, nest);
1331	return -1;
1332}
1333
1334static int
1335cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1336	struct gnet_dump *d)
1337{
1338	struct cbq_sched_data *q = qdisc_priv(sch);
1339	struct cbq_class *cl = (struct cbq_class *)arg;
 
1340
1341	cl->xstats.avgidle = cl->avgidle;
1342	cl->xstats.undertime = 0;
 
1343
1344	if (cl->undertime != PSCHED_PASTPERFECT)
1345		cl->xstats.undertime = cl->undertime - q->now;
1346
1347	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1348				  d, NULL, &cl->bstats) < 0 ||
1349	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1350	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1351		return -1;
1352
1353	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1354}
1355
1356static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1357		     struct Qdisc **old)
1358{
1359	struct cbq_class *cl = (struct cbq_class *)arg;
1360
1361	if (new == NULL) {
1362		new = qdisc_create_dflt(sch->dev_queue,
1363					&pfifo_qdisc_ops, cl->common.classid);
1364		if (new == NULL)
1365			return -ENOBUFS;
1366	}
1367
1368	*old = qdisc_replace(sch, new, &cl->q);
1369	return 0;
1370}
1371
1372static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1373{
1374	struct cbq_class *cl = (struct cbq_class *)arg;
1375
1376	return cl->q;
1377}
1378
1379static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1380{
1381	struct cbq_class *cl = (struct cbq_class *)arg;
1382
1383	if (cl->q->q.qlen == 0)
1384		cbq_deactivate_class(cl);
1385}
1386
1387static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1388{
1389	struct cbq_sched_data *q = qdisc_priv(sch);
1390	struct cbq_class *cl = cbq_class_lookup(q, classid);
1391
1392	if (cl) {
1393		cl->refcnt++;
1394		return (unsigned long)cl;
1395	}
1396	return 0;
1397}
1398
1399static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1400{
1401	struct cbq_sched_data *q = qdisc_priv(sch);
1402
1403	WARN_ON(cl->filters);
1404
1405	tcf_destroy_chain(&cl->filter_list);
1406	qdisc_destroy(cl->q);
1407	qdisc_put_rtab(cl->R_tab);
1408	gen_kill_estimator(&cl->rate_est);
1409	if (cl != &q->link)
1410		kfree(cl);
1411}
1412
1413static void cbq_destroy(struct Qdisc *sch)
1414{
1415	struct cbq_sched_data *q = qdisc_priv(sch);
1416	struct hlist_node *next;
1417	struct cbq_class *cl;
1418	unsigned int h;
1419
1420#ifdef CONFIG_NET_CLS_ACT
1421	q->rx_class = NULL;
1422#endif
1423	/*
1424	 * Filters must be destroyed first because we don't destroy the
1425	 * classes from root to leafs which means that filters can still
1426	 * be bound to classes which have been destroyed already. --TGR '04
1427	 */
1428	for (h = 0; h < q->clhash.hashsize; h++) {
1429		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1430			tcf_destroy_chain(&cl->filter_list);
 
 
1431	}
1432	for (h = 0; h < q->clhash.hashsize; h++) {
1433		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1434					  common.hnode)
1435			cbq_destroy_class(sch, cl);
1436	}
1437	qdisc_class_hash_destroy(&q->clhash);
1438}
1439
1440static void cbq_put(struct Qdisc *sch, unsigned long arg)
1441{
1442	struct cbq_class *cl = (struct cbq_class *)arg;
1443
1444	if (--cl->refcnt == 0) {
1445#ifdef CONFIG_NET_CLS_ACT
1446		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1447		struct cbq_sched_data *q = qdisc_priv(sch);
1448
1449		spin_lock_bh(root_lock);
1450		if (q->rx_class == cl)
1451			q->rx_class = NULL;
1452		spin_unlock_bh(root_lock);
1453#endif
1454
1455		cbq_destroy_class(sch, cl);
1456	}
1457}
1458
1459static int
1460cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1461		 unsigned long *arg)
1462{
1463	int err;
1464	struct cbq_sched_data *q = qdisc_priv(sch);
1465	struct cbq_class *cl = (struct cbq_class *)*arg;
1466	struct nlattr *opt = tca[TCA_OPTIONS];
1467	struct nlattr *tb[TCA_CBQ_MAX + 1];
1468	struct cbq_class *parent;
1469	struct qdisc_rate_table *rtab = NULL;
1470
1471	if (opt == NULL)
1472		return -EINVAL;
1473
1474	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1475	if (err < 0)
1476		return err;
1477
1478	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
 
1479		return -EOPNOTSUPP;
 
1480
1481	if (cl) {
1482		/* Check parent */
1483		if (parentid) {
1484			if (cl->tparent &&
1485			    cl->tparent->common.classid != parentid)
 
1486				return -EINVAL;
1487			if (!cl->tparent && parentid != TC_H_ROOT)
 
 
1488				return -EINVAL;
 
1489		}
1490
1491		if (tb[TCA_CBQ_RATE]) {
1492			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1493					      tb[TCA_CBQ_RTAB]);
1494			if (rtab == NULL)
1495				return -EINVAL;
1496		}
1497
1498		if (tca[TCA_RATE]) {
1499			err = gen_replace_estimator(&cl->bstats, NULL,
1500						    &cl->rate_est,
1501						    NULL,
1502						    qdisc_root_sleeping_running(sch),
1503						    tca[TCA_RATE]);
1504			if (err) {
 
1505				qdisc_put_rtab(rtab);
1506				return err;
1507			}
1508		}
1509
1510		/* Change class parameters */
1511		sch_tree_lock(sch);
1512
1513		if (cl->next_alive != NULL)
1514			cbq_deactivate_class(cl);
1515
1516		if (rtab) {
1517			qdisc_put_rtab(cl->R_tab);
1518			cl->R_tab = rtab;
1519		}
1520
1521		if (tb[TCA_CBQ_LSSOPT])
1522			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1523
1524		if (tb[TCA_CBQ_WRROPT]) {
1525			cbq_rmprio(q, cl);
1526			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1527		}
1528
1529		if (tb[TCA_CBQ_FOPT])
1530			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1531
1532		if (cl->q->q.qlen)
1533			cbq_activate_class(cl);
1534
1535		sch_tree_unlock(sch);
1536
1537		return 0;
1538	}
1539
1540	if (parentid == TC_H_ROOT)
1541		return -EINVAL;
1542
1543	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1544	    tb[TCA_CBQ_LSSOPT] == NULL)
1545		return -EINVAL;
 
1546
1547	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
 
1548	if (rtab == NULL)
1549		return -EINVAL;
1550
1551	if (classid) {
1552		err = -EINVAL;
1553		if (TC_H_MAJ(classid ^ sch->handle) ||
1554		    cbq_class_lookup(q, classid))
 
1555			goto failure;
 
1556	} else {
1557		int i;
1558		classid = TC_H_MAKE(sch->handle, 0x8000);
1559
1560		for (i = 0; i < 0x8000; i++) {
1561			if (++q->hgenerator >= 0x8000)
1562				q->hgenerator = 1;
1563			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1564				break;
1565		}
1566		err = -ENOSR;
1567		if (i >= 0x8000)
 
1568			goto failure;
 
1569		classid = classid|q->hgenerator;
1570	}
1571
1572	parent = &q->link;
1573	if (parentid) {
1574		parent = cbq_class_lookup(q, parentid);
1575		err = -EINVAL;
1576		if (parent == NULL)
 
1577			goto failure;
 
1578	}
1579
1580	err = -ENOBUFS;
1581	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1582	if (cl == NULL)
1583		goto failure;
1584
 
 
 
 
 
 
1585	if (tca[TCA_RATE]) {
1586		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1587					NULL,
1588					qdisc_root_sleeping_running(sch),
1589					tca[TCA_RATE]);
1590		if (err) {
 
 
1591			kfree(cl);
1592			goto failure;
1593		}
1594	}
1595
1596	cl->R_tab = rtab;
1597	rtab = NULL;
1598	cl->refcnt = 1;
1599	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1600	if (!cl->q)
1601		cl->q = &noop_qdisc;
 
 
 
1602	cl->common.classid = classid;
1603	cl->tparent = parent;
1604	cl->qdisc = sch;
1605	cl->allot = parent->allot;
1606	cl->quantum = cl->allot;
1607	cl->weight = cl->R_tab->rate.rate;
1608
1609	sch_tree_lock(sch);
1610	cbq_link_class(cl);
1611	cl->borrow = cl->tparent;
1612	if (cl->tparent != &q->link)
1613		cl->share = cl->tparent;
1614	cbq_adjust_levels(parent);
1615	cl->minidle = -0x7FFFFFFF;
1616	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1617	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1618	if (cl->ewma_log == 0)
1619		cl->ewma_log = q->link.ewma_log;
1620	if (cl->maxidle == 0)
1621		cl->maxidle = q->link.maxidle;
1622	if (cl->avpkt == 0)
1623		cl->avpkt = q->link.avpkt;
1624	if (tb[TCA_CBQ_FOPT])
1625		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1626	sch_tree_unlock(sch);
1627
1628	qdisc_class_hash_grow(sch, &q->clhash);
1629
1630	*arg = (unsigned long)cl;
1631	return 0;
1632
1633failure:
1634	qdisc_put_rtab(rtab);
1635	return err;
1636}
1637
1638static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1639{
1640	struct cbq_sched_data *q = qdisc_priv(sch);
1641	struct cbq_class *cl = (struct cbq_class *)arg;
1642	unsigned int qlen, backlog;
1643
1644	if (cl->filters || cl->children || cl == &q->link)
1645		return -EBUSY;
1646
1647	sch_tree_lock(sch);
1648
1649	qlen = cl->q->q.qlen;
1650	backlog = cl->q->qstats.backlog;
1651	qdisc_reset(cl->q);
1652	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1653
1654	if (cl->next_alive)
1655		cbq_deactivate_class(cl);
1656
1657	if (q->tx_borrowed == cl)
1658		q->tx_borrowed = q->tx_class;
1659	if (q->tx_class == cl) {
1660		q->tx_class = NULL;
1661		q->tx_borrowed = NULL;
1662	}
1663#ifdef CONFIG_NET_CLS_ACT
1664	if (q->rx_class == cl)
1665		q->rx_class = NULL;
1666#endif
1667
1668	cbq_unlink_class(cl);
1669	cbq_adjust_levels(cl->tparent);
1670	cl->defmap = 0;
1671	cbq_sync_defmap(cl);
1672
1673	cbq_rmprio(q, cl);
1674	sch_tree_unlock(sch);
1675
1676	BUG_ON(--cl->refcnt == 0);
1677	/*
1678	 * This shouldn't happen: we "hold" one cops->get() when called
1679	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1680	 */
1681
1682	return 0;
1683}
1684
1685static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1686					     unsigned long arg)
1687{
1688	struct cbq_sched_data *q = qdisc_priv(sch);
1689	struct cbq_class *cl = (struct cbq_class *)arg;
1690
1691	if (cl == NULL)
1692		cl = &q->link;
1693
1694	return &cl->filter_list;
1695}
1696
1697static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1698				     u32 classid)
1699{
1700	struct cbq_sched_data *q = qdisc_priv(sch);
1701	struct cbq_class *p = (struct cbq_class *)parent;
1702	struct cbq_class *cl = cbq_class_lookup(q, classid);
1703
1704	if (cl) {
1705		if (p && p->level <= cl->level)
1706			return 0;
1707		cl->filters++;
1708		return (unsigned long)cl;
1709	}
1710	return 0;
1711}
1712
1713static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1714{
1715	struct cbq_class *cl = (struct cbq_class *)arg;
1716
1717	cl->filters--;
1718}
1719
1720static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1721{
1722	struct cbq_sched_data *q = qdisc_priv(sch);
1723	struct cbq_class *cl;
1724	unsigned int h;
1725
1726	if (arg->stop)
1727		return;
1728
1729	for (h = 0; h < q->clhash.hashsize; h++) {
1730		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1731			if (arg->count < arg->skip) {
1732				arg->count++;
1733				continue;
1734			}
1735			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1736				arg->stop = 1;
1737				return;
1738			}
1739			arg->count++;
1740		}
1741	}
1742}
1743
1744static const struct Qdisc_class_ops cbq_class_ops = {
1745	.graft		=	cbq_graft,
1746	.leaf		=	cbq_leaf,
1747	.qlen_notify	=	cbq_qlen_notify,
1748	.get		=	cbq_get,
1749	.put		=	cbq_put,
1750	.change		=	cbq_change_class,
1751	.delete		=	cbq_delete,
1752	.walk		=	cbq_walk,
1753	.tcf_chain	=	cbq_find_tcf,
1754	.bind_tcf	=	cbq_bind_filter,
1755	.unbind_tcf	=	cbq_unbind_filter,
1756	.dump		=	cbq_dump_class,
1757	.dump_stats	=	cbq_dump_class_stats,
1758};
1759
1760static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1761	.next		=	NULL,
1762	.cl_ops		=	&cbq_class_ops,
1763	.id		=	"cbq",
1764	.priv_size	=	sizeof(struct cbq_sched_data),
1765	.enqueue	=	cbq_enqueue,
1766	.dequeue	=	cbq_dequeue,
1767	.peek		=	qdisc_peek_dequeued,
1768	.init		=	cbq_init,
1769	.reset		=	cbq_reset,
1770	.destroy	=	cbq_destroy,
1771	.change		=	NULL,
1772	.dump		=	cbq_dump,
1773	.dump_stats	=	cbq_dump_stats,
1774	.owner		=	THIS_MODULE,
1775};
1776
1777static int __init cbq_module_init(void)
1778{
1779	return register_qdisc(&cbq_qdisc_ops);
1780}
1781static void __exit cbq_module_exit(void)
1782{
1783	unregister_qdisc(&cbq_qdisc_ops);
1784}
1785module_init(cbq_module_init)
1786module_exit(cbq_module_exit)
1787MODULE_LICENSE("GPL");