sch_cbq.c - net/sched/sch_cbq.c - Linux source code v4.17

Note: File does not exist in v6.13.7.
   1/*
   2 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <net/netlink.h>
  21#include <net/pkt_sched.h>
  22#include <net/pkt_cls.h>
  23
  24
  25/*	Class-Based Queueing (CBQ) algorithm.
  26	=======================================
  27
  28	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  29		 Management Models for Packet Networks",
  30		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  31
  32		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  33
  34		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  35		 Parameters", 1996
  36
  37		 [4] Sally Floyd and Michael Speer, "Experimental Results
  38		 for Class-Based Queueing", 1998, not published.
  39
  40	-----------------------------------------------------------------------
  41
  42	Algorithm skeleton was taken from NS simulator cbq.cc.
  43	If someone wants to check this code against the LBL version,
  44	he should take into account that ONLY the skeleton was borrowed,
  45	the implementation is different. Particularly:
  46
  47	--- The WRR algorithm is different. Our version looks more
  48	reasonable (I hope) and works when quanta are allowed to be
  49	less than MTU, which is always the case when real time classes
  50	have small rates. Note, that the statement of [3] is
  51	incomplete, delay may actually be estimated even if class
  52	per-round allotment is less than MTU. Namely, if per-round
  53	allotment is W*r_i, and r_1+...+r_k = r < 1
  54
  55	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  56
  57	In the worst case we have IntServ estimate with D = W*r+k*MTU
  58	and C = MTU*r. The proof (if correct at all) is trivial.
  59
  60
  61	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  62	interpret some places, which look like wrong translations
  63	from NS. Anyone is advised to find these differences
  64	and explain to me, why I am wrong 8).
  65
  66	--- Linux has no EOI event, so that we cannot estimate true class
  67	idle time. Workaround is to consider the next dequeue event
  68	as sign that previous packet is finished. This is wrong because of
  69	internal device queueing, but on a permanently loaded link it is true.
  70	Moreover, combined with clock integrator, this scheme looks
  71	very close to an ideal solution.  */
  72
  73struct cbq_sched_data;
  74
  75
  76struct cbq_class {
  77	struct Qdisc_class_common common;
  78	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  79
  80/* Parameters */
  81	unsigned char		priority;	/* class priority */
  82	unsigned char		priority2;	/* priority to be used after overlimit */
  83	unsigned char		ewma_log;	/* time constant for idle time calculation */
  84
  85	u32			defmap;
  86
  87	/* Link-sharing scheduler parameters */
  88	long			maxidle;	/* Class parameters: see below. */
  89	long			offtime;
  90	long			minidle;
  91	u32			avpkt;
  92	struct qdisc_rate_table	*R_tab;
  93
  94	/* General scheduler (WRR) parameters */
  95	long			allot;
  96	long			quantum;	/* Allotment per WRR round */
  97	long			weight;		/* Relative allotment: see below */
  98
  99	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
 100	struct cbq_class	*split;		/* Ptr to split node */
 101	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
 102	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
 103	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
 104						   parent otherwise */
 105	struct cbq_class	*sibling;	/* Sibling chain */
 106	struct cbq_class	*children;	/* Pointer to children chain */
 107
 108	struct Qdisc		*q;		/* Elementary queueing discipline */
 109
 110
 111/* Variables */
 112	unsigned char		cpriority;	/* Effective priority */
 113	unsigned char		delayed;
 114	unsigned char		level;		/* level of the class in hierarchy:
 115						   0 for leaf classes, and maximal
 116						   level of children + 1 for nodes.
 117						 */
 118
 119	psched_time_t		last;		/* Last end of service */
 120	psched_time_t		undertime;
 121	long			avgidle;
 122	long			deficit;	/* Saved deficit for WRR */
 123	psched_time_t		penalized;
 124	struct gnet_stats_basic_packed bstats;
 125	struct gnet_stats_queue qstats;
 126	struct net_rate_estimator __rcu *rate_est;
 127	struct tc_cbq_xstats	xstats;
 128
 129	struct tcf_proto __rcu	*filter_list;
 130	struct tcf_block	*block;
 131
 132	int			filters;
 133
 134	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 135};
 136
 137struct cbq_sched_data {
 138	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 139	int			nclasses[TC_CBQ_MAXPRIO + 1];
 140	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 141
 142	struct cbq_class	link;
 143
 144	unsigned int		activemask;
 145	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 146								   with backlog */
 147
 148#ifdef CONFIG_NET_CLS_ACT
 149	struct cbq_class	*rx_class;
 150#endif
 151	struct cbq_class	*tx_class;
 152	struct cbq_class	*tx_borrowed;
 153	int			tx_len;
 154	psched_time_t		now;		/* Cached timestamp */
 155	unsigned int		pmask;
 156
 157	struct hrtimer		delay_timer;
 158	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 159						   started when CBQ has
 160						   backlog, but cannot
 161						   transmit just now */
 162	psched_tdiff_t		wd_expires;
 163	int			toplevel;
 164	u32			hgenerator;
 165};
 166
 167
 168#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 169
 170static inline struct cbq_class *
 171cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 172{
 173	struct Qdisc_class_common *clc;
 174
 175	clc = qdisc_class_find(&q->clhash, classid);
 176	if (clc == NULL)
 177		return NULL;
 178	return container_of(clc, struct cbq_class, common);
 179}
 180
 181#ifdef CONFIG_NET_CLS_ACT
 182
 183static struct cbq_class *
 184cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 185{
 186	struct cbq_class *cl;
 187
 188	for (cl = this->tparent; cl; cl = cl->tparent) {
 189		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 190
 191		if (new != NULL && new != this)
 192			return new;
 193	}
 194	return NULL;
 195}
 196
 197#endif
 198
 199/* Classify packet. The procedure is pretty complicated, but
 200 * it allows us to combine link sharing and priority scheduling
 201 * transparently.
 202 *
 203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 204 * so that it resolves to split nodes. Then packets are classified
 205 * by logical priority, or a more specific classifier may be attached
 206 * to the split node.
 207 */
 208
 209static struct cbq_class *
 210cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 211{
 212	struct cbq_sched_data *q = qdisc_priv(sch);
 213	struct cbq_class *head = &q->link;
 214	struct cbq_class **defmap;
 215	struct cbq_class *cl = NULL;
 216	u32 prio = skb->priority;
 217	struct tcf_proto *fl;
 218	struct tcf_result res;
 219
 220	/*
 221	 *  Step 1. If skb->priority points to one of our classes, use it.
 222	 */
 223	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 224	    (cl = cbq_class_lookup(q, prio)) != NULL)
 225		return cl;
 226
 227	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 228	for (;;) {
 229		int result = 0;
 230		defmap = head->defaults;
 231
 232		fl = rcu_dereference_bh(head->filter_list);
 233		/*
 234		 * Step 2+n. Apply classifier.
 235		 */
 236		result = tcf_classify(skb, fl, &res, true);
 237		if (!fl || result < 0)
 238			goto fallback;
 239
 240		cl = (void *)res.class;
 241		if (!cl) {
 242			if (TC_H_MAJ(res.classid))
 243				cl = cbq_class_lookup(q, res.classid);
 244			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 245				cl = defmap[TC_PRIO_BESTEFFORT];
 246
 247			if (cl == NULL)
 248				goto fallback;
 249		}
 250		if (cl->level >= head->level)
 251			goto fallback;
 252#ifdef CONFIG_NET_CLS_ACT
 253		switch (result) {
 254		case TC_ACT_QUEUED:
 255		case TC_ACT_STOLEN:
 256		case TC_ACT_TRAP:
 257			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 258			/* fall through */
 259		case TC_ACT_SHOT:
 260			return NULL;
 261		case TC_ACT_RECLASSIFY:
 262			return cbq_reclassify(skb, cl);
 263		}
 264#endif
 265		if (cl->level == 0)
 266			return cl;
 267
 268		/*
 269		 * Step 3+n. If classifier selected a link sharing class,
 270		 *	   apply agency specific classifier.
 271		 *	   Repeat this procdure until we hit a leaf node.
 272		 */
 273		head = cl;
 274	}
 275
 276fallback:
 277	cl = head;
 278
 279	/*
 280	 * Step 4. No success...
 281	 */
 282	if (TC_H_MAJ(prio) == 0 &&
 283	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 284	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 285		return head;
 286
 287	return cl;
 288}
 289
 290/*
 291 * A packet has just been enqueued on the empty class.
 292 * cbq_activate_class adds it to the tail of active class list
 293 * of its priority band.
 294 */
 295
 296static inline void cbq_activate_class(struct cbq_class *cl)
 297{
 298	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 299	int prio = cl->cpriority;
 300	struct cbq_class *cl_tail;
 301
 302	cl_tail = q->active[prio];
 303	q->active[prio] = cl;
 304
 305	if (cl_tail != NULL) {
 306		cl->next_alive = cl_tail->next_alive;
 307		cl_tail->next_alive = cl;
 308	} else {
 309		cl->next_alive = cl;
 310		q->activemask |= (1<<prio);
 311	}
 312}
 313
 314/*
 315 * Unlink class from active chain.
 316 * Note that this same procedure is done directly in cbq_dequeue*
 317 * during round-robin procedure.
 318 */
 319
 320static void cbq_deactivate_class(struct cbq_class *this)
 321{
 322	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 323	int prio = this->cpriority;
 324	struct cbq_class *cl;
 325	struct cbq_class *cl_prev = q->active[prio];
 326
 327	do {
 328		cl = cl_prev->next_alive;
 329		if (cl == this) {
 330			cl_prev->next_alive = cl->next_alive;
 331			cl->next_alive = NULL;
 332
 333			if (cl == q->active[prio]) {
 334				q->active[prio] = cl_prev;
 335				if (cl == q->active[prio]) {
 336					q->active[prio] = NULL;
 337					q->activemask &= ~(1<<prio);
 338					return;
 339				}
 340			}
 341			return;
 342		}
 343	} while ((cl_prev = cl) != q->active[prio]);
 344}
 345
 346static void
 347cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 348{
 349	int toplevel = q->toplevel;
 350
 351	if (toplevel > cl->level) {
 352		psched_time_t now = psched_get_time();
 353
 354		do {
 355			if (cl->undertime < now) {
 356				q->toplevel = cl->level;
 357				return;
 358			}
 359		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 360	}
 361}
 362
 363static int
 364cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 365	    struct sk_buff **to_free)
 366{
 367	struct cbq_sched_data *q = qdisc_priv(sch);
 368	int uninitialized_var(ret);
 369	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 370
 371#ifdef CONFIG_NET_CLS_ACT
 372	q->rx_class = cl;
 373#endif
 374	if (cl == NULL) {
 375		if (ret & __NET_XMIT_BYPASS)
 376			qdisc_qstats_drop(sch);
 377		__qdisc_drop(skb, to_free);
 378		return ret;
 379	}
 380
 381	ret = qdisc_enqueue(skb, cl->q, to_free);
 382	if (ret == NET_XMIT_SUCCESS) {
 383		sch->q.qlen++;
 384		cbq_mark_toplevel(q, cl);
 385		if (!cl->next_alive)
 386			cbq_activate_class(cl);
 387		return ret;
 388	}
 389
 390	if (net_xmit_drop_count(ret)) {
 391		qdisc_qstats_drop(sch);
 392		cbq_mark_toplevel(q, cl);
 393		cl->qstats.drops++;
 394	}
 395	return ret;
 396}
 397
 398/* Overlimit action: penalize leaf class by adding offtime */
 399static void cbq_overlimit(struct cbq_class *cl)
 400{
 401	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 402	psched_tdiff_t delay = cl->undertime - q->now;
 403
 404	if (!cl->delayed) {
 405		delay += cl->offtime;
 406
 407		/*
 408		 * Class goes to sleep, so that it will have no
 409		 * chance to work avgidle. Let's forgive it 8)
 410		 *
 411		 * BTW cbq-2.0 has a crap in this
 412		 * place, apparently they forgot to shift it by cl->ewma_log.
 413		 */
 414		if (cl->avgidle < 0)
 415			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 416		if (cl->avgidle < cl->minidle)
 417			cl->avgidle = cl->minidle;
 418		if (delay <= 0)
 419			delay = 1;
 420		cl->undertime = q->now + delay;
 421
 422		cl->xstats.overactions++;
 423		cl->delayed = 1;
 424	}
 425	if (q->wd_expires == 0 || q->wd_expires > delay)
 426		q->wd_expires = delay;
 427
 428	/* Dirty work! We must schedule wakeups based on
 429	 * real available rate, rather than leaf rate,
 430	 * which may be tiny (even zero).
 431	 */
 432	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 433		struct cbq_class *b;
 434		psched_tdiff_t base_delay = q->wd_expires;
 435
 436		for (b = cl->borrow; b; b = b->borrow) {
 437			delay = b->undertime - q->now;
 438			if (delay < base_delay) {
 439				if (delay <= 0)
 440					delay = 1;
 441				base_delay = delay;
 442			}
 443		}
 444
 445		q->wd_expires = base_delay;
 446	}
 447}
 448
 449static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 450				       psched_time_t now)
 451{
 452	struct cbq_class *cl;
 453	struct cbq_class *cl_prev = q->active[prio];
 454	psched_time_t sched = now;
 455
 456	if (cl_prev == NULL)
 457		return 0;
 458
 459	do {
 460		cl = cl_prev->next_alive;
 461		if (now - cl->penalized > 0) {
 462			cl_prev->next_alive = cl->next_alive;
 463			cl->next_alive = NULL;
 464			cl->cpriority = cl->priority;
 465			cl->delayed = 0;
 466			cbq_activate_class(cl);
 467
 468			if (cl == q->active[prio]) {
 469				q->active[prio] = cl_prev;
 470				if (cl == q->active[prio]) {
 471					q->active[prio] = NULL;
 472					return 0;
 473				}
 474			}
 475
 476			cl = cl_prev->next_alive;
 477		} else if (sched - cl->penalized > 0)
 478			sched = cl->penalized;
 479	} while ((cl_prev = cl) != q->active[prio]);
 480
 481	return sched - now;
 482}
 483
 484static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 485{
 486	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 487						delay_timer);
 488	struct Qdisc *sch = q->watchdog.qdisc;
 489	psched_time_t now;
 490	psched_tdiff_t delay = 0;
 491	unsigned int pmask;
 492
 493	now = psched_get_time();
 494
 495	pmask = q->pmask;
 496	q->pmask = 0;
 497
 498	while (pmask) {
 499		int prio = ffz(~pmask);
 500		psched_tdiff_t tmp;
 501
 502		pmask &= ~(1<<prio);
 503
 504		tmp = cbq_undelay_prio(q, prio, now);
 505		if (tmp > 0) {
 506			q->pmask |= 1<<prio;
 507			if (tmp < delay || delay == 0)
 508				delay = tmp;
 509		}
 510	}
 511
 512	if (delay) {
 513		ktime_t time;
 514
 515		time = 0;
 516		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 517		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 518	}
 519
 520	__netif_schedule(qdisc_root(sch));
 521	return HRTIMER_NORESTART;
 522}
 523
 524/*
 525 * It is mission critical procedure.
 526 *
 527 * We "regenerate" toplevel cutoff, if transmitting class
 528 * has backlog and it is not regulated. It is not part of
 529 * original CBQ description, but looks more reasonable.
 530 * Probably, it is wrong. This question needs further investigation.
 531 */
 532
 533static inline void
 534cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 535		    struct cbq_class *borrowed)
 536{
 537	if (cl && q->toplevel >= borrowed->level) {
 538		if (cl->q->q.qlen > 1) {
 539			do {
 540				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 541					q->toplevel = borrowed->level;
 542					return;
 543				}
 544			} while ((borrowed = borrowed->borrow) != NULL);
 545		}
 546#if 0
 547	/* It is not necessary now. Uncommenting it
 548	   will save CPU cycles, but decrease fairness.
 549	 */
 550		q->toplevel = TC_CBQ_MAXLEVEL;
 551#endif
 552	}
 553}
 554
 555static void
 556cbq_update(struct cbq_sched_data *q)
 557{
 558	struct cbq_class *this = q->tx_class;
 559	struct cbq_class *cl = this;
 560	int len = q->tx_len;
 561	psched_time_t now;
 562
 563	q->tx_class = NULL;
 564	/* Time integrator. We calculate EOS time
 565	 * by adding expected packet transmission time.
 566	 */
 567	now = q->now + L2T(&q->link, len);
 568
 569	for ( ; cl; cl = cl->share) {
 570		long avgidle = cl->avgidle;
 571		long idle;
 572
 573		cl->bstats.packets++;
 574		cl->bstats.bytes += len;
 575
 576		/*
 577		 * (now - last) is total time between packet right edges.
 578		 * (last_pktlen/rate) is "virtual" busy time, so that
 579		 *
 580		 *	idle = (now - last) - last_pktlen/rate
 581		 */
 582
 583		idle = now - cl->last;
 584		if ((unsigned long)idle > 128*1024*1024) {
 585			avgidle = cl->maxidle;
 586		} else {
 587			idle -= L2T(cl, len);
 588
 589		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 590		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 591		 * cl->avgidle == true_avgidle/W,
 592		 * hence:
 593		 */
 594			avgidle += idle - (avgidle>>cl->ewma_log);
 595		}
 596
 597		if (avgidle <= 0) {
 598			/* Overlimit or at-limit */
 599
 600			if (avgidle < cl->minidle)
 601				avgidle = cl->minidle;
 602
 603			cl->avgidle = avgidle;
 604
 605			/* Calculate expected time, when this class
 606			 * will be allowed to send.
 607			 * It will occur, when:
 608			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 609			 * idle = (1/W - 1)*(-true_avgidle)
 610			 * or
 611			 * idle = (1 - W)*(-cl->avgidle);
 612			 */
 613			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 614
 615			/*
 616			 * That is not all.
 617			 * To maintain the rate allocated to the class,
 618			 * we add to undertime virtual clock,
 619			 * necessary to complete transmitted packet.
 620			 * (len/phys_bandwidth has been already passed
 621			 * to the moment of cbq_update)
 622			 */
 623
 624			idle -= L2T(&q->link, len);
 625			idle += L2T(cl, len);
 626
 627			cl->undertime = now + idle;
 628		} else {
 629			/* Underlimit */
 630
 631			cl->undertime = PSCHED_PASTPERFECT;
 632			if (avgidle > cl->maxidle)
 633				cl->avgidle = cl->maxidle;
 634			else
 635				cl->avgidle = avgidle;
 636		}
 637		if ((s64)(now - cl->last) > 0)
 638			cl->last = now;
 639	}
 640
 641	cbq_update_toplevel(q, this, q->tx_borrowed);
 642}
 643
 644static inline struct cbq_class *
 645cbq_under_limit(struct cbq_class *cl)
 646{
 647	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 648	struct cbq_class *this_cl = cl;
 649
 650	if (cl->tparent == NULL)
 651		return cl;
 652
 653	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 654		cl->delayed = 0;
 655		return cl;
 656	}
 657
 658	do {
 659		/* It is very suspicious place. Now overlimit
 660		 * action is generated for not bounded classes
 661		 * only if link is completely congested.
 662		 * Though it is in agree with ancestor-only paradigm,
 663		 * it looks very stupid. Particularly,
 664		 * it means that this chunk of code will either
 665		 * never be called or result in strong amplification
 666		 * of burstiness. Dangerous, silly, and, however,
 667		 * no another solution exists.
 668		 */
 669		cl = cl->borrow;
 670		if (!cl) {
 671			this_cl->qstats.overlimits++;
 672			cbq_overlimit(this_cl);
 673			return NULL;
 674		}
 675		if (cl->level > q->toplevel)
 676			return NULL;
 677	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 678
 679	cl->delayed = 0;
 680	return cl;
 681}
 682
 683static inline struct sk_buff *
 684cbq_dequeue_prio(struct Qdisc *sch, int prio)
 685{
 686	struct cbq_sched_data *q = qdisc_priv(sch);
 687	struct cbq_class *cl_tail, *cl_prev, *cl;
 688	struct sk_buff *skb;
 689	int deficit;
 690
 691	cl_tail = cl_prev = q->active[prio];
 692	cl = cl_prev->next_alive;
 693
 694	do {
 695		deficit = 0;
 696
 697		/* Start round */
 698		do {
 699			struct cbq_class *borrow = cl;
 700
 701			if (cl->q->q.qlen &&
 702			    (borrow = cbq_under_limit(cl)) == NULL)
 703				goto skip_class;
 704
 705			if (cl->deficit <= 0) {
 706				/* Class exhausted its allotment per
 707				 * this round. Switch to the next one.
 708				 */
 709				deficit = 1;
 710				cl->deficit += cl->quantum;
 711				goto next_class;
 712			}
 713
 714			skb = cl->q->dequeue(cl->q);
 715
 716			/* Class did not give us any skb :-(
 717			 * It could occur even if cl->q->q.qlen != 0
 718			 * f.e. if cl->q == "tbf"
 719			 */
 720			if (skb == NULL)
 721				goto skip_class;
 722
 723			cl->deficit -= qdisc_pkt_len(skb);
 724			q->tx_class = cl;
 725			q->tx_borrowed = borrow;
 726			if (borrow != cl) {
 727#ifndef CBQ_XSTATS_BORROWS_BYTES
 728				borrow->xstats.borrows++;
 729				cl->xstats.borrows++;
 730#else
 731				borrow->xstats.borrows += qdisc_pkt_len(skb);
 732				cl->xstats.borrows += qdisc_pkt_len(skb);
 733#endif
 734			}
 735			q->tx_len = qdisc_pkt_len(skb);
 736
 737			if (cl->deficit <= 0) {
 738				q->active[prio] = cl;
 739				cl = cl->next_alive;
 740				cl->deficit += cl->quantum;
 741			}
 742			return skb;
 743
 744skip_class:
 745			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 746				/* Class is empty or penalized.
 747				 * Unlink it from active chain.
 748				 */
 749				cl_prev->next_alive = cl->next_alive;
 750				cl->next_alive = NULL;
 751
 752				/* Did cl_tail point to it? */
 753				if (cl == cl_tail) {
 754					/* Repair it! */
 755					cl_tail = cl_prev;
 756
 757					/* Was it the last class in this band? */
 758					if (cl == cl_tail) {
 759						/* Kill the band! */
 760						q->active[prio] = NULL;
 761						q->activemask &= ~(1<<prio);
 762						if (cl->q->q.qlen)
 763							cbq_activate_class(cl);
 764						return NULL;
 765					}
 766
 767					q->active[prio] = cl_tail;
 768				}
 769				if (cl->q->q.qlen)
 770					cbq_activate_class(cl);
 771
 772				cl = cl_prev;
 773			}
 774
 775next_class:
 776			cl_prev = cl;
 777			cl = cl->next_alive;
 778		} while (cl_prev != cl_tail);
 779	} while (deficit);
 780
 781	q->active[prio] = cl_prev;
 782
 783	return NULL;
 784}
 785
 786static inline struct sk_buff *
 787cbq_dequeue_1(struct Qdisc *sch)
 788{
 789	struct cbq_sched_data *q = qdisc_priv(sch);
 790	struct sk_buff *skb;
 791	unsigned int activemask;
 792
 793	activemask = q->activemask & 0xFF;
 794	while (activemask) {
 795		int prio = ffz(~activemask);
 796		activemask &= ~(1<<prio);
 797		skb = cbq_dequeue_prio(sch, prio);
 798		if (skb)
 799			return skb;
 800	}
 801	return NULL;
 802}
 803
 804static struct sk_buff *
 805cbq_dequeue(struct Qdisc *sch)
 806{
 807	struct sk_buff *skb;
 808	struct cbq_sched_data *q = qdisc_priv(sch);
 809	psched_time_t now;
 810
 811	now = psched_get_time();
 812
 813	if (q->tx_class)
 814		cbq_update(q);
 815
 816	q->now = now;
 817
 818	for (;;) {
 819		q->wd_expires = 0;
 820
 821		skb = cbq_dequeue_1(sch);
 822		if (skb) {
 823			qdisc_bstats_update(sch, skb);
 824			sch->q.qlen--;
 825			return skb;
 826		}
 827
 828		/* All the classes are overlimit.
 829		 *
 830		 * It is possible, if:
 831		 *
 832		 * 1. Scheduler is empty.
 833		 * 2. Toplevel cutoff inhibited borrowing.
 834		 * 3. Root class is overlimit.
 835		 *
 836		 * Reset 2d and 3d conditions and retry.
 837		 *
 838		 * Note, that NS and cbq-2.0 are buggy, peeking
 839		 * an arbitrary class is appropriate for ancestor-only
 840		 * sharing, but not for toplevel algorithm.
 841		 *
 842		 * Our version is better, but slower, because it requires
 843		 * two passes, but it is unavoidable with top-level sharing.
 844		 */
 845
 846		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 847		    q->link.undertime == PSCHED_PASTPERFECT)
 848			break;
 849
 850		q->toplevel = TC_CBQ_MAXLEVEL;
 851		q->link.undertime = PSCHED_PASTPERFECT;
 852	}
 853
 854	/* No packets in scheduler or nobody wants to give them to us :-(
 855	 * Sigh... start watchdog timer in the last case.
 856	 */
 857
 858	if (sch->q.qlen) {
 859		qdisc_qstats_overlimit(sch);
 860		if (q->wd_expires)
 861			qdisc_watchdog_schedule(&q->watchdog,
 862						now + q->wd_expires);
 863	}
 864	return NULL;
 865}
 866
 867/* CBQ class maintanance routines */
 868
 869static void cbq_adjust_levels(struct cbq_class *this)
 870{
 871	if (this == NULL)
 872		return;
 873
 874	do {
 875		int level = 0;
 876		struct cbq_class *cl;
 877
 878		cl = this->children;
 879		if (cl) {
 880			do {
 881				if (cl->level > level)
 882					level = cl->level;
 883			} while ((cl = cl->sibling) != this->children);
 884		}
 885		this->level = level + 1;
 886	} while ((this = this->tparent) != NULL);
 887}
 888
 889static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 890{
 891	struct cbq_class *cl;
 892	unsigned int h;
 893
 894	if (q->quanta[prio] == 0)
 895		return;
 896
 897	for (h = 0; h < q->clhash.hashsize; h++) {
 898		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 899			/* BUGGGG... Beware! This expression suffer of
 900			 * arithmetic overflows!
 901			 */
 902			if (cl->priority == prio) {
 903				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 904					q->quanta[prio];
 905			}
 906			if (cl->quantum <= 0 ||
 907			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 908				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 909					cl->common.classid, cl->quantum);
 910				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 911			}
 912		}
 913	}
 914}
 915
 916static void cbq_sync_defmap(struct cbq_class *cl)
 917{
 918	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 919	struct cbq_class *split = cl->split;
 920	unsigned int h;
 921	int i;
 922
 923	if (split == NULL)
 924		return;
 925
 926	for (i = 0; i <= TC_PRIO_MAX; i++) {
 927		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 928			split->defaults[i] = NULL;
 929	}
 930
 931	for (i = 0; i <= TC_PRIO_MAX; i++) {
 932		int level = split->level;
 933
 934		if (split->defaults[i])
 935			continue;
 936
 937		for (h = 0; h < q->clhash.hashsize; h++) {
 938			struct cbq_class *c;
 939
 940			hlist_for_each_entry(c, &q->clhash.hash[h],
 941					     common.hnode) {
 942				if (c->split == split && c->level < level &&
 943				    c->defmap & (1<<i)) {
 944					split->defaults[i] = c;
 945					level = c->level;
 946				}
 947			}
 948		}
 949	}
 950}
 951
 952static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 953{
 954	struct cbq_class *split = NULL;
 955
 956	if (splitid == 0) {
 957		split = cl->split;
 958		if (!split)
 959			return;
 960		splitid = split->common.classid;
 961	}
 962
 963	if (split == NULL || split->common.classid != splitid) {
 964		for (split = cl->tparent; split; split = split->tparent)
 965			if (split->common.classid == splitid)
 966				break;
 967	}
 968
 969	if (split == NULL)
 970		return;
 971
 972	if (cl->split != split) {
 973		cl->defmap = 0;
 974		cbq_sync_defmap(cl);
 975		cl->split = split;
 976		cl->defmap = def & mask;
 977	} else
 978		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 979
 980	cbq_sync_defmap(cl);
 981}
 982
 983static void cbq_unlink_class(struct cbq_class *this)
 984{
 985	struct cbq_class *cl, **clp;
 986	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 987
 988	qdisc_class_hash_remove(&q->clhash, &this->common);
 989
 990	if (this->tparent) {
 991		clp = &this->sibling;
 992		cl = *clp;
 993		do {
 994			if (cl == this) {
 995				*clp = cl->sibling;
 996				break;
 997			}
 998			clp = &cl->sibling;
 999		} while ((cl = *clp) != this->sibling);
1000
1001		if (this->tparent->children == this) {
1002			this->tparent->children = this->sibling;
1003			if (this->sibling == this)
1004				this->tparent->children = NULL;
1005		}
1006	} else {
1007		WARN_ON(this->sibling != this);
1008	}
1009}
1010
1011static void cbq_link_class(struct cbq_class *this)
1012{
1013	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1014	struct cbq_class *parent = this->tparent;
1015
1016	this->sibling = this;
1017	qdisc_class_hash_insert(&q->clhash, &this->common);
1018
1019	if (parent == NULL)
1020		return;
1021
1022	if (parent->children == NULL) {
1023		parent->children = this;
1024	} else {
1025		this->sibling = parent->children->sibling;
1026		parent->children->sibling = this;
1027	}
1028}
1029
1030static void
1031cbq_reset(struct Qdisc *sch)
1032{
1033	struct cbq_sched_data *q = qdisc_priv(sch);
1034	struct cbq_class *cl;
1035	int prio;
1036	unsigned int h;
1037
1038	q->activemask = 0;
1039	q->pmask = 0;
1040	q->tx_class = NULL;
1041	q->tx_borrowed = NULL;
1042	qdisc_watchdog_cancel(&q->watchdog);
1043	hrtimer_cancel(&q->delay_timer);
1044	q->toplevel = TC_CBQ_MAXLEVEL;
1045	q->now = psched_get_time();
1046
1047	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1048		q->active[prio] = NULL;
1049
1050	for (h = 0; h < q->clhash.hashsize; h++) {
1051		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1052			qdisc_reset(cl->q);
1053
1054			cl->next_alive = NULL;
1055			cl->undertime = PSCHED_PASTPERFECT;
1056			cl->avgidle = cl->maxidle;
1057			cl->deficit = cl->quantum;
1058			cl->cpriority = cl->priority;
1059		}
1060	}
1061	sch->q.qlen = 0;
1062}
1063
1064
1065static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1066{
1067	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1068		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1069		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1070	}
1071	if (lss->change & TCF_CBQ_LSS_EWMA)
1072		cl->ewma_log = lss->ewma_log;
1073	if (lss->change & TCF_CBQ_LSS_AVPKT)
1074		cl->avpkt = lss->avpkt;
1075	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1076		cl->minidle = -(long)lss->minidle;
1077	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1078		cl->maxidle = lss->maxidle;
1079		cl->avgidle = lss->maxidle;
1080	}
1081	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1082		cl->offtime = lss->offtime;
1083	return 0;
1084}
1085
1086static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1087{
1088	q->nclasses[cl->priority]--;
1089	q->quanta[cl->priority] -= cl->weight;
1090	cbq_normalize_quanta(q, cl->priority);
1091}
1092
1093static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1094{
1095	q->nclasses[cl->priority]++;
1096	q->quanta[cl->priority] += cl->weight;
1097	cbq_normalize_quanta(q, cl->priority);
1098}
1099
1100static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1101{
1102	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1103
1104	if (wrr->allot)
1105		cl->allot = wrr->allot;
1106	if (wrr->weight)
1107		cl->weight = wrr->weight;
1108	if (wrr->priority) {
1109		cl->priority = wrr->priority - 1;
1110		cl->cpriority = cl->priority;
1111		if (cl->priority >= cl->priority2)
1112			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1113	}
1114
1115	cbq_addprio(q, cl);
1116	return 0;
1117}
1118
1119static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1120{
1121	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1122	return 0;
1123}
1124
1125static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1126	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1127	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1128	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1129	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1130	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1131	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1132	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1133};
1134
1135static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1136		    struct netlink_ext_ack *extack)
1137{
1138	struct cbq_sched_data *q = qdisc_priv(sch);
1139	struct nlattr *tb[TCA_CBQ_MAX + 1];
1140	struct tc_ratespec *r;
1141	int err;
1142
1143	qdisc_watchdog_init(&q->watchdog, sch);
1144	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1145	q->delay_timer.function = cbq_undelay;
1146
1147	if (!opt) {
1148		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1149		return -EINVAL;
1150	}
1151
1152	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
1153	if (err < 0)
1154		return err;
1155
1156	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1157		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1158		return -EINVAL;
1159	}
1160
1161	r = nla_data(tb[TCA_CBQ_RATE]);
1162
1163	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1164	if (!q->link.R_tab)
1165		return -EINVAL;
1166
1167	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1168	if (err)
1169		goto put_rtab;
1170
1171	err = qdisc_class_hash_init(&q->clhash);
1172	if (err < 0)
1173		goto put_block;
1174
1175	q->link.sibling = &q->link;
1176	q->link.common.classid = sch->handle;
1177	q->link.qdisc = sch;
1178	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1179				      sch->handle, NULL);
1180	if (!q->link.q)
1181		q->link.q = &noop_qdisc;
1182	else
1183		qdisc_hash_add(q->link.q, true);
1184
1185	q->link.priority = TC_CBQ_MAXPRIO - 1;
1186	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1187	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1188	q->link.allot = psched_mtu(qdisc_dev(sch));
1189	q->link.quantum = q->link.allot;
1190	q->link.weight = q->link.R_tab->rate.rate;
1191
1192	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1193	q->link.avpkt = q->link.allot/2;
1194	q->link.minidle = -0x7FFFFFFF;
1195
1196	q->toplevel = TC_CBQ_MAXLEVEL;
1197	q->now = psched_get_time();
1198
1199	cbq_link_class(&q->link);
1200
1201	if (tb[TCA_CBQ_LSSOPT])
1202		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1203
1204	cbq_addprio(q, &q->link);
1205	return 0;
1206
1207put_block:
1208	tcf_block_put(q->link.block);
1209
1210put_rtab:
1211	qdisc_put_rtab(q->link.R_tab);
1212	return err;
1213}
1214
1215static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1216{
1217	unsigned char *b = skb_tail_pointer(skb);
1218
1219	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1220		goto nla_put_failure;
1221	return skb->len;
1222
1223nla_put_failure:
1224	nlmsg_trim(skb, b);
1225	return -1;
1226}
1227
1228static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1229{
1230	unsigned char *b = skb_tail_pointer(skb);
1231	struct tc_cbq_lssopt opt;
1232
1233	opt.flags = 0;
1234	if (cl->borrow == NULL)
1235		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1236	if (cl->share == NULL)
1237		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1238	opt.ewma_log = cl->ewma_log;
1239	opt.level = cl->level;
1240	opt.avpkt = cl->avpkt;
1241	opt.maxidle = cl->maxidle;
1242	opt.minidle = (u32)(-cl->minidle);
1243	opt.offtime = cl->offtime;
1244	opt.change = ~0;
1245	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1246		goto nla_put_failure;
1247	return skb->len;
1248
1249nla_put_failure:
1250	nlmsg_trim(skb, b);
1251	return -1;
1252}
1253
1254static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1255{
1256	unsigned char *b = skb_tail_pointer(skb);
1257	struct tc_cbq_wrropt opt;
1258
1259	memset(&opt, 0, sizeof(opt));
1260	opt.flags = 0;
1261	opt.allot = cl->allot;
1262	opt.priority = cl->priority + 1;
1263	opt.cpriority = cl->cpriority + 1;
1264	opt.weight = cl->weight;
1265	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1266		goto nla_put_failure;
1267	return skb->len;
1268
1269nla_put_failure:
1270	nlmsg_trim(skb, b);
1271	return -1;
1272}
1273
1274static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1275{
1276	unsigned char *b = skb_tail_pointer(skb);
1277	struct tc_cbq_fopt opt;
1278
1279	if (cl->split || cl->defmap) {
1280		opt.split = cl->split ? cl->split->common.classid : 0;
1281		opt.defmap = cl->defmap;
1282		opt.defchange = ~0;
1283		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1284			goto nla_put_failure;
1285	}
1286	return skb->len;
1287
1288nla_put_failure:
1289	nlmsg_trim(skb, b);
1290	return -1;
1291}
1292
1293static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1294{
1295	if (cbq_dump_lss(skb, cl) < 0 ||
1296	    cbq_dump_rate(skb, cl) < 0 ||
1297	    cbq_dump_wrr(skb, cl) < 0 ||
1298	    cbq_dump_fopt(skb, cl) < 0)
1299		return -1;
1300	return 0;
1301}
1302
1303static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1304{
1305	struct cbq_sched_data *q = qdisc_priv(sch);
1306	struct nlattr *nest;
1307
1308	nest = nla_nest_start(skb, TCA_OPTIONS);
1309	if (nest == NULL)
1310		goto nla_put_failure;
1311	if (cbq_dump_attr(skb, &q->link) < 0)
1312		goto nla_put_failure;
1313	return nla_nest_end(skb, nest);
1314
1315nla_put_failure:
1316	nla_nest_cancel(skb, nest);
1317	return -1;
1318}
1319
1320static int
1321cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1322{
1323	struct cbq_sched_data *q = qdisc_priv(sch);
1324
1325	q->link.xstats.avgidle = q->link.avgidle;
1326	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1327}
1328
1329static int
1330cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1331	       struct sk_buff *skb, struct tcmsg *tcm)
1332{
1333	struct cbq_class *cl = (struct cbq_class *)arg;
1334	struct nlattr *nest;
1335
1336	if (cl->tparent)
1337		tcm->tcm_parent = cl->tparent->common.classid;
1338	else
1339		tcm->tcm_parent = TC_H_ROOT;
1340	tcm->tcm_handle = cl->common.classid;
1341	tcm->tcm_info = cl->q->handle;
1342
1343	nest = nla_nest_start(skb, TCA_OPTIONS);
1344	if (nest == NULL)
1345		goto nla_put_failure;
1346	if (cbq_dump_attr(skb, cl) < 0)
1347		goto nla_put_failure;
1348	return nla_nest_end(skb, nest);
1349
1350nla_put_failure:
1351	nla_nest_cancel(skb, nest);
1352	return -1;
1353}
1354
1355static int
1356cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1357	struct gnet_dump *d)
1358{
1359	struct cbq_sched_data *q = qdisc_priv(sch);
1360	struct cbq_class *cl = (struct cbq_class *)arg;
1361
1362	cl->xstats.avgidle = cl->avgidle;
1363	cl->xstats.undertime = 0;
1364
1365	if (cl->undertime != PSCHED_PASTPERFECT)
1366		cl->xstats.undertime = cl->undertime - q->now;
1367
1368	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1369				  d, NULL, &cl->bstats) < 0 ||
1370	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1371	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1372		return -1;
1373
1374	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1375}
1376
1377static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1378		     struct Qdisc **old, struct netlink_ext_ack *extack)
1379{
1380	struct cbq_class *cl = (struct cbq_class *)arg;
1381
1382	if (new == NULL) {
1383		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1384					cl->common.classid, extack);
1385		if (new == NULL)
1386			return -ENOBUFS;
1387	}
1388
1389	*old = qdisc_replace(sch, new, &cl->q);
1390	return 0;
1391}
1392
1393static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1394{
1395	struct cbq_class *cl = (struct cbq_class *)arg;
1396
1397	return cl->q;
1398}
1399
1400static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1401{
1402	struct cbq_class *cl = (struct cbq_class *)arg;
1403
1404	cbq_deactivate_class(cl);
1405}
1406
1407static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1408{
1409	struct cbq_sched_data *q = qdisc_priv(sch);
1410
1411	return (unsigned long)cbq_class_lookup(q, classid);
1412}
1413
1414static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1415{
1416	struct cbq_sched_data *q = qdisc_priv(sch);
1417
1418	WARN_ON(cl->filters);
1419
1420	tcf_block_put(cl->block);
1421	qdisc_destroy(cl->q);
1422	qdisc_put_rtab(cl->R_tab);
1423	gen_kill_estimator(&cl->rate_est);
1424	if (cl != &q->link)
1425		kfree(cl);
1426}
1427
1428static void cbq_destroy(struct Qdisc *sch)
1429{
1430	struct cbq_sched_data *q = qdisc_priv(sch);
1431	struct hlist_node *next;
1432	struct cbq_class *cl;
1433	unsigned int h;
1434
1435#ifdef CONFIG_NET_CLS_ACT
1436	q->rx_class = NULL;
1437#endif
1438	/*
1439	 * Filters must be destroyed first because we don't destroy the
1440	 * classes from root to leafs which means that filters can still
1441	 * be bound to classes which have been destroyed already. --TGR '04
1442	 */
1443	for (h = 0; h < q->clhash.hashsize; h++) {
1444		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1445			tcf_block_put(cl->block);
1446			cl->block = NULL;
1447		}
1448	}
1449	for (h = 0; h < q->clhash.hashsize; h++) {
1450		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1451					  common.hnode)
1452			cbq_destroy_class(sch, cl);
1453	}
1454	qdisc_class_hash_destroy(&q->clhash);
1455}
1456
1457static int
1458cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1459		 unsigned long *arg, struct netlink_ext_ack *extack)
1460{
1461	int err;
1462	struct cbq_sched_data *q = qdisc_priv(sch);
1463	struct cbq_class *cl = (struct cbq_class *)*arg;
1464	struct nlattr *opt = tca[TCA_OPTIONS];
1465	struct nlattr *tb[TCA_CBQ_MAX + 1];
1466	struct cbq_class *parent;
1467	struct qdisc_rate_table *rtab = NULL;
1468
1469	if (!opt) {
1470		NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
1471		return -EINVAL;
1472	}
1473
1474	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
1475	if (err < 0)
1476		return err;
1477
1478	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1479		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1480		return -EOPNOTSUPP;
1481	}
1482
1483	if (cl) {
1484		/* Check parent */
1485		if (parentid) {
1486			if (cl->tparent &&
1487			    cl->tparent->common.classid != parentid) {
1488				NL_SET_ERR_MSG(extack, "Invalid parent id");
1489				return -EINVAL;
1490			}
1491			if (!cl->tparent && parentid != TC_H_ROOT) {
1492				NL_SET_ERR_MSG(extack, "Parent must be root");
1493				return -EINVAL;
1494			}
1495		}
1496
1497		if (tb[TCA_CBQ_RATE]) {
1498			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1499					      tb[TCA_CBQ_RTAB], extack);
1500			if (rtab == NULL)
1501				return -EINVAL;
1502		}
1503
1504		if (tca[TCA_RATE]) {
1505			err = gen_replace_estimator(&cl->bstats, NULL,
1506						    &cl->rate_est,
1507						    NULL,
1508						    qdisc_root_sleeping_running(sch),
1509						    tca[TCA_RATE]);
1510			if (err) {
1511				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1512				qdisc_put_rtab(rtab);
1513				return err;
1514			}
1515		}
1516
1517		/* Change class parameters */
1518		sch_tree_lock(sch);
1519
1520		if (cl->next_alive != NULL)
1521			cbq_deactivate_class(cl);
1522
1523		if (rtab) {
1524			qdisc_put_rtab(cl->R_tab);
1525			cl->R_tab = rtab;
1526		}
1527
1528		if (tb[TCA_CBQ_LSSOPT])
1529			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1530
1531		if (tb[TCA_CBQ_WRROPT]) {
1532			cbq_rmprio(q, cl);
1533			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1534		}
1535
1536		if (tb[TCA_CBQ_FOPT])
1537			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1538
1539		if (cl->q->q.qlen)
1540			cbq_activate_class(cl);
1541
1542		sch_tree_unlock(sch);
1543
1544		return 0;
1545	}
1546
1547	if (parentid == TC_H_ROOT)
1548		return -EINVAL;
1549
1550	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1551		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1552		return -EINVAL;
1553	}
1554
1555	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1556			      extack);
1557	if (rtab == NULL)
1558		return -EINVAL;
1559
1560	if (classid) {
1561		err = -EINVAL;
1562		if (TC_H_MAJ(classid ^ sch->handle) ||
1563		    cbq_class_lookup(q, classid)) {
1564			NL_SET_ERR_MSG(extack, "Specified class not found");
1565			goto failure;
1566		}
1567	} else {
1568		int i;
1569		classid = TC_H_MAKE(sch->handle, 0x8000);
1570
1571		for (i = 0; i < 0x8000; i++) {
1572			if (++q->hgenerator >= 0x8000)
1573				q->hgenerator = 1;
1574			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1575				break;
1576		}
1577		err = -ENOSR;
1578		if (i >= 0x8000) {
1579			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1580			goto failure;
1581		}
1582		classid = classid|q->hgenerator;
1583	}
1584
1585	parent = &q->link;
1586	if (parentid) {
1587		parent = cbq_class_lookup(q, parentid);
1588		err = -EINVAL;
1589		if (!parent) {
1590			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1591			goto failure;
1592		}
1593	}
1594
1595	err = -ENOBUFS;
1596	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1597	if (cl == NULL)
1598		goto failure;
1599
1600	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1601	if (err) {
1602		kfree(cl);
1603		return err;
1604	}
1605
1606	if (tca[TCA_RATE]) {
1607		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1608					NULL,
1609					qdisc_root_sleeping_running(sch),
1610					tca[TCA_RATE]);
1611		if (err) {
1612			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1613			tcf_block_put(cl->block);
1614			kfree(cl);
1615			goto failure;
1616		}
1617	}
1618
1619	cl->R_tab = rtab;
1620	rtab = NULL;
1621	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1622				  NULL);
1623	if (!cl->q)
1624		cl->q = &noop_qdisc;
1625	else
1626		qdisc_hash_add(cl->q, true);
1627
1628	cl->common.classid = classid;
1629	cl->tparent = parent;
1630	cl->qdisc = sch;
1631	cl->allot = parent->allot;
1632	cl->quantum = cl->allot;
1633	cl->weight = cl->R_tab->rate.rate;
1634
1635	sch_tree_lock(sch);
1636	cbq_link_class(cl);
1637	cl->borrow = cl->tparent;
1638	if (cl->tparent != &q->link)
1639		cl->share = cl->tparent;
1640	cbq_adjust_levels(parent);
1641	cl->minidle = -0x7FFFFFFF;
1642	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1643	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1644	if (cl->ewma_log == 0)
1645		cl->ewma_log = q->link.ewma_log;
1646	if (cl->maxidle == 0)
1647		cl->maxidle = q->link.maxidle;
1648	if (cl->avpkt == 0)
1649		cl->avpkt = q->link.avpkt;
1650	if (tb[TCA_CBQ_FOPT])
1651		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1652	sch_tree_unlock(sch);
1653
1654	qdisc_class_hash_grow(sch, &q->clhash);
1655
1656	*arg = (unsigned long)cl;
1657	return 0;
1658
1659failure:
1660	qdisc_put_rtab(rtab);
1661	return err;
1662}
1663
1664static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1665{
1666	struct cbq_sched_data *q = qdisc_priv(sch);
1667	struct cbq_class *cl = (struct cbq_class *)arg;
1668	unsigned int qlen, backlog;
1669
1670	if (cl->filters || cl->children || cl == &q->link)
1671		return -EBUSY;
1672
1673	sch_tree_lock(sch);
1674
1675	qlen = cl->q->q.qlen;
1676	backlog = cl->q->qstats.backlog;
1677	qdisc_reset(cl->q);
1678	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1679
1680	if (cl->next_alive)
1681		cbq_deactivate_class(cl);
1682
1683	if (q->tx_borrowed == cl)
1684		q->tx_borrowed = q->tx_class;
1685	if (q->tx_class == cl) {
1686		q->tx_class = NULL;
1687		q->tx_borrowed = NULL;
1688	}
1689#ifdef CONFIG_NET_CLS_ACT
1690	if (q->rx_class == cl)
1691		q->rx_class = NULL;
1692#endif
1693
1694	cbq_unlink_class(cl);
1695	cbq_adjust_levels(cl->tparent);
1696	cl->defmap = 0;
1697	cbq_sync_defmap(cl);
1698
1699	cbq_rmprio(q, cl);
1700	sch_tree_unlock(sch);
1701
1702	cbq_destroy_class(sch, cl);
1703	return 0;
1704}
1705
1706static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1707				       struct netlink_ext_ack *extack)
1708{
1709	struct cbq_sched_data *q = qdisc_priv(sch);
1710	struct cbq_class *cl = (struct cbq_class *)arg;
1711
1712	if (cl == NULL)
1713		cl = &q->link;
1714
1715	return cl->block;
1716}
1717
1718static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1719				     u32 classid)
1720{
1721	struct cbq_sched_data *q = qdisc_priv(sch);
1722	struct cbq_class *p = (struct cbq_class *)parent;
1723	struct cbq_class *cl = cbq_class_lookup(q, classid);
1724
1725	if (cl) {
1726		if (p && p->level <= cl->level)
1727			return 0;
1728		cl->filters++;
1729		return (unsigned long)cl;
1730	}
1731	return 0;
1732}
1733
1734static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1735{
1736	struct cbq_class *cl = (struct cbq_class *)arg;
1737
1738	cl->filters--;
1739}
1740
1741static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1742{
1743	struct cbq_sched_data *q = qdisc_priv(sch);
1744	struct cbq_class *cl;
1745	unsigned int h;
1746
1747	if (arg->stop)
1748		return;
1749
1750	for (h = 0; h < q->clhash.hashsize; h++) {
1751		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1752			if (arg->count < arg->skip) {
1753				arg->count++;
1754				continue;
1755			}
1756			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1757				arg->stop = 1;
1758				return;
1759			}
1760			arg->count++;
1761		}
1762	}
1763}
1764
1765static const struct Qdisc_class_ops cbq_class_ops = {
1766	.graft		=	cbq_graft,
1767	.leaf		=	cbq_leaf,
1768	.qlen_notify	=	cbq_qlen_notify,
1769	.find		=	cbq_find,
1770	.change		=	cbq_change_class,
1771	.delete		=	cbq_delete,
1772	.walk		=	cbq_walk,
1773	.tcf_block	=	cbq_tcf_block,
1774	.bind_tcf	=	cbq_bind_filter,
1775	.unbind_tcf	=	cbq_unbind_filter,
1776	.dump		=	cbq_dump_class,
1777	.dump_stats	=	cbq_dump_class_stats,
1778};
1779
1780static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1781	.next		=	NULL,
1782	.cl_ops		=	&cbq_class_ops,
1783	.id		=	"cbq",
1784	.priv_size	=	sizeof(struct cbq_sched_data),
1785	.enqueue	=	cbq_enqueue,
1786	.dequeue	=	cbq_dequeue,
1787	.peek		=	qdisc_peek_dequeued,
1788	.init		=	cbq_init,
1789	.reset		=	cbq_reset,
1790	.destroy	=	cbq_destroy,
1791	.change		=	NULL,
1792	.dump		=	cbq_dump,
1793	.dump_stats	=	cbq_dump_stats,
1794	.owner		=	THIS_MODULE,
1795};
1796
1797static int __init cbq_module_init(void)
1798{
1799	return register_qdisc(&cbq_qdisc_ops);
1800}
1801static void __exit cbq_module_exit(void)
1802{
1803	unregister_qdisc(&cbq_qdisc_ops);
1804}
1805module_init(cbq_module_init)
1806module_exit(cbq_module_exit)
1807MODULE_LICENSE("GPL");