sch_cbq.c - net/sched/sch_cbq.c - Linux diff v5.4 - Bootlin Elixir Cross Referencer

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   4 *
   5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/slab.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/string.h>
  13#include <linux/errno.h>
  14#include <linux/skbuff.h>
  15#include <net/netlink.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*	Class-Based Queueing (CBQ) algorithm.
  21	=======================================
  22
  23	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24		 Management Models for Packet Networks",
  25		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26
  27		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28
  29		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30		 Parameters", 1996
  31
  32		 [4] Sally Floyd and Michael Speer, "Experimental Results
  33		 for Class-Based Queueing", 1998, not published.
  34
  35	-----------------------------------------------------------------------
  36
  37	Algorithm skeleton was taken from NS simulator cbq.cc.
  38	If someone wants to check this code against the LBL version,
  39	he should take into account that ONLY the skeleton was borrowed,
  40	the implementation is different. Particularly:
  41
  42	--- The WRR algorithm is different. Our version looks more
  43	reasonable (I hope) and works when quanta are allowed to be
  44	less than MTU, which is always the case when real time classes
  45	have small rates. Note, that the statement of [3] is
  46	incomplete, delay may actually be estimated even if class
  47	per-round allotment is less than MTU. Namely, if per-round
  48	allotment is W*r_i, and r_1+...+r_k = r < 1
  49
  50	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51
  52	In the worst case we have IntServ estimate with D = W*r+k*MTU
  53	and C = MTU*r. The proof (if correct at all) is trivial.
  54
  55
  56	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57	interpret some places, which look like wrong translations
  58	from NS. Anyone is advised to find these differences
  59	and explain to me, why I am wrong 8).
  60
  61	--- Linux has no EOI event, so that we cannot estimate true class
  62	idle time. Workaround is to consider the next dequeue event
  63	as sign that previous packet is finished. This is wrong because of
  64	internal device queueing, but on a permanently loaded link it is true.
  65	Moreover, combined with clock integrator, this scheme looks
  66	very close to an ideal solution.  */
  67
  68struct cbq_sched_data;
  69
  70
  71struct cbq_class {
  72	struct Qdisc_class_common common;
  73	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  74
  75/* Parameters */
  76	unsigned char		priority;	/* class priority */
  77	unsigned char		priority2;	/* priority to be used after overlimit */
  78	unsigned char		ewma_log;	/* time constant for idle time calculation */
  79
  80	u32			defmap;
  81
  82	/* Link-sharing scheduler parameters */
  83	long			maxidle;	/* Class parameters: see below. */
  84	long			offtime;
  85	long			minidle;
  86	u32			avpkt;
  87	struct qdisc_rate_table	*R_tab;
  88
  89	/* General scheduler (WRR) parameters */
  90	long			allot;
  91	long			quantum;	/* Allotment per WRR round */
  92	long			weight;		/* Relative allotment: see below */
  93
  94	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
  95	struct cbq_class	*split;		/* Ptr to split node */
  96	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
  97	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
  98	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
  99						   parent otherwise */
 100	struct cbq_class	*sibling;	/* Sibling chain */
 101	struct cbq_class	*children;	/* Pointer to children chain */
 102
 103	struct Qdisc		*q;		/* Elementary queueing discipline */
 104
 105
 106/* Variables */
 107	unsigned char		cpriority;	/* Effective priority */
 108	unsigned char		delayed;
 109	unsigned char		level;		/* level of the class in hierarchy:
 110						   0 for leaf classes, and maximal
 111						   level of children + 1 for nodes.
 112						 */
 113
 114	psched_time_t		last;		/* Last end of service */
 115	psched_time_t		undertime;
 116	long			avgidle;
 117	long			deficit;	/* Saved deficit for WRR */
 118	psched_time_t		penalized;
 119	struct gnet_stats_basic_packed bstats;
 120	struct gnet_stats_queue qstats;
 121	struct net_rate_estimator __rcu *rate_est;
 122	struct tc_cbq_xstats	xstats;
 123
 124	struct tcf_proto __rcu	*filter_list;
 125	struct tcf_block	*block;
 126
 127	int			filters;
 128
 129	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 130};
 131
 132struct cbq_sched_data {
 133	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 134	int			nclasses[TC_CBQ_MAXPRIO + 1];
 135	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 136
 137	struct cbq_class	link;
 138
 139	unsigned int		activemask;
 140	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 141								   with backlog */
 142
 143#ifdef CONFIG_NET_CLS_ACT
 144	struct cbq_class	*rx_class;
 145#endif
 146	struct cbq_class	*tx_class;
 147	struct cbq_class	*tx_borrowed;
 148	int			tx_len;
 149	psched_time_t		now;		/* Cached timestamp */
 150	unsigned int		pmask;
 151
 152	struct hrtimer		delay_timer;
 153	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 154						   started when CBQ has
 155						   backlog, but cannot
 156						   transmit just now */
 157	psched_tdiff_t		wd_expires;
 158	int			toplevel;
 159	u32			hgenerator;
 160};
 161
 162
 163#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 164
 165static inline struct cbq_class *
 166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 167{
 168	struct Qdisc_class_common *clc;
 169
 170	clc = qdisc_class_find(&q->clhash, classid);
 171	if (clc == NULL)
 172		return NULL;
 173	return container_of(clc, struct cbq_class, common);
 174}
 175
 176#ifdef CONFIG_NET_CLS_ACT
 177
 178static struct cbq_class *
 179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 180{
 181	struct cbq_class *cl;
 182
 183	for (cl = this->tparent; cl; cl = cl->tparent) {
 184		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 185
 186		if (new != NULL && new != this)
 187			return new;
 188	}
 189	return NULL;
 190}
 191
 192#endif
 193
 194/* Classify packet. The procedure is pretty complicated, but
 195 * it allows us to combine link sharing and priority scheduling
 196 * transparently.
 197 *
 198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 199 * so that it resolves to split nodes. Then packets are classified
 200 * by logical priority, or a more specific classifier may be attached
 201 * to the split node.
 202 */
 203
 204static struct cbq_class *
 205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 206{
 207	struct cbq_sched_data *q = qdisc_priv(sch);
 208	struct cbq_class *head = &q->link;
 209	struct cbq_class **defmap;
 210	struct cbq_class *cl = NULL;
 211	u32 prio = skb->priority;
 212	struct tcf_proto *fl;
 213	struct tcf_result res;
 214
 215	/*
 216	 *  Step 1. If skb->priority points to one of our classes, use it.
 217	 */
 218	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 219	    (cl = cbq_class_lookup(q, prio)) != NULL)
 220		return cl;
 221
 222	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 223	for (;;) {
 224		int result = 0;
 225		defmap = head->defaults;
 226
 227		fl = rcu_dereference_bh(head->filter_list);
 228		/*
 229		 * Step 2+n. Apply classifier.
 230		 */
 231		result = tcf_classify(skb, fl, &res, true);
 232		if (!fl || result < 0)
 233			goto fallback;
 234
 235		cl = (void *)res.class;
 236		if (!cl) {
 237			if (TC_H_MAJ(res.classid))
 238				cl = cbq_class_lookup(q, res.classid);
 239			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 240				cl = defmap[TC_PRIO_BESTEFFORT];
 241
 242			if (cl == NULL)
 243				goto fallback;
 244		}
 245		if (cl->level >= head->level)
 246			goto fallback;
 247#ifdef CONFIG_NET_CLS_ACT
 248		switch (result) {
 249		case TC_ACT_QUEUED:
 250		case TC_ACT_STOLEN:
 251		case TC_ACT_TRAP:
 252			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 253			/* fall through */
 254		case TC_ACT_SHOT:
 255			return NULL;
 256		case TC_ACT_RECLASSIFY:
 257			return cbq_reclassify(skb, cl);
 258		}
 259#endif
 260		if (cl->level == 0)
 261			return cl;
 262
 263		/*
 264		 * Step 3+n. If classifier selected a link sharing class,
 265		 *	   apply agency specific classifier.
 266		 *	   Repeat this procdure until we hit a leaf node.
 267		 */
 268		head = cl;
 269	}
 270
 271fallback:
 272	cl = head;
 273
 274	/*
 275	 * Step 4. No success...
 276	 */
 277	if (TC_H_MAJ(prio) == 0 &&
 278	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 279	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 280		return head;
 281
 282	return cl;
 283}
 284
 285/*
 286 * A packet has just been enqueued on the empty class.
 287 * cbq_activate_class adds it to the tail of active class list
 288 * of its priority band.
 289 */
 290
 291static inline void cbq_activate_class(struct cbq_class *cl)
 292{
 293	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 294	int prio = cl->cpriority;
 295	struct cbq_class *cl_tail;
 296
 297	cl_tail = q->active[prio];
 298	q->active[prio] = cl;
 299
 300	if (cl_tail != NULL) {
 301		cl->next_alive = cl_tail->next_alive;
 302		cl_tail->next_alive = cl;
 303	} else {
 304		cl->next_alive = cl;
 305		q->activemask |= (1<<prio);
 306	}
 307}
 308
 309/*
 310 * Unlink class from active chain.
 311 * Note that this same procedure is done directly in cbq_dequeue*
 312 * during round-robin procedure.
 313 */
 314
 315static void cbq_deactivate_class(struct cbq_class *this)
 316{
 317	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 318	int prio = this->cpriority;
 319	struct cbq_class *cl;
 320	struct cbq_class *cl_prev = q->active[prio];
 321
 322	do {
 323		cl = cl_prev->next_alive;
 324		if (cl == this) {
 325			cl_prev->next_alive = cl->next_alive;
 326			cl->next_alive = NULL;
 327
 328			if (cl == q->active[prio]) {
 329				q->active[prio] = cl_prev;
 330				if (cl == q->active[prio]) {
 331					q->active[prio] = NULL;
 332					q->activemask &= ~(1<<prio);
 333					return;
 334				}
 335			}
 336			return;
 337		}
 338	} while ((cl_prev = cl) != q->active[prio]);
 339}
 340
 341static void
 342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 343{
 344	int toplevel = q->toplevel;
 345
 346	if (toplevel > cl->level) {
 347		psched_time_t now = psched_get_time();
 348
 349		do {
 350			if (cl->undertime < now) {
 351				q->toplevel = cl->level;
 352				return;
 353			}
 354		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 355	}
 356}
 357
 358static int
 359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 360	    struct sk_buff **to_free)
 361{
 362	struct cbq_sched_data *q = qdisc_priv(sch);
 363	int uninitialized_var(ret);
 364	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 365
 366#ifdef CONFIG_NET_CLS_ACT
 367	q->rx_class = cl;
 368#endif
 369	if (cl == NULL) {
 370		if (ret & __NET_XMIT_BYPASS)
 371			qdisc_qstats_drop(sch);
 372		__qdisc_drop(skb, to_free);
 373		return ret;
 374	}
 375
 376	ret = qdisc_enqueue(skb, cl->q, to_free);
 377	if (ret == NET_XMIT_SUCCESS) {
 378		sch->q.qlen++;
 379		cbq_mark_toplevel(q, cl);
 380		if (!cl->next_alive)
 381			cbq_activate_class(cl);
 382		return ret;
 383	}
 384
 385	if (net_xmit_drop_count(ret)) {
 386		qdisc_qstats_drop(sch);
 387		cbq_mark_toplevel(q, cl);
 388		cl->qstats.drops++;
 389	}
 390	return ret;
 391}
 392
 393/* Overlimit action: penalize leaf class by adding offtime */
 394static void cbq_overlimit(struct cbq_class *cl)
 395{
 396	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 397	psched_tdiff_t delay = cl->undertime - q->now;
 398
 399	if (!cl->delayed) {
 400		delay += cl->offtime;
 401
 402		/*
 403		 * Class goes to sleep, so that it will have no
 404		 * chance to work avgidle. Let's forgive it 8)
 405		 *
 406		 * BTW cbq-2.0 has a crap in this
 407		 * place, apparently they forgot to shift it by cl->ewma_log.
 408		 */
 409		if (cl->avgidle < 0)
 410			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 411		if (cl->avgidle < cl->minidle)
 412			cl->avgidle = cl->minidle;
 413		if (delay <= 0)
 414			delay = 1;
 415		cl->undertime = q->now + delay;
 416
 417		cl->xstats.overactions++;
 418		cl->delayed = 1;
 419	}
 420	if (q->wd_expires == 0 || q->wd_expires > delay)
 421		q->wd_expires = delay;
 422
 423	/* Dirty work! We must schedule wakeups based on
 424	 * real available rate, rather than leaf rate,
 425	 * which may be tiny (even zero).
 426	 */
 427	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 428		struct cbq_class *b;
 429		psched_tdiff_t base_delay = q->wd_expires;
 430
 431		for (b = cl->borrow; b; b = b->borrow) {
 432			delay = b->undertime - q->now;
 433			if (delay < base_delay) {
 434				if (delay <= 0)
 435					delay = 1;
 436				base_delay = delay;
 437			}
 438		}
 439
 440		q->wd_expires = base_delay;
 441	}
 442}
 443
 444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 445				       psched_time_t now)
 446{
 447	struct cbq_class *cl;
 448	struct cbq_class *cl_prev = q->active[prio];
 449	psched_time_t sched = now;
 450
 451	if (cl_prev == NULL)
 452		return 0;
 453
 454	do {
 455		cl = cl_prev->next_alive;
 456		if (now - cl->penalized > 0) {
 457			cl_prev->next_alive = cl->next_alive;
 458			cl->next_alive = NULL;
 459			cl->cpriority = cl->priority;
 460			cl->delayed = 0;
 461			cbq_activate_class(cl);
 462
 463			if (cl == q->active[prio]) {
 464				q->active[prio] = cl_prev;
 465				if (cl == q->active[prio]) {
 466					q->active[prio] = NULL;
 467					return 0;
 468				}
 469			}
 470
 471			cl = cl_prev->next_alive;
 472		} else if (sched - cl->penalized > 0)
 473			sched = cl->penalized;
 474	} while ((cl_prev = cl) != q->active[prio]);
 475
 476	return sched - now;
 477}
 478
 479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 480{
 481	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 482						delay_timer);
 483	struct Qdisc *sch = q->watchdog.qdisc;
 484	psched_time_t now;
 485	psched_tdiff_t delay = 0;
 486	unsigned int pmask;
 487
 488	now = psched_get_time();
 489
 490	pmask = q->pmask;
 491	q->pmask = 0;
 492
 493	while (pmask) {
 494		int prio = ffz(~pmask);
 495		psched_tdiff_t tmp;
 496
 497		pmask &= ~(1<<prio);
 498
 499		tmp = cbq_undelay_prio(q, prio, now);
 500		if (tmp > 0) {
 501			q->pmask |= 1<<prio;
 502			if (tmp < delay || delay == 0)
 503				delay = tmp;
 504		}
 505	}
 506
 507	if (delay) {
 508		ktime_t time;
 509
 510		time = 0;
 511		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 512		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 513	}
 514
 515	__netif_schedule(qdisc_root(sch));
 516	return HRTIMER_NORESTART;
 517}
 518
 519/*
 520 * It is mission critical procedure.
 521 *
 522 * We "regenerate" toplevel cutoff, if transmitting class
 523 * has backlog and it is not regulated. It is not part of
 524 * original CBQ description, but looks more reasonable.
 525 * Probably, it is wrong. This question needs further investigation.
 526 */
 527
 528static inline void
 529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 530		    struct cbq_class *borrowed)
 531{
 532	if (cl && q->toplevel >= borrowed->level) {
 533		if (cl->q->q.qlen > 1) {
 534			do {
 535				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 536					q->toplevel = borrowed->level;
 537					return;
 538				}
 539			} while ((borrowed = borrowed->borrow) != NULL);
 540		}
 541#if 0
 542	/* It is not necessary now. Uncommenting it
 543	   will save CPU cycles, but decrease fairness.
 544	 */
 545		q->toplevel = TC_CBQ_MAXLEVEL;
 546#endif
 547	}
 548}
 549
 550static void
 551cbq_update(struct cbq_sched_data *q)
 552{
 553	struct cbq_class *this = q->tx_class;
 554	struct cbq_class *cl = this;
 555	int len = q->tx_len;
 556	psched_time_t now;
 557
 558	q->tx_class = NULL;
 559	/* Time integrator. We calculate EOS time
 560	 * by adding expected packet transmission time.
 561	 */
 562	now = q->now + L2T(&q->link, len);
 563
 564	for ( ; cl; cl = cl->share) {
 565		long avgidle = cl->avgidle;
 566		long idle;
 567
 568		cl->bstats.packets++;
 569		cl->bstats.bytes += len;
 570
 571		/*
 572		 * (now - last) is total time between packet right edges.
 573		 * (last_pktlen/rate) is "virtual" busy time, so that
 574		 *
 575		 *	idle = (now - last) - last_pktlen/rate
 576		 */
 577
 578		idle = now - cl->last;
 579		if ((unsigned long)idle > 128*1024*1024) {
 580			avgidle = cl->maxidle;
 581		} else {
 582			idle -= L2T(cl, len);
 583
 584		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 585		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 586		 * cl->avgidle == true_avgidle/W,
 587		 * hence:
 588		 */
 589			avgidle += idle - (avgidle>>cl->ewma_log);
 590		}
 591
 592		if (avgidle <= 0) {
 593			/* Overlimit or at-limit */
 594
 595			if (avgidle < cl->minidle)
 596				avgidle = cl->minidle;
 597
 598			cl->avgidle = avgidle;
 599
 600			/* Calculate expected time, when this class
 601			 * will be allowed to send.
 602			 * It will occur, when:
 603			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 604			 * idle = (1/W - 1)*(-true_avgidle)
 605			 * or
 606			 * idle = (1 - W)*(-cl->avgidle);
 607			 */
 608			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 609
 610			/*
 611			 * That is not all.
 612			 * To maintain the rate allocated to the class,
 613			 * we add to undertime virtual clock,
 614			 * necessary to complete transmitted packet.
 615			 * (len/phys_bandwidth has been already passed
 616			 * to the moment of cbq_update)
 617			 */
 618
 619			idle -= L2T(&q->link, len);
 620			idle += L2T(cl, len);
 621
 622			cl->undertime = now + idle;
 623		} else {
 624			/* Underlimit */
 625
 626			cl->undertime = PSCHED_PASTPERFECT;
 627			if (avgidle > cl->maxidle)
 628				cl->avgidle = cl->maxidle;
 629			else
 630				cl->avgidle = avgidle;
 631		}
 632		if ((s64)(now - cl->last) > 0)
 633			cl->last = now;
 634	}
 635
 636	cbq_update_toplevel(q, this, q->tx_borrowed);
 637}
 638
 639static inline struct cbq_class *
 640cbq_under_limit(struct cbq_class *cl)
 641{
 642	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 643	struct cbq_class *this_cl = cl;
 644
 645	if (cl->tparent == NULL)
 646		return cl;
 647
 648	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 649		cl->delayed = 0;
 650		return cl;
 651	}
 652
 653	do {
 654		/* It is very suspicious place. Now overlimit
 655		 * action is generated for not bounded classes
 656		 * only if link is completely congested.
 657		 * Though it is in agree with ancestor-only paradigm,
 658		 * it looks very stupid. Particularly,
 659		 * it means that this chunk of code will either
 660		 * never be called or result in strong amplification
 661		 * of burstiness. Dangerous, silly, and, however,
 662		 * no another solution exists.
 663		 */
 664		cl = cl->borrow;
 665		if (!cl) {
 666			this_cl->qstats.overlimits++;
 667			cbq_overlimit(this_cl);
 668			return NULL;
 669		}
 670		if (cl->level > q->toplevel)
 671			return NULL;
 672	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 673
 674	cl->delayed = 0;
 675	return cl;
 676}
 677
 678static inline struct sk_buff *
 679cbq_dequeue_prio(struct Qdisc *sch, int prio)
 680{
 681	struct cbq_sched_data *q = qdisc_priv(sch);
 682	struct cbq_class *cl_tail, *cl_prev, *cl;
 683	struct sk_buff *skb;
 684	int deficit;
 685
 686	cl_tail = cl_prev = q->active[prio];
 687	cl = cl_prev->next_alive;
 688
 689	do {
 690		deficit = 0;
 691
 692		/* Start round */
 693		do {
 694			struct cbq_class *borrow = cl;
 695
 696			if (cl->q->q.qlen &&
 697			    (borrow = cbq_under_limit(cl)) == NULL)
 698				goto skip_class;
 699
 700			if (cl->deficit <= 0) {
 701				/* Class exhausted its allotment per
 702				 * this round. Switch to the next one.
 703				 */
 704				deficit = 1;
 705				cl->deficit += cl->quantum;
 706				goto next_class;
 707			}
 708
 709			skb = cl->q->dequeue(cl->q);
 710
 711			/* Class did not give us any skb :-(
 712			 * It could occur even if cl->q->q.qlen != 0
 713			 * f.e. if cl->q == "tbf"
 714			 */
 715			if (skb == NULL)
 716				goto skip_class;
 717
 718			cl->deficit -= qdisc_pkt_len(skb);
 719			q->tx_class = cl;
 720			q->tx_borrowed = borrow;
 721			if (borrow != cl) {
 722#ifndef CBQ_XSTATS_BORROWS_BYTES
 723				borrow->xstats.borrows++;
 724				cl->xstats.borrows++;
 725#else
 726				borrow->xstats.borrows += qdisc_pkt_len(skb);
 727				cl->xstats.borrows += qdisc_pkt_len(skb);
 728#endif
 729			}
 730			q->tx_len = qdisc_pkt_len(skb);
 731
 732			if (cl->deficit <= 0) {
 733				q->active[prio] = cl;
 734				cl = cl->next_alive;
 735				cl->deficit += cl->quantum;
 736			}
 737			return skb;
 738
 739skip_class:
 740			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 741				/* Class is empty or penalized.
 742				 * Unlink it from active chain.
 743				 */
 744				cl_prev->next_alive = cl->next_alive;
 745				cl->next_alive = NULL;
 746
 747				/* Did cl_tail point to it? */
 748				if (cl == cl_tail) {
 749					/* Repair it! */
 750					cl_tail = cl_prev;
 751
 752					/* Was it the last class in this band? */
 753					if (cl == cl_tail) {
 754						/* Kill the band! */
 755						q->active[prio] = NULL;
 756						q->activemask &= ~(1<<prio);
 757						if (cl->q->q.qlen)
 758							cbq_activate_class(cl);
 759						return NULL;
 760					}
 761
 762					q->active[prio] = cl_tail;
 763				}
 764				if (cl->q->q.qlen)
 765					cbq_activate_class(cl);
 766
 767				cl = cl_prev;
 768			}
 769
 770next_class:
 771			cl_prev = cl;
 772			cl = cl->next_alive;
 773		} while (cl_prev != cl_tail);
 774	} while (deficit);
 775
 776	q->active[prio] = cl_prev;
 777
 778	return NULL;
 779}
 780
 781static inline struct sk_buff *
 782cbq_dequeue_1(struct Qdisc *sch)
 783{
 784	struct cbq_sched_data *q = qdisc_priv(sch);
 785	struct sk_buff *skb;
 786	unsigned int activemask;
 787
 788	activemask = q->activemask & 0xFF;
 789	while (activemask) {
 790		int prio = ffz(~activemask);
 791		activemask &= ~(1<<prio);
 792		skb = cbq_dequeue_prio(sch, prio);
 793		if (skb)
 794			return skb;
 795	}
 796	return NULL;
 797}
 798
 799static struct sk_buff *
 800cbq_dequeue(struct Qdisc *sch)
 801{
 802	struct sk_buff *skb;
 803	struct cbq_sched_data *q = qdisc_priv(sch);
 804	psched_time_t now;
 805
 806	now = psched_get_time();
 807
 808	if (q->tx_class)
 809		cbq_update(q);
 810
 811	q->now = now;
 812
 813	for (;;) {
 814		q->wd_expires = 0;
 815
 816		skb = cbq_dequeue_1(sch);
 817		if (skb) {
 818			qdisc_bstats_update(sch, skb);
 819			sch->q.qlen--;
 820			return skb;
 821		}
 822
 823		/* All the classes are overlimit.
 824		 *
 825		 * It is possible, if:
 826		 *
 827		 * 1. Scheduler is empty.
 828		 * 2. Toplevel cutoff inhibited borrowing.
 829		 * 3. Root class is overlimit.
 830		 *
 831		 * Reset 2d and 3d conditions and retry.
 832		 *
 833		 * Note, that NS and cbq-2.0 are buggy, peeking
 834		 * an arbitrary class is appropriate for ancestor-only
 835		 * sharing, but not for toplevel algorithm.
 836		 *
 837		 * Our version is better, but slower, because it requires
 838		 * two passes, but it is unavoidable with top-level sharing.
 839		 */
 840
 841		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 842		    q->link.undertime == PSCHED_PASTPERFECT)
 843			break;
 844
 845		q->toplevel = TC_CBQ_MAXLEVEL;
 846		q->link.undertime = PSCHED_PASTPERFECT;
 847	}
 848
 849	/* No packets in scheduler or nobody wants to give them to us :-(
 850	 * Sigh... start watchdog timer in the last case.
 851	 */
 852
 853	if (sch->q.qlen) {
 854		qdisc_qstats_overlimit(sch);
 855		if (q->wd_expires)
 856			qdisc_watchdog_schedule(&q->watchdog,
 857						now + q->wd_expires);
 858	}
 859	return NULL;
 860}
 861
 862/* CBQ class maintanance routines */
 863
 864static void cbq_adjust_levels(struct cbq_class *this)
 865{
 866	if (this == NULL)
 867		return;
 868
 869	do {
 870		int level = 0;
 871		struct cbq_class *cl;
 872
 873		cl = this->children;
 874		if (cl) {
 875			do {
 876				if (cl->level > level)
 877					level = cl->level;
 878			} while ((cl = cl->sibling) != this->children);
 879		}
 880		this->level = level + 1;
 881	} while ((this = this->tparent) != NULL);
 882}
 883
 884static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 885{
 886	struct cbq_class *cl;
 887	unsigned int h;
 888
 889	if (q->quanta[prio] == 0)
 890		return;
 891
 892	for (h = 0; h < q->clhash.hashsize; h++) {
 893		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 894			/* BUGGGG... Beware! This expression suffer of
 895			 * arithmetic overflows!
 896			 */
 897			if (cl->priority == prio) {
 898				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 899					q->quanta[prio];
 900			}
 901			if (cl->quantum <= 0 ||
 902			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 903				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 904					cl->common.classid, cl->quantum);
 905				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 906			}
 907		}
 908	}
 909}
 910
 911static void cbq_sync_defmap(struct cbq_class *cl)
 912{
 913	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 914	struct cbq_class *split = cl->split;
 915	unsigned int h;
 916	int i;
 917
 918	if (split == NULL)
 919		return;
 920
 921	for (i = 0; i <= TC_PRIO_MAX; i++) {
 922		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 923			split->defaults[i] = NULL;
 924	}
 925
 926	for (i = 0; i <= TC_PRIO_MAX; i++) {
 927		int level = split->level;
 928
 929		if (split->defaults[i])
 930			continue;
 931
 932		for (h = 0; h < q->clhash.hashsize; h++) {
 933			struct cbq_class *c;
 934
 935			hlist_for_each_entry(c, &q->clhash.hash[h],
 936					     common.hnode) {
 937				if (c->split == split && c->level < level &&
 938				    c->defmap & (1<<i)) {
 939					split->defaults[i] = c;
 940					level = c->level;
 941				}
 942			}
 943		}
 944	}
 945}
 946
 947static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 948{
 949	struct cbq_class *split = NULL;
 950
 951	if (splitid == 0) {
 952		split = cl->split;
 953		if (!split)
 954			return;
 955		splitid = split->common.classid;
 956	}
 957
 958	if (split == NULL || split->common.classid != splitid) {
 959		for (split = cl->tparent; split; split = split->tparent)
 960			if (split->common.classid == splitid)
 961				break;
 962	}
 963
 964	if (split == NULL)
 965		return;
 966
 967	if (cl->split != split) {
 968		cl->defmap = 0;
 969		cbq_sync_defmap(cl);
 970		cl->split = split;
 971		cl->defmap = def & mask;
 972	} else
 973		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 974
 975	cbq_sync_defmap(cl);
 976}
 977
 978static void cbq_unlink_class(struct cbq_class *this)
 979{
 980	struct cbq_class *cl, **clp;
 981	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 982
 983	qdisc_class_hash_remove(&q->clhash, &this->common);
 984
 985	if (this->tparent) {
 986		clp = &this->sibling;
 987		cl = *clp;
 988		do {
 989			if (cl == this) {
 990				*clp = cl->sibling;
 991				break;
 992			}
 993			clp = &cl->sibling;
 994		} while ((cl = *clp) != this->sibling);
 995
 996		if (this->tparent->children == this) {
 997			this->tparent->children = this->sibling;
 998			if (this->sibling == this)
 999				this->tparent->children = NULL;
1000		}
1001	} else {
1002		WARN_ON(this->sibling != this);
1003	}
1004}
1005
1006static void cbq_link_class(struct cbq_class *this)
1007{
1008	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009	struct cbq_class *parent = this->tparent;
1010
1011	this->sibling = this;
1012	qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014	if (parent == NULL)
1015		return;
1016
1017	if (parent->children == NULL) {
1018		parent->children = this;
1019	} else {
1020		this->sibling = parent->children->sibling;
1021		parent->children->sibling = this;
1022	}
1023}
1024
1025static void
1026cbq_reset(struct Qdisc *sch)
1027{
1028	struct cbq_sched_data *q = qdisc_priv(sch);
1029	struct cbq_class *cl;
1030	int prio;
1031	unsigned int h;
1032
1033	q->activemask = 0;
1034	q->pmask = 0;
1035	q->tx_class = NULL;
1036	q->tx_borrowed = NULL;
1037	qdisc_watchdog_cancel(&q->watchdog);
1038	hrtimer_cancel(&q->delay_timer);
1039	q->toplevel = TC_CBQ_MAXLEVEL;
1040	q->now = psched_get_time();
1041
1042	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043		q->active[prio] = NULL;
1044
1045	for (h = 0; h < q->clhash.hashsize; h++) {
1046		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047			qdisc_reset(cl->q);
1048
1049			cl->next_alive = NULL;
1050			cl->undertime = PSCHED_PASTPERFECT;
1051			cl->avgidle = cl->maxidle;
1052			cl->deficit = cl->quantum;
1053			cl->cpriority = cl->priority;
1054		}
1055	}
1056	sch->q.qlen = 0;
1057}
1058
1059
1060static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061{
1062	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065	}
1066	if (lss->change & TCF_CBQ_LSS_EWMA)
1067		cl->ewma_log = lss->ewma_log;
1068	if (lss->change & TCF_CBQ_LSS_AVPKT)
1069		cl->avpkt = lss->avpkt;
1070	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071		cl->minidle = -(long)lss->minidle;
1072	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073		cl->maxidle = lss->maxidle;
1074		cl->avgidle = lss->maxidle;
1075	}
1076	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077		cl->offtime = lss->offtime;
1078	return 0;
1079}
1080
1081static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082{
1083	q->nclasses[cl->priority]--;
1084	q->quanta[cl->priority] -= cl->weight;
1085	cbq_normalize_quanta(q, cl->priority);
1086}
1087
1088static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089{
1090	q->nclasses[cl->priority]++;
1091	q->quanta[cl->priority] += cl->weight;
1092	cbq_normalize_quanta(q, cl->priority);
1093}
1094
1095static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096{
1097	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098
1099	if (wrr->allot)
1100		cl->allot = wrr->allot;
1101	if (wrr->weight)
1102		cl->weight = wrr->weight;
1103	if (wrr->priority) {
1104		cl->priority = wrr->priority - 1;
1105		cl->cpriority = cl->priority;
1106		if (cl->priority >= cl->priority2)
1107			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108	}
1109
1110	cbq_addprio(q, cl);
1111	return 0;
1112}
1113
1114static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115{
1116	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117	return 0;
1118}
1119
1120static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1122	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1123	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1124	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1125	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1126	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1128};
1129
1130static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1131			 struct nlattr *opt,
1132			 struct netlink_ext_ack *extack)
1133{
1134	int err;
1135
1136	if (!opt) {
1137		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1138		return -EINVAL;
1139	}
1140
1141	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1142					  cbq_policy, extack);
1143	if (err < 0)
1144		return err;
1145
1146	if (tb[TCA_CBQ_WRROPT]) {
1147		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148
1149		if (wrr->priority > TC_CBQ_MAXPRIO) {
1150			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1151			err = -EINVAL;
1152		}
1153	}
1154	return err;
1155}
1156
1157static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1158		    struct netlink_ext_ack *extack)
1159{
1160	struct cbq_sched_data *q = qdisc_priv(sch);
1161	struct nlattr *tb[TCA_CBQ_MAX + 1];
1162	struct tc_ratespec *r;
1163	int err;
1164
1165	qdisc_watchdog_init(&q->watchdog, sch);
1166	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1167	q->delay_timer.function = cbq_undelay;
1168
1169	err = cbq_opt_parse(tb, opt, extack);
1170	if (err < 0)
1171		return err;
1172
1173	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1174		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1175		return -EINVAL;
1176	}
1177
1178	r = nla_data(tb[TCA_CBQ_RATE]);
1179
1180	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1181	if (!q->link.R_tab)
1182		return -EINVAL;
1183
1184	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1185	if (err)
1186		goto put_rtab;
1187
1188	err = qdisc_class_hash_init(&q->clhash);
1189	if (err < 0)
1190		goto put_block;
1191
1192	q->link.sibling = &q->link;
1193	q->link.common.classid = sch->handle;
1194	q->link.qdisc = sch;
1195	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1196				      sch->handle, NULL);
1197	if (!q->link.q)
1198		q->link.q = &noop_qdisc;
1199	else
1200		qdisc_hash_add(q->link.q, true);
1201
1202	q->link.priority = TC_CBQ_MAXPRIO - 1;
1203	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1204	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1205	q->link.allot = psched_mtu(qdisc_dev(sch));
1206	q->link.quantum = q->link.allot;
1207	q->link.weight = q->link.R_tab->rate.rate;
1208
1209	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1210	q->link.avpkt = q->link.allot/2;
1211	q->link.minidle = -0x7FFFFFFF;
1212
1213	q->toplevel = TC_CBQ_MAXLEVEL;
1214	q->now = psched_get_time();
1215
1216	cbq_link_class(&q->link);
1217
1218	if (tb[TCA_CBQ_LSSOPT])
1219		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1220
1221	cbq_addprio(q, &q->link);
1222	return 0;
1223
1224put_block:
1225	tcf_block_put(q->link.block);
1226
1227put_rtab:
1228	qdisc_put_rtab(q->link.R_tab);
1229	return err;
1230}
1231
1232static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1233{
1234	unsigned char *b = skb_tail_pointer(skb);
1235
1236	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1237		goto nla_put_failure;
1238	return skb->len;
1239
1240nla_put_failure:
1241	nlmsg_trim(skb, b);
1242	return -1;
1243}
1244
1245static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1246{
1247	unsigned char *b = skb_tail_pointer(skb);
1248	struct tc_cbq_lssopt opt;
1249
1250	opt.flags = 0;
1251	if (cl->borrow == NULL)
1252		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1253	if (cl->share == NULL)
1254		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1255	opt.ewma_log = cl->ewma_log;
1256	opt.level = cl->level;
1257	opt.avpkt = cl->avpkt;
1258	opt.maxidle = cl->maxidle;
1259	opt.minidle = (u32)(-cl->minidle);
1260	opt.offtime = cl->offtime;
1261	opt.change = ~0;
1262	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1263		goto nla_put_failure;
1264	return skb->len;
1265
1266nla_put_failure:
1267	nlmsg_trim(skb, b);
1268	return -1;
1269}
1270
1271static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1272{
1273	unsigned char *b = skb_tail_pointer(skb);
1274	struct tc_cbq_wrropt opt;
1275
1276	memset(&opt, 0, sizeof(opt));
1277	opt.flags = 0;
1278	opt.allot = cl->allot;
1279	opt.priority = cl->priority + 1;
1280	opt.cpriority = cl->cpriority + 1;
1281	opt.weight = cl->weight;
1282	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1283		goto nla_put_failure;
1284	return skb->len;
1285
1286nla_put_failure:
1287	nlmsg_trim(skb, b);
1288	return -1;
1289}
1290
1291static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1292{
1293	unsigned char *b = skb_tail_pointer(skb);
1294	struct tc_cbq_fopt opt;
1295
1296	if (cl->split || cl->defmap) {
1297		opt.split = cl->split ? cl->split->common.classid : 0;
1298		opt.defmap = cl->defmap;
1299		opt.defchange = ~0;
1300		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1301			goto nla_put_failure;
1302	}
1303	return skb->len;
1304
1305nla_put_failure:
1306	nlmsg_trim(skb, b);
1307	return -1;
1308}
1309
1310static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1311{
1312	if (cbq_dump_lss(skb, cl) < 0 ||
1313	    cbq_dump_rate(skb, cl) < 0 ||
1314	    cbq_dump_wrr(skb, cl) < 0 ||
1315	    cbq_dump_fopt(skb, cl) < 0)
1316		return -1;
1317	return 0;
1318}
1319
1320static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1321{
1322	struct cbq_sched_data *q = qdisc_priv(sch);
1323	struct nlattr *nest;
1324
1325	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1326	if (nest == NULL)
1327		goto nla_put_failure;
1328	if (cbq_dump_attr(skb, &q->link) < 0)
1329		goto nla_put_failure;
1330	return nla_nest_end(skb, nest);
1331
1332nla_put_failure:
1333	nla_nest_cancel(skb, nest);
1334	return -1;
1335}
1336
1337static int
1338cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1339{
1340	struct cbq_sched_data *q = qdisc_priv(sch);
1341
1342	q->link.xstats.avgidle = q->link.avgidle;
1343	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1344}
1345
1346static int
1347cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1348	       struct sk_buff *skb, struct tcmsg *tcm)
1349{
1350	struct cbq_class *cl = (struct cbq_class *)arg;
1351	struct nlattr *nest;
1352
1353	if (cl->tparent)
1354		tcm->tcm_parent = cl->tparent->common.classid;
1355	else
1356		tcm->tcm_parent = TC_H_ROOT;
1357	tcm->tcm_handle = cl->common.classid;
1358	tcm->tcm_info = cl->q->handle;
1359
1360	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1361	if (nest == NULL)
1362		goto nla_put_failure;
1363	if (cbq_dump_attr(skb, cl) < 0)
1364		goto nla_put_failure;
1365	return nla_nest_end(skb, nest);
1366
1367nla_put_failure:
1368	nla_nest_cancel(skb, nest);
1369	return -1;
1370}
1371
1372static int
1373cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1374	struct gnet_dump *d)
1375{
1376	struct cbq_sched_data *q = qdisc_priv(sch);
1377	struct cbq_class *cl = (struct cbq_class *)arg;
1378	__u32 qlen;
1379
1380	cl->xstats.avgidle = cl->avgidle;
1381	cl->xstats.undertime = 0;
1382	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1383
1384	if (cl->undertime != PSCHED_PASTPERFECT)
1385		cl->xstats.undertime = cl->undertime - q->now;
1386
1387	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1388				  d, NULL, &cl->bstats) < 0 ||
1389	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1390	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1391		return -1;
1392
1393	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1394}
1395
1396static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1397		     struct Qdisc **old, struct netlink_ext_ack *extack)
1398{
1399	struct cbq_class *cl = (struct cbq_class *)arg;
1400
1401	if (new == NULL) {
1402		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1403					cl->common.classid, extack);
1404		if (new == NULL)
1405			return -ENOBUFS;
1406	}
1407
1408	*old = qdisc_replace(sch, new, &cl->q);
1409	return 0;
1410}
1411
1412static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1413{
1414	struct cbq_class *cl = (struct cbq_class *)arg;
1415
1416	return cl->q;
1417}
1418
1419static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1420{
1421	struct cbq_class *cl = (struct cbq_class *)arg;
1422
1423	cbq_deactivate_class(cl);
1424}
1425
1426static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1427{
1428	struct cbq_sched_data *q = qdisc_priv(sch);
1429
1430	return (unsigned long)cbq_class_lookup(q, classid);
1431}
1432
1433static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1434{
1435	struct cbq_sched_data *q = qdisc_priv(sch);
1436
1437	WARN_ON(cl->filters);
1438
1439	tcf_block_put(cl->block);
1440	qdisc_put(cl->q);
1441	qdisc_put_rtab(cl->R_tab);
1442	gen_kill_estimator(&cl->rate_est);
1443	if (cl != &q->link)
1444		kfree(cl);
1445}
1446
1447static void cbq_destroy(struct Qdisc *sch)
1448{
1449	struct cbq_sched_data *q = qdisc_priv(sch);
1450	struct hlist_node *next;
1451	struct cbq_class *cl;
1452	unsigned int h;
1453
1454#ifdef CONFIG_NET_CLS_ACT
1455	q->rx_class = NULL;
1456#endif
1457	/*
1458	 * Filters must be destroyed first because we don't destroy the
1459	 * classes from root to leafs which means that filters can still
1460	 * be bound to classes which have been destroyed already. --TGR '04
1461	 */
1462	for (h = 0; h < q->clhash.hashsize; h++) {
1463		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1464			tcf_block_put(cl->block);
1465			cl->block = NULL;
1466		}
1467	}
1468	for (h = 0; h < q->clhash.hashsize; h++) {
1469		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1470					  common.hnode)
1471			cbq_destroy_class(sch, cl);
1472	}
1473	qdisc_class_hash_destroy(&q->clhash);
1474}
1475
1476static int
1477cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1478		 unsigned long *arg, struct netlink_ext_ack *extack)
1479{
1480	int err;
1481	struct cbq_sched_data *q = qdisc_priv(sch);
1482	struct cbq_class *cl = (struct cbq_class *)*arg;
1483	struct nlattr *opt = tca[TCA_OPTIONS];
1484	struct nlattr *tb[TCA_CBQ_MAX + 1];
1485	struct cbq_class *parent;
1486	struct qdisc_rate_table *rtab = NULL;
1487
1488	err = cbq_opt_parse(tb, opt, extack);
1489	if (err < 0)
1490		return err;
1491
1492	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1493		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1494		return -EOPNOTSUPP;
1495	}
1496
1497	if (cl) {
1498		/* Check parent */
1499		if (parentid) {
1500			if (cl->tparent &&
1501			    cl->tparent->common.classid != parentid) {
1502				NL_SET_ERR_MSG(extack, "Invalid parent id");
1503				return -EINVAL;
1504			}
1505			if (!cl->tparent && parentid != TC_H_ROOT) {
1506				NL_SET_ERR_MSG(extack, "Parent must be root");
1507				return -EINVAL;
1508			}
1509		}
1510
1511		if (tb[TCA_CBQ_RATE]) {
1512			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1513					      tb[TCA_CBQ_RTAB], extack);
1514			if (rtab == NULL)
1515				return -EINVAL;
1516		}
1517
1518		if (tca[TCA_RATE]) {
1519			err = gen_replace_estimator(&cl->bstats, NULL,
1520						    &cl->rate_est,
1521						    NULL,
1522						    qdisc_root_sleeping_running(sch),
1523						    tca[TCA_RATE]);
1524			if (err) {
1525				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1526				qdisc_put_rtab(rtab);
1527				return err;
1528			}
1529		}
1530
1531		/* Change class parameters */
1532		sch_tree_lock(sch);
1533
1534		if (cl->next_alive != NULL)
1535			cbq_deactivate_class(cl);
1536
1537		if (rtab) {
1538			qdisc_put_rtab(cl->R_tab);
1539			cl->R_tab = rtab;
1540		}
1541
1542		if (tb[TCA_CBQ_LSSOPT])
1543			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1544
1545		if (tb[TCA_CBQ_WRROPT]) {
1546			cbq_rmprio(q, cl);
1547			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1548		}
1549
1550		if (tb[TCA_CBQ_FOPT])
1551			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1552
1553		if (cl->q->q.qlen)
1554			cbq_activate_class(cl);
1555
1556		sch_tree_unlock(sch);
1557
1558		return 0;
1559	}
1560
1561	if (parentid == TC_H_ROOT)
1562		return -EINVAL;
1563
1564	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1565		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1566		return -EINVAL;
1567	}
1568
1569	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1570			      extack);
1571	if (rtab == NULL)
1572		return -EINVAL;
1573
1574	if (classid) {
1575		err = -EINVAL;
1576		if (TC_H_MAJ(classid ^ sch->handle) ||
1577		    cbq_class_lookup(q, classid)) {
1578			NL_SET_ERR_MSG(extack, "Specified class not found");
1579			goto failure;
1580		}
1581	} else {
1582		int i;
1583		classid = TC_H_MAKE(sch->handle, 0x8000);
1584
1585		for (i = 0; i < 0x8000; i++) {
1586			if (++q->hgenerator >= 0x8000)
1587				q->hgenerator = 1;
1588			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1589				break;
1590		}
1591		err = -ENOSR;
1592		if (i >= 0x8000) {
1593			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1594			goto failure;
1595		}
1596		classid = classid|q->hgenerator;
1597	}
1598
1599	parent = &q->link;
1600	if (parentid) {
1601		parent = cbq_class_lookup(q, parentid);
1602		err = -EINVAL;
1603		if (!parent) {
1604			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1605			goto failure;
1606		}
1607	}
1608
1609	err = -ENOBUFS;
1610	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1611	if (cl == NULL)
1612		goto failure;
1613
1614	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1615	if (err) {
1616		kfree(cl);
1617		return err;
1618	}
1619
1620	if (tca[TCA_RATE]) {
1621		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1622					NULL,
1623					qdisc_root_sleeping_running(sch),
1624					tca[TCA_RATE]);
1625		if (err) {
1626			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1627			tcf_block_put(cl->block);
1628			kfree(cl);
1629			goto failure;
1630		}
1631	}
1632
1633	cl->R_tab = rtab;
1634	rtab = NULL;
1635	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1636				  NULL);
1637	if (!cl->q)
1638		cl->q = &noop_qdisc;
1639	else
1640		qdisc_hash_add(cl->q, true);
1641
1642	cl->common.classid = classid;
1643	cl->tparent = parent;
1644	cl->qdisc = sch;
1645	cl->allot = parent->allot;
1646	cl->quantum = cl->allot;
1647	cl->weight = cl->R_tab->rate.rate;
1648
1649	sch_tree_lock(sch);
1650	cbq_link_class(cl);
1651	cl->borrow = cl->tparent;
1652	if (cl->tparent != &q->link)
1653		cl->share = cl->tparent;
1654	cbq_adjust_levels(parent);
1655	cl->minidle = -0x7FFFFFFF;
1656	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1657	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1658	if (cl->ewma_log == 0)
1659		cl->ewma_log = q->link.ewma_log;
1660	if (cl->maxidle == 0)
1661		cl->maxidle = q->link.maxidle;
1662	if (cl->avpkt == 0)
1663		cl->avpkt = q->link.avpkt;
1664	if (tb[TCA_CBQ_FOPT])
1665		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1666	sch_tree_unlock(sch);
1667
1668	qdisc_class_hash_grow(sch, &q->clhash);
1669
1670	*arg = (unsigned long)cl;
1671	return 0;
1672
1673failure:
1674	qdisc_put_rtab(rtab);
1675	return err;
1676}
1677
1678static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 
1679{
1680	struct cbq_sched_data *q = qdisc_priv(sch);
1681	struct cbq_class *cl = (struct cbq_class *)arg;
1682
1683	if (cl->filters || cl->children || cl == &q->link)
1684		return -EBUSY;
1685
1686	sch_tree_lock(sch);
1687
1688	qdisc_purge_queue(cl->q);
1689
1690	if (cl->next_alive)
1691		cbq_deactivate_class(cl);
1692
1693	if (q->tx_borrowed == cl)
1694		q->tx_borrowed = q->tx_class;
1695	if (q->tx_class == cl) {
1696		q->tx_class = NULL;
1697		q->tx_borrowed = NULL;
1698	}
1699#ifdef CONFIG_NET_CLS_ACT
1700	if (q->rx_class == cl)
1701		q->rx_class = NULL;
1702#endif
1703
1704	cbq_unlink_class(cl);
1705	cbq_adjust_levels(cl->tparent);
1706	cl->defmap = 0;
1707	cbq_sync_defmap(cl);
1708
1709	cbq_rmprio(q, cl);
1710	sch_tree_unlock(sch);
1711
1712	cbq_destroy_class(sch, cl);
1713	return 0;
1714}
1715
1716static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1717				       struct netlink_ext_ack *extack)
1718{
1719	struct cbq_sched_data *q = qdisc_priv(sch);
1720	struct cbq_class *cl = (struct cbq_class *)arg;
1721
1722	if (cl == NULL)
1723		cl = &q->link;
1724
1725	return cl->block;
1726}
1727
1728static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1729				     u32 classid)
1730{
1731	struct cbq_sched_data *q = qdisc_priv(sch);
1732	struct cbq_class *p = (struct cbq_class *)parent;
1733	struct cbq_class *cl = cbq_class_lookup(q, classid);
1734
1735	if (cl) {
1736		if (p && p->level <= cl->level)
1737			return 0;
1738		cl->filters++;
1739		return (unsigned long)cl;
1740	}
1741	return 0;
1742}
1743
1744static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1745{
1746	struct cbq_class *cl = (struct cbq_class *)arg;
1747
1748	cl->filters--;
1749}
1750
1751static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1752{
1753	struct cbq_sched_data *q = qdisc_priv(sch);
1754	struct cbq_class *cl;
1755	unsigned int h;
1756
1757	if (arg->stop)
1758		return;
1759
1760	for (h = 0; h < q->clhash.hashsize; h++) {
1761		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1762			if (arg->count < arg->skip) {
1763				arg->count++;
1764				continue;
1765			}
1766			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1767				arg->stop = 1;
1768				return;
1769			}
1770			arg->count++;
1771		}
1772	}
1773}
1774
1775static const struct Qdisc_class_ops cbq_class_ops = {
1776	.graft		=	cbq_graft,
1777	.leaf		=	cbq_leaf,
1778	.qlen_notify	=	cbq_qlen_notify,
1779	.find		=	cbq_find,
1780	.change		=	cbq_change_class,
1781	.delete		=	cbq_delete,
1782	.walk		=	cbq_walk,
1783	.tcf_block	=	cbq_tcf_block,
1784	.bind_tcf	=	cbq_bind_filter,
1785	.unbind_tcf	=	cbq_unbind_filter,
1786	.dump		=	cbq_dump_class,
1787	.dump_stats	=	cbq_dump_class_stats,
1788};
1789
1790static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1791	.next		=	NULL,
1792	.cl_ops		=	&cbq_class_ops,
1793	.id		=	"cbq",
1794	.priv_size	=	sizeof(struct cbq_sched_data),
1795	.enqueue	=	cbq_enqueue,
1796	.dequeue	=	cbq_dequeue,
1797	.peek		=	qdisc_peek_dequeued,
1798	.init		=	cbq_init,
1799	.reset		=	cbq_reset,
1800	.destroy	=	cbq_destroy,
1801	.change		=	NULL,
1802	.dump		=	cbq_dump,
1803	.dump_stats	=	cbq_dump_stats,
1804	.owner		=	THIS_MODULE,
1805};
1806
1807static int __init cbq_module_init(void)
1808{
1809	return register_qdisc(&cbq_qdisc_ops);
1810}
1811static void __exit cbq_module_exit(void)
1812{
1813	unregister_qdisc(&cbq_qdisc_ops);
1814}
1815module_init(cbq_module_init)
1816module_exit(cbq_module_exit)
1817MODULE_LICENSE("GPL");

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
   4 *
   5 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 */
   7
   8#include <linux/module.h>
   9#include <linux/slab.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/string.h>
  13#include <linux/errno.h>
  14#include <linux/skbuff.h>
  15#include <net/netlink.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*	Class-Based Queueing (CBQ) algorithm.
  21	=======================================
  22
  23	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24		 Management Models for Packet Networks",
  25		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26
  27		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28
  29		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30		 Parameters", 1996
  31
  32		 [4] Sally Floyd and Michael Speer, "Experimental Results
  33		 for Class-Based Queueing", 1998, not published.
  34
  35	-----------------------------------------------------------------------
  36
  37	Algorithm skeleton was taken from NS simulator cbq.cc.
  38	If someone wants to check this code against the LBL version,
  39	he should take into account that ONLY the skeleton was borrowed,
  40	the implementation is different. Particularly:
  41
  42	--- The WRR algorithm is different. Our version looks more
  43	reasonable (I hope) and works when quanta are allowed to be
  44	less than MTU, which is always the case when real time classes
  45	have small rates. Note, that the statement of [3] is
  46	incomplete, delay may actually be estimated even if class
  47	per-round allotment is less than MTU. Namely, if per-round
  48	allotment is W*r_i, and r_1+...+r_k = r < 1
  49
  50	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51
  52	In the worst case we have IntServ estimate with D = W*r+k*MTU
  53	and C = MTU*r. The proof (if correct at all) is trivial.
  54
  55
  56	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57	interpret some places, which look like wrong translations
  58	from NS. Anyone is advised to find these differences
  59	and explain to me, why I am wrong 8).
  60
  61	--- Linux has no EOI event, so that we cannot estimate true class
  62	idle time. Workaround is to consider the next dequeue event
  63	as sign that previous packet is finished. This is wrong because of
  64	internal device queueing, but on a permanently loaded link it is true.
  65	Moreover, combined with clock integrator, this scheme looks
  66	very close to an ideal solution.  */
  67
  68struct cbq_sched_data;
  69
  70
  71struct cbq_class {
  72	struct Qdisc_class_common common;
  73	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
  74
  75/* Parameters */
  76	unsigned char		priority;	/* class priority */
  77	unsigned char		priority2;	/* priority to be used after overlimit */
  78	unsigned char		ewma_log;	/* time constant for idle time calculation */
  79
  80	u32			defmap;
  81
  82	/* Link-sharing scheduler parameters */
  83	long			maxidle;	/* Class parameters: see below. */
  84	long			offtime;
  85	long			minidle;
  86	u32			avpkt;
  87	struct qdisc_rate_table	*R_tab;
  88
  89	/* General scheduler (WRR) parameters */
  90	long			allot;
  91	long			quantum;	/* Allotment per WRR round */
  92	long			weight;		/* Relative allotment: see below */
  93
  94	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
  95	struct cbq_class	*split;		/* Ptr to split node */
  96	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
  97	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
  98	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
  99						   parent otherwise */
 100	struct cbq_class	*sibling;	/* Sibling chain */
 101	struct cbq_class	*children;	/* Pointer to children chain */
 102
 103	struct Qdisc		*q;		/* Elementary queueing discipline */
 104
 105
 106/* Variables */
 107	unsigned char		cpriority;	/* Effective priority */
 108	unsigned char		delayed;
 109	unsigned char		level;		/* level of the class in hierarchy:
 110						   0 for leaf classes, and maximal
 111						   level of children + 1 for nodes.
 112						 */
 113
 114	psched_time_t		last;		/* Last end of service */
 115	psched_time_t		undertime;
 116	long			avgidle;
 117	long			deficit;	/* Saved deficit for WRR */
 118	psched_time_t		penalized;
 119	struct gnet_stats_basic_packed bstats;
 120	struct gnet_stats_queue qstats;
 121	struct net_rate_estimator __rcu *rate_est;
 122	struct tc_cbq_xstats	xstats;
 123
 124	struct tcf_proto __rcu	*filter_list;
 125	struct tcf_block	*block;
 126
 127	int			filters;
 128
 129	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 130};
 131
 132struct cbq_sched_data {
 133	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
 134	int			nclasses[TC_CBQ_MAXPRIO + 1];
 135	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 136
 137	struct cbq_class	link;
 138
 139	unsigned int		activemask;
 140	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 141								   with backlog */
 142
 143#ifdef CONFIG_NET_CLS_ACT
 144	struct cbq_class	*rx_class;
 145#endif
 146	struct cbq_class	*tx_class;
 147	struct cbq_class	*tx_borrowed;
 148	int			tx_len;
 149	psched_time_t		now;		/* Cached timestamp */
 150	unsigned int		pmask;
 151
 152	struct hrtimer		delay_timer;
 153	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 154						   started when CBQ has
 155						   backlog, but cannot
 156						   transmit just now */
 157	psched_tdiff_t		wd_expires;
 158	int			toplevel;
 159	u32			hgenerator;
 160};
 161
 162
 163#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 164
 165static inline struct cbq_class *
 166cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 167{
 168	struct Qdisc_class_common *clc;
 169
 170	clc = qdisc_class_find(&q->clhash, classid);
 171	if (clc == NULL)
 172		return NULL;
 173	return container_of(clc, struct cbq_class, common);
 174}
 175
 176#ifdef CONFIG_NET_CLS_ACT
 177
 178static struct cbq_class *
 179cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 180{
 181	struct cbq_class *cl;
 182
 183	for (cl = this->tparent; cl; cl = cl->tparent) {
 184		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 185
 186		if (new != NULL && new != this)
 187			return new;
 188	}
 189	return NULL;
 190}
 191
 192#endif
 193
 194/* Classify packet. The procedure is pretty complicated, but
 195 * it allows us to combine link sharing and priority scheduling
 196 * transparently.
 197 *
 198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 199 * so that it resolves to split nodes. Then packets are classified
 200 * by logical priority, or a more specific classifier may be attached
 201 * to the split node.
 202 */
 203
 204static struct cbq_class *
 205cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 206{
 207	struct cbq_sched_data *q = qdisc_priv(sch);
 208	struct cbq_class *head = &q->link;
 209	struct cbq_class **defmap;
 210	struct cbq_class *cl = NULL;
 211	u32 prio = skb->priority;
 212	struct tcf_proto *fl;
 213	struct tcf_result res;
 214
 215	/*
 216	 *  Step 1. If skb->priority points to one of our classes, use it.
 217	 */
 218	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 219	    (cl = cbq_class_lookup(q, prio)) != NULL)
 220		return cl;
 221
 222	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 223	for (;;) {
 224		int result = 0;
 225		defmap = head->defaults;
 226
 227		fl = rcu_dereference_bh(head->filter_list);
 228		/*
 229		 * Step 2+n. Apply classifier.
 230		 */
 231		result = tcf_classify(skb, fl, &res, true);
 232		if (!fl || result < 0)
 233			goto fallback;
 234
 235		cl = (void *)res.class;
 236		if (!cl) {
 237			if (TC_H_MAJ(res.classid))
 238				cl = cbq_class_lookup(q, res.classid);
 239			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 240				cl = defmap[TC_PRIO_BESTEFFORT];
 241
 242			if (cl == NULL)
 243				goto fallback;
 244		}
 245		if (cl->level >= head->level)
 246			goto fallback;
 247#ifdef CONFIG_NET_CLS_ACT
 248		switch (result) {
 249		case TC_ACT_QUEUED:
 250		case TC_ACT_STOLEN:
 251		case TC_ACT_TRAP:
 252			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 253			fallthrough;
 254		case TC_ACT_SHOT:
 255			return NULL;
 256		case TC_ACT_RECLASSIFY:
 257			return cbq_reclassify(skb, cl);
 258		}
 259#endif
 260		if (cl->level == 0)
 261			return cl;
 262
 263		/*
 264		 * Step 3+n. If classifier selected a link sharing class,
 265		 *	   apply agency specific classifier.
 266		 *	   Repeat this procedure until we hit a leaf node.
 267		 */
 268		head = cl;
 269	}
 270
 271fallback:
 272	cl = head;
 273
 274	/*
 275	 * Step 4. No success...
 276	 */
 277	if (TC_H_MAJ(prio) == 0 &&
 278	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 279	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 280		return head;
 281
 282	return cl;
 283}
 284
 285/*
 286 * A packet has just been enqueued on the empty class.
 287 * cbq_activate_class adds it to the tail of active class list
 288 * of its priority band.
 289 */
 290
 291static inline void cbq_activate_class(struct cbq_class *cl)
 292{
 293	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 294	int prio = cl->cpriority;
 295	struct cbq_class *cl_tail;
 296
 297	cl_tail = q->active[prio];
 298	q->active[prio] = cl;
 299
 300	if (cl_tail != NULL) {
 301		cl->next_alive = cl_tail->next_alive;
 302		cl_tail->next_alive = cl;
 303	} else {
 304		cl->next_alive = cl;
 305		q->activemask |= (1<<prio);
 306	}
 307}
 308
 309/*
 310 * Unlink class from active chain.
 311 * Note that this same procedure is done directly in cbq_dequeue*
 312 * during round-robin procedure.
 313 */
 314
 315static void cbq_deactivate_class(struct cbq_class *this)
 316{
 317	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 318	int prio = this->cpriority;
 319	struct cbq_class *cl;
 320	struct cbq_class *cl_prev = q->active[prio];
 321
 322	do {
 323		cl = cl_prev->next_alive;
 324		if (cl == this) {
 325			cl_prev->next_alive = cl->next_alive;
 326			cl->next_alive = NULL;
 327
 328			if (cl == q->active[prio]) {
 329				q->active[prio] = cl_prev;
 330				if (cl == q->active[prio]) {
 331					q->active[prio] = NULL;
 332					q->activemask &= ~(1<<prio);
 333					return;
 334				}
 335			}
 336			return;
 337		}
 338	} while ((cl_prev = cl) != q->active[prio]);
 339}
 340
 341static void
 342cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 343{
 344	int toplevel = q->toplevel;
 345
 346	if (toplevel > cl->level) {
 347		psched_time_t now = psched_get_time();
 348
 349		do {
 350			if (cl->undertime < now) {
 351				q->toplevel = cl->level;
 352				return;
 353			}
 354		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 355	}
 356}
 357
 358static int
 359cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 360	    struct sk_buff **to_free)
 361{
 362	struct cbq_sched_data *q = qdisc_priv(sch);
 363	int ret;
 364	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 365
 366#ifdef CONFIG_NET_CLS_ACT
 367	q->rx_class = cl;
 368#endif
 369	if (cl == NULL) {
 370		if (ret & __NET_XMIT_BYPASS)
 371			qdisc_qstats_drop(sch);
 372		__qdisc_drop(skb, to_free);
 373		return ret;
 374	}
 375
 376	ret = qdisc_enqueue(skb, cl->q, to_free);
 377	if (ret == NET_XMIT_SUCCESS) {
 378		sch->q.qlen++;
 379		cbq_mark_toplevel(q, cl);
 380		if (!cl->next_alive)
 381			cbq_activate_class(cl);
 382		return ret;
 383	}
 384
 385	if (net_xmit_drop_count(ret)) {
 386		qdisc_qstats_drop(sch);
 387		cbq_mark_toplevel(q, cl);
 388		cl->qstats.drops++;
 389	}
 390	return ret;
 391}
 392
 393/* Overlimit action: penalize leaf class by adding offtime */
 394static void cbq_overlimit(struct cbq_class *cl)
 395{
 396	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 397	psched_tdiff_t delay = cl->undertime - q->now;
 398
 399	if (!cl->delayed) {
 400		delay += cl->offtime;
 401
 402		/*
 403		 * Class goes to sleep, so that it will have no
 404		 * chance to work avgidle. Let's forgive it 8)
 405		 *
 406		 * BTW cbq-2.0 has a crap in this
 407		 * place, apparently they forgot to shift it by cl->ewma_log.
 408		 */
 409		if (cl->avgidle < 0)
 410			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 411		if (cl->avgidle < cl->minidle)
 412			cl->avgidle = cl->minidle;
 413		if (delay <= 0)
 414			delay = 1;
 415		cl->undertime = q->now + delay;
 416
 417		cl->xstats.overactions++;
 418		cl->delayed = 1;
 419	}
 420	if (q->wd_expires == 0 || q->wd_expires > delay)
 421		q->wd_expires = delay;
 422
 423	/* Dirty work! We must schedule wakeups based on
 424	 * real available rate, rather than leaf rate,
 425	 * which may be tiny (even zero).
 426	 */
 427	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 428		struct cbq_class *b;
 429		psched_tdiff_t base_delay = q->wd_expires;
 430
 431		for (b = cl->borrow; b; b = b->borrow) {
 432			delay = b->undertime - q->now;
 433			if (delay < base_delay) {
 434				if (delay <= 0)
 435					delay = 1;
 436				base_delay = delay;
 437			}
 438		}
 439
 440		q->wd_expires = base_delay;
 441	}
 442}
 443
 444static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 445				       psched_time_t now)
 446{
 447	struct cbq_class *cl;
 448	struct cbq_class *cl_prev = q->active[prio];
 449	psched_time_t sched = now;
 450
 451	if (cl_prev == NULL)
 452		return 0;
 453
 454	do {
 455		cl = cl_prev->next_alive;
 456		if (now - cl->penalized > 0) {
 457			cl_prev->next_alive = cl->next_alive;
 458			cl->next_alive = NULL;
 459			cl->cpriority = cl->priority;
 460			cl->delayed = 0;
 461			cbq_activate_class(cl);
 462
 463			if (cl == q->active[prio]) {
 464				q->active[prio] = cl_prev;
 465				if (cl == q->active[prio]) {
 466					q->active[prio] = NULL;
 467					return 0;
 468				}
 469			}
 470
 471			cl = cl_prev->next_alive;
 472		} else if (sched - cl->penalized > 0)
 473			sched = cl->penalized;
 474	} while ((cl_prev = cl) != q->active[prio]);
 475
 476	return sched - now;
 477}
 478
 479static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 480{
 481	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 482						delay_timer);
 483	struct Qdisc *sch = q->watchdog.qdisc;
 484	psched_time_t now;
 485	psched_tdiff_t delay = 0;
 486	unsigned int pmask;
 487
 488	now = psched_get_time();
 489
 490	pmask = q->pmask;
 491	q->pmask = 0;
 492
 493	while (pmask) {
 494		int prio = ffz(~pmask);
 495		psched_tdiff_t tmp;
 496
 497		pmask &= ~(1<<prio);
 498
 499		tmp = cbq_undelay_prio(q, prio, now);
 500		if (tmp > 0) {
 501			q->pmask |= 1<<prio;
 502			if (tmp < delay || delay == 0)
 503				delay = tmp;
 504		}
 505	}
 506
 507	if (delay) {
 508		ktime_t time;
 509
 510		time = 0;
 511		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 512		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 513	}
 514
 515	__netif_schedule(qdisc_root(sch));
 516	return HRTIMER_NORESTART;
 517}
 518
 519/*
 520 * It is mission critical procedure.
 521 *
 522 * We "regenerate" toplevel cutoff, if transmitting class
 523 * has backlog and it is not regulated. It is not part of
 524 * original CBQ description, but looks more reasonable.
 525 * Probably, it is wrong. This question needs further investigation.
 526 */
 527
 528static inline void
 529cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 530		    struct cbq_class *borrowed)
 531{
 532	if (cl && q->toplevel >= borrowed->level) {
 533		if (cl->q->q.qlen > 1) {
 534			do {
 535				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 536					q->toplevel = borrowed->level;
 537					return;
 538				}
 539			} while ((borrowed = borrowed->borrow) != NULL);
 540		}
 541#if 0
 542	/* It is not necessary now. Uncommenting it
 543	   will save CPU cycles, but decrease fairness.
 544	 */
 545		q->toplevel = TC_CBQ_MAXLEVEL;
 546#endif
 547	}
 548}
 549
 550static void
 551cbq_update(struct cbq_sched_data *q)
 552{
 553	struct cbq_class *this = q->tx_class;
 554	struct cbq_class *cl = this;
 555	int len = q->tx_len;
 556	psched_time_t now;
 557
 558	q->tx_class = NULL;
 559	/* Time integrator. We calculate EOS time
 560	 * by adding expected packet transmission time.
 561	 */
 562	now = q->now + L2T(&q->link, len);
 563
 564	for ( ; cl; cl = cl->share) {
 565		long avgidle = cl->avgidle;
 566		long idle;
 567
 568		cl->bstats.packets++;
 569		cl->bstats.bytes += len;
 570
 571		/*
 572		 * (now - last) is total time between packet right edges.
 573		 * (last_pktlen/rate) is "virtual" busy time, so that
 574		 *
 575		 *	idle = (now - last) - last_pktlen/rate
 576		 */
 577
 578		idle = now - cl->last;
 579		if ((unsigned long)idle > 128*1024*1024) {
 580			avgidle = cl->maxidle;
 581		} else {
 582			idle -= L2T(cl, len);
 583
 584		/* true_avgidle := (1-W)*true_avgidle + W*idle,
 585		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 586		 * cl->avgidle == true_avgidle/W,
 587		 * hence:
 588		 */
 589			avgidle += idle - (avgidle>>cl->ewma_log);
 590		}
 591
 592		if (avgidle <= 0) {
 593			/* Overlimit or at-limit */
 594
 595			if (avgidle < cl->minidle)
 596				avgidle = cl->minidle;
 597
 598			cl->avgidle = avgidle;
 599
 600			/* Calculate expected time, when this class
 601			 * will be allowed to send.
 602			 * It will occur, when:
 603			 * (1-W)*true_avgidle + W*delay = 0, i.e.
 604			 * idle = (1/W - 1)*(-true_avgidle)
 605			 * or
 606			 * idle = (1 - W)*(-cl->avgidle);
 607			 */
 608			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 609
 610			/*
 611			 * That is not all.
 612			 * To maintain the rate allocated to the class,
 613			 * we add to undertime virtual clock,
 614			 * necessary to complete transmitted packet.
 615			 * (len/phys_bandwidth has been already passed
 616			 * to the moment of cbq_update)
 617			 */
 618
 619			idle -= L2T(&q->link, len);
 620			idle += L2T(cl, len);
 621
 622			cl->undertime = now + idle;
 623		} else {
 624			/* Underlimit */
 625
 626			cl->undertime = PSCHED_PASTPERFECT;
 627			if (avgidle > cl->maxidle)
 628				cl->avgidle = cl->maxidle;
 629			else
 630				cl->avgidle = avgidle;
 631		}
 632		if ((s64)(now - cl->last) > 0)
 633			cl->last = now;
 634	}
 635
 636	cbq_update_toplevel(q, this, q->tx_borrowed);
 637}
 638
 639static inline struct cbq_class *
 640cbq_under_limit(struct cbq_class *cl)
 641{
 642	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 643	struct cbq_class *this_cl = cl;
 644
 645	if (cl->tparent == NULL)
 646		return cl;
 647
 648	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 649		cl->delayed = 0;
 650		return cl;
 651	}
 652
 653	do {
 654		/* It is very suspicious place. Now overlimit
 655		 * action is generated for not bounded classes
 656		 * only if link is completely congested.
 657		 * Though it is in agree with ancestor-only paradigm,
 658		 * it looks very stupid. Particularly,
 659		 * it means that this chunk of code will either
 660		 * never be called or result in strong amplification
 661		 * of burstiness. Dangerous, silly, and, however,
 662		 * no another solution exists.
 663		 */
 664		cl = cl->borrow;
 665		if (!cl) {
 666			this_cl->qstats.overlimits++;
 667			cbq_overlimit(this_cl);
 668			return NULL;
 669		}
 670		if (cl->level > q->toplevel)
 671			return NULL;
 672	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 673
 674	cl->delayed = 0;
 675	return cl;
 676}
 677
 678static inline struct sk_buff *
 679cbq_dequeue_prio(struct Qdisc *sch, int prio)
 680{
 681	struct cbq_sched_data *q = qdisc_priv(sch);
 682	struct cbq_class *cl_tail, *cl_prev, *cl;
 683	struct sk_buff *skb;
 684	int deficit;
 685
 686	cl_tail = cl_prev = q->active[prio];
 687	cl = cl_prev->next_alive;
 688
 689	do {
 690		deficit = 0;
 691
 692		/* Start round */
 693		do {
 694			struct cbq_class *borrow = cl;
 695
 696			if (cl->q->q.qlen &&
 697			    (borrow = cbq_under_limit(cl)) == NULL)
 698				goto skip_class;
 699
 700			if (cl->deficit <= 0) {
 701				/* Class exhausted its allotment per
 702				 * this round. Switch to the next one.
 703				 */
 704				deficit = 1;
 705				cl->deficit += cl->quantum;
 706				goto next_class;
 707			}
 708
 709			skb = cl->q->dequeue(cl->q);
 710
 711			/* Class did not give us any skb :-(
 712			 * It could occur even if cl->q->q.qlen != 0
 713			 * f.e. if cl->q == "tbf"
 714			 */
 715			if (skb == NULL)
 716				goto skip_class;
 717
 718			cl->deficit -= qdisc_pkt_len(skb);
 719			q->tx_class = cl;
 720			q->tx_borrowed = borrow;
 721			if (borrow != cl) {
 722#ifndef CBQ_XSTATS_BORROWS_BYTES
 723				borrow->xstats.borrows++;
 724				cl->xstats.borrows++;
 725#else
 726				borrow->xstats.borrows += qdisc_pkt_len(skb);
 727				cl->xstats.borrows += qdisc_pkt_len(skb);
 728#endif
 729			}
 730			q->tx_len = qdisc_pkt_len(skb);
 731
 732			if (cl->deficit <= 0) {
 733				q->active[prio] = cl;
 734				cl = cl->next_alive;
 735				cl->deficit += cl->quantum;
 736			}
 737			return skb;
 738
 739skip_class:
 740			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 741				/* Class is empty or penalized.
 742				 * Unlink it from active chain.
 743				 */
 744				cl_prev->next_alive = cl->next_alive;
 745				cl->next_alive = NULL;
 746
 747				/* Did cl_tail point to it? */
 748				if (cl == cl_tail) {
 749					/* Repair it! */
 750					cl_tail = cl_prev;
 751
 752					/* Was it the last class in this band? */
 753					if (cl == cl_tail) {
 754						/* Kill the band! */
 755						q->active[prio] = NULL;
 756						q->activemask &= ~(1<<prio);
 757						if (cl->q->q.qlen)
 758							cbq_activate_class(cl);
 759						return NULL;
 760					}
 761
 762					q->active[prio] = cl_tail;
 763				}
 764				if (cl->q->q.qlen)
 765					cbq_activate_class(cl);
 766
 767				cl = cl_prev;
 768			}
 769
 770next_class:
 771			cl_prev = cl;
 772			cl = cl->next_alive;
 773		} while (cl_prev != cl_tail);
 774	} while (deficit);
 775
 776	q->active[prio] = cl_prev;
 777
 778	return NULL;
 779}
 780
 781static inline struct sk_buff *
 782cbq_dequeue_1(struct Qdisc *sch)
 783{
 784	struct cbq_sched_data *q = qdisc_priv(sch);
 785	struct sk_buff *skb;
 786	unsigned int activemask;
 787
 788	activemask = q->activemask & 0xFF;
 789	while (activemask) {
 790		int prio = ffz(~activemask);
 791		activemask &= ~(1<<prio);
 792		skb = cbq_dequeue_prio(sch, prio);
 793		if (skb)
 794			return skb;
 795	}
 796	return NULL;
 797}
 798
 799static struct sk_buff *
 800cbq_dequeue(struct Qdisc *sch)
 801{
 802	struct sk_buff *skb;
 803	struct cbq_sched_data *q = qdisc_priv(sch);
 804	psched_time_t now;
 805
 806	now = psched_get_time();
 807
 808	if (q->tx_class)
 809		cbq_update(q);
 810
 811	q->now = now;
 812
 813	for (;;) {
 814		q->wd_expires = 0;
 815
 816		skb = cbq_dequeue_1(sch);
 817		if (skb) {
 818			qdisc_bstats_update(sch, skb);
 819			sch->q.qlen--;
 820			return skb;
 821		}
 822
 823		/* All the classes are overlimit.
 824		 *
 825		 * It is possible, if:
 826		 *
 827		 * 1. Scheduler is empty.
 828		 * 2. Toplevel cutoff inhibited borrowing.
 829		 * 3. Root class is overlimit.
 830		 *
 831		 * Reset 2d and 3d conditions and retry.
 832		 *
 833		 * Note, that NS and cbq-2.0 are buggy, peeking
 834		 * an arbitrary class is appropriate for ancestor-only
 835		 * sharing, but not for toplevel algorithm.
 836		 *
 837		 * Our version is better, but slower, because it requires
 838		 * two passes, but it is unavoidable with top-level sharing.
 839		 */
 840
 841		if (q->toplevel == TC_CBQ_MAXLEVEL &&
 842		    q->link.undertime == PSCHED_PASTPERFECT)
 843			break;
 844
 845		q->toplevel = TC_CBQ_MAXLEVEL;
 846		q->link.undertime = PSCHED_PASTPERFECT;
 847	}
 848
 849	/* No packets in scheduler or nobody wants to give them to us :-(
 850	 * Sigh... start watchdog timer in the last case.
 851	 */
 852
 853	if (sch->q.qlen) {
 854		qdisc_qstats_overlimit(sch);
 855		if (q->wd_expires)
 856			qdisc_watchdog_schedule(&q->watchdog,
 857						now + q->wd_expires);
 858	}
 859	return NULL;
 860}
 861
 862/* CBQ class maintenance routines */
 863
 864static void cbq_adjust_levels(struct cbq_class *this)
 865{
 866	if (this == NULL)
 867		return;
 868
 869	do {
 870		int level = 0;
 871		struct cbq_class *cl;
 872
 873		cl = this->children;
 874		if (cl) {
 875			do {
 876				if (cl->level > level)
 877					level = cl->level;
 878			} while ((cl = cl->sibling) != this->children);
 879		}
 880		this->level = level + 1;
 881	} while ((this = this->tparent) != NULL);
 882}
 883
 884static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 885{
 886	struct cbq_class *cl;
 887	unsigned int h;
 888
 889	if (q->quanta[prio] == 0)
 890		return;
 891
 892	for (h = 0; h < q->clhash.hashsize; h++) {
 893		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 894			/* BUGGGG... Beware! This expression suffer of
 895			 * arithmetic overflows!
 896			 */
 897			if (cl->priority == prio) {
 898				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 899					q->quanta[prio];
 900			}
 901			if (cl->quantum <= 0 ||
 902			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 903				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 904					cl->common.classid, cl->quantum);
 905				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 906			}
 907		}
 908	}
 909}
 910
 911static void cbq_sync_defmap(struct cbq_class *cl)
 912{
 913	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 914	struct cbq_class *split = cl->split;
 915	unsigned int h;
 916	int i;
 917
 918	if (split == NULL)
 919		return;
 920
 921	for (i = 0; i <= TC_PRIO_MAX; i++) {
 922		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 923			split->defaults[i] = NULL;
 924	}
 925
 926	for (i = 0; i <= TC_PRIO_MAX; i++) {
 927		int level = split->level;
 928
 929		if (split->defaults[i])
 930			continue;
 931
 932		for (h = 0; h < q->clhash.hashsize; h++) {
 933			struct cbq_class *c;
 934
 935			hlist_for_each_entry(c, &q->clhash.hash[h],
 936					     common.hnode) {
 937				if (c->split == split && c->level < level &&
 938				    c->defmap & (1<<i)) {
 939					split->defaults[i] = c;
 940					level = c->level;
 941				}
 942			}
 943		}
 944	}
 945}
 946
 947static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 948{
 949	struct cbq_class *split = NULL;
 950
 951	if (splitid == 0) {
 952		split = cl->split;
 953		if (!split)
 954			return;
 955		splitid = split->common.classid;
 956	}
 957
 958	if (split == NULL || split->common.classid != splitid) {
 959		for (split = cl->tparent; split; split = split->tparent)
 960			if (split->common.classid == splitid)
 961				break;
 962	}
 963
 964	if (split == NULL)
 965		return;
 966
 967	if (cl->split != split) {
 968		cl->defmap = 0;
 969		cbq_sync_defmap(cl);
 970		cl->split = split;
 971		cl->defmap = def & mask;
 972	} else
 973		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 974
 975	cbq_sync_defmap(cl);
 976}
 977
 978static void cbq_unlink_class(struct cbq_class *this)
 979{
 980	struct cbq_class *cl, **clp;
 981	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 982
 983	qdisc_class_hash_remove(&q->clhash, &this->common);
 984
 985	if (this->tparent) {
 986		clp = &this->sibling;
 987		cl = *clp;
 988		do {
 989			if (cl == this) {
 990				*clp = cl->sibling;
 991				break;
 992			}
 993			clp = &cl->sibling;
 994		} while ((cl = *clp) != this->sibling);
 995
 996		if (this->tparent->children == this) {
 997			this->tparent->children = this->sibling;
 998			if (this->sibling == this)
 999				this->tparent->children = NULL;
1000		}
1001	} else {
1002		WARN_ON(this->sibling != this);
1003	}
1004}
1005
1006static void cbq_link_class(struct cbq_class *this)
1007{
1008	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009	struct cbq_class *parent = this->tparent;
1010
1011	this->sibling = this;
1012	qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014	if (parent == NULL)
1015		return;
1016
1017	if (parent->children == NULL) {
1018		parent->children = this;
1019	} else {
1020		this->sibling = parent->children->sibling;
1021		parent->children->sibling = this;
1022	}
1023}
1024
1025static void
1026cbq_reset(struct Qdisc *sch)
1027{
1028	struct cbq_sched_data *q = qdisc_priv(sch);
1029	struct cbq_class *cl;
1030	int prio;
1031	unsigned int h;
1032
1033	q->activemask = 0;
1034	q->pmask = 0;
1035	q->tx_class = NULL;
1036	q->tx_borrowed = NULL;
1037	qdisc_watchdog_cancel(&q->watchdog);
1038	hrtimer_cancel(&q->delay_timer);
1039	q->toplevel = TC_CBQ_MAXLEVEL;
1040	q->now = psched_get_time();
1041
1042	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043		q->active[prio] = NULL;
1044
1045	for (h = 0; h < q->clhash.hashsize; h++) {
1046		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047			qdisc_reset(cl->q);
1048
1049			cl->next_alive = NULL;
1050			cl->undertime = PSCHED_PASTPERFECT;
1051			cl->avgidle = cl->maxidle;
1052			cl->deficit = cl->quantum;
1053			cl->cpriority = cl->priority;
1054		}
1055	}
1056	sch->q.qlen = 0;
1057}
1058
1059
1060static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061{
1062	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065	}
1066	if (lss->change & TCF_CBQ_LSS_EWMA)
1067		cl->ewma_log = lss->ewma_log;
1068	if (lss->change & TCF_CBQ_LSS_AVPKT)
1069		cl->avpkt = lss->avpkt;
1070	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071		cl->minidle = -(long)lss->minidle;
1072	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073		cl->maxidle = lss->maxidle;
1074		cl->avgidle = lss->maxidle;
1075	}
1076	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077		cl->offtime = lss->offtime;
1078	return 0;
1079}
1080
1081static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082{
1083	q->nclasses[cl->priority]--;
1084	q->quanta[cl->priority] -= cl->weight;
1085	cbq_normalize_quanta(q, cl->priority);
1086}
1087
1088static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089{
1090	q->nclasses[cl->priority]++;
1091	q->quanta[cl->priority] += cl->weight;
1092	cbq_normalize_quanta(q, cl->priority);
1093}
1094
1095static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096{
1097	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098
1099	if (wrr->allot)
1100		cl->allot = wrr->allot;
1101	if (wrr->weight)
1102		cl->weight = wrr->weight;
1103	if (wrr->priority) {
1104		cl->priority = wrr->priority - 1;
1105		cl->cpriority = cl->priority;
1106		if (cl->priority >= cl->priority2)
1107			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108	}
1109
1110	cbq_addprio(q, cl);
1111	return 0;
1112}
1113
1114static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115{
1116	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117	return 0;
1118}
1119
1120static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1122	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1123	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1124	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1125	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1126	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1128};
1129
1130static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1131			 struct nlattr *opt,
1132			 struct netlink_ext_ack *extack)
1133{
1134	int err;
1135
1136	if (!opt) {
1137		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1138		return -EINVAL;
1139	}
1140
1141	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1142					  cbq_policy, extack);
1143	if (err < 0)
1144		return err;
1145
1146	if (tb[TCA_CBQ_WRROPT]) {
1147		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148
1149		if (wrr->priority > TC_CBQ_MAXPRIO) {
1150			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1151			err = -EINVAL;
1152		}
1153	}
1154	return err;
1155}
1156
1157static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1158		    struct netlink_ext_ack *extack)
1159{
1160	struct cbq_sched_data *q = qdisc_priv(sch);
1161	struct nlattr *tb[TCA_CBQ_MAX + 1];
1162	struct tc_ratespec *r;
1163	int err;
1164
1165	qdisc_watchdog_init(&q->watchdog, sch);
1166	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1167	q->delay_timer.function = cbq_undelay;
1168
1169	err = cbq_opt_parse(tb, opt, extack);
1170	if (err < 0)
1171		return err;
1172
1173	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1174		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1175		return -EINVAL;
1176	}
1177
1178	r = nla_data(tb[TCA_CBQ_RATE]);
1179
1180	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1181	if (!q->link.R_tab)
1182		return -EINVAL;
1183
1184	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1185	if (err)
1186		goto put_rtab;
1187
1188	err = qdisc_class_hash_init(&q->clhash);
1189	if (err < 0)
1190		goto put_block;
1191
1192	q->link.sibling = &q->link;
1193	q->link.common.classid = sch->handle;
1194	q->link.qdisc = sch;
1195	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1196				      sch->handle, NULL);
1197	if (!q->link.q)
1198		q->link.q = &noop_qdisc;
1199	else
1200		qdisc_hash_add(q->link.q, true);
1201
1202	q->link.priority = TC_CBQ_MAXPRIO - 1;
1203	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1204	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1205	q->link.allot = psched_mtu(qdisc_dev(sch));
1206	q->link.quantum = q->link.allot;
1207	q->link.weight = q->link.R_tab->rate.rate;
1208
1209	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1210	q->link.avpkt = q->link.allot/2;
1211	q->link.minidle = -0x7FFFFFFF;
1212
1213	q->toplevel = TC_CBQ_MAXLEVEL;
1214	q->now = psched_get_time();
1215
1216	cbq_link_class(&q->link);
1217
1218	if (tb[TCA_CBQ_LSSOPT])
1219		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1220
1221	cbq_addprio(q, &q->link);
1222	return 0;
1223
1224put_block:
1225	tcf_block_put(q->link.block);
1226
1227put_rtab:
1228	qdisc_put_rtab(q->link.R_tab);
1229	return err;
1230}
1231
1232static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1233{
1234	unsigned char *b = skb_tail_pointer(skb);
1235
1236	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1237		goto nla_put_failure;
1238	return skb->len;
1239
1240nla_put_failure:
1241	nlmsg_trim(skb, b);
1242	return -1;
1243}
1244
1245static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1246{
1247	unsigned char *b = skb_tail_pointer(skb);
1248	struct tc_cbq_lssopt opt;
1249
1250	opt.flags = 0;
1251	if (cl->borrow == NULL)
1252		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1253	if (cl->share == NULL)
1254		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1255	opt.ewma_log = cl->ewma_log;
1256	opt.level = cl->level;
1257	opt.avpkt = cl->avpkt;
1258	opt.maxidle = cl->maxidle;
1259	opt.minidle = (u32)(-cl->minidle);
1260	opt.offtime = cl->offtime;
1261	opt.change = ~0;
1262	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1263		goto nla_put_failure;
1264	return skb->len;
1265
1266nla_put_failure:
1267	nlmsg_trim(skb, b);
1268	return -1;
1269}
1270
1271static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1272{
1273	unsigned char *b = skb_tail_pointer(skb);
1274	struct tc_cbq_wrropt opt;
1275
1276	memset(&opt, 0, sizeof(opt));
1277	opt.flags = 0;
1278	opt.allot = cl->allot;
1279	opt.priority = cl->priority + 1;
1280	opt.cpriority = cl->cpriority + 1;
1281	opt.weight = cl->weight;
1282	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1283		goto nla_put_failure;
1284	return skb->len;
1285
1286nla_put_failure:
1287	nlmsg_trim(skb, b);
1288	return -1;
1289}
1290
1291static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1292{
1293	unsigned char *b = skb_tail_pointer(skb);
1294	struct tc_cbq_fopt opt;
1295
1296	if (cl->split || cl->defmap) {
1297		opt.split = cl->split ? cl->split->common.classid : 0;
1298		opt.defmap = cl->defmap;
1299		opt.defchange = ~0;
1300		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1301			goto nla_put_failure;
1302	}
1303	return skb->len;
1304
1305nla_put_failure:
1306	nlmsg_trim(skb, b);
1307	return -1;
1308}
1309
1310static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1311{
1312	if (cbq_dump_lss(skb, cl) < 0 ||
1313	    cbq_dump_rate(skb, cl) < 0 ||
1314	    cbq_dump_wrr(skb, cl) < 0 ||
1315	    cbq_dump_fopt(skb, cl) < 0)
1316		return -1;
1317	return 0;
1318}
1319
1320static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1321{
1322	struct cbq_sched_data *q = qdisc_priv(sch);
1323	struct nlattr *nest;
1324
1325	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1326	if (nest == NULL)
1327		goto nla_put_failure;
1328	if (cbq_dump_attr(skb, &q->link) < 0)
1329		goto nla_put_failure;
1330	return nla_nest_end(skb, nest);
1331
1332nla_put_failure:
1333	nla_nest_cancel(skb, nest);
1334	return -1;
1335}
1336
1337static int
1338cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1339{
1340	struct cbq_sched_data *q = qdisc_priv(sch);
1341
1342	q->link.xstats.avgidle = q->link.avgidle;
1343	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1344}
1345
1346static int
1347cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1348	       struct sk_buff *skb, struct tcmsg *tcm)
1349{
1350	struct cbq_class *cl = (struct cbq_class *)arg;
1351	struct nlattr *nest;
1352
1353	if (cl->tparent)
1354		tcm->tcm_parent = cl->tparent->common.classid;
1355	else
1356		tcm->tcm_parent = TC_H_ROOT;
1357	tcm->tcm_handle = cl->common.classid;
1358	tcm->tcm_info = cl->q->handle;
1359
1360	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1361	if (nest == NULL)
1362		goto nla_put_failure;
1363	if (cbq_dump_attr(skb, cl) < 0)
1364		goto nla_put_failure;
1365	return nla_nest_end(skb, nest);
1366
1367nla_put_failure:
1368	nla_nest_cancel(skb, nest);
1369	return -1;
1370}
1371
1372static int
1373cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1374	struct gnet_dump *d)
1375{
1376	struct cbq_sched_data *q = qdisc_priv(sch);
1377	struct cbq_class *cl = (struct cbq_class *)arg;
1378	__u32 qlen;
1379
1380	cl->xstats.avgidle = cl->avgidle;
1381	cl->xstats.undertime = 0;
1382	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1383
1384	if (cl->undertime != PSCHED_PASTPERFECT)
1385		cl->xstats.undertime = cl->undertime - q->now;
1386
1387	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1388				  d, NULL, &cl->bstats) < 0 ||
1389	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1390	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1391		return -1;
1392
1393	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1394}
1395
1396static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1397		     struct Qdisc **old, struct netlink_ext_ack *extack)
1398{
1399	struct cbq_class *cl = (struct cbq_class *)arg;
1400
1401	if (new == NULL) {
1402		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1403					cl->common.classid, extack);
1404		if (new == NULL)
1405			return -ENOBUFS;
1406	}
1407
1408	*old = qdisc_replace(sch, new, &cl->q);
1409	return 0;
1410}
1411
1412static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1413{
1414	struct cbq_class *cl = (struct cbq_class *)arg;
1415
1416	return cl->q;
1417}
1418
1419static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1420{
1421	struct cbq_class *cl = (struct cbq_class *)arg;
1422
1423	cbq_deactivate_class(cl);
1424}
1425
1426static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1427{
1428	struct cbq_sched_data *q = qdisc_priv(sch);
1429
1430	return (unsigned long)cbq_class_lookup(q, classid);
1431}
1432
1433static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1434{
1435	struct cbq_sched_data *q = qdisc_priv(sch);
1436
1437	WARN_ON(cl->filters);
1438
1439	tcf_block_put(cl->block);
1440	qdisc_put(cl->q);
1441	qdisc_put_rtab(cl->R_tab);
1442	gen_kill_estimator(&cl->rate_est);
1443	if (cl != &q->link)
1444		kfree(cl);
1445}
1446
1447static void cbq_destroy(struct Qdisc *sch)
1448{
1449	struct cbq_sched_data *q = qdisc_priv(sch);
1450	struct hlist_node *next;
1451	struct cbq_class *cl;
1452	unsigned int h;
1453
1454#ifdef CONFIG_NET_CLS_ACT
1455	q->rx_class = NULL;
1456#endif
1457	/*
1458	 * Filters must be destroyed first because we don't destroy the
1459	 * classes from root to leafs which means that filters can still
1460	 * be bound to classes which have been destroyed already. --TGR '04
1461	 */
1462	for (h = 0; h < q->clhash.hashsize; h++) {
1463		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1464			tcf_block_put(cl->block);
1465			cl->block = NULL;
1466		}
1467	}
1468	for (h = 0; h < q->clhash.hashsize; h++) {
1469		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1470					  common.hnode)
1471			cbq_destroy_class(sch, cl);
1472	}
1473	qdisc_class_hash_destroy(&q->clhash);
1474}
1475
1476static int
1477cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1478		 unsigned long *arg, struct netlink_ext_ack *extack)
1479{
1480	int err;
1481	struct cbq_sched_data *q = qdisc_priv(sch);
1482	struct cbq_class *cl = (struct cbq_class *)*arg;
1483	struct nlattr *opt = tca[TCA_OPTIONS];
1484	struct nlattr *tb[TCA_CBQ_MAX + 1];
1485	struct cbq_class *parent;
1486	struct qdisc_rate_table *rtab = NULL;
1487
1488	err = cbq_opt_parse(tb, opt, extack);
1489	if (err < 0)
1490		return err;
1491
1492	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1493		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1494		return -EOPNOTSUPP;
1495	}
1496
1497	if (cl) {
1498		/* Check parent */
1499		if (parentid) {
1500			if (cl->tparent &&
1501			    cl->tparent->common.classid != parentid) {
1502				NL_SET_ERR_MSG(extack, "Invalid parent id");
1503				return -EINVAL;
1504			}
1505			if (!cl->tparent && parentid != TC_H_ROOT) {
1506				NL_SET_ERR_MSG(extack, "Parent must be root");
1507				return -EINVAL;
1508			}
1509		}
1510
1511		if (tb[TCA_CBQ_RATE]) {
1512			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1513					      tb[TCA_CBQ_RTAB], extack);
1514			if (rtab == NULL)
1515				return -EINVAL;
1516		}
1517
1518		if (tca[TCA_RATE]) {
1519			err = gen_replace_estimator(&cl->bstats, NULL,
1520						    &cl->rate_est,
1521						    NULL,
1522						    qdisc_root_sleeping_running(sch),
1523						    tca[TCA_RATE]);
1524			if (err) {
1525				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1526				qdisc_put_rtab(rtab);
1527				return err;
1528			}
1529		}
1530
1531		/* Change class parameters */
1532		sch_tree_lock(sch);
1533
1534		if (cl->next_alive != NULL)
1535			cbq_deactivate_class(cl);
1536
1537		if (rtab) {
1538			qdisc_put_rtab(cl->R_tab);
1539			cl->R_tab = rtab;
1540		}
1541
1542		if (tb[TCA_CBQ_LSSOPT])
1543			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1544
1545		if (tb[TCA_CBQ_WRROPT]) {
1546			cbq_rmprio(q, cl);
1547			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1548		}
1549
1550		if (tb[TCA_CBQ_FOPT])
1551			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1552
1553		if (cl->q->q.qlen)
1554			cbq_activate_class(cl);
1555
1556		sch_tree_unlock(sch);
1557
1558		return 0;
1559	}
1560
1561	if (parentid == TC_H_ROOT)
1562		return -EINVAL;
1563
1564	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1565		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1566		return -EINVAL;
1567	}
1568
1569	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1570			      extack);
1571	if (rtab == NULL)
1572		return -EINVAL;
1573
1574	if (classid) {
1575		err = -EINVAL;
1576		if (TC_H_MAJ(classid ^ sch->handle) ||
1577		    cbq_class_lookup(q, classid)) {
1578			NL_SET_ERR_MSG(extack, "Specified class not found");
1579			goto failure;
1580		}
1581	} else {
1582		int i;
1583		classid = TC_H_MAKE(sch->handle, 0x8000);
1584
1585		for (i = 0; i < 0x8000; i++) {
1586			if (++q->hgenerator >= 0x8000)
1587				q->hgenerator = 1;
1588			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1589				break;
1590		}
1591		err = -ENOSR;
1592		if (i >= 0x8000) {
1593			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1594			goto failure;
1595		}
1596		classid = classid|q->hgenerator;
1597	}
1598
1599	parent = &q->link;
1600	if (parentid) {
1601		parent = cbq_class_lookup(q, parentid);
1602		err = -EINVAL;
1603		if (!parent) {
1604			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1605			goto failure;
1606		}
1607	}
1608
1609	err = -ENOBUFS;
1610	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1611	if (cl == NULL)
1612		goto failure;
1613
1614	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1615	if (err) {
1616		kfree(cl);
1617		goto failure;
1618	}
1619
1620	if (tca[TCA_RATE]) {
1621		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1622					NULL,
1623					qdisc_root_sleeping_running(sch),
1624					tca[TCA_RATE]);
1625		if (err) {
1626			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1627			tcf_block_put(cl->block);
1628			kfree(cl);
1629			goto failure;
1630		}
1631	}
1632
1633	cl->R_tab = rtab;
1634	rtab = NULL;
1635	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1636				  NULL);
1637	if (!cl->q)
1638		cl->q = &noop_qdisc;
1639	else
1640		qdisc_hash_add(cl->q, true);
1641
1642	cl->common.classid = classid;
1643	cl->tparent = parent;
1644	cl->qdisc = sch;
1645	cl->allot = parent->allot;
1646	cl->quantum = cl->allot;
1647	cl->weight = cl->R_tab->rate.rate;
1648
1649	sch_tree_lock(sch);
1650	cbq_link_class(cl);
1651	cl->borrow = cl->tparent;
1652	if (cl->tparent != &q->link)
1653		cl->share = cl->tparent;
1654	cbq_adjust_levels(parent);
1655	cl->minidle = -0x7FFFFFFF;
1656	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1657	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1658	if (cl->ewma_log == 0)
1659		cl->ewma_log = q->link.ewma_log;
1660	if (cl->maxidle == 0)
1661		cl->maxidle = q->link.maxidle;
1662	if (cl->avpkt == 0)
1663		cl->avpkt = q->link.avpkt;
1664	if (tb[TCA_CBQ_FOPT])
1665		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1666	sch_tree_unlock(sch);
1667
1668	qdisc_class_hash_grow(sch, &q->clhash);
1669
1670	*arg = (unsigned long)cl;
1671	return 0;
1672
1673failure:
1674	qdisc_put_rtab(rtab);
1675	return err;
1676}
1677
1678static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1679		      struct netlink_ext_ack *extack)
1680{
1681	struct cbq_sched_data *q = qdisc_priv(sch);
1682	struct cbq_class *cl = (struct cbq_class *)arg;
1683
1684	if (cl->filters || cl->children || cl == &q->link)
1685		return -EBUSY;
1686
1687	sch_tree_lock(sch);
1688
1689	qdisc_purge_queue(cl->q);
1690
1691	if (cl->next_alive)
1692		cbq_deactivate_class(cl);
1693
1694	if (q->tx_borrowed == cl)
1695		q->tx_borrowed = q->tx_class;
1696	if (q->tx_class == cl) {
1697		q->tx_class = NULL;
1698		q->tx_borrowed = NULL;
1699	}
1700#ifdef CONFIG_NET_CLS_ACT
1701	if (q->rx_class == cl)
1702		q->rx_class = NULL;
1703#endif
1704
1705	cbq_unlink_class(cl);
1706	cbq_adjust_levels(cl->tparent);
1707	cl->defmap = 0;
1708	cbq_sync_defmap(cl);
1709
1710	cbq_rmprio(q, cl);
1711	sch_tree_unlock(sch);
1712
1713	cbq_destroy_class(sch, cl);
1714	return 0;
1715}
1716
1717static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1718				       struct netlink_ext_ack *extack)
1719{
1720	struct cbq_sched_data *q = qdisc_priv(sch);
1721	struct cbq_class *cl = (struct cbq_class *)arg;
1722
1723	if (cl == NULL)
1724		cl = &q->link;
1725
1726	return cl->block;
1727}
1728
1729static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1730				     u32 classid)
1731{
1732	struct cbq_sched_data *q = qdisc_priv(sch);
1733	struct cbq_class *p = (struct cbq_class *)parent;
1734	struct cbq_class *cl = cbq_class_lookup(q, classid);
1735
1736	if (cl) {
1737		if (p && p->level <= cl->level)
1738			return 0;
1739		cl->filters++;
1740		return (unsigned long)cl;
1741	}
1742	return 0;
1743}
1744
1745static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1746{
1747	struct cbq_class *cl = (struct cbq_class *)arg;
1748
1749	cl->filters--;
1750}
1751
1752static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1753{
1754	struct cbq_sched_data *q = qdisc_priv(sch);
1755	struct cbq_class *cl;
1756	unsigned int h;
1757
1758	if (arg->stop)
1759		return;
1760
1761	for (h = 0; h < q->clhash.hashsize; h++) {
1762		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1763			if (arg->count < arg->skip) {
1764				arg->count++;
1765				continue;
1766			}
1767			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1768				arg->stop = 1;
1769				return;
1770			}
1771			arg->count++;
1772		}
1773	}
1774}
1775
1776static const struct Qdisc_class_ops cbq_class_ops = {
1777	.graft		=	cbq_graft,
1778	.leaf		=	cbq_leaf,
1779	.qlen_notify	=	cbq_qlen_notify,
1780	.find		=	cbq_find,
1781	.change		=	cbq_change_class,
1782	.delete		=	cbq_delete,
1783	.walk		=	cbq_walk,
1784	.tcf_block	=	cbq_tcf_block,
1785	.bind_tcf	=	cbq_bind_filter,
1786	.unbind_tcf	=	cbq_unbind_filter,
1787	.dump		=	cbq_dump_class,
1788	.dump_stats	=	cbq_dump_class_stats,
1789};
1790
1791static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1792	.next		=	NULL,
1793	.cl_ops		=	&cbq_class_ops,
1794	.id		=	"cbq",
1795	.priv_size	=	sizeof(struct cbq_sched_data),
1796	.enqueue	=	cbq_enqueue,
1797	.dequeue	=	cbq_dequeue,
1798	.peek		=	qdisc_peek_dequeued,
1799	.init		=	cbq_init,
1800	.reset		=	cbq_reset,
1801	.destroy	=	cbq_destroy,
1802	.change		=	NULL,
1803	.dump		=	cbq_dump,
1804	.dump_stats	=	cbq_dump_stats,
1805	.owner		=	THIS_MODULE,
1806};
1807
1808static int __init cbq_module_init(void)
1809{
1810	return register_qdisc(&cbq_qdisc_ops);
1811}
1812static void __exit cbq_module_exit(void)
1813{
1814	unregister_qdisc(&cbq_qdisc_ops);
1815}
1816module_init(cbq_module_init)
1817module_exit(cbq_module_exit)
1818MODULE_LICENSE("GPL");