sch_qfq.c - net/sched/sch_qfq.c - Linux diff v3.15 - Bootlin Elixir Cross Referencer

 
   1/*
   2 * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
   3 *
   4 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
   5 * Copyright (c) 2012 Paolo Valente.
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License
   9 * version 2 as published by the Free Software Foundation.
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/init.h>
  14#include <linux/bitops.h>
  15#include <linux/errno.h>
  16#include <linux/netdevice.h>
  17#include <linux/pkt_sched.h>
  18#include <net/sch_generic.h>
  19#include <net/pkt_sched.h>
  20#include <net/pkt_cls.h>
  21
  22
  23/*  Quick Fair Queueing Plus
  24    ========================
  25
  26    Sources:
  27
  28    [1] Paolo Valente,
  29    "Reducing the Execution Time of Fair-Queueing Schedulers."
  30    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
  31
  32    Sources for QFQ:
  33
  34    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
  35    Packet Scheduling with Tight Bandwidth Distribution Guarantees."
  36
  37    See also:
  38    http://retis.sssup.it/~fabio/linux/qfq/
  39 */
  40
  41/*
  42
  43  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
  44  classes. Each aggregate is timestamped with a virtual start time S
  45  and a virtual finish time F, and scheduled according to its
  46  timestamps. S and F are computed as a function of a system virtual
  47  time function V. The classes within each aggregate are instead
  48  scheduled with DRR.
  49
  50  To speed up operations, QFQ+ divides also aggregates into a limited
  51  number of groups. Which group a class belongs to depends on the
  52  ratio between the maximum packet length for the class and the weight
  53  of the class. Groups have their own S and F. In the end, QFQ+
  54  schedules groups, then aggregates within groups, then classes within
  55  aggregates. See [1] and [2] for a full description.
  56
  57  Virtual time computations.
  58
  59  S, F and V are all computed in fixed point arithmetic with
  60  FRAC_BITS decimal bits.
  61
  62  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
  63	one bit per index.
  64  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
  65
  66  The layout of the bits is as below:
  67
  68                   [ MTU_SHIFT ][      FRAC_BITS    ]
  69                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
  70				 ^.__grp->index = 0
  71				 *.__grp->slot_shift
  72
  73  where MIN_SLOT_SHIFT is derived by difference from the others.
  74
  75  The max group index corresponds to Lmax/w_min, where
  76  Lmax=1<<MTU_SHIFT, w_min = 1 .
  77  From this, and knowing how many groups (MAX_INDEX) we want,
  78  we can derive the shift corresponding to each group.
  79
  80  Because we often need to compute
  81	F = S + len/w_i  and V = V + len/wsum
  82  instead of storing w_i store the value
  83	inv_w = (1<<FRAC_BITS)/w_i
  84  so we can do F = S + len * inv_w * wsum.
  85  We use W_TOT in the formulas so we can easily move between
  86  static and adaptive weight sum.
  87
  88  The per-scheduler-instance data contain all the data structures
  89  for the scheduler: bitmaps and bucket lists.
  90
  91 */
  92
  93/*
  94 * Maximum number of consecutive slots occupied by backlogged classes
  95 * inside a group.
  96 */
  97#define QFQ_MAX_SLOTS	32
  98
  99/*
 100 * Shifts used for aggregate<->group mapping.  We allow class weights that are
 101 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
 102 * group with the smallest index that can support the L_i / r_i configured
 103 * for the classes in the aggregate.
 104 *
 105 * grp->index is the index of the group; and grp->slot_shift
 106 * is the shift for the corresponding (scaled) sigma_i.
 107 */
 108#define QFQ_MAX_INDEX		24
 109#define QFQ_MAX_WSHIFT		10
 110
 111#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
 112#define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT)
 113
 114#define FRAC_BITS		30	/* fixed point arithmetic */
 115#define ONE_FP			(1UL << FRAC_BITS)
 116
 117#define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
 118#define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
 119
 120#define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */
 121
 122/*
 123 * Possible group states.  These values are used as indexes for the bitmaps
 124 * array of struct qfq_queue.
 125 */
 126enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
 127
 128struct qfq_group;
 129
 130struct qfq_aggregate;
 131
 132struct qfq_class {
 133	struct Qdisc_class_common common;
 134
 135	unsigned int refcnt;
 136	unsigned int filter_cnt;
 137
 138	struct gnet_stats_basic_packed bstats;
 139	struct gnet_stats_queue qstats;
 140	struct gnet_stats_rate_est64 rate_est;
 141	struct Qdisc *qdisc;
 142	struct list_head alist;		/* Link for active-classes list. */
 143	struct qfq_aggregate *agg;	/* Parent aggregate. */
 144	int deficit;			/* DRR deficit counter. */
 145};
 146
 147struct qfq_aggregate {
 148	struct hlist_node next;	/* Link for the slot list. */
 149	u64 S, F;		/* flow timestamps (exact) */
 150
 151	/* group we belong to. In principle we would need the index,
 152	 * which is log_2(lmax/weight), but we never reference it
 153	 * directly, only the group.
 154	 */
 155	struct qfq_group *grp;
 156
 157	/* these are copied from the flowset. */
 158	u32	class_weight; /* Weight of each class in this aggregate. */
 159	/* Max pkt size for the classes in this aggregate, DRR quantum. */
 160	int	lmax;
 161
 162	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */
 163	u32	budgetmax;  /* Max budget for this aggregate. */
 164	u32	initial_budget, budget;     /* Initial and current budget. */
 165
 166	int		  num_classes;	/* Number of classes in this aggr. */
 167	struct list_head  active;	/* DRR queue of active classes. */
 168
 169	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */
 170};
 171
 172struct qfq_group {
 173	u64 S, F;			/* group timestamps (approx). */
 174	unsigned int slot_shift;	/* Slot shift. */
 175	unsigned int index;		/* Group index. */
 176	unsigned int front;		/* Index of the front slot. */
 177	unsigned long full_slots;	/* non-empty slots */
 178
 179	/* Array of RR lists of active aggregates. */
 180	struct hlist_head slots[QFQ_MAX_SLOTS];
 181};
 182
 183struct qfq_sched {
 184	struct tcf_proto *filter_list;
 
 185	struct Qdisc_class_hash clhash;
 186
 187	u64			oldV, V;	/* Precise virtual times. */
 188	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
 189	u32			num_active_agg; /* Num. of active aggregates */
 190	u32			wsum;		/* weight sum */
 191	u32			iwsum;		/* inverse weight sum */
 192
 193	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
 194	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
 195	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */
 196
 197	u32 max_agg_classes;		/* Max number of classes per aggr. */
 198	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
 199};
 200
 201/*
 202 * Possible reasons why the timestamps of an aggregate are updated
 203 * enqueue: the aggregate switches from idle to active and must scheduled
 204 *	    for service
 205 * requeue: the aggregate finishes its budget, so it stops being served and
 206 *	    must be rescheduled for service
 207 */
 208enum update_reason {enqueue, requeue};
 209
 210static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
 211{
 212	struct qfq_sched *q = qdisc_priv(sch);
 213	struct Qdisc_class_common *clc;
 214
 215	clc = qdisc_class_find(&q->clhash, classid);
 216	if (clc == NULL)
 217		return NULL;
 218	return container_of(clc, struct qfq_class, common);
 219}
 220
 221static void qfq_purge_queue(struct qfq_class *cl)
 222{
 223	unsigned int len = cl->qdisc->q.qlen;
 224
 225	qdisc_reset(cl->qdisc);
 226	qdisc_tree_decrease_qlen(cl->qdisc, len);
 227}
 228
 229static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
 230	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
 231	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
 232};
 233
 234/*
 235 * Calculate a flow index, given its weight and maximum packet length.
 236 * index = log_2(maxlen/weight) but we need to apply the scaling.
 237 * This is used only once at flow creation.
 238 */
 239static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
 240{
 241	u64 slot_size = (u64)maxlen * inv_w;
 242	unsigned long size_map;
 243	int index = 0;
 244
 245	size_map = slot_size >> min_slot_shift;
 246	if (!size_map)
 247		goto out;
 248
 249	index = __fls(size_map) + 1;	/* basically a log_2 */
 250	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
 251
 252	if (index < 0)
 253		index = 0;
 254out:
 255	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
 256		 (unsigned long) ONE_FP/inv_w, maxlen, index);
 257
 258	return index;
 259}
 260
 261static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
 262static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
 263			     enum update_reason);
 264
 265static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
 266			 u32 lmax, u32 weight)
 267{
 268	INIT_LIST_HEAD(&agg->active);
 269	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
 270
 271	agg->lmax = lmax;
 272	agg->class_weight = weight;
 273}
 274
 275static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
 276					  u32 lmax, u32 weight)
 277{
 278	struct qfq_aggregate *agg;
 279
 280	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
 281		if (agg->lmax == lmax && agg->class_weight == weight)
 282			return agg;
 283
 284	return NULL;
 285}
 286
 287
 288/* Update aggregate as a function of the new number of classes. */
 289static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
 290			   int new_num_classes)
 291{
 292	u32 new_agg_weight;
 293
 294	if (new_num_classes == q->max_agg_classes)
 295		hlist_del_init(&agg->nonfull_next);
 296
 297	if (agg->num_classes > new_num_classes &&
 298	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */
 299		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
 300
 301	/* The next assignment may let
 302	 * agg->initial_budget > agg->budgetmax
 303	 * hold, we will take it into account in charge_actual_service().
 304	 */
 305	agg->budgetmax = new_num_classes * agg->lmax;
 306	new_agg_weight = agg->class_weight * new_num_classes;
 307	agg->inv_w = ONE_FP/new_agg_weight;
 308
 309	if (agg->grp == NULL) {
 310		int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
 311				       q->min_slot_shift);
 312		agg->grp = &q->groups[i];
 313	}
 314
 315	q->wsum +=
 316		(int) agg->class_weight * (new_num_classes - agg->num_classes);
 317	q->iwsum = ONE_FP / q->wsum;
 318
 319	agg->num_classes = new_num_classes;
 320}
 321
 322/* Add class to aggregate. */
 323static void qfq_add_to_agg(struct qfq_sched *q,
 324			   struct qfq_aggregate *agg,
 325			   struct qfq_class *cl)
 326{
 327	cl->agg = agg;
 328
 329	qfq_update_agg(q, agg, agg->num_classes+1);
 330	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
 331		list_add_tail(&cl->alist, &agg->active);
 332		if (list_first_entry(&agg->active, struct qfq_class, alist) ==
 333		    cl && q->in_serv_agg != agg) /* agg was inactive */
 334			qfq_activate_agg(q, agg, enqueue); /* schedule agg */
 335	}
 336}
 337
 338static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
 339
 340static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
 341{
 342	if (!hlist_unhashed(&agg->nonfull_next))
 343		hlist_del_init(&agg->nonfull_next);
 344	q->wsum -= agg->class_weight;
 345	if (q->wsum != 0)
 346		q->iwsum = ONE_FP / q->wsum;
 347
 348	if (q->in_serv_agg == agg)
 349		q->in_serv_agg = qfq_choose_next_agg(q);
 350	kfree(agg);
 351}
 352
 353/* Deschedule class from within its parent aggregate. */
 354static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
 355{
 356	struct qfq_aggregate *agg = cl->agg;
 357
 358
 359	list_del(&cl->alist); /* remove from RR queue of the aggregate */
 360	if (list_empty(&agg->active)) /* agg is now inactive */
 361		qfq_deactivate_agg(q, agg);
 362}
 363
 364/* Remove class from its parent aggregate. */
 365static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
 366{
 367	struct qfq_aggregate *agg = cl->agg;
 368
 369	cl->agg = NULL;
 370	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
 371		qfq_destroy_agg(q, agg);
 372		return;
 373	}
 374	qfq_update_agg(q, agg, agg->num_classes-1);
 375}
 376
 377/* Deschedule class and remove it from its parent aggregate. */
 378static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
 379{
 380	if (cl->qdisc->q.qlen > 0) /* class is active */
 381		qfq_deactivate_class(q, cl);
 382
 383	qfq_rm_from_agg(q, cl);
 384}
 385
 386/* Move class to a new aggregate, matching the new class weight and/or lmax */
 387static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
 388			   u32 lmax)
 389{
 390	struct qfq_sched *q = qdisc_priv(sch);
 391	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
 392
 393	if (new_agg == NULL) { /* create new aggregate */
 394		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
 395		if (new_agg == NULL)
 396			return -ENOBUFS;
 397		qfq_init_agg(q, new_agg, lmax, weight);
 398	}
 399	qfq_deact_rm_from_agg(q, cl);
 400	qfq_add_to_agg(q, new_agg, cl);
 401
 402	return 0;
 403}
 404
 405static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 406			    struct nlattr **tca, unsigned long *arg)
 
 407{
 408	struct qfq_sched *q = qdisc_priv(sch);
 409	struct qfq_class *cl = (struct qfq_class *)*arg;
 410	bool existing = false;
 411	struct nlattr *tb[TCA_QFQ_MAX + 1];
 412	struct qfq_aggregate *new_agg = NULL;
 413	u32 weight, lmax, inv_w;
 414	int err;
 415	int delta_w;
 416
 417	if (tca[TCA_OPTIONS] == NULL) {
 418		pr_notice("qfq: no options\n");
 419		return -EINVAL;
 420	}
 421
 422	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
 
 423	if (err < 0)
 424		return err;
 425
 426	if (tb[TCA_QFQ_WEIGHT]) {
 427		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
 428		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
 429			pr_notice("qfq: invalid weight %u\n", weight);
 430			return -EINVAL;
 431		}
 432	} else
 433		weight = 1;
 434
 435	if (tb[TCA_QFQ_LMAX]) {
 436		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
 437		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
 438			pr_notice("qfq: invalid max length %u\n", lmax);
 439			return -EINVAL;
 440		}
 441	} else
 442		lmax = psched_mtu(qdisc_dev(sch));
 443
 444	inv_w = ONE_FP / weight;
 445	weight = ONE_FP / inv_w;
 446
 447	if (cl != NULL &&
 448	    lmax == cl->agg->lmax &&
 449	    weight == cl->agg->class_weight)
 450		return 0; /* nothing to change */
 451
 452	delta_w = weight - (cl ? cl->agg->class_weight : 0);
 453
 454	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
 455		pr_notice("qfq: total weight out of range (%d + %u)\n",
 456			  delta_w, q->wsum);
 457		return -EINVAL;
 458	}
 459
 460	if (cl != NULL) { /* modify existing class */
 461		if (tca[TCA_RATE]) {
 462			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
 463						    qdisc_root_sleeping_lock(sch),
 
 
 464						    tca[TCA_RATE]);
 465			if (err)
 466				return err;
 467		}
 468		existing = true;
 469		goto set_change_agg;
 470	}
 471
 472	/* create and init new class */
 473	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
 474	if (cl == NULL)
 475		return -ENOBUFS;
 476
 477	cl->refcnt = 1;
 478	cl->common.classid = classid;
 479	cl->deficit = lmax;
 480
 481	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
 482				      &pfifo_qdisc_ops, classid);
 483	if (cl->qdisc == NULL)
 484		cl->qdisc = &noop_qdisc;
 485
 486	if (tca[TCA_RATE]) {
 487		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
 488					qdisc_root_sleeping_lock(sch),
 
 
 489					tca[TCA_RATE]);
 490		if (err)
 491			goto destroy_class;
 492	}
 493
 494	sch_tree_lock(sch);
 495	qdisc_class_hash_insert(&q->clhash, &cl->common);
 496	sch_tree_unlock(sch);
 497
 498	qdisc_class_hash_grow(sch, &q->clhash);
 499
 500set_change_agg:
 501	sch_tree_lock(sch);
 502	new_agg = qfq_find_agg(q, lmax, weight);
 503	if (new_agg == NULL) { /* create new aggregate */
 504		sch_tree_unlock(sch);
 505		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
 506		if (new_agg == NULL) {
 507			err = -ENOBUFS;
 508			gen_kill_estimator(&cl->bstats, &cl->rate_est);
 509			goto destroy_class;
 510		}
 511		sch_tree_lock(sch);
 512		qfq_init_agg(q, new_agg, lmax, weight);
 513	}
 514	if (existing)
 515		qfq_deact_rm_from_agg(q, cl);
 
 
 516	qfq_add_to_agg(q, new_agg, cl);
 517	sch_tree_unlock(sch);
 
 518
 519	*arg = (unsigned long)cl;
 520	return 0;
 521
 522destroy_class:
 523	qdisc_destroy(cl->qdisc);
 524	kfree(cl);
 525	return err;
 526}
 527
 528static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 529{
 530	struct qfq_sched *q = qdisc_priv(sch);
 531
 532	qfq_rm_from_agg(q, cl);
 533	gen_kill_estimator(&cl->bstats, &cl->rate_est);
 534	qdisc_destroy(cl->qdisc);
 535	kfree(cl);
 536}
 537
 538static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
 
 539{
 540	struct qfq_sched *q = qdisc_priv(sch);
 541	struct qfq_class *cl = (struct qfq_class *)arg;
 542
 543	if (cl->filter_cnt > 0)
 544		return -EBUSY;
 545
 546	sch_tree_lock(sch);
 547
 548	qfq_purge_queue(cl);
 549	qdisc_class_hash_remove(&q->clhash, &cl->common);
 550
 551	BUG_ON(--cl->refcnt == 0);
 552	/*
 553	 * This shouldn't happen: we "hold" one cops->get() when called
 554	 * from tc_ctl_tclass; the destroy method is done from cops->put().
 555	 */
 556
 557	sch_tree_unlock(sch);
 558	return 0;
 559}
 560
 561static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
 562{
 563	struct qfq_class *cl = qfq_find_class(sch, classid);
 564
 565	if (cl != NULL)
 566		cl->refcnt++;
 567
 568	return (unsigned long)cl;
 569}
 570
 571static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
 572{
 573	struct qfq_class *cl = (struct qfq_class *)arg;
 574
 575	if (--cl->refcnt == 0)
 576		qfq_destroy_class(sch, cl);
 577}
 578
 579static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
 
 580{
 581	struct qfq_sched *q = qdisc_priv(sch);
 582
 583	if (cl)
 584		return NULL;
 585
 586	return &q->filter_list;
 587}
 588
 589static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
 590				  u32 classid)
 591{
 592	struct qfq_class *cl = qfq_find_class(sch, classid);
 593
 594	if (cl != NULL)
 595		cl->filter_cnt++;
 596
 597	return (unsigned long)cl;
 598}
 599
 600static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
 601{
 602	struct qfq_class *cl = (struct qfq_class *)arg;
 603
 604	cl->filter_cnt--;
 605}
 606
 607static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
 608			   struct Qdisc *new, struct Qdisc **old)
 
 609{
 610	struct qfq_class *cl = (struct qfq_class *)arg;
 611
 612	if (new == NULL) {
 613		new = qdisc_create_dflt(sch->dev_queue,
 614					&pfifo_qdisc_ops, cl->common.classid);
 615		if (new == NULL)
 616			new = &noop_qdisc;
 617	}
 618
 619	sch_tree_lock(sch);
 620	qfq_purge_queue(cl);
 621	*old = cl->qdisc;
 622	cl->qdisc = new;
 623	sch_tree_unlock(sch);
 624	return 0;
 625}
 626
 627static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
 628{
 629	struct qfq_class *cl = (struct qfq_class *)arg;
 630
 631	return cl->qdisc;
 632}
 633
 634static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
 635			  struct sk_buff *skb, struct tcmsg *tcm)
 636{
 637	struct qfq_class *cl = (struct qfq_class *)arg;
 638	struct nlattr *nest;
 639
 640	tcm->tcm_parent	= TC_H_ROOT;
 641	tcm->tcm_handle	= cl->common.classid;
 642	tcm->tcm_info	= cl->qdisc->handle;
 643
 644	nest = nla_nest_start(skb, TCA_OPTIONS);
 645	if (nest == NULL)
 646		goto nla_put_failure;
 647	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
 648	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
 649		goto nla_put_failure;
 650	return nla_nest_end(skb, nest);
 651
 652nla_put_failure:
 653	nla_nest_cancel(skb, nest);
 654	return -EMSGSIZE;
 655}
 656
 657static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 658				struct gnet_dump *d)
 659{
 660	struct qfq_class *cl = (struct qfq_class *)arg;
 661	struct tc_qfq_stats xstats;
 662
 663	memset(&xstats, 0, sizeof(xstats));
 664	cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
 665
 666	xstats.weight = cl->agg->class_weight;
 667	xstats.lmax = cl->agg->lmax;
 668
 669	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
 670	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 671	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
 672		return -1;
 673
 674	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 675}
 676
 677static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 678{
 679	struct qfq_sched *q = qdisc_priv(sch);
 680	struct qfq_class *cl;
 681	unsigned int i;
 682
 683	if (arg->stop)
 684		return;
 685
 686	for (i = 0; i < q->clhash.hashsize; i++) {
 687		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 688			if (arg->count < arg->skip) {
 689				arg->count++;
 690				continue;
 691			}
 692			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
 693				arg->stop = 1;
 694				return;
 695			}
 696			arg->count++;
 697		}
 698	}
 699}
 700
 701static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 702				      int *qerr)
 703{
 704	struct qfq_sched *q = qdisc_priv(sch);
 705	struct qfq_class *cl;
 706	struct tcf_result res;
 
 707	int result;
 708
 709	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
 710		pr_debug("qfq_classify: found %d\n", skb->priority);
 711		cl = qfq_find_class(sch, skb->priority);
 712		if (cl != NULL)
 713			return cl;
 714	}
 715
 716	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 717	result = tc_classify(skb, q->filter_list, &res);
 
 718	if (result >= 0) {
 719#ifdef CONFIG_NET_CLS_ACT
 720		switch (result) {
 721		case TC_ACT_QUEUED:
 722		case TC_ACT_STOLEN:
 
 723			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 
 724		case TC_ACT_SHOT:
 725			return NULL;
 726		}
 727#endif
 728		cl = (struct qfq_class *)res.class;
 729		if (cl == NULL)
 730			cl = qfq_find_class(sch, res.classid);
 731		return cl;
 732	}
 733
 734	return NULL;
 735}
 736
 737/* Generic comparison function, handling wraparound. */
 738static inline int qfq_gt(u64 a, u64 b)
 739{
 740	return (s64)(a - b) > 0;
 741}
 742
 743/* Round a precise timestamp to its slotted value. */
 744static inline u64 qfq_round_down(u64 ts, unsigned int shift)
 745{
 746	return ts & ~((1ULL << shift) - 1);
 747}
 748
 749/* return the pointer to the group with lowest index in the bitmap */
 750static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
 751					unsigned long bitmap)
 752{
 753	int index = __ffs(bitmap);
 754	return &q->groups[index];
 755}
 756/* Calculate a mask to mimic what would be ffs_from(). */
 757static inline unsigned long mask_from(unsigned long bitmap, int from)
 758{
 759	return bitmap & ~((1UL << from) - 1);
 760}
 761
 762/*
 763 * The state computation relies on ER=0, IR=1, EB=2, IB=3
 764 * First compute eligibility comparing grp->S, q->V,
 765 * then check if someone is blocking us and possibly add EB
 766 */
 767static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
 768{
 769	/* if S > V we are not eligible */
 770	unsigned int state = qfq_gt(grp->S, q->V);
 771	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
 772	struct qfq_group *next;
 773
 774	if (mask) {
 775		next = qfq_ffs(q, mask);
 776		if (qfq_gt(grp->F, next->F))
 777			state |= EB;
 778	}
 779
 780	return state;
 781}
 782
 783
 784/*
 785 * In principle
 786 *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
 787 *	q->bitmaps[src] &= ~mask;
 788 * but we should make sure that src != dst
 789 */
 790static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
 791				   int src, int dst)
 792{
 793	q->bitmaps[dst] |= q->bitmaps[src] & mask;
 794	q->bitmaps[src] &= ~mask;
 795}
 796
 797static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
 798{
 799	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
 800	struct qfq_group *next;
 801
 802	if (mask) {
 803		next = qfq_ffs(q, mask);
 804		if (!qfq_gt(next->F, old_F))
 805			return;
 806	}
 807
 808	mask = (1UL << index) - 1;
 809	qfq_move_groups(q, mask, EB, ER);
 810	qfq_move_groups(q, mask, IB, IR);
 811}
 812
 813/*
 814 * perhaps
 815 *
 816	old_V ^= q->V;
 817	old_V >>= q->min_slot_shift;
 818	if (old_V) {
 819		...
 820	}
 821 *
 822 */
 823static void qfq_make_eligible(struct qfq_sched *q)
 824{
 825	unsigned long vslot = q->V >> q->min_slot_shift;
 826	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
 827
 828	if (vslot != old_vslot) {
 829		unsigned long mask;
 830		int last_flip_pos = fls(vslot ^ old_vslot);
 831
 832		if (last_flip_pos > 31) /* higher than the number of groups */
 833			mask = ~0UL;    /* make all groups eligible */
 834		else
 835			mask = (1UL << last_flip_pos) - 1;
 836
 837		qfq_move_groups(q, mask, IR, ER);
 838		qfq_move_groups(q, mask, IB, EB);
 839	}
 840}
 841
 842/*
 843 * The index of the slot in which the input aggregate agg is to be
 844 * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
 845 * and not a '-1' because the start time of the group may be moved
 846 * backward by one slot after the aggregate has been inserted, and
 847 * this would cause non-empty slots to be right-shifted by one
 848 * position.
 849 *
 850 * QFQ+ fully satisfies this bound to the slot index if the parameters
 851 * of the classes are not changed dynamically, and if QFQ+ never
 852 * happens to postpone the service of agg unjustly, i.e., it never
 853 * happens that the aggregate becomes backlogged and eligible, or just
 854 * eligible, while an aggregate with a higher approximated finish time
 855 * is being served. In particular, in this case QFQ+ guarantees that
 856 * the timestamps of agg are low enough that the slot index is never
 857 * higher than 2. Unfortunately, QFQ+ cannot provide the same
 858 * guarantee if it happens to unjustly postpone the service of agg, or
 859 * if the parameters of some class are changed.
 860 *
 861 * As for the first event, i.e., an out-of-order service, the
 862 * upper bound to the slot index guaranteed by QFQ+ grows to
 863 * 2 +
 864 * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
 865 * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
 866 *
 867 * The following function deals with this problem by backward-shifting
 868 * the timestamps of agg, if needed, so as to guarantee that the slot
 869 * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
 870 * cause the service of other aggregates to be postponed, yet the
 871 * worst-case guarantees of these aggregates are not violated.  In
 872 * fact, in case of no out-of-order service, the timestamps of agg
 873 * would have been even lower than they are after the backward shift,
 874 * because QFQ+ would have guaranteed a maximum value equal to 2 for
 875 * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
 876 * service is postponed because of the backward-shift would have
 877 * however waited for the service of agg before being served.
 878 *
 879 * The other event that may cause the slot index to be higher than 2
 880 * for agg is a recent change of the parameters of some class. If the
 881 * weight of a class is increased or the lmax (max_pkt_size) of the
 882 * class is decreased, then a new aggregate with smaller slot size
 883 * than the original parent aggregate of the class may happen to be
 884 * activated. The activation of this aggregate should be properly
 885 * delayed to when the service of the class has finished in the ideal
 886 * system tracked by QFQ+. If the activation of the aggregate is not
 887 * delayed to this reference time instant, then this aggregate may be
 888 * unjustly served before other aggregates waiting for service. This
 889 * may cause the above bound to the slot index to be violated for some
 890 * of these unlucky aggregates.
 891 *
 892 * Instead of delaying the activation of the new aggregate, which is
 893 * quite complex, the above-discussed capping of the slot index is
 894 * used to handle also the consequences of a change of the parameters
 895 * of a class.
 896 */
 897static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
 898			    u64 roundedS)
 899{
 900	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
 901	unsigned int i; /* slot index in the bucket list */
 902
 903	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
 904		u64 deltaS = roundedS - grp->S -
 905			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
 906		agg->S -= deltaS;
 907		agg->F -= deltaS;
 908		slot = QFQ_MAX_SLOTS - 2;
 909	}
 910
 911	i = (grp->front + slot) % QFQ_MAX_SLOTS;
 912
 913	hlist_add_head(&agg->next, &grp->slots[i]);
 914	__set_bit(slot, &grp->full_slots);
 915}
 916
 917/* Maybe introduce hlist_first_entry?? */
 918static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
 919{
 920	return hlist_entry(grp->slots[grp->front].first,
 921			   struct qfq_aggregate, next);
 922}
 923
 924/*
 925 * remove the entry from the slot
 926 */
 927static void qfq_front_slot_remove(struct qfq_group *grp)
 928{
 929	struct qfq_aggregate *agg = qfq_slot_head(grp);
 930
 931	BUG_ON(!agg);
 932	hlist_del(&agg->next);
 933	if (hlist_empty(&grp->slots[grp->front]))
 934		__clear_bit(0, &grp->full_slots);
 935}
 936
 937/*
 938 * Returns the first aggregate in the first non-empty bucket of the
 939 * group. As a side effect, adjusts the bucket list so the first
 940 * non-empty bucket is at position 0 in full_slots.
 941 */
 942static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
 943{
 944	unsigned int i;
 945
 946	pr_debug("qfq slot_scan: grp %u full %#lx\n",
 947		 grp->index, grp->full_slots);
 948
 949	if (grp->full_slots == 0)
 950		return NULL;
 951
 952	i = __ffs(grp->full_slots);  /* zero based */
 953	if (i > 0) {
 954		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
 955		grp->full_slots >>= i;
 956	}
 957
 958	return qfq_slot_head(grp);
 959}
 960
 961/*
 962 * adjust the bucket list. When the start time of a group decreases,
 963 * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
 964 * move the objects. The mask of occupied slots must be shifted
 965 * because we use ffs() to find the first non-empty slot.
 966 * This covers decreases in the group's start time, but what about
 967 * increases of the start time ?
 968 * Here too we should make sure that i is less than 32
 969 */
 970static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
 971{
 972	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
 973
 974	grp->full_slots <<= i;
 975	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
 976}
 977
 978static void qfq_update_eligible(struct qfq_sched *q)
 979{
 980	struct qfq_group *grp;
 981	unsigned long ineligible;
 982
 983	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
 984	if (ineligible) {
 985		if (!q->bitmaps[ER]) {
 986			grp = qfq_ffs(q, ineligible);
 987			if (qfq_gt(grp->S, q->V))
 988				q->V = grp->S;
 989		}
 990		qfq_make_eligible(q);
 991	}
 992}
 993
 994/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
 995static void agg_dequeue(struct qfq_aggregate *agg,
 996			struct qfq_class *cl, unsigned int len)
 997{
 998	qdisc_dequeue_peeked(cl->qdisc);
 999
1000	cl->deficit -= (int) len;
1001
1002	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
1003		list_del(&cl->alist);
1004	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
1005		cl->deficit += agg->lmax;
1006		list_move_tail(&cl->alist, &agg->active);
1007	}
1008}
1009
1010static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
1011					   struct qfq_class **cl,
1012					   unsigned int *len)
1013{
1014	struct sk_buff *skb;
1015
1016	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
1017	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
1018	if (skb == NULL)
1019		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
1020	else
1021		*len = qdisc_pkt_len(skb);
1022
1023	return skb;
1024}
1025
1026/* Update F according to the actual service received by the aggregate. */
1027static inline void charge_actual_service(struct qfq_aggregate *agg)
1028{
1029	/* Compute the service received by the aggregate, taking into
1030	 * account that, after decreasing the number of classes in
1031	 * agg, it may happen that
1032	 * agg->initial_budget - agg->budget > agg->bugdetmax
1033	 */
1034	u32 service_received = min(agg->budgetmax,
1035				   agg->initial_budget - agg->budget);
1036
1037	agg->F = agg->S + (u64)service_received * agg->inv_w;
1038}
1039
1040/* Assign a reasonable start time for a new aggregate in group i.
1041 * Admissible values for \hat(F) are multiples of \sigma_i
1042 * no greater than V+\sigma_i . Larger values mean that
1043 * we had a wraparound so we consider the timestamp to be stale.
1044 *
1045 * If F is not stale and F >= V then we set S = F.
1046 * Otherwise we should assign S = V, but this may violate
1047 * the ordering in EB (see [2]). So, if we have groups in ER,
1048 * set S to the F_j of the first group j which would be blocking us.
1049 * We are guaranteed not to move S backward because
1050 * otherwise our group i would still be blocked.
1051 */
1052static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
1053{
1054	unsigned long mask;
1055	u64 limit, roundedF;
1056	int slot_shift = agg->grp->slot_shift;
1057
1058	roundedF = qfq_round_down(agg->F, slot_shift);
1059	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
1060
1061	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
1062		/* timestamp was stale */
1063		mask = mask_from(q->bitmaps[ER], agg->grp->index);
1064		if (mask) {
1065			struct qfq_group *next = qfq_ffs(q, mask);
1066			if (qfq_gt(roundedF, next->F)) {
1067				if (qfq_gt(limit, next->F))
1068					agg->S = next->F;
1069				else /* preserve timestamp correctness */
1070					agg->S = limit;
1071				return;
1072			}
1073		}
1074		agg->S = q->V;
1075	} else  /* timestamp is not stale */
1076		agg->S = agg->F;
1077}
1078
1079/* Update the timestamps of agg before scheduling/rescheduling it for
1080 * service.  In particular, assign to agg->F its maximum possible
1081 * value, i.e., the virtual finish time with which the aggregate
1082 * should be labeled if it used all its budget once in service.
1083 */
1084static inline void
1085qfq_update_agg_ts(struct qfq_sched *q,
1086		    struct qfq_aggregate *agg, enum update_reason reason)
1087{
1088	if (reason != requeue)
1089		qfq_update_start(q, agg);
1090	else /* just charge agg for the service received */
1091		agg->S = agg->F;
1092
1093	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
1094}
1095
1096static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
1097
1098static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
1099{
1100	struct qfq_sched *q = qdisc_priv(sch);
1101	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
1102	struct qfq_class *cl;
1103	struct sk_buff *skb = NULL;
1104	/* next-packet len, 0 means no more active classes in in-service agg */
1105	unsigned int len = 0;
1106
1107	if (in_serv_agg == NULL)
1108		return NULL;
1109
1110	if (!list_empty(&in_serv_agg->active))
1111		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
1112
1113	/*
1114	 * If there are no active classes in the in-service aggregate,
1115	 * or if the aggregate has not enough budget to serve its next
1116	 * class, then choose the next aggregate to serve.
1117	 */
1118	if (len == 0 || in_serv_agg->budget < len) {
1119		charge_actual_service(in_serv_agg);
1120
1121		/* recharge the budget of the aggregate */
1122		in_serv_agg->initial_budget = in_serv_agg->budget =
1123			in_serv_agg->budgetmax;
1124
1125		if (!list_empty(&in_serv_agg->active)) {
1126			/*
1127			 * Still active: reschedule for
1128			 * service. Possible optimization: if no other
1129			 * aggregate is active, then there is no point
1130			 * in rescheduling this aggregate, and we can
1131			 * just keep it as the in-service one. This
1132			 * should be however a corner case, and to
1133			 * handle it, we would need to maintain an
1134			 * extra num_active_aggs field.
1135			*/
1136			qfq_update_agg_ts(q, in_serv_agg, requeue);
1137			qfq_schedule_agg(q, in_serv_agg);
1138		} else if (sch->q.qlen == 0) { /* no aggregate to serve */
1139			q->in_serv_agg = NULL;
1140			return NULL;
1141		}
1142
1143		/*
1144		 * If we get here, there are other aggregates queued:
1145		 * choose the new aggregate to serve.
1146		 */
1147		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
1148		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
1149	}
1150	if (!skb)
1151		return NULL;
1152
 
1153	sch->q.qlen--;
1154	qdisc_bstats_update(sch, skb);
1155
1156	agg_dequeue(in_serv_agg, cl, len);
1157	/* If lmax is lowered, through qfq_change_class, for a class
1158	 * owning pending packets with larger size than the new value
1159	 * of lmax, then the following condition may hold.
1160	 */
1161	if (unlikely(in_serv_agg->budget < len))
1162		in_serv_agg->budget = 0;
1163	else
1164		in_serv_agg->budget -= len;
1165
1166	q->V += (u64)len * q->iwsum;
1167	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
1168		 len, (unsigned long long) in_serv_agg->F,
1169		 (unsigned long long) q->V);
1170
1171	return skb;
1172}
1173
1174static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
1175{
1176	struct qfq_group *grp;
1177	struct qfq_aggregate *agg, *new_front_agg;
1178	u64 old_F;
1179
1180	qfq_update_eligible(q);
1181	q->oldV = q->V;
1182
1183	if (!q->bitmaps[ER])
1184		return NULL;
1185
1186	grp = qfq_ffs(q, q->bitmaps[ER]);
1187	old_F = grp->F;
1188
1189	agg = qfq_slot_head(grp);
1190
1191	/* agg starts to be served, remove it from schedule */
1192	qfq_front_slot_remove(grp);
1193
1194	new_front_agg = qfq_slot_scan(grp);
1195
1196	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
1197		__clear_bit(grp->index, &q->bitmaps[ER]);
1198	else {
1199		u64 roundedS = qfq_round_down(new_front_agg->S,
1200					      grp->slot_shift);
1201		unsigned int s;
1202
1203		if (grp->S == roundedS)
1204			return agg;
1205		grp->S = roundedS;
1206		grp->F = roundedS + (2ULL << grp->slot_shift);
1207		__clear_bit(grp->index, &q->bitmaps[ER]);
1208		s = qfq_calc_state(q, grp);
1209		__set_bit(grp->index, &q->bitmaps[s]);
1210	}
1211
1212	qfq_unblock_groups(q, grp->index, old_F);
1213
1214	return agg;
1215}
1216
1217static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
1218{
 
1219	struct qfq_sched *q = qdisc_priv(sch);
1220	struct qfq_class *cl;
1221	struct qfq_aggregate *agg;
1222	int err = 0;
 
1223
1224	cl = qfq_classify(skb, sch, &err);
1225	if (cl == NULL) {
1226		if (err & __NET_XMIT_BYPASS)
1227			sch->qstats.drops++;
1228		kfree_skb(skb);
1229		return err;
1230	}
1231	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
1232
1233	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
1234		pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
1235			 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
1236		err = qfq_change_agg(sch, cl, cl->agg->class_weight,
1237				     qdisc_pkt_len(skb));
1238		if (err)
1239			return err;
 
1240	}
1241
1242	err = qdisc_enqueue(skb, cl->qdisc);
 
 
1243	if (unlikely(err != NET_XMIT_SUCCESS)) {
1244		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
1245		if (net_xmit_drop_count(err)) {
1246			cl->qstats.drops++;
1247			sch->qstats.drops++;
1248		}
1249		return err;
1250	}
1251
1252	bstats_update(&cl->bstats, skb);
 
1253	++sch->q.qlen;
1254
1255	agg = cl->agg;
1256	/* if the queue was not empty, then done here */
1257	if (cl->qdisc->q.qlen != 1) {
1258		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
1259		    list_first_entry(&agg->active, struct qfq_class, alist)
1260		    == cl && cl->deficit < qdisc_pkt_len(skb))
1261			list_move_tail(&cl->alist, &agg->active);
1262
1263		return err;
1264	}
1265
1266	/* schedule class for service within the aggregate */
1267	cl->deficit = agg->lmax;
1268	list_add_tail(&cl->alist, &agg->active);
1269
1270	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
1271	    q->in_serv_agg == agg)
1272		return err; /* non-empty or in service, nothing else to do */
1273
1274	qfq_activate_agg(q, agg, enqueue);
1275
1276	return err;
1277}
1278
1279/*
1280 * Schedule aggregate according to its timestamps.
1281 */
1282static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1283{
1284	struct qfq_group *grp = agg->grp;
1285	u64 roundedS;
1286	int s;
1287
1288	roundedS = qfq_round_down(agg->S, grp->slot_shift);
1289
1290	/*
1291	 * Insert agg in the correct bucket.
1292	 * If agg->S >= grp->S we don't need to adjust the
1293	 * bucket list and simply go to the insertion phase.
1294	 * Otherwise grp->S is decreasing, we must make room
1295	 * in the bucket list, and also recompute the group state.
1296	 * Finally, if there were no flows in this group and nobody
1297	 * was in ER make sure to adjust V.
1298	 */
1299	if (grp->full_slots) {
1300		if (!qfq_gt(grp->S, agg->S))
1301			goto skip_update;
1302
1303		/* create a slot for this agg->S */
1304		qfq_slot_rotate(grp, roundedS);
1305		/* group was surely ineligible, remove */
1306		__clear_bit(grp->index, &q->bitmaps[IR]);
1307		__clear_bit(grp->index, &q->bitmaps[IB]);
1308	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
1309		   q->in_serv_agg == NULL)
1310		q->V = roundedS;
1311
1312	grp->S = roundedS;
1313	grp->F = roundedS + (2ULL << grp->slot_shift);
1314	s = qfq_calc_state(q, grp);
1315	__set_bit(grp->index, &q->bitmaps[s]);
1316
1317	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
1318		 s, q->bitmaps[s],
1319		 (unsigned long long) agg->S,
1320		 (unsigned long long) agg->F,
1321		 (unsigned long long) q->V);
1322
1323skip_update:
1324	qfq_slot_insert(grp, agg, roundedS);
1325}
1326
1327
1328/* Update agg ts and schedule agg for service */
1329static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
1330			     enum update_reason reason)
1331{
1332	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
1333
1334	qfq_update_agg_ts(q, agg, reason);
1335	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
1336		q->in_serv_agg = agg; /* start serving this aggregate */
1337		 /* update V: to be in service, agg must be eligible */
1338		q->oldV = q->V = agg->S;
1339	} else if (agg != q->in_serv_agg)
1340		qfq_schedule_agg(q, agg);
1341}
1342
1343static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
1344			    struct qfq_aggregate *agg)
1345{
1346	unsigned int i, offset;
1347	u64 roundedS;
1348
1349	roundedS = qfq_round_down(agg->S, grp->slot_shift);
1350	offset = (roundedS - grp->S) >> grp->slot_shift;
1351
1352	i = (grp->front + offset) % QFQ_MAX_SLOTS;
1353
1354	hlist_del(&agg->next);
1355	if (hlist_empty(&grp->slots[i]))
1356		__clear_bit(offset, &grp->full_slots);
1357}
1358
1359/*
1360 * Called to forcibly deschedule an aggregate.  If the aggregate is
1361 * not in the front bucket, or if the latter has other aggregates in
1362 * the front bucket, we can simply remove the aggregate with no other
1363 * side effects.
1364 * Otherwise we must propagate the event up.
1365 */
1366static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1367{
1368	struct qfq_group *grp = agg->grp;
1369	unsigned long mask;
1370	u64 roundedS;
1371	int s;
1372
1373	if (agg == q->in_serv_agg) {
1374		charge_actual_service(agg);
1375		q->in_serv_agg = qfq_choose_next_agg(q);
1376		return;
1377	}
1378
1379	agg->F = agg->S;
1380	qfq_slot_remove(q, grp, agg);
1381
1382	if (!grp->full_slots) {
1383		__clear_bit(grp->index, &q->bitmaps[IR]);
1384		__clear_bit(grp->index, &q->bitmaps[EB]);
1385		__clear_bit(grp->index, &q->bitmaps[IB]);
1386
1387		if (test_bit(grp->index, &q->bitmaps[ER]) &&
1388		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
1389			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
1390			if (mask)
1391				mask = ~((1UL << __fls(mask)) - 1);
1392			else
1393				mask = ~0UL;
1394			qfq_move_groups(q, mask, EB, ER);
1395			qfq_move_groups(q, mask, IB, IR);
1396		}
1397		__clear_bit(grp->index, &q->bitmaps[ER]);
1398	} else if (hlist_empty(&grp->slots[grp->front])) {
1399		agg = qfq_slot_scan(grp);
1400		roundedS = qfq_round_down(agg->S, grp->slot_shift);
1401		if (grp->S != roundedS) {
1402			__clear_bit(grp->index, &q->bitmaps[ER]);
1403			__clear_bit(grp->index, &q->bitmaps[IR]);
1404			__clear_bit(grp->index, &q->bitmaps[EB]);
1405			__clear_bit(grp->index, &q->bitmaps[IB]);
1406			grp->S = roundedS;
1407			grp->F = roundedS + (2ULL << grp->slot_shift);
1408			s = qfq_calc_state(q, grp);
1409			__set_bit(grp->index, &q->bitmaps[s]);
1410		}
1411	}
1412}
1413
1414static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1415{
1416	struct qfq_sched *q = qdisc_priv(sch);
1417	struct qfq_class *cl = (struct qfq_class *)arg;
1418
1419	if (cl->qdisc->q.qlen == 0)
1420		qfq_deactivate_class(q, cl);
1421}
1422
1423static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
1424				       struct hlist_head *slot)
1425{
1426	struct qfq_aggregate *agg;
1427	struct qfq_class *cl;
1428	unsigned int len;
1429
1430	hlist_for_each_entry(agg, slot, next) {
1431		list_for_each_entry(cl, &agg->active, alist) {
1432
1433			if (!cl->qdisc->ops->drop)
1434				continue;
1435
1436			len = cl->qdisc->ops->drop(cl->qdisc);
1437			if (len > 0) {
1438				if (cl->qdisc->q.qlen == 0)
1439					qfq_deactivate_class(q, cl);
1440
1441				return len;
1442			}
1443		}
1444	}
1445	return 0;
1446}
1447
1448static unsigned int qfq_drop(struct Qdisc *sch)
1449{
1450	struct qfq_sched *q = qdisc_priv(sch);
1451	struct qfq_group *grp;
1452	unsigned int i, j, len;
1453
1454	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1455		grp = &q->groups[i];
1456		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
1457			len = qfq_drop_from_slot(q, &grp->slots[j]);
1458			if (len > 0) {
1459				sch->q.qlen--;
1460				return len;
1461			}
1462		}
1463
1464	}
1465
1466	return 0;
1467}
1468
1469static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 
1470{
1471	struct qfq_sched *q = qdisc_priv(sch);
1472	struct qfq_group *grp;
1473	int i, j, err;
1474	u32 max_cl_shift, maxbudg_shift, max_classes;
1475
 
 
 
 
1476	err = qdisc_class_hash_init(&q->clhash);
1477	if (err < 0)
1478		return err;
1479
1480	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
1481		max_classes = QFQ_MAX_AGG_CLASSES;
1482	else
1483		max_classes = qdisc_dev(sch)->tx_queue_len + 1;
1484	/* max_cl_shift = floor(log_2(max_classes)) */
1485	max_cl_shift = __fls(max_classes);
1486	q->max_agg_classes = 1<<max_cl_shift;
1487
1488	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
1489	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
1490	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
1491
1492	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1493		grp = &q->groups[i];
1494		grp->index = i;
1495		grp->slot_shift = q->min_slot_shift + i;
1496		for (j = 0; j < QFQ_MAX_SLOTS; j++)
1497			INIT_HLIST_HEAD(&grp->slots[j]);
1498	}
1499
1500	INIT_HLIST_HEAD(&q->nonfull_aggs);
1501
1502	return 0;
1503}
1504
1505static void qfq_reset_qdisc(struct Qdisc *sch)
1506{
1507	struct qfq_sched *q = qdisc_priv(sch);
1508	struct qfq_class *cl;
1509	unsigned int i;
1510
1511	for (i = 0; i < q->clhash.hashsize; i++) {
1512		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1513			if (cl->qdisc->q.qlen > 0)
1514				qfq_deactivate_class(q, cl);
1515
1516			qdisc_reset(cl->qdisc);
1517		}
1518	}
1519	sch->q.qlen = 0;
1520}
1521
1522static void qfq_destroy_qdisc(struct Qdisc *sch)
1523{
1524	struct qfq_sched *q = qdisc_priv(sch);
1525	struct qfq_class *cl;
1526	struct hlist_node *next;
1527	unsigned int i;
1528
1529	tcf_destroy_chain(&q->filter_list);
1530
1531	for (i = 0; i < q->clhash.hashsize; i++) {
1532		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1533					  common.hnode) {
1534			qfq_destroy_class(sch, cl);
1535		}
1536	}
1537	qdisc_class_hash_destroy(&q->clhash);
1538}
1539
1540static const struct Qdisc_class_ops qfq_class_ops = {
1541	.change		= qfq_change_class,
1542	.delete		= qfq_delete_class,
1543	.get		= qfq_get_class,
1544	.put		= qfq_put_class,
1545	.tcf_chain	= qfq_tcf_chain,
1546	.bind_tcf	= qfq_bind_tcf,
1547	.unbind_tcf	= qfq_unbind_tcf,
1548	.graft		= qfq_graft_class,
1549	.leaf		= qfq_class_leaf,
1550	.qlen_notify	= qfq_qlen_notify,
1551	.dump		= qfq_dump_class,
1552	.dump_stats	= qfq_dump_class_stats,
1553	.walk		= qfq_walk,
1554};
1555
1556static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
1557	.cl_ops		= &qfq_class_ops,
1558	.id		= "qfq",
1559	.priv_size	= sizeof(struct qfq_sched),
1560	.enqueue	= qfq_enqueue,
1561	.dequeue	= qfq_dequeue,
1562	.peek		= qdisc_peek_dequeued,
1563	.drop		= qfq_drop,
1564	.init		= qfq_init_qdisc,
1565	.reset		= qfq_reset_qdisc,
1566	.destroy	= qfq_destroy_qdisc,
1567	.owner		= THIS_MODULE,
1568};
1569
1570static int __init qfq_init(void)
1571{
1572	return register_qdisc(&qfq_qdisc_ops);
1573}
1574
1575static void __exit qfq_exit(void)
1576{
1577	unregister_qdisc(&qfq_qdisc_ops);
1578}
1579
1580module_init(qfq_init);
1581module_exit(qfq_exit);
1582MODULE_LICENSE("GPL");

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
   4 *
   5 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
   6 * Copyright (c) 2012 Paolo Valente.
 
 
 
 
   7 */
   8
   9#include <linux/module.h>
  10#include <linux/init.h>
  11#include <linux/bitops.h>
  12#include <linux/errno.h>
  13#include <linux/netdevice.h>
  14#include <linux/pkt_sched.h>
  15#include <net/sch_generic.h>
  16#include <net/pkt_sched.h>
  17#include <net/pkt_cls.h>
  18
  19
  20/*  Quick Fair Queueing Plus
  21    ========================
  22
  23    Sources:
  24
  25    [1] Paolo Valente,
  26    "Reducing the Execution Time of Fair-Queueing Schedulers."
  27    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
  28
  29    Sources for QFQ:
  30
  31    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
  32    Packet Scheduling with Tight Bandwidth Distribution Guarantees."
  33
  34    See also:
  35    http://retis.sssup.it/~fabio/linux/qfq/
  36 */
  37
  38/*
  39
  40  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
  41  classes. Each aggregate is timestamped with a virtual start time S
  42  and a virtual finish time F, and scheduled according to its
  43  timestamps. S and F are computed as a function of a system virtual
  44  time function V. The classes within each aggregate are instead
  45  scheduled with DRR.
  46
  47  To speed up operations, QFQ+ divides also aggregates into a limited
  48  number of groups. Which group a class belongs to depends on the
  49  ratio between the maximum packet length for the class and the weight
  50  of the class. Groups have their own S and F. In the end, QFQ+
  51  schedules groups, then aggregates within groups, then classes within
  52  aggregates. See [1] and [2] for a full description.
  53
  54  Virtual time computations.
  55
  56  S, F and V are all computed in fixed point arithmetic with
  57  FRAC_BITS decimal bits.
  58
  59  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
  60	one bit per index.
  61  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
  62
  63  The layout of the bits is as below:
  64
  65                   [ MTU_SHIFT ][      FRAC_BITS    ]
  66                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
  67				 ^.__grp->index = 0
  68				 *.__grp->slot_shift
  69
  70  where MIN_SLOT_SHIFT is derived by difference from the others.
  71
  72  The max group index corresponds to Lmax/w_min, where
  73  Lmax=1<<MTU_SHIFT, w_min = 1 .
  74  From this, and knowing how many groups (MAX_INDEX) we want,
  75  we can derive the shift corresponding to each group.
  76
  77  Because we often need to compute
  78	F = S + len/w_i  and V = V + len/wsum
  79  instead of storing w_i store the value
  80	inv_w = (1<<FRAC_BITS)/w_i
  81  so we can do F = S + len * inv_w * wsum.
  82  We use W_TOT in the formulas so we can easily move between
  83  static and adaptive weight sum.
  84
  85  The per-scheduler-instance data contain all the data structures
  86  for the scheduler: bitmaps and bucket lists.
  87
  88 */
  89
  90/*
  91 * Maximum number of consecutive slots occupied by backlogged classes
  92 * inside a group.
  93 */
  94#define QFQ_MAX_SLOTS	32
  95
  96/*
  97 * Shifts used for aggregate<->group mapping.  We allow class weights that are
  98 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
  99 * group with the smallest index that can support the L_i / r_i configured
 100 * for the classes in the aggregate.
 101 *
 102 * grp->index is the index of the group; and grp->slot_shift
 103 * is the shift for the corresponding (scaled) sigma_i.
 104 */
 105#define QFQ_MAX_INDEX		24
 106#define QFQ_MAX_WSHIFT		10
 107
 108#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
 109#define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT)
 110
 111#define FRAC_BITS		30	/* fixed point arithmetic */
 112#define ONE_FP			(1UL << FRAC_BITS)
 113
 114#define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
 115#define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
 116
 117#define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */
 118
 119/*
 120 * Possible group states.  These values are used as indexes for the bitmaps
 121 * array of struct qfq_queue.
 122 */
 123enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
 124
 125struct qfq_group;
 126
 127struct qfq_aggregate;
 128
 129struct qfq_class {
 130	struct Qdisc_class_common common;
 131
 
 132	unsigned int filter_cnt;
 133
 134	struct gnet_stats_basic_sync bstats;
 135	struct gnet_stats_queue qstats;
 136	struct net_rate_estimator __rcu *rate_est;
 137	struct Qdisc *qdisc;
 138	struct list_head alist;		/* Link for active-classes list. */
 139	struct qfq_aggregate *agg;	/* Parent aggregate. */
 140	int deficit;			/* DRR deficit counter. */
 141};
 142
 143struct qfq_aggregate {
 144	struct hlist_node next;	/* Link for the slot list. */
 145	u64 S, F;		/* flow timestamps (exact) */
 146
 147	/* group we belong to. In principle we would need the index,
 148	 * which is log_2(lmax/weight), but we never reference it
 149	 * directly, only the group.
 150	 */
 151	struct qfq_group *grp;
 152
 153	/* these are copied from the flowset. */
 154	u32	class_weight; /* Weight of each class in this aggregate. */
 155	/* Max pkt size for the classes in this aggregate, DRR quantum. */
 156	int	lmax;
 157
 158	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */
 159	u32	budgetmax;  /* Max budget for this aggregate. */
 160	u32	initial_budget, budget;     /* Initial and current budget. */
 161
 162	int		  num_classes;	/* Number of classes in this aggr. */
 163	struct list_head  active;	/* DRR queue of active classes. */
 164
 165	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */
 166};
 167
 168struct qfq_group {
 169	u64 S, F;			/* group timestamps (approx). */
 170	unsigned int slot_shift;	/* Slot shift. */
 171	unsigned int index;		/* Group index. */
 172	unsigned int front;		/* Index of the front slot. */
 173	unsigned long full_slots;	/* non-empty slots */
 174
 175	/* Array of RR lists of active aggregates. */
 176	struct hlist_head slots[QFQ_MAX_SLOTS];
 177};
 178
 179struct qfq_sched {
 180	struct tcf_proto __rcu *filter_list;
 181	struct tcf_block	*block;
 182	struct Qdisc_class_hash clhash;
 183
 184	u64			oldV, V;	/* Precise virtual times. */
 185	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
 
 186	u32			wsum;		/* weight sum */
 187	u32			iwsum;		/* inverse weight sum */
 188
 189	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
 190	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
 191	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */
 192
 193	u32 max_agg_classes;		/* Max number of classes per aggr. */
 194	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
 195};
 196
 197/*
 198 * Possible reasons why the timestamps of an aggregate are updated
 199 * enqueue: the aggregate switches from idle to active and must scheduled
 200 *	    for service
 201 * requeue: the aggregate finishes its budget, so it stops being served and
 202 *	    must be rescheduled for service
 203 */
 204enum update_reason {enqueue, requeue};
 205
 206static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
 207{
 208	struct qfq_sched *q = qdisc_priv(sch);
 209	struct Qdisc_class_common *clc;
 210
 211	clc = qdisc_class_find(&q->clhash, classid);
 212	if (clc == NULL)
 213		return NULL;
 214	return container_of(clc, struct qfq_class, common);
 215}
 216
 
 
 
 
 
 
 
 
 217static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
 218	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
 219	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
 220};
 221
 222/*
 223 * Calculate a flow index, given its weight and maximum packet length.
 224 * index = log_2(maxlen/weight) but we need to apply the scaling.
 225 * This is used only once at flow creation.
 226 */
 227static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
 228{
 229	u64 slot_size = (u64)maxlen * inv_w;
 230	unsigned long size_map;
 231	int index = 0;
 232
 233	size_map = slot_size >> min_slot_shift;
 234	if (!size_map)
 235		goto out;
 236
 237	index = __fls(size_map) + 1;	/* basically a log_2 */
 238	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
 239
 240	if (index < 0)
 241		index = 0;
 242out:
 243	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
 244		 (unsigned long) ONE_FP/inv_w, maxlen, index);
 245
 246	return index;
 247}
 248
 249static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
 250static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
 251			     enum update_reason);
 252
 253static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
 254			 u32 lmax, u32 weight)
 255{
 256	INIT_LIST_HEAD(&agg->active);
 257	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
 258
 259	agg->lmax = lmax;
 260	agg->class_weight = weight;
 261}
 262
 263static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
 264					  u32 lmax, u32 weight)
 265{
 266	struct qfq_aggregate *agg;
 267
 268	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
 269		if (agg->lmax == lmax && agg->class_weight == weight)
 270			return agg;
 271
 272	return NULL;
 273}
 274
 275
 276/* Update aggregate as a function of the new number of classes. */
 277static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
 278			   int new_num_classes)
 279{
 280	u32 new_agg_weight;
 281
 282	if (new_num_classes == q->max_agg_classes)
 283		hlist_del_init(&agg->nonfull_next);
 284
 285	if (agg->num_classes > new_num_classes &&
 286	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */
 287		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
 288
 289	/* The next assignment may let
 290	 * agg->initial_budget > agg->budgetmax
 291	 * hold, we will take it into account in charge_actual_service().
 292	 */
 293	agg->budgetmax = new_num_classes * agg->lmax;
 294	new_agg_weight = agg->class_weight * new_num_classes;
 295	agg->inv_w = ONE_FP/new_agg_weight;
 296
 297	if (agg->grp == NULL) {
 298		int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
 299				       q->min_slot_shift);
 300		agg->grp = &q->groups[i];
 301	}
 302
 303	q->wsum +=
 304		(int) agg->class_weight * (new_num_classes - agg->num_classes);
 305	q->iwsum = ONE_FP / q->wsum;
 306
 307	agg->num_classes = new_num_classes;
 308}
 309
 310/* Add class to aggregate. */
 311static void qfq_add_to_agg(struct qfq_sched *q,
 312			   struct qfq_aggregate *agg,
 313			   struct qfq_class *cl)
 314{
 315	cl->agg = agg;
 316
 317	qfq_update_agg(q, agg, agg->num_classes+1);
 318	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
 319		list_add_tail(&cl->alist, &agg->active);
 320		if (list_first_entry(&agg->active, struct qfq_class, alist) ==
 321		    cl && q->in_serv_agg != agg) /* agg was inactive */
 322			qfq_activate_agg(q, agg, enqueue); /* schedule agg */
 323	}
 324}
 325
 326static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
 327
 328static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
 329{
 330	hlist_del_init(&agg->nonfull_next);
 
 331	q->wsum -= agg->class_weight;
 332	if (q->wsum != 0)
 333		q->iwsum = ONE_FP / q->wsum;
 334
 335	if (q->in_serv_agg == agg)
 336		q->in_serv_agg = qfq_choose_next_agg(q);
 337	kfree(agg);
 338}
 339
 340/* Deschedule class from within its parent aggregate. */
 341static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
 342{
 343	struct qfq_aggregate *agg = cl->agg;
 344
 345
 346	list_del(&cl->alist); /* remove from RR queue of the aggregate */
 347	if (list_empty(&agg->active)) /* agg is now inactive */
 348		qfq_deactivate_agg(q, agg);
 349}
 350
 351/* Remove class from its parent aggregate. */
 352static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
 353{
 354	struct qfq_aggregate *agg = cl->agg;
 355
 356	cl->agg = NULL;
 357	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
 358		qfq_destroy_agg(q, agg);
 359		return;
 360	}
 361	qfq_update_agg(q, agg, agg->num_classes-1);
 362}
 363
 364/* Deschedule class and remove it from its parent aggregate. */
 365static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
 366{
 367	if (cl->qdisc->q.qlen > 0) /* class is active */
 368		qfq_deactivate_class(q, cl);
 369
 370	qfq_rm_from_agg(q, cl);
 371}
 372
 373/* Move class to a new aggregate, matching the new class weight and/or lmax */
 374static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
 375			   u32 lmax)
 376{
 377	struct qfq_sched *q = qdisc_priv(sch);
 378	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
 379
 380	if (new_agg == NULL) { /* create new aggregate */
 381		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
 382		if (new_agg == NULL)
 383			return -ENOBUFS;
 384		qfq_init_agg(q, new_agg, lmax, weight);
 385	}
 386	qfq_deact_rm_from_agg(q, cl);
 387	qfq_add_to_agg(q, new_agg, cl);
 388
 389	return 0;
 390}
 391
 392static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 393			    struct nlattr **tca, unsigned long *arg,
 394			    struct netlink_ext_ack *extack)
 395{
 396	struct qfq_sched *q = qdisc_priv(sch);
 397	struct qfq_class *cl = (struct qfq_class *)*arg;
 398	bool existing = false;
 399	struct nlattr *tb[TCA_QFQ_MAX + 1];
 400	struct qfq_aggregate *new_agg = NULL;
 401	u32 weight, lmax, inv_w;
 402	int err;
 403	int delta_w;
 404
 405	if (tca[TCA_OPTIONS] == NULL) {
 406		pr_notice("qfq: no options\n");
 407		return -EINVAL;
 408	}
 409
 410	err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS],
 411					  qfq_policy, NULL);
 412	if (err < 0)
 413		return err;
 414
 415	if (tb[TCA_QFQ_WEIGHT]) {
 416		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
 417		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
 418			pr_notice("qfq: invalid weight %u\n", weight);
 419			return -EINVAL;
 420		}
 421	} else
 422		weight = 1;
 423
 424	if (tb[TCA_QFQ_LMAX]) {
 425		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
 426		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
 427			pr_notice("qfq: invalid max length %u\n", lmax);
 428			return -EINVAL;
 429		}
 430	} else
 431		lmax = psched_mtu(qdisc_dev(sch));
 432
 433	inv_w = ONE_FP / weight;
 434	weight = ONE_FP / inv_w;
 435
 436	if (cl != NULL &&
 437	    lmax == cl->agg->lmax &&
 438	    weight == cl->agg->class_weight)
 439		return 0; /* nothing to change */
 440
 441	delta_w = weight - (cl ? cl->agg->class_weight : 0);
 442
 443	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
 444		pr_notice("qfq: total weight out of range (%d + %u)\n",
 445			  delta_w, q->wsum);
 446		return -EINVAL;
 447	}
 448
 449	if (cl != NULL) { /* modify existing class */
 450		if (tca[TCA_RATE]) {
 451			err = gen_replace_estimator(&cl->bstats, NULL,
 452						    &cl->rate_est,
 453						    NULL,
 454						    true,
 455						    tca[TCA_RATE]);
 456			if (err)
 457				return err;
 458		}
 459		existing = true;
 460		goto set_change_agg;
 461	}
 462
 463	/* create and init new class */
 464	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
 465	if (cl == NULL)
 466		return -ENOBUFS;
 467
 468	gnet_stats_basic_sync_init(&cl->bstats);
 469	cl->common.classid = classid;
 470	cl->deficit = lmax;
 471
 472	cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 473				      classid, NULL);
 474	if (cl->qdisc == NULL)
 475		cl->qdisc = &noop_qdisc;
 476
 477	if (tca[TCA_RATE]) {
 478		err = gen_new_estimator(&cl->bstats, NULL,
 479					&cl->rate_est,
 480					NULL,
 481					true,
 482					tca[TCA_RATE]);
 483		if (err)
 484			goto destroy_class;
 485	}
 486
 487	if (cl->qdisc != &noop_qdisc)
 488		qdisc_hash_add(cl->qdisc, true);
 
 
 
 489
 490set_change_agg:
 491	sch_tree_lock(sch);
 492	new_agg = qfq_find_agg(q, lmax, weight);
 493	if (new_agg == NULL) { /* create new aggregate */
 494		sch_tree_unlock(sch);
 495		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
 496		if (new_agg == NULL) {
 497			err = -ENOBUFS;
 498			gen_kill_estimator(&cl->rate_est);
 499			goto destroy_class;
 500		}
 501		sch_tree_lock(sch);
 502		qfq_init_agg(q, new_agg, lmax, weight);
 503	}
 504	if (existing)
 505		qfq_deact_rm_from_agg(q, cl);
 506	else
 507		qdisc_class_hash_insert(&q->clhash, &cl->common);
 508	qfq_add_to_agg(q, new_agg, cl);
 509	sch_tree_unlock(sch);
 510	qdisc_class_hash_grow(sch, &q->clhash);
 511
 512	*arg = (unsigned long)cl;
 513	return 0;
 514
 515destroy_class:
 516	qdisc_put(cl->qdisc);
 517	kfree(cl);
 518	return err;
 519}
 520
 521static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 522{
 523	struct qfq_sched *q = qdisc_priv(sch);
 524
 525	qfq_rm_from_agg(q, cl);
 526	gen_kill_estimator(&cl->rate_est);
 527	qdisc_put(cl->qdisc);
 528	kfree(cl);
 529}
 530
 531static int qfq_delete_class(struct Qdisc *sch, unsigned long arg,
 532			    struct netlink_ext_ack *extack)
 533{
 534	struct qfq_sched *q = qdisc_priv(sch);
 535	struct qfq_class *cl = (struct qfq_class *)arg;
 536
 537	if (cl->filter_cnt > 0)
 538		return -EBUSY;
 539
 540	sch_tree_lock(sch);
 541
 542	qdisc_purge_queue(cl->qdisc);
 543	qdisc_class_hash_remove(&q->clhash, &cl->common);
 544
 
 
 
 
 
 
 545	sch_tree_unlock(sch);
 
 
 
 
 
 
 546
 547	qfq_destroy_class(sch, cl);
 548	return 0;
 
 
 549}
 550
 551static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
 552{
 553	return (unsigned long)qfq_find_class(sch, classid);
 
 
 
 554}
 555
 556static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl,
 557				       struct netlink_ext_ack *extack)
 558{
 559	struct qfq_sched *q = qdisc_priv(sch);
 560
 561	if (cl)
 562		return NULL;
 563
 564	return q->block;
 565}
 566
 567static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
 568				  u32 classid)
 569{
 570	struct qfq_class *cl = qfq_find_class(sch, classid);
 571
 572	if (cl != NULL)
 573		cl->filter_cnt++;
 574
 575	return (unsigned long)cl;
 576}
 577
 578static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
 579{
 580	struct qfq_class *cl = (struct qfq_class *)arg;
 581
 582	cl->filter_cnt--;
 583}
 584
 585static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
 586			   struct Qdisc *new, struct Qdisc **old,
 587			   struct netlink_ext_ack *extack)
 588{
 589	struct qfq_class *cl = (struct qfq_class *)arg;
 590
 591	if (new == NULL) {
 592		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 593					cl->common.classid, NULL);
 594		if (new == NULL)
 595			new = &noop_qdisc;
 596	}
 597
 598	*old = qdisc_replace(sch, new, &cl->qdisc);
 
 
 
 
 599	return 0;
 600}
 601
 602static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
 603{
 604	struct qfq_class *cl = (struct qfq_class *)arg;
 605
 606	return cl->qdisc;
 607}
 608
 609static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
 610			  struct sk_buff *skb, struct tcmsg *tcm)
 611{
 612	struct qfq_class *cl = (struct qfq_class *)arg;
 613	struct nlattr *nest;
 614
 615	tcm->tcm_parent	= TC_H_ROOT;
 616	tcm->tcm_handle	= cl->common.classid;
 617	tcm->tcm_info	= cl->qdisc->handle;
 618
 619	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 620	if (nest == NULL)
 621		goto nla_put_failure;
 622	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
 623	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
 624		goto nla_put_failure;
 625	return nla_nest_end(skb, nest);
 626
 627nla_put_failure:
 628	nla_nest_cancel(skb, nest);
 629	return -EMSGSIZE;
 630}
 631
 632static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 633				struct gnet_dump *d)
 634{
 635	struct qfq_class *cl = (struct qfq_class *)arg;
 636	struct tc_qfq_stats xstats;
 637
 638	memset(&xstats, 0, sizeof(xstats));
 
 639
 640	xstats.weight = cl->agg->class_weight;
 641	xstats.lmax = cl->agg->lmax;
 642
 643	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
 644	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 645	    qdisc_qstats_copy(d, cl->qdisc) < 0)
 646		return -1;
 647
 648	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 649}
 650
 651static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 652{
 653	struct qfq_sched *q = qdisc_priv(sch);
 654	struct qfq_class *cl;
 655	unsigned int i;
 656
 657	if (arg->stop)
 658		return;
 659
 660	for (i = 0; i < q->clhash.hashsize; i++) {
 661		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 662			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
 
 
 
 
 
 663				return;
 
 
 664		}
 665	}
 666}
 667
 668static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 669				      int *qerr)
 670{
 671	struct qfq_sched *q = qdisc_priv(sch);
 672	struct qfq_class *cl;
 673	struct tcf_result res;
 674	struct tcf_proto *fl;
 675	int result;
 676
 677	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
 678		pr_debug("qfq_classify: found %d\n", skb->priority);
 679		cl = qfq_find_class(sch, skb->priority);
 680		if (cl != NULL)
 681			return cl;
 682	}
 683
 684	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 685	fl = rcu_dereference_bh(q->filter_list);
 686	result = tcf_classify(skb, NULL, fl, &res, false);
 687	if (result >= 0) {
 688#ifdef CONFIG_NET_CLS_ACT
 689		switch (result) {
 690		case TC_ACT_QUEUED:
 691		case TC_ACT_STOLEN:
 692		case TC_ACT_TRAP:
 693			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 694			fallthrough;
 695		case TC_ACT_SHOT:
 696			return NULL;
 697		}
 698#endif
 699		cl = (struct qfq_class *)res.class;
 700		if (cl == NULL)
 701			cl = qfq_find_class(sch, res.classid);
 702		return cl;
 703	}
 704
 705	return NULL;
 706}
 707
 708/* Generic comparison function, handling wraparound. */
 709static inline int qfq_gt(u64 a, u64 b)
 710{
 711	return (s64)(a - b) > 0;
 712}
 713
 714/* Round a precise timestamp to its slotted value. */
 715static inline u64 qfq_round_down(u64 ts, unsigned int shift)
 716{
 717	return ts & ~((1ULL << shift) - 1);
 718}
 719
 720/* return the pointer to the group with lowest index in the bitmap */
 721static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
 722					unsigned long bitmap)
 723{
 724	int index = __ffs(bitmap);
 725	return &q->groups[index];
 726}
 727/* Calculate a mask to mimic what would be ffs_from(). */
 728static inline unsigned long mask_from(unsigned long bitmap, int from)
 729{
 730	return bitmap & ~((1UL << from) - 1);
 731}
 732
 733/*
 734 * The state computation relies on ER=0, IR=1, EB=2, IB=3
 735 * First compute eligibility comparing grp->S, q->V,
 736 * then check if someone is blocking us and possibly add EB
 737 */
 738static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
 739{
 740	/* if S > V we are not eligible */
 741	unsigned int state = qfq_gt(grp->S, q->V);
 742	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
 743	struct qfq_group *next;
 744
 745	if (mask) {
 746		next = qfq_ffs(q, mask);
 747		if (qfq_gt(grp->F, next->F))
 748			state |= EB;
 749	}
 750
 751	return state;
 752}
 753
 754
 755/*
 756 * In principle
 757 *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
 758 *	q->bitmaps[src] &= ~mask;
 759 * but we should make sure that src != dst
 760 */
 761static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
 762				   int src, int dst)
 763{
 764	q->bitmaps[dst] |= q->bitmaps[src] & mask;
 765	q->bitmaps[src] &= ~mask;
 766}
 767
 768static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
 769{
 770	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
 771	struct qfq_group *next;
 772
 773	if (mask) {
 774		next = qfq_ffs(q, mask);
 775		if (!qfq_gt(next->F, old_F))
 776			return;
 777	}
 778
 779	mask = (1UL << index) - 1;
 780	qfq_move_groups(q, mask, EB, ER);
 781	qfq_move_groups(q, mask, IB, IR);
 782}
 783
 784/*
 785 * perhaps
 786 *
 787	old_V ^= q->V;
 788	old_V >>= q->min_slot_shift;
 789	if (old_V) {
 790		...
 791	}
 792 *
 793 */
 794static void qfq_make_eligible(struct qfq_sched *q)
 795{
 796	unsigned long vslot = q->V >> q->min_slot_shift;
 797	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
 798
 799	if (vslot != old_vslot) {
 800		unsigned long mask;
 801		int last_flip_pos = fls(vslot ^ old_vslot);
 802
 803		if (last_flip_pos > 31) /* higher than the number of groups */
 804			mask = ~0UL;    /* make all groups eligible */
 805		else
 806			mask = (1UL << last_flip_pos) - 1;
 807
 808		qfq_move_groups(q, mask, IR, ER);
 809		qfq_move_groups(q, mask, IB, EB);
 810	}
 811}
 812
 813/*
 814 * The index of the slot in which the input aggregate agg is to be
 815 * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
 816 * and not a '-1' because the start time of the group may be moved
 817 * backward by one slot after the aggregate has been inserted, and
 818 * this would cause non-empty slots to be right-shifted by one
 819 * position.
 820 *
 821 * QFQ+ fully satisfies this bound to the slot index if the parameters
 822 * of the classes are not changed dynamically, and if QFQ+ never
 823 * happens to postpone the service of agg unjustly, i.e., it never
 824 * happens that the aggregate becomes backlogged and eligible, or just
 825 * eligible, while an aggregate with a higher approximated finish time
 826 * is being served. In particular, in this case QFQ+ guarantees that
 827 * the timestamps of agg are low enough that the slot index is never
 828 * higher than 2. Unfortunately, QFQ+ cannot provide the same
 829 * guarantee if it happens to unjustly postpone the service of agg, or
 830 * if the parameters of some class are changed.
 831 *
 832 * As for the first event, i.e., an out-of-order service, the
 833 * upper bound to the slot index guaranteed by QFQ+ grows to
 834 * 2 +
 835 * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
 836 * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
 837 *
 838 * The following function deals with this problem by backward-shifting
 839 * the timestamps of agg, if needed, so as to guarantee that the slot
 840 * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
 841 * cause the service of other aggregates to be postponed, yet the
 842 * worst-case guarantees of these aggregates are not violated.  In
 843 * fact, in case of no out-of-order service, the timestamps of agg
 844 * would have been even lower than they are after the backward shift,
 845 * because QFQ+ would have guaranteed a maximum value equal to 2 for
 846 * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
 847 * service is postponed because of the backward-shift would have
 848 * however waited for the service of agg before being served.
 849 *
 850 * The other event that may cause the slot index to be higher than 2
 851 * for agg is a recent change of the parameters of some class. If the
 852 * weight of a class is increased or the lmax (max_pkt_size) of the
 853 * class is decreased, then a new aggregate with smaller slot size
 854 * than the original parent aggregate of the class may happen to be
 855 * activated. The activation of this aggregate should be properly
 856 * delayed to when the service of the class has finished in the ideal
 857 * system tracked by QFQ+. If the activation of the aggregate is not
 858 * delayed to this reference time instant, then this aggregate may be
 859 * unjustly served before other aggregates waiting for service. This
 860 * may cause the above bound to the slot index to be violated for some
 861 * of these unlucky aggregates.
 862 *
 863 * Instead of delaying the activation of the new aggregate, which is
 864 * quite complex, the above-discussed capping of the slot index is
 865 * used to handle also the consequences of a change of the parameters
 866 * of a class.
 867 */
 868static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
 869			    u64 roundedS)
 870{
 871	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
 872	unsigned int i; /* slot index in the bucket list */
 873
 874	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
 875		u64 deltaS = roundedS - grp->S -
 876			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
 877		agg->S -= deltaS;
 878		agg->F -= deltaS;
 879		slot = QFQ_MAX_SLOTS - 2;
 880	}
 881
 882	i = (grp->front + slot) % QFQ_MAX_SLOTS;
 883
 884	hlist_add_head(&agg->next, &grp->slots[i]);
 885	__set_bit(slot, &grp->full_slots);
 886}
 887
 888/* Maybe introduce hlist_first_entry?? */
 889static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
 890{
 891	return hlist_entry(grp->slots[grp->front].first,
 892			   struct qfq_aggregate, next);
 893}
 894
 895/*
 896 * remove the entry from the slot
 897 */
 898static void qfq_front_slot_remove(struct qfq_group *grp)
 899{
 900	struct qfq_aggregate *agg = qfq_slot_head(grp);
 901
 902	BUG_ON(!agg);
 903	hlist_del(&agg->next);
 904	if (hlist_empty(&grp->slots[grp->front]))
 905		__clear_bit(0, &grp->full_slots);
 906}
 907
 908/*
 909 * Returns the first aggregate in the first non-empty bucket of the
 910 * group. As a side effect, adjusts the bucket list so the first
 911 * non-empty bucket is at position 0 in full_slots.
 912 */
 913static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
 914{
 915	unsigned int i;
 916
 917	pr_debug("qfq slot_scan: grp %u full %#lx\n",
 918		 grp->index, grp->full_slots);
 919
 920	if (grp->full_slots == 0)
 921		return NULL;
 922
 923	i = __ffs(grp->full_slots);  /* zero based */
 924	if (i > 0) {
 925		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
 926		grp->full_slots >>= i;
 927	}
 928
 929	return qfq_slot_head(grp);
 930}
 931
 932/*
 933 * adjust the bucket list. When the start time of a group decreases,
 934 * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
 935 * move the objects. The mask of occupied slots must be shifted
 936 * because we use ffs() to find the first non-empty slot.
 937 * This covers decreases in the group's start time, but what about
 938 * increases of the start time ?
 939 * Here too we should make sure that i is less than 32
 940 */
 941static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
 942{
 943	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
 944
 945	grp->full_slots <<= i;
 946	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
 947}
 948
 949static void qfq_update_eligible(struct qfq_sched *q)
 950{
 951	struct qfq_group *grp;
 952	unsigned long ineligible;
 953
 954	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
 955	if (ineligible) {
 956		if (!q->bitmaps[ER]) {
 957			grp = qfq_ffs(q, ineligible);
 958			if (qfq_gt(grp->S, q->V))
 959				q->V = grp->S;
 960		}
 961		qfq_make_eligible(q);
 962	}
 963}
 964
 965/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
 966static void agg_dequeue(struct qfq_aggregate *agg,
 967			struct qfq_class *cl, unsigned int len)
 968{
 969	qdisc_dequeue_peeked(cl->qdisc);
 970
 971	cl->deficit -= (int) len;
 972
 973	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
 974		list_del(&cl->alist);
 975	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
 976		cl->deficit += agg->lmax;
 977		list_move_tail(&cl->alist, &agg->active);
 978	}
 979}
 980
 981static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
 982					   struct qfq_class **cl,
 983					   unsigned int *len)
 984{
 985	struct sk_buff *skb;
 986
 987	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
 988	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
 989	if (skb == NULL)
 990		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
 991	else
 992		*len = qdisc_pkt_len(skb);
 993
 994	return skb;
 995}
 996
 997/* Update F according to the actual service received by the aggregate. */
 998static inline void charge_actual_service(struct qfq_aggregate *agg)
 999{
1000	/* Compute the service received by the aggregate, taking into
1001	 * account that, after decreasing the number of classes in
1002	 * agg, it may happen that
1003	 * agg->initial_budget - agg->budget > agg->bugdetmax
1004	 */
1005	u32 service_received = min(agg->budgetmax,
1006				   agg->initial_budget - agg->budget);
1007
1008	agg->F = agg->S + (u64)service_received * agg->inv_w;
1009}
1010
1011/* Assign a reasonable start time for a new aggregate in group i.
1012 * Admissible values for \hat(F) are multiples of \sigma_i
1013 * no greater than V+\sigma_i . Larger values mean that
1014 * we had a wraparound so we consider the timestamp to be stale.
1015 *
1016 * If F is not stale and F >= V then we set S = F.
1017 * Otherwise we should assign S = V, but this may violate
1018 * the ordering in EB (see [2]). So, if we have groups in ER,
1019 * set S to the F_j of the first group j which would be blocking us.
1020 * We are guaranteed not to move S backward because
1021 * otherwise our group i would still be blocked.
1022 */
1023static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
1024{
1025	unsigned long mask;
1026	u64 limit, roundedF;
1027	int slot_shift = agg->grp->slot_shift;
1028
1029	roundedF = qfq_round_down(agg->F, slot_shift);
1030	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
1031
1032	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
1033		/* timestamp was stale */
1034		mask = mask_from(q->bitmaps[ER], agg->grp->index);
1035		if (mask) {
1036			struct qfq_group *next = qfq_ffs(q, mask);
1037			if (qfq_gt(roundedF, next->F)) {
1038				if (qfq_gt(limit, next->F))
1039					agg->S = next->F;
1040				else /* preserve timestamp correctness */
1041					agg->S = limit;
1042				return;
1043			}
1044		}
1045		agg->S = q->V;
1046	} else  /* timestamp is not stale */
1047		agg->S = agg->F;
1048}
1049
1050/* Update the timestamps of agg before scheduling/rescheduling it for
1051 * service.  In particular, assign to agg->F its maximum possible
1052 * value, i.e., the virtual finish time with which the aggregate
1053 * should be labeled if it used all its budget once in service.
1054 */
1055static inline void
1056qfq_update_agg_ts(struct qfq_sched *q,
1057		    struct qfq_aggregate *agg, enum update_reason reason)
1058{
1059	if (reason != requeue)
1060		qfq_update_start(q, agg);
1061	else /* just charge agg for the service received */
1062		agg->S = agg->F;
1063
1064	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
1065}
1066
1067static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
1068
1069static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
1070{
1071	struct qfq_sched *q = qdisc_priv(sch);
1072	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
1073	struct qfq_class *cl;
1074	struct sk_buff *skb = NULL;
1075	/* next-packet len, 0 means no more active classes in in-service agg */
1076	unsigned int len = 0;
1077
1078	if (in_serv_agg == NULL)
1079		return NULL;
1080
1081	if (!list_empty(&in_serv_agg->active))
1082		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
1083
1084	/*
1085	 * If there are no active classes in the in-service aggregate,
1086	 * or if the aggregate has not enough budget to serve its next
1087	 * class, then choose the next aggregate to serve.
1088	 */
1089	if (len == 0 || in_serv_agg->budget < len) {
1090		charge_actual_service(in_serv_agg);
1091
1092		/* recharge the budget of the aggregate */
1093		in_serv_agg->initial_budget = in_serv_agg->budget =
1094			in_serv_agg->budgetmax;
1095
1096		if (!list_empty(&in_serv_agg->active)) {
1097			/*
1098			 * Still active: reschedule for
1099			 * service. Possible optimization: if no other
1100			 * aggregate is active, then there is no point
1101			 * in rescheduling this aggregate, and we can
1102			 * just keep it as the in-service one. This
1103			 * should be however a corner case, and to
1104			 * handle it, we would need to maintain an
1105			 * extra num_active_aggs field.
1106			*/
1107			qfq_update_agg_ts(q, in_serv_agg, requeue);
1108			qfq_schedule_agg(q, in_serv_agg);
1109		} else if (sch->q.qlen == 0) { /* no aggregate to serve */
1110			q->in_serv_agg = NULL;
1111			return NULL;
1112		}
1113
1114		/*
1115		 * If we get here, there are other aggregates queued:
1116		 * choose the new aggregate to serve.
1117		 */
1118		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
1119		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
1120	}
1121	if (!skb)
1122		return NULL;
1123
1124	qdisc_qstats_backlog_dec(sch, skb);
1125	sch->q.qlen--;
1126	qdisc_bstats_update(sch, skb);
1127
1128	agg_dequeue(in_serv_agg, cl, len);
1129	/* If lmax is lowered, through qfq_change_class, for a class
1130	 * owning pending packets with larger size than the new value
1131	 * of lmax, then the following condition may hold.
1132	 */
1133	if (unlikely(in_serv_agg->budget < len))
1134		in_serv_agg->budget = 0;
1135	else
1136		in_serv_agg->budget -= len;
1137
1138	q->V += (u64)len * q->iwsum;
1139	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
1140		 len, (unsigned long long) in_serv_agg->F,
1141		 (unsigned long long) q->V);
1142
1143	return skb;
1144}
1145
1146static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
1147{
1148	struct qfq_group *grp;
1149	struct qfq_aggregate *agg, *new_front_agg;
1150	u64 old_F;
1151
1152	qfq_update_eligible(q);
1153	q->oldV = q->V;
1154
1155	if (!q->bitmaps[ER])
1156		return NULL;
1157
1158	grp = qfq_ffs(q, q->bitmaps[ER]);
1159	old_F = grp->F;
1160
1161	agg = qfq_slot_head(grp);
1162
1163	/* agg starts to be served, remove it from schedule */
1164	qfq_front_slot_remove(grp);
1165
1166	new_front_agg = qfq_slot_scan(grp);
1167
1168	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
1169		__clear_bit(grp->index, &q->bitmaps[ER]);
1170	else {
1171		u64 roundedS = qfq_round_down(new_front_agg->S,
1172					      grp->slot_shift);
1173		unsigned int s;
1174
1175		if (grp->S == roundedS)
1176			return agg;
1177		grp->S = roundedS;
1178		grp->F = roundedS + (2ULL << grp->slot_shift);
1179		__clear_bit(grp->index, &q->bitmaps[ER]);
1180		s = qfq_calc_state(q, grp);
1181		__set_bit(grp->index, &q->bitmaps[s]);
1182	}
1183
1184	qfq_unblock_groups(q, grp->index, old_F);
1185
1186	return agg;
1187}
1188
1189static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1190		       struct sk_buff **to_free)
1191{
1192	unsigned int len = qdisc_pkt_len(skb), gso_segs;
1193	struct qfq_sched *q = qdisc_priv(sch);
1194	struct qfq_class *cl;
1195	struct qfq_aggregate *agg;
1196	int err = 0;
1197	bool first;
1198
1199	cl = qfq_classify(skb, sch, &err);
1200	if (cl == NULL) {
1201		if (err & __NET_XMIT_BYPASS)
1202			qdisc_qstats_drop(sch);
1203		__qdisc_drop(skb, to_free);
1204		return err;
1205	}
1206	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
1207
1208	if (unlikely(cl->agg->lmax < len)) {
1209		pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
1210			 cl->agg->lmax, len, cl->common.classid);
1211		err = qfq_change_agg(sch, cl, cl->agg->class_weight, len);
1212		if (err) {
1213			cl->qstats.drops++;
1214			return qdisc_drop(skb, sch, to_free);
1215		}
1216	}
1217
1218	gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
1219	first = !cl->qdisc->q.qlen;
1220	err = qdisc_enqueue(skb, cl->qdisc, to_free);
1221	if (unlikely(err != NET_XMIT_SUCCESS)) {
1222		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
1223		if (net_xmit_drop_count(err)) {
1224			cl->qstats.drops++;
1225			qdisc_qstats_drop(sch);
1226		}
1227		return err;
1228	}
1229
1230	_bstats_update(&cl->bstats, len, gso_segs);
1231	sch->qstats.backlog += len;
1232	++sch->q.qlen;
1233
1234	agg = cl->agg;
1235	/* if the queue was not empty, then done here */
1236	if (!first) {
1237		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
1238		    list_first_entry(&agg->active, struct qfq_class, alist)
1239		    == cl && cl->deficit < len)
1240			list_move_tail(&cl->alist, &agg->active);
1241
1242		return err;
1243	}
1244
1245	/* schedule class for service within the aggregate */
1246	cl->deficit = agg->lmax;
1247	list_add_tail(&cl->alist, &agg->active);
1248
1249	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
1250	    q->in_serv_agg == agg)
1251		return err; /* non-empty or in service, nothing else to do */
1252
1253	qfq_activate_agg(q, agg, enqueue);
1254
1255	return err;
1256}
1257
1258/*
1259 * Schedule aggregate according to its timestamps.
1260 */
1261static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1262{
1263	struct qfq_group *grp = agg->grp;
1264	u64 roundedS;
1265	int s;
1266
1267	roundedS = qfq_round_down(agg->S, grp->slot_shift);
1268
1269	/*
1270	 * Insert agg in the correct bucket.
1271	 * If agg->S >= grp->S we don't need to adjust the
1272	 * bucket list and simply go to the insertion phase.
1273	 * Otherwise grp->S is decreasing, we must make room
1274	 * in the bucket list, and also recompute the group state.
1275	 * Finally, if there were no flows in this group and nobody
1276	 * was in ER make sure to adjust V.
1277	 */
1278	if (grp->full_slots) {
1279		if (!qfq_gt(grp->S, agg->S))
1280			goto skip_update;
1281
1282		/* create a slot for this agg->S */
1283		qfq_slot_rotate(grp, roundedS);
1284		/* group was surely ineligible, remove */
1285		__clear_bit(grp->index, &q->bitmaps[IR]);
1286		__clear_bit(grp->index, &q->bitmaps[IB]);
1287	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
1288		   q->in_serv_agg == NULL)
1289		q->V = roundedS;
1290
1291	grp->S = roundedS;
1292	grp->F = roundedS + (2ULL << grp->slot_shift);
1293	s = qfq_calc_state(q, grp);
1294	__set_bit(grp->index, &q->bitmaps[s]);
1295
1296	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
1297		 s, q->bitmaps[s],
1298		 (unsigned long long) agg->S,
1299		 (unsigned long long) agg->F,
1300		 (unsigned long long) q->V);
1301
1302skip_update:
1303	qfq_slot_insert(grp, agg, roundedS);
1304}
1305
1306
1307/* Update agg ts and schedule agg for service */
1308static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
1309			     enum update_reason reason)
1310{
1311	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
1312
1313	qfq_update_agg_ts(q, agg, reason);
1314	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
1315		q->in_serv_agg = agg; /* start serving this aggregate */
1316		 /* update V: to be in service, agg must be eligible */
1317		q->oldV = q->V = agg->S;
1318	} else if (agg != q->in_serv_agg)
1319		qfq_schedule_agg(q, agg);
1320}
1321
1322static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
1323			    struct qfq_aggregate *agg)
1324{
1325	unsigned int i, offset;
1326	u64 roundedS;
1327
1328	roundedS = qfq_round_down(agg->S, grp->slot_shift);
1329	offset = (roundedS - grp->S) >> grp->slot_shift;
1330
1331	i = (grp->front + offset) % QFQ_MAX_SLOTS;
1332
1333	hlist_del(&agg->next);
1334	if (hlist_empty(&grp->slots[i]))
1335		__clear_bit(offset, &grp->full_slots);
1336}
1337
1338/*
1339 * Called to forcibly deschedule an aggregate.  If the aggregate is
1340 * not in the front bucket, or if the latter has other aggregates in
1341 * the front bucket, we can simply remove the aggregate with no other
1342 * side effects.
1343 * Otherwise we must propagate the event up.
1344 */
1345static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
1346{
1347	struct qfq_group *grp = agg->grp;
1348	unsigned long mask;
1349	u64 roundedS;
1350	int s;
1351
1352	if (agg == q->in_serv_agg) {
1353		charge_actual_service(agg);
1354		q->in_serv_agg = qfq_choose_next_agg(q);
1355		return;
1356	}
1357
1358	agg->F = agg->S;
1359	qfq_slot_remove(q, grp, agg);
1360
1361	if (!grp->full_slots) {
1362		__clear_bit(grp->index, &q->bitmaps[IR]);
1363		__clear_bit(grp->index, &q->bitmaps[EB]);
1364		__clear_bit(grp->index, &q->bitmaps[IB]);
1365
1366		if (test_bit(grp->index, &q->bitmaps[ER]) &&
1367		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
1368			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
1369			if (mask)
1370				mask = ~((1UL << __fls(mask)) - 1);
1371			else
1372				mask = ~0UL;
1373			qfq_move_groups(q, mask, EB, ER);
1374			qfq_move_groups(q, mask, IB, IR);
1375		}
1376		__clear_bit(grp->index, &q->bitmaps[ER]);
1377	} else if (hlist_empty(&grp->slots[grp->front])) {
1378		agg = qfq_slot_scan(grp);
1379		roundedS = qfq_round_down(agg->S, grp->slot_shift);
1380		if (grp->S != roundedS) {
1381			__clear_bit(grp->index, &q->bitmaps[ER]);
1382			__clear_bit(grp->index, &q->bitmaps[IR]);
1383			__clear_bit(grp->index, &q->bitmaps[EB]);
1384			__clear_bit(grp->index, &q->bitmaps[IB]);
1385			grp->S = roundedS;
1386			grp->F = roundedS + (2ULL << grp->slot_shift);
1387			s = qfq_calc_state(q, grp);
1388			__set_bit(grp->index, &q->bitmaps[s]);
1389		}
1390	}
1391}
1392
1393static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1394{
1395	struct qfq_sched *q = qdisc_priv(sch);
1396	struct qfq_class *cl = (struct qfq_class *)arg;
1397
1398	qfq_deactivate_class(q, cl);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1399}
1400
1401static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
1402			  struct netlink_ext_ack *extack)
1403{
1404	struct qfq_sched *q = qdisc_priv(sch);
1405	struct qfq_group *grp;
1406	int i, j, err;
1407	u32 max_cl_shift, maxbudg_shift, max_classes;
1408
1409	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1410	if (err)
1411		return err;
1412
1413	err = qdisc_class_hash_init(&q->clhash);
1414	if (err < 0)
1415		return err;
1416
1417	max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1,
1418			    QFQ_MAX_AGG_CLASSES);
 
 
1419	/* max_cl_shift = floor(log_2(max_classes)) */
1420	max_cl_shift = __fls(max_classes);
1421	q->max_agg_classes = 1<<max_cl_shift;
1422
1423	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
1424	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
1425	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
1426
1427	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
1428		grp = &q->groups[i];
1429		grp->index = i;
1430		grp->slot_shift = q->min_slot_shift + i;
1431		for (j = 0; j < QFQ_MAX_SLOTS; j++)
1432			INIT_HLIST_HEAD(&grp->slots[j]);
1433	}
1434
1435	INIT_HLIST_HEAD(&q->nonfull_aggs);
1436
1437	return 0;
1438}
1439
1440static void qfq_reset_qdisc(struct Qdisc *sch)
1441{
1442	struct qfq_sched *q = qdisc_priv(sch);
1443	struct qfq_class *cl;
1444	unsigned int i;
1445
1446	for (i = 0; i < q->clhash.hashsize; i++) {
1447		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1448			if (cl->qdisc->q.qlen > 0)
1449				qfq_deactivate_class(q, cl);
1450
1451			qdisc_reset(cl->qdisc);
1452		}
1453	}
 
1454}
1455
1456static void qfq_destroy_qdisc(struct Qdisc *sch)
1457{
1458	struct qfq_sched *q = qdisc_priv(sch);
1459	struct qfq_class *cl;
1460	struct hlist_node *next;
1461	unsigned int i;
1462
1463	tcf_block_put(q->block);
1464
1465	for (i = 0; i < q->clhash.hashsize; i++) {
1466		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1467					  common.hnode) {
1468			qfq_destroy_class(sch, cl);
1469		}
1470	}
1471	qdisc_class_hash_destroy(&q->clhash);
1472}
1473
1474static const struct Qdisc_class_ops qfq_class_ops = {
1475	.change		= qfq_change_class,
1476	.delete		= qfq_delete_class,
1477	.find		= qfq_search_class,
1478	.tcf_block	= qfq_tcf_block,
 
1479	.bind_tcf	= qfq_bind_tcf,
1480	.unbind_tcf	= qfq_unbind_tcf,
1481	.graft		= qfq_graft_class,
1482	.leaf		= qfq_class_leaf,
1483	.qlen_notify	= qfq_qlen_notify,
1484	.dump		= qfq_dump_class,
1485	.dump_stats	= qfq_dump_class_stats,
1486	.walk		= qfq_walk,
1487};
1488
1489static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
1490	.cl_ops		= &qfq_class_ops,
1491	.id		= "qfq",
1492	.priv_size	= sizeof(struct qfq_sched),
1493	.enqueue	= qfq_enqueue,
1494	.dequeue	= qfq_dequeue,
1495	.peek		= qdisc_peek_dequeued,
 
1496	.init		= qfq_init_qdisc,
1497	.reset		= qfq_reset_qdisc,
1498	.destroy	= qfq_destroy_qdisc,
1499	.owner		= THIS_MODULE,
1500};
1501
1502static int __init qfq_init(void)
1503{
1504	return register_qdisc(&qfq_qdisc_ops);
1505}
1506
1507static void __exit qfq_exit(void)
1508{
1509	unregister_qdisc(&qfq_qdisc_ops);
1510}
1511
1512module_init(qfq_init);
1513module_exit(qfq_exit);
1514MODULE_LICENSE("GPL");