blk-iocost.c - block/blk-iocost.c - Linux source code v3.1

Note: File does not exist in v3.1.
   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * IO cost model based controller.
   4 *
   5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7 * Copyright (C) 2019 Facebook
   8 *
   9 * One challenge of controlling IO resources is the lack of trivially
  10 * observable cost metric.  This is distinguished from CPU and memory where
  11 * wallclock time and the number of bytes can serve as accurate enough
  12 * approximations.
  13 *
  14 * Bandwidth and iops are the most commonly used metrics for IO devices but
  15 * depending on the type and specifics of the device, different IO patterns
  16 * easily lead to multiple orders of magnitude variations rendering them
  17 * useless for the purpose of IO capacity distribution.  While on-device
  18 * time, with a lot of clutches, could serve as a useful approximation for
  19 * non-queued rotational devices, this is no longer viable with modern
  20 * devices, even the rotational ones.
  21 *
  22 * While there is no cost metric we can trivially observe, it isn't a
  23 * complete mystery.  For example, on a rotational device, seek cost
  24 * dominates while a contiguous transfer contributes a smaller amount
  25 * proportional to the size.  If we can characterize at least the relative
  26 * costs of these different types of IOs, it should be possible to
  27 * implement a reasonable work-conserving proportional IO resource
  28 * distribution.
  29 *
  30 * 1. IO Cost Model
  31 *
  32 * IO cost model estimates the cost of an IO given its basic parameters and
  33 * history (e.g. the end sector of the last IO).  The cost is measured in
  34 * device time.  If a given IO is estimated to cost 10ms, the device should
  35 * be able to process ~100 of those IOs in a second.
  36 *
  37 * Currently, there's only one builtin cost model - linear.  Each IO is
  38 * classified as sequential or random and given a base cost accordingly.
  39 * On top of that, a size cost proportional to the length of the IO is
  40 * added.  While simple, this model captures the operational
  41 * characteristics of a wide varienty of devices well enough.  Default
  42 * parameters for several different classes of devices are provided and the
  43 * parameters can be configured from userspace via
  44 * /sys/fs/cgroup/io.cost.model.
  45 *
  46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47 * device-specific coefficients.
  48 *
  49 * 2. Control Strategy
  50 *
  51 * The device virtual time (vtime) is used as the primary control metric.
  52 * The control strategy is composed of the following three parts.
  53 *
  54 * 2-1. Vtime Distribution
  55 *
  56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57 * calculated.  Please consider the following hierarchy where the numbers
  58 * inside parentheses denote the configured weights.
  59 *
  60 *           root
  61 *         /       \
  62 *      A (w:100)  B (w:300)
  63 *      /       \
  64 *  A0 (w:100)  A1 (w:100)
  65 *
  66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69 * 12.5% each.  The distribution mechanism only cares about these flattened
  70 * shares.  They're called hweights (hierarchical weights) and always add
  71 * upto 1 (WEIGHT_ONE).
  72 *
  73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75 * against the device vtime - an IO which takes 10ms on the underlying
  76 * device is considered to take 80ms on A0.
  77 *
  78 * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79 * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80 * the vtime consumed by past IOs and can issue a new IO if doing so
  81 * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82 * suspended until the vtime has progressed enough to cover it.
  83 *
  84 * 2-2. Vrate Adjustment
  85 *
  86 * It's unrealistic to expect the cost model to be perfect.  There are too
  87 * many devices and even on the same device the overall performance
  88 * fluctuates depending on numerous factors such as IO mixture and device
  89 * internal garbage collection.  The controller needs to adapt dynamically.
  90 *
  91 * This is achieved by adjusting the overall IO rate according to how busy
  92 * the device is.  If the device becomes overloaded, we're sending down too
  93 * many IOs and should generally slow down.  If there are waiting issuers
  94 * but the device isn't saturated, we're issuing too few and should
  95 * generally speed up.
  96 *
  97 * To slow down, we lower the vrate - the rate at which the device vtime
  98 * passes compared to the wall clock.  For example, if the vtime is running
  99 * at the vrate of 75%, all cgroups added up would only be able to issue
 100 * 750ms worth of IOs per second, and vice-versa for speeding up.
 101 *
 102 * Device business is determined using two criteria - rq wait and
 103 * completion latencies.
 104 *
 105 * When a device gets saturated, the on-device and then the request queues
 106 * fill up and a bio which is ready to be issued has to wait for a request
 107 * to become available.  When this delay becomes noticeable, it's a clear
 108 * indication that the device is saturated and we lower the vrate.  This
 109 * saturation signal is fairly conservative as it only triggers when both
 110 * hardware and software queues are filled up, and is used as the default
 111 * busy signal.
 112 *
 113 * As devices can have deep queues and be unfair in how the queued commands
 114 * are executed, solely depending on rq wait may not result in satisfactory
 115 * control quality.  For a better control quality, completion latency QoS
 116 * parameters can be configured so that the device is considered saturated
 117 * if N'th percentile completion latency rises above the set point.
 118 *
 119 * The completion latency requirements are a function of both the
 120 * underlying device characteristics and the desired IO latency quality of
 121 * service.  There is an inherent trade-off - the tighter the latency QoS,
 122 * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123 * and can be set through /sys/fs/cgroup/io.cost.qos.
 124 *
 125 * 2-3. Work Conservation
 126 *
 127 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128 * periodically while B is sending out enough parallel IOs to saturate the
 129 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130 * cost per second, i.e., 10% of the device capacity.  The naive
 131 * distribution of half and half would lead to 60% utilization of the
 132 * device, a significant reduction in the total amount of work done
 133 * compared to free-for-all competition.  This is too high a cost to pay
 134 * for IO control.
 135 *
 136 * To conserve the total amount of work done, we keep track of how much
 137 * each active cgroup is actually using and yield part of its weight if
 138 * there are other cgroups which can make use of it.  In the above case,
 139 * A's weight will be lowered so that it hovers above the actual usage and
 140 * B would be able to use the rest.
 141 *
 142 * As we don't want to penalize a cgroup for donating its weight, the
 143 * surplus weight adjustment factors in a margin and has an immediate
 144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145 *
 146 * Note that adjusting down surplus weights has the same effects as
 147 * accelerating vtime for other cgroups and work conservation can also be
 148 * implemented by adjusting vrate dynamically.  However, squaring who can
 149 * donate and should take back how much requires hweight propagations
 150 * anyway making it easier to implement and understand as a separate
 151 * mechanism.
 152 *
 153 * 3. Monitoring
 154 *
 155 * Instead of debugfs or other clumsy monitoring mechanisms, this
 156 * controller uses a drgn based monitoring script -
 157 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158 * https://github.com/osandov/drgn.  The output looks like the following.
 159 *
 160 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161 *                 active      weight      hweight% inflt% dbt  delay usages%
 162 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164 *
 165 * - per	: Timer period
 166 * - cur_per	: Internal wall and device vtime clock
 167 * - vrate	: Device virtual time rate against wall clock
 168 * - weight	: Surplus-adjusted and configured weights
 169 * - hweight	: Surplus-adjusted and configured hierarchical weights
 170 * - inflt	: The percentage of in-flight IO cost at the end of last period
 171 * - del_ms	: Deferred issuer delay induction level and duration
 172 * - usages	: Usage history
 173 */
 174
 175#include <linux/kernel.h>
 176#include <linux/module.h>
 177#include <linux/timer.h>
 178#include <linux/time64.h>
 179#include <linux/parser.h>
 180#include <linux/sched/signal.h>
 181#include <asm/local.h>
 182#include <asm/local64.h>
 183#include "blk-rq-qos.h"
 184#include "blk-stat.h"
 185#include "blk-wbt.h"
 186#include "blk-cgroup.h"
 187
 188#ifdef CONFIG_TRACEPOINTS
 189
 190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 191#define TRACE_IOCG_PATH_LEN 1024
 192static DEFINE_SPINLOCK(trace_iocg_path_lock);
 193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 194
 195#define TRACE_IOCG_PATH(type, iocg, ...)					\
 196	do {									\
 197		unsigned long flags;						\
 198		if (trace_iocost_##type##_enabled()) {				\
 199			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
 200			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
 201				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
 202			trace_iocost_##type(iocg, trace_iocg_path,		\
 203					      ##__VA_ARGS__);			\
 204			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
 205		}								\
 206	} while (0)
 207
 208#else	/* CONFIG_TRACE_POINTS */
 209#define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
 210#endif	/* CONFIG_TRACE_POINTS */
 211
 212enum {
 213	MILLION			= 1000000,
 214
 215	/* timer period is calculated from latency requirements, bound it */
 216	MIN_PERIOD		= USEC_PER_MSEC,
 217	MAX_PERIOD		= USEC_PER_SEC,
 218
 219	/*
 220	 * iocg->vtime is targeted at 50% behind the device vtime, which
 221	 * serves as its IO credit buffer.  Surplus weight adjustment is
 222	 * immediately canceled if the vtime margin runs below 10%.
 223	 */
 224	MARGIN_MIN_PCT		= 10,
 225	MARGIN_LOW_PCT		= 20,
 226	MARGIN_TARGET_PCT	= 50,
 227
 228	INUSE_ADJ_STEP_PCT	= 25,
 229
 230	/* Have some play in timer operations */
 231	TIMER_SLACK_PCT		= 1,
 232
 233	/* 1/64k is granular enough and can easily be handled w/ u32 */
 234	WEIGHT_ONE		= 1 << 16,
 235};
 236
 237enum {
 238	/*
 239	 * As vtime is used to calculate the cost of each IO, it needs to
 240	 * be fairly high precision.  For example, it should be able to
 241	 * represent the cost of a single page worth of discard with
 242	 * suffificient accuracy.  At the same time, it should be able to
 243	 * represent reasonably long enough durations to be useful and
 244	 * convenient during operation.
 245	 *
 246	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 247	 * granularity and days of wrap-around time even at extreme vrates.
 248	 */
 249	VTIME_PER_SEC_SHIFT	= 37,
 250	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
 251	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
 252	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
 253
 254	/* bound vrate adjustments within two orders of magnitude */
 255	VRATE_MIN_PPM		= 10000,	/* 1% */
 256	VRATE_MAX_PPM		= 100000000,	/* 10000% */
 257
 258	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 259	VRATE_CLAMP_ADJ_PCT	= 4,
 260
 261	/* switch iff the conditions are met for longer than this */
 262	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
 263};
 264
 265enum {
 266	/* if IOs end up waiting for requests, issue less */
 267	RQ_WAIT_BUSY_PCT	= 5,
 268
 269	/* unbusy hysterisis */
 270	UNBUSY_THR_PCT		= 75,
 271
 272	/*
 273	 * The effect of delay is indirect and non-linear and a huge amount of
 274	 * future debt can accumulate abruptly while unthrottled. Linearly scale
 275	 * up delay as debt is going up and then let it decay exponentially.
 276	 * This gives us quick ramp ups while delay is accumulating and long
 277	 * tails which can help reducing the frequency of debt explosions on
 278	 * unthrottle. The parameters are experimentally determined.
 279	 *
 280	 * The delay mechanism provides adequate protection and behavior in many
 281	 * cases. However, this is far from ideal and falls shorts on both
 282	 * fronts. The debtors are often throttled too harshly costing a
 283	 * significant level of fairness and possibly total work while the
 284	 * protection against their impacts on the system can be choppy and
 285	 * unreliable.
 286	 *
 287	 * The shortcoming primarily stems from the fact that, unlike for page
 288	 * cache, the kernel doesn't have well-defined back-pressure propagation
 289	 * mechanism and policies for anonymous memory. Fully addressing this
 290	 * issue will likely require substantial improvements in the area.
 291	 */
 292	MIN_DELAY_THR_PCT	= 500,
 293	MAX_DELAY_THR_PCT	= 25000,
 294	MIN_DELAY		= 250,
 295	MAX_DELAY		= 250 * USEC_PER_MSEC,
 296
 297	/* halve debts if avg usage over 100ms is under 50% */
 298	DFGV_USAGE_PCT		= 50,
 299	DFGV_PERIOD		= 100 * USEC_PER_MSEC,
 300
 301	/* don't let cmds which take a very long time pin lagging for too long */
 302	MAX_LAGGING_PERIODS	= 10,
 303
 304	/*
 305	 * Count IO size in 4k pages.  The 12bit shift helps keeping
 306	 * size-proportional components of cost calculation in closer
 307	 * numbers of digits to per-IO cost components.
 308	 */
 309	IOC_PAGE_SHIFT		= 12,
 310	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
 311	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
 312
 313	/* if apart further than 16M, consider randio for linear model */
 314	LCOEF_RANDIO_PAGES	= 4096,
 315};
 316
 317enum ioc_running {
 318	IOC_IDLE,
 319	IOC_RUNNING,
 320	IOC_STOP,
 321};
 322
 323/* io.cost.qos controls including per-dev enable of the whole controller */
 324enum {
 325	QOS_ENABLE,
 326	QOS_CTRL,
 327	NR_QOS_CTRL_PARAMS,
 328};
 329
 330/* io.cost.qos params */
 331enum {
 332	QOS_RPPM,
 333	QOS_RLAT,
 334	QOS_WPPM,
 335	QOS_WLAT,
 336	QOS_MIN,
 337	QOS_MAX,
 338	NR_QOS_PARAMS,
 339};
 340
 341/* io.cost.model controls */
 342enum {
 343	COST_CTRL,
 344	COST_MODEL,
 345	NR_COST_CTRL_PARAMS,
 346};
 347
 348/* builtin linear cost model coefficients */
 349enum {
 350	I_LCOEF_RBPS,
 351	I_LCOEF_RSEQIOPS,
 352	I_LCOEF_RRANDIOPS,
 353	I_LCOEF_WBPS,
 354	I_LCOEF_WSEQIOPS,
 355	I_LCOEF_WRANDIOPS,
 356	NR_I_LCOEFS,
 357};
 358
 359enum {
 360	LCOEF_RPAGE,
 361	LCOEF_RSEQIO,
 362	LCOEF_RRANDIO,
 363	LCOEF_WPAGE,
 364	LCOEF_WSEQIO,
 365	LCOEF_WRANDIO,
 366	NR_LCOEFS,
 367};
 368
 369enum {
 370	AUTOP_INVALID,
 371	AUTOP_HDD,
 372	AUTOP_SSD_QD1,
 373	AUTOP_SSD_DFL,
 374	AUTOP_SSD_FAST,
 375};
 376
 377struct ioc_params {
 378	u32				qos[NR_QOS_PARAMS];
 379	u64				i_lcoefs[NR_I_LCOEFS];
 380	u64				lcoefs[NR_LCOEFS];
 381	u32				too_fast_vrate_pct;
 382	u32				too_slow_vrate_pct;
 383};
 384
 385struct ioc_margins {
 386	s64				min;
 387	s64				low;
 388	s64				target;
 389};
 390
 391struct ioc_missed {
 392	local_t				nr_met;
 393	local_t				nr_missed;
 394	u32				last_met;
 395	u32				last_missed;
 396};
 397
 398struct ioc_pcpu_stat {
 399	struct ioc_missed		missed[2];
 400
 401	local64_t			rq_wait_ns;
 402	u64				last_rq_wait_ns;
 403};
 404
 405/* per device */
 406struct ioc {
 407	struct rq_qos			rqos;
 408
 409	bool				enabled;
 410
 411	struct ioc_params		params;
 412	struct ioc_margins		margins;
 413	u32				period_us;
 414	u32				timer_slack_ns;
 415	u64				vrate_min;
 416	u64				vrate_max;
 417
 418	spinlock_t			lock;
 419	struct timer_list		timer;
 420	struct list_head		active_iocgs;	/* active cgroups */
 421	struct ioc_pcpu_stat __percpu	*pcpu_stat;
 422
 423	enum ioc_running		running;
 424	atomic64_t			vtime_rate;
 425	u64				vtime_base_rate;
 426	s64				vtime_err;
 427
 428	seqcount_spinlock_t		period_seqcount;
 429	u64				period_at;	/* wallclock starttime */
 430	u64				period_at_vtime; /* vtime starttime */
 431
 432	atomic64_t			cur_period;	/* inc'd each period */
 433	int				busy_level;	/* saturation history */
 434
 435	bool				weights_updated;
 436	atomic_t			hweight_gen;	/* for lazy hweights */
 437
 438	/* debt forgivness */
 439	u64				dfgv_period_at;
 440	u64				dfgv_period_rem;
 441	u64				dfgv_usage_us_sum;
 442
 443	u64				autop_too_fast_at;
 444	u64				autop_too_slow_at;
 445	int				autop_idx;
 446	bool				user_qos_params:1;
 447	bool				user_cost_model:1;
 448};
 449
 450struct iocg_pcpu_stat {
 451	local64_t			abs_vusage;
 452};
 453
 454struct iocg_stat {
 455	u64				usage_us;
 456	u64				wait_us;
 457	u64				indebt_us;
 458	u64				indelay_us;
 459};
 460
 461/* per device-cgroup pair */
 462struct ioc_gq {
 463	struct blkg_policy_data		pd;
 464	struct ioc			*ioc;
 465
 466	/*
 467	 * A iocg can get its weight from two sources - an explicit
 468	 * per-device-cgroup configuration or the default weight of the
 469	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 470	 * configuration.  `weight` is the effective considering both
 471	 * sources.
 472	 *
 473	 * When an idle cgroup becomes active its `active` goes from 0 to
 474	 * `weight`.  `inuse` is the surplus adjusted active weight.
 475	 * `active` and `inuse` are used to calculate `hweight_active` and
 476	 * `hweight_inuse`.
 477	 *
 478	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
 479	 * surplus adjustments.
 480	 *
 481	 * `inuse` may be adjusted dynamically during period. `saved_*` are used
 482	 * to determine and track adjustments.
 483	 */
 484	u32				cfg_weight;
 485	u32				weight;
 486	u32				active;
 487	u32				inuse;
 488
 489	u32				last_inuse;
 490	s64				saved_margin;
 491
 492	sector_t			cursor;		/* to detect randio */
 493
 494	/*
 495	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
 496	 * issued.  If lagging behind device vtime, the delta represents
 497	 * the currently available IO budget.  If running ahead, the
 498	 * overage.
 499	 *
 500	 * `vtime_done` is the same but progressed on completion rather
 501	 * than issue.  The delta behind `vtime` represents the cost of
 502	 * currently in-flight IOs.
 503	 */
 504	atomic64_t			vtime;
 505	atomic64_t			done_vtime;
 506	u64				abs_vdebt;
 507
 508	/* current delay in effect and when it started */
 509	u64				delay;
 510	u64				delay_at;
 511
 512	/*
 513	 * The period this iocg was last active in.  Used for deactivation
 514	 * and invalidating `vtime`.
 515	 */
 516	atomic64_t			active_period;
 517	struct list_head		active_list;
 518
 519	/* see __propagate_weights() and current_hweight() for details */
 520	u64				child_active_sum;
 521	u64				child_inuse_sum;
 522	u64				child_adjusted_sum;
 523	int				hweight_gen;
 524	u32				hweight_active;
 525	u32				hweight_inuse;
 526	u32				hweight_donating;
 527	u32				hweight_after_donation;
 528
 529	struct list_head		walk_list;
 530	struct list_head		surplus_list;
 531
 532	struct wait_queue_head		waitq;
 533	struct hrtimer			waitq_timer;
 534
 535	/* timestamp at the latest activation */
 536	u64				activated_at;
 537
 538	/* statistics */
 539	struct iocg_pcpu_stat __percpu	*pcpu_stat;
 540	struct iocg_stat		stat;
 541	struct iocg_stat		last_stat;
 542	u64				last_stat_abs_vusage;
 543	u64				usage_delta_us;
 544	u64				wait_since;
 545	u64				indebt_since;
 546	u64				indelay_since;
 547
 548	/* this iocg's depth in the hierarchy and ancestors including self */
 549	int				level;
 550	struct ioc_gq			*ancestors[];
 551};
 552
 553/* per cgroup */
 554struct ioc_cgrp {
 555	struct blkcg_policy_data	cpd;
 556	unsigned int			dfl_weight;
 557};
 558
 559struct ioc_now {
 560	u64				now_ns;
 561	u64				now;
 562	u64				vnow;
 563};
 564
 565struct iocg_wait {
 566	struct wait_queue_entry		wait;
 567	struct bio			*bio;
 568	u64				abs_cost;
 569	bool				committed;
 570};
 571
 572struct iocg_wake_ctx {
 573	struct ioc_gq			*iocg;
 574	u32				hw_inuse;
 575	s64				vbudget;
 576};
 577
 578static const struct ioc_params autop[] = {
 579	[AUTOP_HDD] = {
 580		.qos				= {
 581			[QOS_RLAT]		=        250000, /* 250ms */
 582			[QOS_WLAT]		=        250000,
 583			[QOS_MIN]		= VRATE_MIN_PPM,
 584			[QOS_MAX]		= VRATE_MAX_PPM,
 585		},
 586		.i_lcoefs			= {
 587			[I_LCOEF_RBPS]		=     174019176,
 588			[I_LCOEF_RSEQIOPS]	=         41708,
 589			[I_LCOEF_RRANDIOPS]	=           370,
 590			[I_LCOEF_WBPS]		=     178075866,
 591			[I_LCOEF_WSEQIOPS]	=         42705,
 592			[I_LCOEF_WRANDIOPS]	=           378,
 593		},
 594	},
 595	[AUTOP_SSD_QD1] = {
 596		.qos				= {
 597			[QOS_RLAT]		=         25000, /* 25ms */
 598			[QOS_WLAT]		=         25000,
 599			[QOS_MIN]		= VRATE_MIN_PPM,
 600			[QOS_MAX]		= VRATE_MAX_PPM,
 601		},
 602		.i_lcoefs			= {
 603			[I_LCOEF_RBPS]		=     245855193,
 604			[I_LCOEF_RSEQIOPS]	=         61575,
 605			[I_LCOEF_RRANDIOPS]	=          6946,
 606			[I_LCOEF_WBPS]		=     141365009,
 607			[I_LCOEF_WSEQIOPS]	=         33716,
 608			[I_LCOEF_WRANDIOPS]	=         26796,
 609		},
 610	},
 611	[AUTOP_SSD_DFL] = {
 612		.qos				= {
 613			[QOS_RLAT]		=         25000, /* 25ms */
 614			[QOS_WLAT]		=         25000,
 615			[QOS_MIN]		= VRATE_MIN_PPM,
 616			[QOS_MAX]		= VRATE_MAX_PPM,
 617		},
 618		.i_lcoefs			= {
 619			[I_LCOEF_RBPS]		=     488636629,
 620			[I_LCOEF_RSEQIOPS]	=          8932,
 621			[I_LCOEF_RRANDIOPS]	=          8518,
 622			[I_LCOEF_WBPS]		=     427891549,
 623			[I_LCOEF_WSEQIOPS]	=         28755,
 624			[I_LCOEF_WRANDIOPS]	=         21940,
 625		},
 626		.too_fast_vrate_pct		=           500,
 627	},
 628	[AUTOP_SSD_FAST] = {
 629		.qos				= {
 630			[QOS_RLAT]		=          5000, /* 5ms */
 631			[QOS_WLAT]		=          5000,
 632			[QOS_MIN]		= VRATE_MIN_PPM,
 633			[QOS_MAX]		= VRATE_MAX_PPM,
 634		},
 635		.i_lcoefs			= {
 636			[I_LCOEF_RBPS]		=    3102524156LLU,
 637			[I_LCOEF_RSEQIOPS]	=        724816,
 638			[I_LCOEF_RRANDIOPS]	=        778122,
 639			[I_LCOEF_WBPS]		=    1742780862LLU,
 640			[I_LCOEF_WSEQIOPS]	=        425702,
 641			[I_LCOEF_WRANDIOPS]	=	 443193,
 642		},
 643		.too_slow_vrate_pct		=            10,
 644	},
 645};
 646
 647/*
 648 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 649 * vtime credit shortage and down on device saturation.
 650 */
 651static u32 vrate_adj_pct[] =
 652	{ 0, 0, 0, 0,
 653	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 654	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 655	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 656
 657static struct blkcg_policy blkcg_policy_iocost;
 658
 659/* accessors and helpers */
 660static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 661{
 662	return container_of(rqos, struct ioc, rqos);
 663}
 664
 665static struct ioc *q_to_ioc(struct request_queue *q)
 666{
 667	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 668}
 669
 670static const char __maybe_unused *ioc_name(struct ioc *ioc)
 671{
 672	struct gendisk *disk = ioc->rqos.disk;
 673
 674	if (!disk)
 675		return "<unknown>";
 676	return disk->disk_name;
 677}
 678
 679static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 680{
 681	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 682}
 683
 684static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 685{
 686	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 687}
 688
 689static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 690{
 691	return pd_to_blkg(&iocg->pd);
 692}
 693
 694static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 695{
 696	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 697			    struct ioc_cgrp, cpd);
 698}
 699
 700/*
 701 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 702 * weight, the more expensive each IO.  Must round up.
 703 */
 704static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 705{
 706	return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
 707}
 708
 709/*
 710 * The inverse of abs_cost_to_cost().  Must round up.
 711 */
 712static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 713{
 714	return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
 715}
 716
 717static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 718			    u64 abs_cost, u64 cost)
 719{
 720	struct iocg_pcpu_stat *gcs;
 721
 722	bio->bi_iocost_cost = cost;
 723	atomic64_add(cost, &iocg->vtime);
 724
 725	gcs = get_cpu_ptr(iocg->pcpu_stat);
 726	local64_add(abs_cost, &gcs->abs_vusage);
 727	put_cpu_ptr(gcs);
 728}
 729
 730static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
 731{
 732	if (lock_ioc) {
 733		spin_lock_irqsave(&iocg->ioc->lock, *flags);
 734		spin_lock(&iocg->waitq.lock);
 735	} else {
 736		spin_lock_irqsave(&iocg->waitq.lock, *flags);
 737	}
 738}
 739
 740static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
 741{
 742	if (unlock_ioc) {
 743		spin_unlock(&iocg->waitq.lock);
 744		spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
 745	} else {
 746		spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
 747	}
 748}
 749
 750#define CREATE_TRACE_POINTS
 751#include <trace/events/iocost.h>
 752
 753static void ioc_refresh_margins(struct ioc *ioc)
 754{
 755	struct ioc_margins *margins = &ioc->margins;
 756	u32 period_us = ioc->period_us;
 757	u64 vrate = ioc->vtime_base_rate;
 758
 759	margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
 760	margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
 761	margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
 762}
 763
 764/* latency Qos params changed, update period_us and all the dependent params */
 765static void ioc_refresh_period_us(struct ioc *ioc)
 766{
 767	u32 ppm, lat, multi, period_us;
 768
 769	lockdep_assert_held(&ioc->lock);
 770
 771	/* pick the higher latency target */
 772	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 773		ppm = ioc->params.qos[QOS_RPPM];
 774		lat = ioc->params.qos[QOS_RLAT];
 775	} else {
 776		ppm = ioc->params.qos[QOS_WPPM];
 777		lat = ioc->params.qos[QOS_WLAT];
 778	}
 779
 780	/*
 781	 * We want the period to be long enough to contain a healthy number
 782	 * of IOs while short enough for granular control.  Define it as a
 783	 * multiple of the latency target.  Ideally, the multiplier should
 784	 * be scaled according to the percentile so that it would nominally
 785	 * contain a certain number of requests.  Let's be simpler and
 786	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 787	 */
 788	if (ppm)
 789		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 790	else
 791		multi = 2;
 792	period_us = multi * lat;
 793	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 794
 795	/* calculate dependent params */
 796	ioc->period_us = period_us;
 797	ioc->timer_slack_ns = div64_u64(
 798		(u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
 799		100);
 800	ioc_refresh_margins(ioc);
 801}
 802
 803/*
 804 *  ioc->rqos.disk isn't initialized when this function is called from
 805 *  the init path.
 806 */
 807static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
 808{
 809	int idx = ioc->autop_idx;
 810	const struct ioc_params *p = &autop[idx];
 811	u32 vrate_pct;
 812	u64 now_ns;
 813
 814	/* rotational? */
 815	if (!blk_queue_nonrot(disk->queue))
 816		return AUTOP_HDD;
 817
 818	/* handle SATA SSDs w/ broken NCQ */
 819	if (blk_queue_depth(disk->queue) == 1)
 820		return AUTOP_SSD_QD1;
 821
 822	/* use one of the normal ssd sets */
 823	if (idx < AUTOP_SSD_DFL)
 824		return AUTOP_SSD_DFL;
 825
 826	/* if user is overriding anything, maintain what was there */
 827	if (ioc->user_qos_params || ioc->user_cost_model)
 828		return idx;
 829
 830	/* step up/down based on the vrate */
 831	vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
 832	now_ns = ktime_get_ns();
 833
 834	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 835		if (!ioc->autop_too_fast_at)
 836			ioc->autop_too_fast_at = now_ns;
 837		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 838			return idx + 1;
 839	} else {
 840		ioc->autop_too_fast_at = 0;
 841	}
 842
 843	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 844		if (!ioc->autop_too_slow_at)
 845			ioc->autop_too_slow_at = now_ns;
 846		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 847			return idx - 1;
 848	} else {
 849		ioc->autop_too_slow_at = 0;
 850	}
 851
 852	return idx;
 853}
 854
 855/*
 856 * Take the followings as input
 857 *
 858 *  @bps	maximum sequential throughput
 859 *  @seqiops	maximum sequential 4k iops
 860 *  @randiops	maximum random 4k iops
 861 *
 862 * and calculate the linear model cost coefficients.
 863 *
 864 *  *@page	per-page cost		1s / (@bps / 4096)
 865 *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
 866 *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
 867 */
 868static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 869			u64 *page, u64 *seqio, u64 *randio)
 870{
 871	u64 v;
 872
 873	*page = *seqio = *randio = 0;
 874
 875	if (bps) {
 876		u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
 877
 878		if (bps_pages)
 879			*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
 880		else
 881			*page = 1;
 882	}
 883
 884	if (seqiops) {
 885		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 886		if (v > *page)
 887			*seqio = v - *page;
 888	}
 889
 890	if (randiops) {
 891		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 892		if (v > *page)
 893			*randio = v - *page;
 894	}
 895}
 896
 897static void ioc_refresh_lcoefs(struct ioc *ioc)
 898{
 899	u64 *u = ioc->params.i_lcoefs;
 900	u64 *c = ioc->params.lcoefs;
 901
 902	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 903		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 904	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 905		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 906}
 907
 908/*
 909 * struct gendisk is required as an argument because ioc->rqos.disk
 910 * is not properly initialized when called from the init path.
 911 */
 912static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
 913				    struct gendisk *disk)
 914{
 915	const struct ioc_params *p;
 916	int idx;
 917
 918	lockdep_assert_held(&ioc->lock);
 919
 920	idx = ioc_autop_idx(ioc, disk);
 921	p = &autop[idx];
 922
 923	if (idx == ioc->autop_idx && !force)
 924		return false;
 925
 926	if (idx != ioc->autop_idx) {
 927		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 928		ioc->vtime_base_rate = VTIME_PER_USEC;
 929	}
 930
 931	ioc->autop_idx = idx;
 932	ioc->autop_too_fast_at = 0;
 933	ioc->autop_too_slow_at = 0;
 934
 935	if (!ioc->user_qos_params)
 936		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 937	if (!ioc->user_cost_model)
 938		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 939
 940	ioc_refresh_period_us(ioc);
 941	ioc_refresh_lcoefs(ioc);
 942
 943	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 944					    VTIME_PER_USEC, MILLION);
 945	ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
 946					    VTIME_PER_USEC, MILLION);
 947
 948	return true;
 949}
 950
 951static bool ioc_refresh_params(struct ioc *ioc, bool force)
 952{
 953	return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
 954}
 955
 956/*
 957 * When an iocg accumulates too much vtime or gets deactivated, we throw away
 958 * some vtime, which lowers the overall device utilization. As the exact amount
 959 * which is being thrown away is known, we can compensate by accelerating the
 960 * vrate accordingly so that the extra vtime generated in the current period
 961 * matches what got lost.
 962 */
 963static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
 964{
 965	s64 pleft = ioc->period_at + ioc->period_us - now->now;
 966	s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
 967	s64 vcomp, vcomp_min, vcomp_max;
 968
 969	lockdep_assert_held(&ioc->lock);
 970
 971	/* we need some time left in this period */
 972	if (pleft <= 0)
 973		goto done;
 974
 975	/*
 976	 * Calculate how much vrate should be adjusted to offset the error.
 977	 * Limit the amount of adjustment and deduct the adjusted amount from
 978	 * the error.
 979	 */
 980	vcomp = -div64_s64(ioc->vtime_err, pleft);
 981	vcomp_min = -(ioc->vtime_base_rate >> 1);
 982	vcomp_max = ioc->vtime_base_rate;
 983	vcomp = clamp(vcomp, vcomp_min, vcomp_max);
 984
 985	ioc->vtime_err += vcomp * pleft;
 986
 987	atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
 988done:
 989	/* bound how much error can accumulate */
 990	ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
 991}
 992
 993static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
 994				  int nr_lagging, int nr_shortages,
 995				  int prev_busy_level, u32 *missed_ppm)
 996{
 997	u64 vrate = ioc->vtime_base_rate;
 998	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
 999
1000	if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
1001		if (ioc->busy_level != prev_busy_level || nr_lagging)
1002			trace_iocost_ioc_vrate_adj(ioc, vrate,
1003						   missed_ppm, rq_wait_pct,
1004						   nr_lagging, nr_shortages);
1005
1006		return;
1007	}
1008
1009	/*
1010	 * If vrate is out of bounds, apply clamp gradually as the
1011	 * bounds can change abruptly.  Otherwise, apply busy_level
1012	 * based adjustment.
1013	 */
1014	if (vrate < vrate_min) {
1015		vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1016		vrate = min(vrate, vrate_min);
1017	} else if (vrate > vrate_max) {
1018		vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1019		vrate = max(vrate, vrate_max);
1020	} else {
1021		int idx = min_t(int, abs(ioc->busy_level),
1022				ARRAY_SIZE(vrate_adj_pct) - 1);
1023		u32 adj_pct = vrate_adj_pct[idx];
1024
1025		if (ioc->busy_level > 0)
1026			adj_pct = 100 - adj_pct;
1027		else
1028			adj_pct = 100 + adj_pct;
1029
1030		vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1031			      vrate_min, vrate_max);
1032	}
1033
1034	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1035				   nr_lagging, nr_shortages);
1036
1037	ioc->vtime_base_rate = vrate;
1038	ioc_refresh_margins(ioc);
1039}
1040
1041/* take a snapshot of the current [v]time and vrate */
1042static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1043{
1044	unsigned seq;
1045	u64 vrate;
1046
1047	now->now_ns = ktime_get();
1048	now->now = ktime_to_us(now->now_ns);
1049	vrate = atomic64_read(&ioc->vtime_rate);
1050
1051	/*
1052	 * The current vtime is
1053	 *
1054	 *   vtime at period start + (wallclock time since the start) * vrate
1055	 *
1056	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
1057	 * needed, they're seqcount protected.
1058	 */
1059	do {
1060		seq = read_seqcount_begin(&ioc->period_seqcount);
1061		now->vnow = ioc->period_at_vtime +
1062			(now->now - ioc->period_at) * vrate;
1063	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
1064}
1065
1066static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1067{
1068	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1069
1070	write_seqcount_begin(&ioc->period_seqcount);
1071	ioc->period_at = now->now;
1072	ioc->period_at_vtime = now->vnow;
1073	write_seqcount_end(&ioc->period_seqcount);
1074
1075	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1076	add_timer(&ioc->timer);
1077}
1078
1079/*
1080 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1081 * weight sums and propagate upwards accordingly. If @save, the current margin
1082 * is saved to be used as reference for later inuse in-period adjustments.
1083 */
1084static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1085				bool save, struct ioc_now *now)
1086{
1087	struct ioc *ioc = iocg->ioc;
1088	int lvl;
1089
1090	lockdep_assert_held(&ioc->lock);
1091
1092	/*
1093	 * For an active leaf node, its inuse shouldn't be zero or exceed
1094	 * @active. An active internal node's inuse is solely determined by the
1095	 * inuse to active ratio of its children regardless of @inuse.
1096	 */
1097	if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1098		inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1099					   iocg->child_active_sum);
1100	} else {
1101		inuse = clamp_t(u32, inuse, 1, active);
1102	}
1103
1104	iocg->last_inuse = iocg->inuse;
1105	if (save)
1106		iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1107
1108	if (active == iocg->active && inuse == iocg->inuse)
1109		return;
1110
1111	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1112		struct ioc_gq *parent = iocg->ancestors[lvl];
1113		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1114		u32 parent_active = 0, parent_inuse = 0;
1115
1116		/* update the level sums */
1117		parent->child_active_sum += (s32)(active - child->active);
1118		parent->child_inuse_sum += (s32)(inuse - child->inuse);
1119		/* apply the updates */
1120		child->active = active;
1121		child->inuse = inuse;
1122
1123		/*
1124		 * The delta between inuse and active sums indicates that
1125		 * much of weight is being given away.  Parent's inuse
1126		 * and active should reflect the ratio.
1127		 */
1128		if (parent->child_active_sum) {
1129			parent_active = parent->weight;
1130			parent_inuse = DIV64_U64_ROUND_UP(
1131				parent_active * parent->child_inuse_sum,
1132				parent->child_active_sum);
1133		}
1134
1135		/* do we need to keep walking up? */
1136		if (parent_active == parent->active &&
1137		    parent_inuse == parent->inuse)
1138			break;
1139
1140		active = parent_active;
1141		inuse = parent_inuse;
1142	}
1143
1144	ioc->weights_updated = true;
1145}
1146
1147static void commit_weights(struct ioc *ioc)
1148{
1149	lockdep_assert_held(&ioc->lock);
1150
1151	if (ioc->weights_updated) {
1152		/* paired with rmb in current_hweight(), see there */
1153		smp_wmb();
1154		atomic_inc(&ioc->hweight_gen);
1155		ioc->weights_updated = false;
1156	}
1157}
1158
1159static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1160			      bool save, struct ioc_now *now)
1161{
1162	__propagate_weights(iocg, active, inuse, save, now);
1163	commit_weights(iocg->ioc);
1164}
1165
1166static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1167{
1168	struct ioc *ioc = iocg->ioc;
1169	int lvl;
1170	u32 hwa, hwi;
1171	int ioc_gen;
1172
1173	/* hot path - if uptodate, use cached */
1174	ioc_gen = atomic_read(&ioc->hweight_gen);
1175	if (ioc_gen == iocg->hweight_gen)
1176		goto out;
1177
1178	/*
1179	 * Paired with wmb in commit_weights(). If we saw the updated
1180	 * hweight_gen, all the weight updates from __propagate_weights() are
1181	 * visible too.
1182	 *
1183	 * We can race with weight updates during calculation and get it
1184	 * wrong.  However, hweight_gen would have changed and a future
1185	 * reader will recalculate and we're guaranteed to discard the
1186	 * wrong result soon.
1187	 */
1188	smp_rmb();
1189
1190	hwa = hwi = WEIGHT_ONE;
1191	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1192		struct ioc_gq *parent = iocg->ancestors[lvl];
1193		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1194		u64 active_sum = READ_ONCE(parent->child_active_sum);
1195		u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1196		u32 active = READ_ONCE(child->active);
1197		u32 inuse = READ_ONCE(child->inuse);
1198
1199		/* we can race with deactivations and either may read as zero */
1200		if (!active_sum || !inuse_sum)
1201			continue;
1202
1203		active_sum = max_t(u64, active, active_sum);
1204		hwa = div64_u64((u64)hwa * active, active_sum);
1205
1206		inuse_sum = max_t(u64, inuse, inuse_sum);
1207		hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1208	}
1209
1210	iocg->hweight_active = max_t(u32, hwa, 1);
1211	iocg->hweight_inuse = max_t(u32, hwi, 1);
1212	iocg->hweight_gen = ioc_gen;
1213out:
1214	if (hw_activep)
1215		*hw_activep = iocg->hweight_active;
1216	if (hw_inusep)
1217		*hw_inusep = iocg->hweight_inuse;
1218}
1219
1220/*
1221 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1222 * other weights stay unchanged.
1223 */
1224static u32 current_hweight_max(struct ioc_gq *iocg)
1225{
1226	u32 hwm = WEIGHT_ONE;
1227	u32 inuse = iocg->active;
1228	u64 child_inuse_sum;
1229	int lvl;
1230
1231	lockdep_assert_held(&iocg->ioc->lock);
1232
1233	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1234		struct ioc_gq *parent = iocg->ancestors[lvl];
1235		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1236
1237		child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1238		hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1239		inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1240					   parent->child_active_sum);
1241	}
1242
1243	return max_t(u32, hwm, 1);
1244}
1245
1246static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1247{
1248	struct ioc *ioc = iocg->ioc;
1249	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1250	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1251	u32 weight;
1252
1253	lockdep_assert_held(&ioc->lock);
1254
1255	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1256	if (weight != iocg->weight && iocg->active)
1257		propagate_weights(iocg, weight, iocg->inuse, true, now);
1258	iocg->weight = weight;
1259}
1260
1261static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1262{
1263	struct ioc *ioc = iocg->ioc;
1264	u64 __maybe_unused last_period, cur_period;
1265	u64 vtime, vtarget;
1266	int i;
1267
1268	/*
1269	 * If seem to be already active, just update the stamp to tell the
1270	 * timer that we're still active.  We don't mind occassional races.
1271	 */
1272	if (!list_empty(&iocg->active_list)) {
1273		ioc_now(ioc, now);
1274		cur_period = atomic64_read(&ioc->cur_period);
1275		if (atomic64_read(&iocg->active_period) != cur_period)
1276			atomic64_set(&iocg->active_period, cur_period);
1277		return true;
1278	}
1279
1280	/* racy check on internal node IOs, treat as root level IOs */
1281	if (iocg->child_active_sum)
1282		return false;
1283
1284	spin_lock_irq(&ioc->lock);
1285
1286	ioc_now(ioc, now);
1287
1288	/* update period */
1289	cur_period = atomic64_read(&ioc->cur_period);
1290	last_period = atomic64_read(&iocg->active_period);
1291	atomic64_set(&iocg->active_period, cur_period);
1292
1293	/* already activated or breaking leaf-only constraint? */
1294	if (!list_empty(&iocg->active_list))
1295		goto succeed_unlock;
1296	for (i = iocg->level - 1; i > 0; i--)
1297		if (!list_empty(&iocg->ancestors[i]->active_list))
1298			goto fail_unlock;
1299
1300	if (iocg->child_active_sum)
1301		goto fail_unlock;
1302
1303	/*
1304	 * Always start with the target budget. On deactivation, we throw away
1305	 * anything above it.
1306	 */
1307	vtarget = now->vnow - ioc->margins.target;
1308	vtime = atomic64_read(&iocg->vtime);
1309
1310	atomic64_add(vtarget - vtime, &iocg->vtime);
1311	atomic64_add(vtarget - vtime, &iocg->done_vtime);
1312	vtime = vtarget;
1313
1314	/*
1315	 * Activate, propagate weight and start period timer if not
1316	 * running.  Reset hweight_gen to avoid accidental match from
1317	 * wrapping.
1318	 */
1319	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1320	list_add(&iocg->active_list, &ioc->active_iocgs);
1321
1322	propagate_weights(iocg, iocg->weight,
1323			  iocg->last_inuse ?: iocg->weight, true, now);
1324
1325	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1326			last_period, cur_period, vtime);
1327
1328	iocg->activated_at = now->now;
1329
1330	if (ioc->running == IOC_IDLE) {
1331		ioc->running = IOC_RUNNING;
1332		ioc->dfgv_period_at = now->now;
1333		ioc->dfgv_period_rem = 0;
1334		ioc_start_period(ioc, now);
1335	}
1336
1337succeed_unlock:
1338	spin_unlock_irq(&ioc->lock);
1339	return true;
1340
1341fail_unlock:
1342	spin_unlock_irq(&ioc->lock);
1343	return false;
1344}
1345
1346static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1347{
1348	struct ioc *ioc = iocg->ioc;
1349	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1350	u64 tdelta, delay, new_delay;
1351	s64 vover, vover_pct;
1352	u32 hwa;
1353
1354	lockdep_assert_held(&iocg->waitq.lock);
1355
1356	/*
1357	 * If the delay is set by another CPU, we may be in the past. No need to
1358	 * change anything if so. This avoids decay calculation underflow.
1359	 */
1360	if (time_before64(now->now, iocg->delay_at))
1361		return false;
1362
1363	/* calculate the current delay in effect - 1/2 every second */
1364	tdelta = now->now - iocg->delay_at;
1365	if (iocg->delay)
1366		delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1367	else
1368		delay = 0;
1369
1370	/* calculate the new delay from the debt amount */
1371	current_hweight(iocg, &hwa, NULL);
1372	vover = atomic64_read(&iocg->vtime) +
1373		abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1374	vover_pct = div64_s64(100 * vover,
1375			      ioc->period_us * ioc->vtime_base_rate);
1376
1377	if (vover_pct <= MIN_DELAY_THR_PCT)
1378		new_delay = 0;
1379	else if (vover_pct >= MAX_DELAY_THR_PCT)
1380		new_delay = MAX_DELAY;
1381	else
1382		new_delay = MIN_DELAY +
1383			div_u64((MAX_DELAY - MIN_DELAY) *
1384				(vover_pct - MIN_DELAY_THR_PCT),
1385				MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1386
1387	/* pick the higher one and apply */
1388	if (new_delay > delay) {
1389		iocg->delay = new_delay;
1390		iocg->delay_at = now->now;
1391		delay = new_delay;
1392	}
1393
1394	if (delay >= MIN_DELAY) {
1395		if (!iocg->indelay_since)
1396			iocg->indelay_since = now->now;
1397		blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1398		return true;
1399	} else {
1400		if (iocg->indelay_since) {
1401			iocg->stat.indelay_us += now->now - iocg->indelay_since;
1402			iocg->indelay_since = 0;
1403		}
1404		iocg->delay = 0;
1405		blkcg_clear_delay(blkg);
1406		return false;
1407	}
1408}
1409
1410static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1411			    struct ioc_now *now)
1412{
1413	struct iocg_pcpu_stat *gcs;
1414
1415	lockdep_assert_held(&iocg->ioc->lock);
1416	lockdep_assert_held(&iocg->waitq.lock);
1417	WARN_ON_ONCE(list_empty(&iocg->active_list));
1418
1419	/*
1420	 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1421	 * inuse donating all of it share to others until its debt is paid off.
1422	 */
1423	if (!iocg->abs_vdebt && abs_cost) {
1424		iocg->indebt_since = now->now;
1425		propagate_weights(iocg, iocg->active, 0, false, now);
1426	}
1427
1428	iocg->abs_vdebt += abs_cost;
1429
1430	gcs = get_cpu_ptr(iocg->pcpu_stat);
1431	local64_add(abs_cost, &gcs->abs_vusage);
1432	put_cpu_ptr(gcs);
1433}
1434
1435static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1436			  struct ioc_now *now)
1437{
1438	lockdep_assert_held(&iocg->ioc->lock);
1439	lockdep_assert_held(&iocg->waitq.lock);
1440
1441	/* make sure that nobody messed with @iocg */
1442	WARN_ON_ONCE(list_empty(&iocg->active_list));
1443	WARN_ON_ONCE(iocg->inuse > 1);
1444
1445	iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1446
1447	/* if debt is paid in full, restore inuse */
1448	if (!iocg->abs_vdebt) {
1449		iocg->stat.indebt_us += now->now - iocg->indebt_since;
1450		iocg->indebt_since = 0;
1451
1452		propagate_weights(iocg, iocg->active, iocg->last_inuse,
1453				  false, now);
1454	}
1455}
1456
1457static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1458			int flags, void *key)
1459{
1460	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1461	struct iocg_wake_ctx *ctx = key;
1462	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1463
1464	ctx->vbudget -= cost;
1465
1466	if (ctx->vbudget < 0)
1467		return -1;
1468
1469	iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1470	wait->committed = true;
1471
1472	/*
1473	 * autoremove_wake_function() removes the wait entry only when it
1474	 * actually changed the task state. We want the wait always removed.
1475	 * Remove explicitly and use default_wake_function(). Note that the
1476	 * order of operations is important as finish_wait() tests whether
1477	 * @wq_entry is removed without grabbing the lock.
1478	 */
1479	default_wake_function(wq_entry, mode, flags, key);
1480	list_del_init_careful(&wq_entry->entry);
1481	return 0;
1482}
1483
1484/*
1485 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1486 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1487 * addition to iocg->waitq.lock.
1488 */
1489static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1490			    struct ioc_now *now)
1491{
1492	struct ioc *ioc = iocg->ioc;
1493	struct iocg_wake_ctx ctx = { .iocg = iocg };
1494	u64 vshortage, expires, oexpires;
1495	s64 vbudget;
1496	u32 hwa;
1497
1498	lockdep_assert_held(&iocg->waitq.lock);
1499
1500	current_hweight(iocg, &hwa, NULL);
1501	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1502
1503	/* pay off debt */
1504	if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1505		u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1506		u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1507		u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1508
1509		lockdep_assert_held(&ioc->lock);
1510
1511		atomic64_add(vpay, &iocg->vtime);
1512		atomic64_add(vpay, &iocg->done_vtime);
1513		iocg_pay_debt(iocg, abs_vpay, now);
1514		vbudget -= vpay;
1515	}
1516
1517	if (iocg->abs_vdebt || iocg->delay)
1518		iocg_kick_delay(iocg, now);
1519
1520	/*
1521	 * Debt can still be outstanding if we haven't paid all yet or the
1522	 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1523	 * under debt. Make sure @vbudget reflects the outstanding amount and is
1524	 * not positive.
1525	 */
1526	if (iocg->abs_vdebt) {
1527		s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1528		vbudget = min_t(s64, 0, vbudget - vdebt);
1529	}
1530
1531	/*
1532	 * Wake up the ones which are due and see how much vtime we'll need for
1533	 * the next one. As paying off debt restores hw_inuse, it must be read
1534	 * after the above debt payment.
1535	 */
1536	ctx.vbudget = vbudget;
1537	current_hweight(iocg, NULL, &ctx.hw_inuse);
1538
1539	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1540
1541	if (!waitqueue_active(&iocg->waitq)) {
1542		if (iocg->wait_since) {
1543			iocg->stat.wait_us += now->now - iocg->wait_since;
1544			iocg->wait_since = 0;
1545		}
1546		return;
1547	}
1548
1549	if (!iocg->wait_since)
1550		iocg->wait_since = now->now;
1551
1552	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1553		return;
1554
1555	/* determine next wakeup, add a timer margin to guarantee chunking */
1556	vshortage = -ctx.vbudget;
1557	expires = now->now_ns +
1558		DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1559		NSEC_PER_USEC;
1560	expires += ioc->timer_slack_ns;
1561
1562	/* if already active and close enough, don't bother */
1563	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1564	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1565	    abs(oexpires - expires) <= ioc->timer_slack_ns)
1566		return;
1567
1568	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1569			       ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1570}
1571
1572static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1573{
1574	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1575	bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1576	struct ioc_now now;
1577	unsigned long flags;
1578
1579	ioc_now(iocg->ioc, &now);
1580
1581	iocg_lock(iocg, pay_debt, &flags);
1582	iocg_kick_waitq(iocg, pay_debt, &now);
1583	iocg_unlock(iocg, pay_debt, &flags);
1584
1585	return HRTIMER_NORESTART;
1586}
1587
1588static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1589{
1590	u32 nr_met[2] = { };
1591	u32 nr_missed[2] = { };
1592	u64 rq_wait_ns = 0;
1593	int cpu, rw;
1594
1595	for_each_online_cpu(cpu) {
1596		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1597		u64 this_rq_wait_ns;
1598
1599		for (rw = READ; rw <= WRITE; rw++) {
1600			u32 this_met = local_read(&stat->missed[rw].nr_met);
1601			u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1602
1603			nr_met[rw] += this_met - stat->missed[rw].last_met;
1604			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1605			stat->missed[rw].last_met = this_met;
1606			stat->missed[rw].last_missed = this_missed;
1607		}
1608
1609		this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1610		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1611		stat->last_rq_wait_ns = this_rq_wait_ns;
1612	}
1613
1614	for (rw = READ; rw <= WRITE; rw++) {
1615		if (nr_met[rw] + nr_missed[rw])
1616			missed_ppm_ar[rw] =
1617				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1618						   nr_met[rw] + nr_missed[rw]);
1619		else
1620			missed_ppm_ar[rw] = 0;
1621	}
1622
1623	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1624				   ioc->period_us * NSEC_PER_USEC);
1625}
1626
1627/* was iocg idle this period? */
1628static bool iocg_is_idle(struct ioc_gq *iocg)
1629{
1630	struct ioc *ioc = iocg->ioc;
1631
1632	/* did something get issued this period? */
1633	if (atomic64_read(&iocg->active_period) ==
1634	    atomic64_read(&ioc->cur_period))
1635		return false;
1636
1637	/* is something in flight? */
1638	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1639		return false;
1640
1641	return true;
1642}
1643
1644/*
1645 * Call this function on the target leaf @iocg's to build pre-order traversal
1646 * list of all the ancestors in @inner_walk. The inner nodes are linked through
1647 * ->walk_list and the caller is responsible for dissolving the list after use.
1648 */
1649static void iocg_build_inner_walk(struct ioc_gq *iocg,
1650				  struct list_head *inner_walk)
1651{
1652	int lvl;
1653
1654	WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1655
1656	/* find the first ancestor which hasn't been visited yet */
1657	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1658		if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1659			break;
1660	}
1661
1662	/* walk down and visit the inner nodes to get pre-order traversal */
1663	while (++lvl <= iocg->level - 1) {
1664		struct ioc_gq *inner = iocg->ancestors[lvl];
1665
1666		/* record traversal order */
1667		list_add_tail(&inner->walk_list, inner_walk);
1668	}
1669}
1670
1671/* propagate the deltas to the parent */
1672static void iocg_flush_stat_upward(struct ioc_gq *iocg)
1673{
1674	if (iocg->level > 0) {
1675		struct iocg_stat *parent_stat =
1676			&iocg->ancestors[iocg->level - 1]->stat;
1677
1678		parent_stat->usage_us +=
1679			iocg->stat.usage_us - iocg->last_stat.usage_us;
1680		parent_stat->wait_us +=
1681			iocg->stat.wait_us - iocg->last_stat.wait_us;
1682		parent_stat->indebt_us +=
1683			iocg->stat.indebt_us - iocg->last_stat.indebt_us;
1684		parent_stat->indelay_us +=
1685			iocg->stat.indelay_us - iocg->last_stat.indelay_us;
1686	}
1687
1688	iocg->last_stat = iocg->stat;
1689}
1690
1691/* collect per-cpu counters and propagate the deltas to the parent */
1692static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
1693{
1694	struct ioc *ioc = iocg->ioc;
1695	u64 abs_vusage = 0;
1696	u64 vusage_delta;
1697	int cpu;
1698
1699	lockdep_assert_held(&iocg->ioc->lock);
1700
1701	/* collect per-cpu counters */
1702	for_each_possible_cpu(cpu) {
1703		abs_vusage += local64_read(
1704				per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1705	}
1706	vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1707	iocg->last_stat_abs_vusage = abs_vusage;
1708
1709	iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1710	iocg->stat.usage_us += iocg->usage_delta_us;
1711
1712	iocg_flush_stat_upward(iocg);
1713}
1714
1715/* get stat counters ready for reading on all active iocgs */
1716static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1717{
1718	LIST_HEAD(inner_walk);
1719	struct ioc_gq *iocg, *tiocg;
1720
1721	/* flush leaves and build inner node walk list */
1722	list_for_each_entry(iocg, target_iocgs, active_list) {
1723		iocg_flush_stat_leaf(iocg, now);
1724		iocg_build_inner_walk(iocg, &inner_walk);
1725	}
1726
1727	/* keep flushing upwards by walking the inner list backwards */
1728	list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1729		iocg_flush_stat_upward(iocg);
1730		list_del_init(&iocg->walk_list);
1731	}
1732}
1733
1734/*
1735 * Determine what @iocg's hweight_inuse should be after donating unused
1736 * capacity. @hwm is the upper bound and used to signal no donation. This
1737 * function also throws away @iocg's excess budget.
1738 */
1739static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1740				  u32 usage, struct ioc_now *now)
1741{
1742	struct ioc *ioc = iocg->ioc;
1743	u64 vtime = atomic64_read(&iocg->vtime);
1744	s64 excess, delta, target, new_hwi;
1745
1746	/* debt handling owns inuse for debtors */
1747	if (iocg->abs_vdebt)
1748		return 1;
1749
1750	/* see whether minimum margin requirement is met */
1751	if (waitqueue_active(&iocg->waitq) ||
1752	    time_after64(vtime, now->vnow - ioc->margins.min))
1753		return hwm;
1754
1755	/* throw away excess above target */
1756	excess = now->vnow - vtime - ioc->margins.target;
1757	if (excess > 0) {
1758		atomic64_add(excess, &iocg->vtime);
1759		atomic64_add(excess, &iocg->done_vtime);
1760		vtime += excess;
1761		ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1762	}
1763
1764	/*
1765	 * Let's say the distance between iocg's and device's vtimes as a
1766	 * fraction of period duration is delta. Assuming that the iocg will
1767	 * consume the usage determined above, we want to determine new_hwi so
1768	 * that delta equals MARGIN_TARGET at the end of the next period.
1769	 *
1770	 * We need to execute usage worth of IOs while spending the sum of the
1771	 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1772	 * (delta):
1773	 *
1774	 *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1775	 *
1776	 * Therefore, the new_hwi is:
1777	 *
1778	 *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1779	 */
1780	delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1781			  now->vnow - ioc->period_at_vtime);
1782	target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1783	new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1784
1785	return clamp_t(s64, new_hwi, 1, hwm);
1786}
1787
1788/*
1789 * For work-conservation, an iocg which isn't using all of its share should
1790 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1791 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1792 *
1793 * #1 is mathematically simpler but has the drawback of requiring synchronous
1794 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1795 * change due to donation snapbacks as it has the possibility of grossly
1796 * overshooting what's allowed by the model and vrate.
1797 *
1798 * #2 is inherently safe with local operations. The donating iocg can easily
1799 * snap back to higher weights when needed without worrying about impacts on
1800 * other nodes as the impacts will be inherently correct. This also makes idle
1801 * iocg activations safe. The only effect activations have is decreasing
1802 * hweight_inuse of others, the right solution to which is for those iocgs to
1803 * snap back to higher weights.
1804 *
1805 * So, we go with #2. The challenge is calculating how each donating iocg's
1806 * inuse should be adjusted to achieve the target donation amounts. This is done
1807 * using Andy's method described in the following pdf.
1808 *
1809 *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1810 *
1811 * Given the weights and target after-donation hweight_inuse values, Andy's
1812 * method determines how the proportional distribution should look like at each
1813 * sibling level to maintain the relative relationship between all non-donating
1814 * pairs. To roughly summarize, it divides the tree into donating and
1815 * non-donating parts, calculates global donation rate which is used to
1816 * determine the target hweight_inuse for each node, and then derives per-level
1817 * proportions.
1818 *
1819 * The following pdf shows that global distribution calculated this way can be
1820 * achieved by scaling inuse weights of donating leaves and propagating the
1821 * adjustments upwards proportionally.
1822 *
1823 *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1824 *
1825 * Combining the above two, we can determine how each leaf iocg's inuse should
1826 * be adjusted to achieve the target donation.
1827 *
1828 *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1829 *
1830 * The inline comments use symbols from the last pdf.
1831 *
1832 *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1833 *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1834 *   t is the sum of the absolute budgets of donating nodes in the subtree.
1835 *   w is the weight of the node. w = w_f + w_t
1836 *   w_f is the non-donating portion of w. w_f = w * f / b
1837 *   w_b is the donating portion of w. w_t = w * t / b
1838 *   s is the sum of all sibling weights. s = Sum(w) for siblings
1839 *   s_f and s_t are the non-donating and donating portions of s.
1840 *
1841 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1842 * w_pt is the donating portion of the parent's weight and w'_pt the same value
1843 * after adjustments. Subscript r denotes the root node's values.
1844 */
1845static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1846{
1847	LIST_HEAD(over_hwa);
1848	LIST_HEAD(inner_walk);
1849	struct ioc_gq *iocg, *tiocg, *root_iocg;
1850	u32 after_sum, over_sum, over_target, gamma;
1851
1852	/*
1853	 * It's pretty unlikely but possible for the total sum of
1854	 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1855	 * confuse the following calculations. If such condition is detected,
1856	 * scale down everyone over its full share equally to keep the sum below
1857	 * WEIGHT_ONE.
1858	 */
1859	after_sum = 0;
1860	over_sum = 0;
1861	list_for_each_entry(iocg, surpluses, surplus_list) {
1862		u32 hwa;
1863
1864		current_hweight(iocg, &hwa, NULL);
1865		after_sum += iocg->hweight_after_donation;
1866
1867		if (iocg->hweight_after_donation > hwa) {
1868			over_sum += iocg->hweight_after_donation;
1869			list_add(&iocg->walk_list, &over_hwa);
1870		}
1871	}
1872
1873	if (after_sum >= WEIGHT_ONE) {
1874		/*
1875		 * The delta should be deducted from the over_sum, calculate
1876		 * target over_sum value.
1877		 */
1878		u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1879		WARN_ON_ONCE(over_sum <= over_delta);
1880		over_target = over_sum - over_delta;
1881	} else {
1882		over_target = 0;
1883	}
1884
1885	list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1886		if (over_target)
1887			iocg->hweight_after_donation =
1888				div_u64((u64)iocg->hweight_after_donation *
1889					over_target, over_sum);
1890		list_del_init(&iocg->walk_list);
1891	}
1892
1893	/*
1894	 * Build pre-order inner node walk list and prepare for donation
1895	 * adjustment calculations.
1896	 */
1897	list_for_each_entry(iocg, surpluses, surplus_list) {
1898		iocg_build_inner_walk(iocg, &inner_walk);
1899	}
1900
1901	root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1902	WARN_ON_ONCE(root_iocg->level > 0);
1903
1904	list_for_each_entry(iocg, &inner_walk, walk_list) {
1905		iocg->child_adjusted_sum = 0;
1906		iocg->hweight_donating = 0;
1907		iocg->hweight_after_donation = 0;
1908	}
1909
1910	/*
1911	 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1912	 * up the hierarchy.
1913	 */
1914	list_for_each_entry(iocg, surpluses, surplus_list) {
1915		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1916
1917		parent->hweight_donating += iocg->hweight_donating;
1918		parent->hweight_after_donation += iocg->hweight_after_donation;
1919	}
1920
1921	list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1922		if (iocg->level > 0) {
1923			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1924
1925			parent->hweight_donating += iocg->hweight_donating;
1926			parent->hweight_after_donation += iocg->hweight_after_donation;
1927		}
1928	}
1929
1930	/*
1931	 * Calculate inner hwa's (b) and make sure the donation values are
1932	 * within the accepted ranges as we're doing low res calculations with
1933	 * roundups.
1934	 */
1935	list_for_each_entry(iocg, &inner_walk, walk_list) {
1936		if (iocg->level) {
1937			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1938
1939			iocg->hweight_active = DIV64_U64_ROUND_UP(
1940				(u64)parent->hweight_active * iocg->active,
1941				parent->child_active_sum);
1942
1943		}
1944
1945		iocg->hweight_donating = min(iocg->hweight_donating,
1946					     iocg->hweight_active);
1947		iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1948						   iocg->hweight_donating - 1);
1949		if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1950				 iocg->hweight_donating <= 1 ||
1951				 iocg->hweight_after_donation == 0)) {
1952			pr_warn("iocg: invalid donation weights in ");
1953			pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1954			pr_cont(": active=%u donating=%u after=%u\n",
1955				iocg->hweight_active, iocg->hweight_donating,
1956				iocg->hweight_after_donation);
1957		}
1958	}
1959
1960	/*
1961	 * Calculate the global donation rate (gamma) - the rate to adjust
1962	 * non-donating budgets by.
1963	 *
1964	 * No need to use 64bit multiplication here as the first operand is
1965	 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1966	 *
1967	 * We know that there are beneficiary nodes and the sum of the donating
1968	 * hweights can't be whole; however, due to the round-ups during hweight
1969	 * calculations, root_iocg->hweight_donating might still end up equal to
1970	 * or greater than whole. Limit the range when calculating the divider.
1971	 *
1972	 * gamma = (1 - t_r') / (1 - t_r)
1973	 */
1974	gamma = DIV_ROUND_UP(
1975		(WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1976		WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1977
1978	/*
1979	 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1980	 * nodes.
1981	 */
1982	list_for_each_entry(iocg, &inner_walk, walk_list) {
1983		struct ioc_gq *parent;
1984		u32 inuse, wpt, wptp;
1985		u64 st, sf;
1986
1987		if (iocg->level == 0) {
1988			/* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1989			iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1990				iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1991				WEIGHT_ONE - iocg->hweight_after_donation);
1992			continue;
1993		}
1994
1995		parent = iocg->ancestors[iocg->level - 1];
1996
1997		/* b' = gamma * b_f + b_t' */
1998		iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1999			(u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
2000			WEIGHT_ONE) + iocg->hweight_after_donation;
2001
2002		/* w' = s' * b' / b'_p */
2003		inuse = DIV64_U64_ROUND_UP(
2004			(u64)parent->child_adjusted_sum * iocg->hweight_inuse,
2005			parent->hweight_inuse);
2006
2007		/* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
2008		st = DIV64_U64_ROUND_UP(
2009			iocg->child_active_sum * iocg->hweight_donating,
2010			iocg->hweight_active);
2011		sf = iocg->child_active_sum - st;
2012		wpt = DIV64_U64_ROUND_UP(
2013			(u64)iocg->active * iocg->hweight_donating,
2014			iocg->hweight_active);
2015		wptp = DIV64_U64_ROUND_UP(
2016			(u64)inuse * iocg->hweight_after_donation,
2017			iocg->hweight_inuse);
2018
2019		iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
2020	}
2021
2022	/*
2023	 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2024	 * we can finally determine leaf adjustments.
2025	 */
2026	list_for_each_entry(iocg, surpluses, surplus_list) {
2027		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2028		u32 inuse;
2029
2030		/*
2031		 * In-debt iocgs participated in the donation calculation with
2032		 * the minimum target hweight_inuse. Configuring inuse
2033		 * accordingly would work fine but debt handling expects
2034		 * @iocg->inuse stay at the minimum and we don't wanna
2035		 * interfere.
2036		 */
2037		if (iocg->abs_vdebt) {
2038			WARN_ON_ONCE(iocg->inuse > 1);
2039			continue;
2040		}
2041
2042		/* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2043		inuse = DIV64_U64_ROUND_UP(
2044			parent->child_adjusted_sum * iocg->hweight_after_donation,
2045			parent->hweight_inuse);
2046
2047		TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2048				iocg->inuse, inuse,
2049				iocg->hweight_inuse,
2050				iocg->hweight_after_donation);
2051
2052		__propagate_weights(iocg, iocg->active, inuse, true, now);
2053	}
2054
2055	/* walk list should be dissolved after use */
2056	list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2057		list_del_init(&iocg->walk_list);
2058}
2059
2060/*
2061 * A low weight iocg can amass a large amount of debt, for example, when
2062 * anonymous memory gets reclaimed aggressively. If the system has a lot of
2063 * memory paired with a slow IO device, the debt can span multiple seconds or
2064 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2065 * up blocked paying its debt while the IO device is idle.
2066 *
2067 * The following protects against such cases. If the device has been
2068 * sufficiently idle for a while, the debts are halved and delays are
2069 * recalculated.
2070 */
2071static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2072			      struct ioc_now *now)
2073{
2074	struct ioc_gq *iocg;
2075	u64 dur, usage_pct, nr_cycles;
2076
2077	/* if no debtor, reset the cycle */
2078	if (!nr_debtors) {
2079		ioc->dfgv_period_at = now->now;
2080		ioc->dfgv_period_rem = 0;
2081		ioc->dfgv_usage_us_sum = 0;
2082		return;
2083	}
2084
2085	/*
2086	 * Debtors can pass through a lot of writes choking the device and we
2087	 * don't want to be forgiving debts while the device is struggling from
2088	 * write bursts. If we're missing latency targets, consider the device
2089	 * fully utilized.
2090	 */
2091	if (ioc->busy_level > 0)
2092		usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2093
2094	ioc->dfgv_usage_us_sum += usage_us_sum;
2095	if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2096		return;
2097
2098	/*
2099	 * At least DFGV_PERIOD has passed since the last period. Calculate the
2100	 * average usage and reset the period counters.
2101	 */
2102	dur = now->now - ioc->dfgv_period_at;
2103	usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2104
2105	ioc->dfgv_period_at = now->now;
2106	ioc->dfgv_usage_us_sum = 0;
2107
2108	/* if was too busy, reset everything */
2109	if (usage_pct > DFGV_USAGE_PCT) {
2110		ioc->dfgv_period_rem = 0;
2111		return;
2112	}
2113
2114	/*
2115	 * Usage is lower than threshold. Let's forgive some debts. Debt
2116	 * forgiveness runs off of the usual ioc timer but its period usually
2117	 * doesn't match ioc's. Compensate the difference by performing the
2118	 * reduction as many times as would fit in the duration since the last
2119	 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2120	 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2121	 * reductions is doubled.
2122	 */
2123	nr_cycles = dur + ioc->dfgv_period_rem;
2124	ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2125
2126	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2127		u64 __maybe_unused old_debt, __maybe_unused old_delay;
2128
2129		if (!iocg->abs_vdebt && !iocg->delay)
2130			continue;
2131
2132		spin_lock(&iocg->waitq.lock);
2133
2134		old_debt = iocg->abs_vdebt;
2135		old_delay = iocg->delay;
2136
2137		if (iocg->abs_vdebt)
2138			iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2139		if (iocg->delay)
2140			iocg->delay = iocg->delay >> nr_cycles ?: 1;
2141
2142		iocg_kick_waitq(iocg, true, now);
2143
2144		TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2145				old_debt, iocg->abs_vdebt,
2146				old_delay, iocg->delay);
2147
2148		spin_unlock(&iocg->waitq.lock);
2149	}
2150}
2151
2152/*
2153 * Check the active iocgs' state to avoid oversleeping and deactive
2154 * idle iocgs.
2155 *
2156 * Since waiters determine the sleep durations based on the vrate
2157 * they saw at the time of sleep, if vrate has increased, some
2158 * waiters could be sleeping for too long. Wake up tardy waiters
2159 * which should have woken up in the last period and expire idle
2160 * iocgs.
2161 */
2162static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2163{
2164	int nr_debtors = 0;
2165	struct ioc_gq *iocg, *tiocg;
2166
2167	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2168		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2169		    !iocg->delay && !iocg_is_idle(iocg))
2170			continue;
2171
2172		spin_lock(&iocg->waitq.lock);
2173
2174		/* flush wait and indebt stat deltas */
2175		if (iocg->wait_since) {
2176			iocg->stat.wait_us += now->now - iocg->wait_since;
2177			iocg->wait_since = now->now;
2178		}
2179		if (iocg->indebt_since) {
2180			iocg->stat.indebt_us +=
2181				now->now - iocg->indebt_since;
2182			iocg->indebt_since = now->now;
2183		}
2184		if (iocg->indelay_since) {
2185			iocg->stat.indelay_us +=
2186				now->now - iocg->indelay_since;
2187			iocg->indelay_since = now->now;
2188		}
2189
2190		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2191		    iocg->delay) {
2192			/* might be oversleeping vtime / hweight changes, kick */
2193			iocg_kick_waitq(iocg, true, now);
2194			if (iocg->abs_vdebt || iocg->delay)
2195				nr_debtors++;
2196		} else if (iocg_is_idle(iocg)) {
2197			/* no waiter and idle, deactivate */
2198			u64 vtime = atomic64_read(&iocg->vtime);
2199			s64 excess;
2200
2201			/*
2202			 * @iocg has been inactive for a full duration and will
2203			 * have a high budget. Account anything above target as
2204			 * error and throw away. On reactivation, it'll start
2205			 * with the target budget.
2206			 */
2207			excess = now->vnow - vtime - ioc->margins.target;
2208			if (excess > 0) {
2209				u32 old_hwi;
2210
2211				current_hweight(iocg, NULL, &old_hwi);
2212				ioc->vtime_err -= div64_u64(excess * old_hwi,
2213							    WEIGHT_ONE);
2214			}
2215
2216			TRACE_IOCG_PATH(iocg_idle, iocg, now,
2217					atomic64_read(&iocg->active_period),
2218					atomic64_read(&ioc->cur_period), vtime);
2219			__propagate_weights(iocg, 0, 0, false, now);
2220			list_del_init(&iocg->active_list);
2221		}
2222
2223		spin_unlock(&iocg->waitq.lock);
2224	}
2225
2226	commit_weights(ioc);
2227	return nr_debtors;
2228}
2229
2230static void ioc_timer_fn(struct timer_list *timer)
2231{
2232	struct ioc *ioc = container_of(timer, struct ioc, timer);
2233	struct ioc_gq *iocg, *tiocg;
2234	struct ioc_now now;
2235	LIST_HEAD(surpluses);
2236	int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2237	u64 usage_us_sum = 0;
2238	u32 ppm_rthr;
2239	u32 ppm_wthr;
2240	u32 missed_ppm[2], rq_wait_pct;
2241	u64 period_vtime;
2242	int prev_busy_level;
2243
2244	/* how were the latencies during the period? */
2245	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2246
2247	/* take care of active iocgs */
2248	spin_lock_irq(&ioc->lock);
2249
2250	ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2251	ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2252	ioc_now(ioc, &now);
2253
2254	period_vtime = now.vnow - ioc->period_at_vtime;
2255	if (WARN_ON_ONCE(!period_vtime)) {
2256		spin_unlock_irq(&ioc->lock);
2257		return;
2258	}
2259
2260	nr_debtors = ioc_check_iocgs(ioc, &now);
2261
2262	/*
2263	 * Wait and indebt stat are flushed above and the donation calculation
2264	 * below needs updated usage stat. Let's bring stat up-to-date.
2265	 */
2266	iocg_flush_stat(&ioc->active_iocgs, &now);
2267
2268	/* calc usage and see whether some weights need to be moved around */
2269	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2270		u64 vdone, vtime, usage_us;
2271		u32 hw_active, hw_inuse;
2272
2273		/*
2274		 * Collect unused and wind vtime closer to vnow to prevent
2275		 * iocgs from accumulating a large amount of budget.
2276		 */
2277		vdone = atomic64_read(&iocg->done_vtime);
2278		vtime = atomic64_read(&iocg->vtime);
2279		current_hweight(iocg, &hw_active, &hw_inuse);
2280
2281		/*
2282		 * Latency QoS detection doesn't account for IOs which are
2283		 * in-flight for longer than a period.  Detect them by
2284		 * comparing vdone against period start.  If lagging behind
2285		 * IOs from past periods, don't increase vrate.
2286		 */
2287		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2288		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2289		    time_after64(vtime, vdone) &&
2290		    time_after64(vtime, now.vnow -
2291				 MAX_LAGGING_PERIODS * period_vtime) &&
2292		    time_before64(vdone, now.vnow - period_vtime))
2293			nr_lagging++;
2294
2295		/*
2296		 * Determine absolute usage factoring in in-flight IOs to avoid
2297		 * high-latency completions appearing as idle.
2298		 */
2299		usage_us = iocg->usage_delta_us;
2300		usage_us_sum += usage_us;
2301
2302		/* see whether there's surplus vtime */
2303		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2304		if (hw_inuse < hw_active ||
2305		    (!waitqueue_active(&iocg->waitq) &&
2306		     time_before64(vtime, now.vnow - ioc->margins.low))) {
2307			u32 hwa, old_hwi, hwm, new_hwi, usage;
2308			u64 usage_dur;
2309
2310			if (vdone != vtime) {
2311				u64 inflight_us = DIV64_U64_ROUND_UP(
2312					cost_to_abs_cost(vtime - vdone, hw_inuse),
2313					ioc->vtime_base_rate);
2314
2315				usage_us = max(usage_us, inflight_us);
2316			}
2317
2318			/* convert to hweight based usage ratio */
2319			if (time_after64(iocg->activated_at, ioc->period_at))
2320				usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2321			else
2322				usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2323
2324			usage = clamp_t(u32,
2325				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2326						   usage_dur),
2327				1, WEIGHT_ONE);
2328
2329			/*
2330			 * Already donating or accumulated enough to start.
2331			 * Determine the donation amount.
2332			 */
2333			current_hweight(iocg, &hwa, &old_hwi);
2334			hwm = current_hweight_max(iocg);
2335			new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2336							 usage, &now);
2337			/*
2338			 * Donation calculation assumes hweight_after_donation
2339			 * to be positive, a condition that a donor w/ hwa < 2
2340			 * can't meet. Don't bother with donation if hwa is
2341			 * below 2. It's not gonna make a meaningful difference
2342			 * anyway.
2343			 */
2344			if (new_hwi < hwm && hwa >= 2) {
2345				iocg->hweight_donating = hwa;
2346				iocg->hweight_after_donation = new_hwi;
2347				list_add(&iocg->surplus_list, &surpluses);
2348			} else if (!iocg->abs_vdebt) {
2349				/*
2350				 * @iocg doesn't have enough to donate. Reset
2351				 * its inuse to active.
2352				 *
2353				 * Don't reset debtors as their inuse's are
2354				 * owned by debt handling. This shouldn't affect
2355				 * donation calculuation in any meaningful way
2356				 * as @iocg doesn't have a meaningful amount of
2357				 * share anyway.
2358				 */
2359				TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2360						iocg->inuse, iocg->active,
2361						iocg->hweight_inuse, new_hwi);
2362
2363				__propagate_weights(iocg, iocg->active,
2364						    iocg->active, true, &now);
2365				nr_shortages++;
2366			}
2367		} else {
2368			/* genuinely short on vtime */
2369			nr_shortages++;
2370		}
2371	}
2372
2373	if (!list_empty(&surpluses) && nr_shortages)
2374		transfer_surpluses(&surpluses, &now);
2375
2376	commit_weights(ioc);
2377
2378	/* surplus list should be dissolved after use */
2379	list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2380		list_del_init(&iocg->surplus_list);
2381
2382	/*
2383	 * If q is getting clogged or we're missing too much, we're issuing
2384	 * too much IO and should lower vtime rate.  If we're not missing
2385	 * and experiencing shortages but not surpluses, we're too stingy
2386	 * and should increase vtime rate.
2387	 */
2388	prev_busy_level = ioc->busy_level;
2389	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2390	    missed_ppm[READ] > ppm_rthr ||
2391	    missed_ppm[WRITE] > ppm_wthr) {
2392		/* clearly missing QoS targets, slow down vrate */
2393		ioc->busy_level = max(ioc->busy_level, 0);
2394		ioc->busy_level++;
2395	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2396		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2397		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2398		/* QoS targets are being met with >25% margin */
2399		if (nr_shortages) {
2400			/*
2401			 * We're throttling while the device has spare
2402			 * capacity.  If vrate was being slowed down, stop.
2403			 */
2404			ioc->busy_level = min(ioc->busy_level, 0);
2405
2406			/*
2407			 * If there are IOs spanning multiple periods, wait
2408			 * them out before pushing the device harder.
2409			 */
2410			if (!nr_lagging)
2411				ioc->busy_level--;
2412		} else {
2413			/*
2414			 * Nobody is being throttled and the users aren't
2415			 * issuing enough IOs to saturate the device.  We
2416			 * simply don't know how close the device is to
2417			 * saturation.  Coast.
2418			 */
2419			ioc->busy_level = 0;
2420		}
2421	} else {
2422		/* inside the hysterisis margin, we're good */
2423		ioc->busy_level = 0;
2424	}
2425
2426	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2427
2428	ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2429			      prev_busy_level, missed_ppm);
2430
2431	ioc_refresh_params(ioc, false);
2432
2433	ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2434
2435	/*
2436	 * This period is done.  Move onto the next one.  If nothing's
2437	 * going on with the device, stop the timer.
2438	 */
2439	atomic64_inc(&ioc->cur_period);
2440
2441	if (ioc->running != IOC_STOP) {
2442		if (!list_empty(&ioc->active_iocgs)) {
2443			ioc_start_period(ioc, &now);
2444		} else {
2445			ioc->busy_level = 0;
2446			ioc->vtime_err = 0;
2447			ioc->running = IOC_IDLE;
2448		}
2449
2450		ioc_refresh_vrate(ioc, &now);
2451	}
2452
2453	spin_unlock_irq(&ioc->lock);
2454}
2455
2456static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2457				      u64 abs_cost, struct ioc_now *now)
2458{
2459	struct ioc *ioc = iocg->ioc;
2460	struct ioc_margins *margins = &ioc->margins;
2461	u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2462	u32 hwi, adj_step;
2463	s64 margin;
2464	u64 cost, new_inuse;
2465	unsigned long flags;
2466
2467	current_hweight(iocg, NULL, &hwi);
2468	old_hwi = hwi;
2469	cost = abs_cost_to_cost(abs_cost, hwi);
2470	margin = now->vnow - vtime - cost;
2471
2472	/* debt handling owns inuse for debtors */
2473	if (iocg->abs_vdebt)
2474		return cost;
2475
2476	/*
2477	 * We only increase inuse during period and do so if the margin has
2478	 * deteriorated since the previous adjustment.
2479	 */
2480	if (margin >= iocg->saved_margin || margin >= margins->low ||
2481	    iocg->inuse == iocg->active)
2482		return cost;
2483
2484	spin_lock_irqsave(&ioc->lock, flags);
2485
2486	/* we own inuse only when @iocg is in the normal active state */
2487	if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2488		spin_unlock_irqrestore(&ioc->lock, flags);
2489		return cost;
2490	}
2491
2492	/*
2493	 * Bump up inuse till @abs_cost fits in the existing budget.
2494	 * adj_step must be determined after acquiring ioc->lock - we might
2495	 * have raced and lost to another thread for activation and could
2496	 * be reading 0 iocg->active before ioc->lock which will lead to
2497	 * infinite loop.
2498	 */
2499	new_inuse = iocg->inuse;
2500	adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2501	do {
2502		new_inuse = new_inuse + adj_step;
2503		propagate_weights(iocg, iocg->active, new_inuse, true, now);
2504		current_hweight(iocg, NULL, &hwi);
2505		cost = abs_cost_to_cost(abs_cost, hwi);
2506	} while (time_after64(vtime + cost, now->vnow) &&
2507		 iocg->inuse != iocg->active);
2508
2509	spin_unlock_irqrestore(&ioc->lock, flags);
2510
2511	TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2512			old_inuse, iocg->inuse, old_hwi, hwi);
2513
2514	return cost;
2515}
2516
2517static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2518				    bool is_merge, u64 *costp)
2519{
2520	struct ioc *ioc = iocg->ioc;
2521	u64 coef_seqio, coef_randio, coef_page;
2522	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2523	u64 seek_pages = 0;
2524	u64 cost = 0;
2525
2526	/* Can't calculate cost for empty bio */
2527	if (!bio->bi_iter.bi_size)
2528		goto out;
2529
2530	switch (bio_op(bio)) {
2531	case REQ_OP_READ:
2532		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
2533		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
2534		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
2535		break;
2536	case REQ_OP_WRITE:
2537		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
2538		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
2539		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
2540		break;
2541	default:
2542		goto out;
2543	}
2544
2545	if (iocg->cursor) {
2546		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2547		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2548	}
2549
2550	if (!is_merge) {
2551		if (seek_pages > LCOEF_RANDIO_PAGES) {
2552			cost += coef_randio;
2553		} else {
2554			cost += coef_seqio;
2555		}
2556	}
2557	cost += pages * coef_page;
2558out:
2559	*costp = cost;
2560}
2561
2562static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2563{
2564	u64 cost;
2565
2566	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2567	return cost;
2568}
2569
2570static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2571					 u64 *costp)
2572{
2573	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2574
2575	switch (req_op(rq)) {
2576	case REQ_OP_READ:
2577		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2578		break;
2579	case REQ_OP_WRITE:
2580		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2581		break;
2582	default:
2583		*costp = 0;
2584	}
2585}
2586
2587static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2588{
2589	u64 cost;
2590
2591	calc_size_vtime_cost_builtin(rq, ioc, &cost);
2592	return cost;
2593}
2594
2595static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2596{
2597	struct blkcg_gq *blkg = bio->bi_blkg;
2598	struct ioc *ioc = rqos_to_ioc(rqos);
2599	struct ioc_gq *iocg = blkg_to_iocg(blkg);
2600	struct ioc_now now;
2601	struct iocg_wait wait;
2602	u64 abs_cost, cost, vtime;
2603	bool use_debt, ioc_locked;
2604	unsigned long flags;
2605
2606	/* bypass IOs if disabled, still initializing, or for root cgroup */
2607	if (!ioc->enabled || !iocg || !iocg->level)
2608		return;
2609
2610	/* calculate the absolute vtime cost */
2611	abs_cost = calc_vtime_cost(bio, iocg, false);
2612	if (!abs_cost)
2613		return;
2614
2615	if (!iocg_activate(iocg, &now))
2616		return;
2617
2618	iocg->cursor = bio_end_sector(bio);
2619	vtime = atomic64_read(&iocg->vtime);
2620	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2621
2622	/*
2623	 * If no one's waiting and within budget, issue right away.  The
2624	 * tests are racy but the races aren't systemic - we only miss once
2625	 * in a while which is fine.
2626	 */
2627	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2628	    time_before_eq64(vtime + cost, now.vnow)) {
2629		iocg_commit_bio(iocg, bio, abs_cost, cost);
2630		return;
2631	}
2632
2633	/*
2634	 * We're over budget. This can be handled in two ways. IOs which may
2635	 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2636	 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2637	 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2638	 * whether debt handling is needed and acquire locks accordingly.
2639	 */
2640	use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2641	ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2642retry_lock:
2643	iocg_lock(iocg, ioc_locked, &flags);
2644
2645	/*
2646	 * @iocg must stay activated for debt and waitq handling. Deactivation
2647	 * is synchronized against both ioc->lock and waitq.lock and we won't
2648	 * get deactivated as long as we're waiting or has debt, so we're good
2649	 * if we're activated here. In the unlikely cases that we aren't, just
2650	 * issue the IO.
2651	 */
2652	if (unlikely(list_empty(&iocg->active_list))) {
2653		iocg_unlock(iocg, ioc_locked, &flags);
2654		iocg_commit_bio(iocg, bio, abs_cost, cost);
2655		return;
2656	}
2657
2658	/*
2659	 * We're over budget. If @bio has to be issued regardless, remember
2660	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2661	 * off the debt before waking more IOs.
2662	 *
2663	 * This way, the debt is continuously paid off each period with the
2664	 * actual budget available to the cgroup. If we just wound vtime, we
2665	 * would incorrectly use the current hw_inuse for the entire amount
2666	 * which, for example, can lead to the cgroup staying blocked for a
2667	 * long time even with substantially raised hw_inuse.
2668	 *
2669	 * An iocg with vdebt should stay online so that the timer can keep
2670	 * deducting its vdebt and [de]activate use_delay mechanism
2671	 * accordingly. We don't want to race against the timer trying to
2672	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2673	 * penalizing the cgroup and its descendants.
2674	 */
2675	if (use_debt) {
2676		iocg_incur_debt(iocg, abs_cost, &now);
2677		if (iocg_kick_delay(iocg, &now))
2678			blkcg_schedule_throttle(rqos->disk,
2679					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2680		iocg_unlock(iocg, ioc_locked, &flags);
2681		return;
2682	}
2683
2684	/* guarantee that iocgs w/ waiters have maximum inuse */
2685	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2686		if (!ioc_locked) {
2687			iocg_unlock(iocg, false, &flags);
2688			ioc_locked = true;
2689			goto retry_lock;
2690		}
2691		propagate_weights(iocg, iocg->active, iocg->active, true,
2692				  &now);
2693	}
2694
2695	/*
2696	 * Append self to the waitq and schedule the wakeup timer if we're
2697	 * the first waiter.  The timer duration is calculated based on the
2698	 * current vrate.  vtime and hweight changes can make it too short
2699	 * or too long.  Each wait entry records the absolute cost it's
2700	 * waiting for to allow re-evaluation using a custom wait entry.
2701	 *
2702	 * If too short, the timer simply reschedules itself.  If too long,
2703	 * the period timer will notice and trigger wakeups.
2704	 *
2705	 * All waiters are on iocg->waitq and the wait states are
2706	 * synchronized using waitq.lock.
2707	 */
2708	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2709	wait.wait.private = current;
2710	wait.bio = bio;
2711	wait.abs_cost = abs_cost;
2712	wait.committed = false;	/* will be set true by waker */
2713
2714	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2715	iocg_kick_waitq(iocg, ioc_locked, &now);
2716
2717	iocg_unlock(iocg, ioc_locked, &flags);
2718
2719	while (true) {
2720		set_current_state(TASK_UNINTERRUPTIBLE);
2721		if (wait.committed)
2722			break;
2723		io_schedule();
2724	}
2725
2726	/* waker already committed us, proceed */
2727	finish_wait(&iocg->waitq, &wait.wait);
2728}
2729
2730static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2731			   struct bio *bio)
2732{
2733	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2734	struct ioc *ioc = rqos_to_ioc(rqos);
2735	sector_t bio_end = bio_end_sector(bio);
2736	struct ioc_now now;
2737	u64 vtime, abs_cost, cost;
2738	unsigned long flags;
2739
2740	/* bypass if disabled, still initializing, or for root cgroup */
2741	if (!ioc->enabled || !iocg || !iocg->level)
2742		return;
2743
2744	abs_cost = calc_vtime_cost(bio, iocg, true);
2745	if (!abs_cost)
2746		return;
2747
2748	ioc_now(ioc, &now);
2749
2750	vtime = atomic64_read(&iocg->vtime);
2751	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2752
2753	/* update cursor if backmerging into the request at the cursor */
2754	if (blk_rq_pos(rq) < bio_end &&
2755	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2756		iocg->cursor = bio_end;
2757
2758	/*
2759	 * Charge if there's enough vtime budget and the existing request has
2760	 * cost assigned.
2761	 */
2762	if (rq->bio && rq->bio->bi_iocost_cost &&
2763	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2764		iocg_commit_bio(iocg, bio, abs_cost, cost);
2765		return;
2766	}
2767
2768	/*
2769	 * Otherwise, account it as debt if @iocg is online, which it should
2770	 * be for the vast majority of cases. See debt handling in
2771	 * ioc_rqos_throttle() for details.
2772	 */
2773	spin_lock_irqsave(&ioc->lock, flags);
2774	spin_lock(&iocg->waitq.lock);
2775
2776	if (likely(!list_empty(&iocg->active_list))) {
2777		iocg_incur_debt(iocg, abs_cost, &now);
2778		if (iocg_kick_delay(iocg, &now))
2779			blkcg_schedule_throttle(rqos->disk,
2780					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2781	} else {
2782		iocg_commit_bio(iocg, bio, abs_cost, cost);
2783	}
2784
2785	spin_unlock(&iocg->waitq.lock);
2786	spin_unlock_irqrestore(&ioc->lock, flags);
2787}
2788
2789static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2790{
2791	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2792
2793	if (iocg && bio->bi_iocost_cost)
2794		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2795}
2796
2797static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2798{
2799	struct ioc *ioc = rqos_to_ioc(rqos);
2800	struct ioc_pcpu_stat *ccs;
2801	u64 on_q_ns, rq_wait_ns, size_nsec;
2802	int pidx, rw;
2803
2804	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2805		return;
2806
2807	switch (req_op(rq)) {
2808	case REQ_OP_READ:
2809		pidx = QOS_RLAT;
2810		rw = READ;
2811		break;
2812	case REQ_OP_WRITE:
2813		pidx = QOS_WLAT;
2814		rw = WRITE;
2815		break;
2816	default:
2817		return;
2818	}
2819
2820	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2821	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2822	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2823
2824	ccs = get_cpu_ptr(ioc->pcpu_stat);
2825
2826	if (on_q_ns <= size_nsec ||
2827	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2828		local_inc(&ccs->missed[rw].nr_met);
2829	else
2830		local_inc(&ccs->missed[rw].nr_missed);
2831
2832	local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2833
2834	put_cpu_ptr(ccs);
2835}
2836
2837static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2838{
2839	struct ioc *ioc = rqos_to_ioc(rqos);
2840
2841	spin_lock_irq(&ioc->lock);
2842	ioc_refresh_params(ioc, false);
2843	spin_unlock_irq(&ioc->lock);
2844}
2845
2846static void ioc_rqos_exit(struct rq_qos *rqos)
2847{
2848	struct ioc *ioc = rqos_to_ioc(rqos);
2849
2850	blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
2851
2852	spin_lock_irq(&ioc->lock);
2853	ioc->running = IOC_STOP;
2854	spin_unlock_irq(&ioc->lock);
2855
2856	timer_shutdown_sync(&ioc->timer);
2857	free_percpu(ioc->pcpu_stat);
2858	kfree(ioc);
2859}
2860
2861static const struct rq_qos_ops ioc_rqos_ops = {
2862	.throttle = ioc_rqos_throttle,
2863	.merge = ioc_rqos_merge,
2864	.done_bio = ioc_rqos_done_bio,
2865	.done = ioc_rqos_done,
2866	.queue_depth_changed = ioc_rqos_queue_depth_changed,
2867	.exit = ioc_rqos_exit,
2868};
2869
2870static int blk_iocost_init(struct gendisk *disk)
2871{
2872	struct ioc *ioc;
2873	int i, cpu, ret;
2874
2875	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2876	if (!ioc)
2877		return -ENOMEM;
2878
2879	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2880	if (!ioc->pcpu_stat) {
2881		kfree(ioc);
2882		return -ENOMEM;
2883	}
2884
2885	for_each_possible_cpu(cpu) {
2886		struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2887
2888		for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2889			local_set(&ccs->missed[i].nr_met, 0);
2890			local_set(&ccs->missed[i].nr_missed, 0);
2891		}
2892		local64_set(&ccs->rq_wait_ns, 0);
2893	}
2894
2895	spin_lock_init(&ioc->lock);
2896	timer_setup(&ioc->timer, ioc_timer_fn, 0);
2897	INIT_LIST_HEAD(&ioc->active_iocgs);
2898
2899	ioc->running = IOC_IDLE;
2900	ioc->vtime_base_rate = VTIME_PER_USEC;
2901	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2902	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2903	ioc->period_at = ktime_to_us(ktime_get());
2904	atomic64_set(&ioc->cur_period, 0);
2905	atomic_set(&ioc->hweight_gen, 0);
2906
2907	spin_lock_irq(&ioc->lock);
2908	ioc->autop_idx = AUTOP_INVALID;
2909	ioc_refresh_params_disk(ioc, true, disk);
2910	spin_unlock_irq(&ioc->lock);
2911
2912	/*
2913	 * rqos must be added before activation to allow ioc_pd_init() to
2914	 * lookup the ioc from q. This means that the rqos methods may get
2915	 * called before policy activation completion, can't assume that the
2916	 * target bio has an iocg associated and need to test for NULL iocg.
2917	 */
2918	ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
2919	if (ret)
2920		goto err_free_ioc;
2921
2922	ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
2923	if (ret)
2924		goto err_del_qos;
2925	return 0;
2926
2927err_del_qos:
2928	rq_qos_del(&ioc->rqos);
2929err_free_ioc:
2930	free_percpu(ioc->pcpu_stat);
2931	kfree(ioc);
2932	return ret;
2933}
2934
2935static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2936{
2937	struct ioc_cgrp *iocc;
2938
2939	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2940	if (!iocc)
2941		return NULL;
2942
2943	iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2944	return &iocc->cpd;
2945}
2946
2947static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2948{
2949	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2950}
2951
2952static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
2953		struct blkcg *blkcg, gfp_t gfp)
2954{
2955	int levels = blkcg->css.cgroup->level + 1;
2956	struct ioc_gq *iocg;
2957
2958	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
2959			    disk->node_id);
2960	if (!iocg)
2961		return NULL;
2962
2963	iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2964	if (!iocg->pcpu_stat) {
2965		kfree(iocg);
2966		return NULL;
2967	}
2968
2969	return &iocg->pd;
2970}
2971
2972static void ioc_pd_init(struct blkg_policy_data *pd)
2973{
2974	struct ioc_gq *iocg = pd_to_iocg(pd);
2975	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2976	struct ioc *ioc = q_to_ioc(blkg->q);
2977	struct ioc_now now;
2978	struct blkcg_gq *tblkg;
2979	unsigned long flags;
2980
2981	ioc_now(ioc, &now);
2982
2983	iocg->ioc = ioc;
2984	atomic64_set(&iocg->vtime, now.vnow);
2985	atomic64_set(&iocg->done_vtime, now.vnow);
2986	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2987	INIT_LIST_HEAD(&iocg->active_list);
2988	INIT_LIST_HEAD(&iocg->walk_list);
2989	INIT_LIST_HEAD(&iocg->surplus_list);
2990	iocg->hweight_active = WEIGHT_ONE;
2991	iocg->hweight_inuse = WEIGHT_ONE;
2992
2993	init_waitqueue_head(&iocg->waitq);
2994	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2995	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2996
2997	iocg->level = blkg->blkcg->css.cgroup->level;
2998
2999	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
3000		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
3001		iocg->ancestors[tiocg->level] = tiocg;
3002	}
3003
3004	spin_lock_irqsave(&ioc->lock, flags);
3005	weight_updated(iocg, &now);
3006	spin_unlock_irqrestore(&ioc->lock, flags);
3007}
3008
3009static void ioc_pd_free(struct blkg_policy_data *pd)
3010{
3011	struct ioc_gq *iocg = pd_to_iocg(pd);
3012	struct ioc *ioc = iocg->ioc;
3013	unsigned long flags;
3014
3015	if (ioc) {
3016		spin_lock_irqsave(&ioc->lock, flags);
3017
3018		if (!list_empty(&iocg->active_list)) {
3019			struct ioc_now now;
3020
3021			ioc_now(ioc, &now);
3022			propagate_weights(iocg, 0, 0, false, &now);
3023			list_del_init(&iocg->active_list);
3024		}
3025
3026		WARN_ON_ONCE(!list_empty(&iocg->walk_list));
3027		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
3028
3029		spin_unlock_irqrestore(&ioc->lock, flags);
3030
3031		hrtimer_cancel(&iocg->waitq_timer);
3032	}
3033	free_percpu(iocg->pcpu_stat);
3034	kfree(iocg);
3035}
3036
3037static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
3038{
3039	struct ioc_gq *iocg = pd_to_iocg(pd);
3040	struct ioc *ioc = iocg->ioc;
3041
3042	if (!ioc->enabled)
3043		return;
3044
3045	if (iocg->level == 0) {
3046		unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3047			ioc->vtime_base_rate * 10000,
3048			VTIME_PER_USEC);
3049		seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3050	}
3051
3052	seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3053
3054	if (blkcg_debug_stats)
3055		seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3056			iocg->last_stat.wait_us,
3057			iocg->last_stat.indebt_us,
3058			iocg->last_stat.indelay_us);
3059}
3060
3061static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3062			     int off)
3063{
3064	const char *dname = blkg_dev_name(pd->blkg);
3065	struct ioc_gq *iocg = pd_to_iocg(pd);
3066
3067	if (dname && iocg->cfg_weight)
3068		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3069	return 0;
3070}
3071
3072
3073static int ioc_weight_show(struct seq_file *sf, void *v)
3074{
3075	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3076	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3077
3078	seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3079	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3080			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3081	return 0;
3082}
3083
3084static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3085				size_t nbytes, loff_t off)
3086{
3087	struct blkcg *blkcg = css_to_blkcg(of_css(of));
3088	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3089	struct blkg_conf_ctx ctx;
3090	struct ioc_now now;
3091	struct ioc_gq *iocg;
3092	u32 v;
3093	int ret;
3094
3095	if (!strchr(buf, ':')) {
3096		struct blkcg_gq *blkg;
3097
3098		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3099			return -EINVAL;
3100
3101		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3102			return -EINVAL;
3103
3104		spin_lock_irq(&blkcg->lock);
3105		iocc->dfl_weight = v * WEIGHT_ONE;
3106		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3107			struct ioc_gq *iocg = blkg_to_iocg(blkg);
3108
3109			if (iocg) {
3110				spin_lock(&iocg->ioc->lock);
3111				ioc_now(iocg->ioc, &now);
3112				weight_updated(iocg, &now);
3113				spin_unlock(&iocg->ioc->lock);
3114			}
3115		}
3116		spin_unlock_irq(&blkcg->lock);
3117
3118		return nbytes;
3119	}
3120
3121	blkg_conf_init(&ctx, buf);
3122
3123	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
3124	if (ret)
3125		goto err;
3126
3127	iocg = blkg_to_iocg(ctx.blkg);
3128
3129	if (!strncmp(ctx.body, "default", 7)) {
3130		v = 0;
3131	} else {
3132		if (!sscanf(ctx.body, "%u", &v))
3133			goto einval;
3134		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3135			goto einval;
3136	}
3137
3138	spin_lock(&iocg->ioc->lock);
3139	iocg->cfg_weight = v * WEIGHT_ONE;
3140	ioc_now(iocg->ioc, &now);
3141	weight_updated(iocg, &now);
3142	spin_unlock(&iocg->ioc->lock);
3143
3144	blkg_conf_exit(&ctx);
3145	return nbytes;
3146
3147einval:
3148	ret = -EINVAL;
3149err:
3150	blkg_conf_exit(&ctx);
3151	return ret;
3152}
3153
3154static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3155			  int off)
3156{
3157	const char *dname = blkg_dev_name(pd->blkg);
3158	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3159
3160	if (!dname)
3161		return 0;
3162
3163	spin_lock_irq(&ioc->lock);
3164	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3165		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3166		   ioc->params.qos[QOS_RPPM] / 10000,
3167		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
3168		   ioc->params.qos[QOS_RLAT],
3169		   ioc->params.qos[QOS_WPPM] / 10000,
3170		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
3171		   ioc->params.qos[QOS_WLAT],
3172		   ioc->params.qos[QOS_MIN] / 10000,
3173		   ioc->params.qos[QOS_MIN] % 10000 / 100,
3174		   ioc->params.qos[QOS_MAX] / 10000,
3175		   ioc->params.qos[QOS_MAX] % 10000 / 100);
3176	spin_unlock_irq(&ioc->lock);
3177	return 0;
3178}
3179
3180static int ioc_qos_show(struct seq_file *sf, void *v)
3181{
3182	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3183
3184	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3185			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3186	return 0;
3187}
3188
3189static const match_table_t qos_ctrl_tokens = {
3190	{ QOS_ENABLE,		"enable=%u"	},
3191	{ QOS_CTRL,		"ctrl=%s"	},
3192	{ NR_QOS_CTRL_PARAMS,	NULL		},
3193};
3194
3195static const match_table_t qos_tokens = {
3196	{ QOS_RPPM,		"rpct=%s"	},
3197	{ QOS_RLAT,		"rlat=%u"	},
3198	{ QOS_WPPM,		"wpct=%s"	},
3199	{ QOS_WLAT,		"wlat=%u"	},
3200	{ QOS_MIN,		"min=%s"	},
3201	{ QOS_MAX,		"max=%s"	},
3202	{ NR_QOS_PARAMS,	NULL		},
3203};
3204
3205static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3206			     size_t nbytes, loff_t off)
3207{
3208	struct blkg_conf_ctx ctx;
3209	struct gendisk *disk;
3210	struct ioc *ioc;
3211	u32 qos[NR_QOS_PARAMS];
3212	bool enable, user;
3213	char *body, *p;
3214	int ret;
3215
3216	blkg_conf_init(&ctx, input);
3217
3218	ret = blkg_conf_open_bdev(&ctx);
3219	if (ret)
3220		goto err;
3221
3222	body = ctx.body;
3223	disk = ctx.bdev->bd_disk;
3224	if (!queue_is_mq(disk->queue)) {
3225		ret = -EOPNOTSUPP;
3226		goto err;
3227	}
3228
3229	ioc = q_to_ioc(disk->queue);
3230	if (!ioc) {
3231		ret = blk_iocost_init(disk);
3232		if (ret)
3233			goto err;
3234		ioc = q_to_ioc(disk->queue);
3235	}
3236
3237	blk_mq_freeze_queue(disk->queue);
3238	blk_mq_quiesce_queue(disk->queue);
3239
3240	spin_lock_irq(&ioc->lock);
3241	memcpy(qos, ioc->params.qos, sizeof(qos));
3242	enable = ioc->enabled;
3243	user = ioc->user_qos_params;
3244
3245	while ((p = strsep(&body, " \t\n"))) {
3246		substring_t args[MAX_OPT_ARGS];
3247		char buf[32];
3248		int tok;
3249		s64 v;
3250
3251		if (!*p)
3252			continue;
3253
3254		switch (match_token(p, qos_ctrl_tokens, args)) {
3255		case QOS_ENABLE:
3256			if (match_u64(&args[0], &v))
3257				goto einval;
3258			enable = v;
3259			continue;
3260		case QOS_CTRL:
3261			match_strlcpy(buf, &args[0], sizeof(buf));
3262			if (!strcmp(buf, "auto"))
3263				user = false;
3264			else if (!strcmp(buf, "user"))
3265				user = true;
3266			else
3267				goto einval;
3268			continue;
3269		}
3270
3271		tok = match_token(p, qos_tokens, args);
3272		switch (tok) {
3273		case QOS_RPPM:
3274		case QOS_WPPM:
3275			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3276			    sizeof(buf))
3277				goto einval;
3278			if (cgroup_parse_float(buf, 2, &v))
3279				goto einval;
3280			if (v < 0 || v > 10000)
3281				goto einval;
3282			qos[tok] = v * 100;
3283			break;
3284		case QOS_RLAT:
3285		case QOS_WLAT:
3286			if (match_u64(&args[0], &v))
3287				goto einval;
3288			qos[tok] = v;
3289			break;
3290		case QOS_MIN:
3291		case QOS_MAX:
3292			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3293			    sizeof(buf))
3294				goto einval;
3295			if (cgroup_parse_float(buf, 2, &v))
3296				goto einval;
3297			if (v < 0)
3298				goto einval;
3299			qos[tok] = clamp_t(s64, v * 100,
3300					   VRATE_MIN_PPM, VRATE_MAX_PPM);
3301			break;
3302		default:
3303			goto einval;
3304		}
3305		user = true;
3306	}
3307
3308	if (qos[QOS_MIN] > qos[QOS_MAX])
3309		goto einval;
3310
3311	if (enable && !ioc->enabled) {
3312		blk_stat_enable_accounting(disk->queue);
3313		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3314		ioc->enabled = true;
3315	} else if (!enable && ioc->enabled) {
3316		blk_stat_disable_accounting(disk->queue);
3317		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3318		ioc->enabled = false;
3319	}
3320
3321	if (user) {
3322		memcpy(ioc->params.qos, qos, sizeof(qos));
3323		ioc->user_qos_params = true;
3324	} else {
3325		ioc->user_qos_params = false;
3326	}
3327
3328	ioc_refresh_params(ioc, true);
3329	spin_unlock_irq(&ioc->lock);
3330
3331	if (enable)
3332		wbt_disable_default(disk);
3333	else
3334		wbt_enable_default(disk);
3335
3336	blk_mq_unquiesce_queue(disk->queue);
3337	blk_mq_unfreeze_queue(disk->queue);
3338
3339	blkg_conf_exit(&ctx);
3340	return nbytes;
3341einval:
3342	spin_unlock_irq(&ioc->lock);
3343
3344	blk_mq_unquiesce_queue(disk->queue);
3345	blk_mq_unfreeze_queue(disk->queue);
3346
3347	ret = -EINVAL;
3348err:
3349	blkg_conf_exit(&ctx);
3350	return ret;
3351}
3352
3353static u64 ioc_cost_model_prfill(struct seq_file *sf,
3354				 struct blkg_policy_data *pd, int off)
3355{
3356	const char *dname = blkg_dev_name(pd->blkg);
3357	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3358	u64 *u = ioc->params.i_lcoefs;
3359
3360	if (!dname)
3361		return 0;
3362
3363	spin_lock_irq(&ioc->lock);
3364	seq_printf(sf, "%s ctrl=%s model=linear "
3365		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
3366		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3367		   dname, ioc->user_cost_model ? "user" : "auto",
3368		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3369		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3370	spin_unlock_irq(&ioc->lock);
3371	return 0;
3372}
3373
3374static int ioc_cost_model_show(struct seq_file *sf, void *v)
3375{
3376	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3377
3378	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3379			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3380	return 0;
3381}
3382
3383static const match_table_t cost_ctrl_tokens = {
3384	{ COST_CTRL,		"ctrl=%s"	},
3385	{ COST_MODEL,		"model=%s"	},
3386	{ NR_COST_CTRL_PARAMS,	NULL		},
3387};
3388
3389static const match_table_t i_lcoef_tokens = {
3390	{ I_LCOEF_RBPS,		"rbps=%u"	},
3391	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
3392	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
3393	{ I_LCOEF_WBPS,		"wbps=%u"	},
3394	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
3395	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
3396	{ NR_I_LCOEFS,		NULL		},
3397};
3398
3399static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3400				    size_t nbytes, loff_t off)
3401{
3402	struct blkg_conf_ctx ctx;
3403	struct request_queue *q;
3404	struct ioc *ioc;
3405	u64 u[NR_I_LCOEFS];
3406	bool user;
3407	char *body, *p;
3408	int ret;
3409
3410	blkg_conf_init(&ctx, input);
3411
3412	ret = blkg_conf_open_bdev(&ctx);
3413	if (ret)
3414		goto err;
3415
3416	body = ctx.body;
3417	q = bdev_get_queue(ctx.bdev);
3418	if (!queue_is_mq(q)) {
3419		ret = -EOPNOTSUPP;
3420		goto err;
3421	}
3422
3423	ioc = q_to_ioc(q);
3424	if (!ioc) {
3425		ret = blk_iocost_init(ctx.bdev->bd_disk);
3426		if (ret)
3427			goto err;
3428		ioc = q_to_ioc(q);
3429	}
3430
3431	blk_mq_freeze_queue(q);
3432	blk_mq_quiesce_queue(q);
3433
3434	spin_lock_irq(&ioc->lock);
3435	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3436	user = ioc->user_cost_model;
3437
3438	while ((p = strsep(&body, " \t\n"))) {
3439		substring_t args[MAX_OPT_ARGS];
3440		char buf[32];
3441		int tok;
3442		u64 v;
3443
3444		if (!*p)
3445			continue;
3446
3447		switch (match_token(p, cost_ctrl_tokens, args)) {
3448		case COST_CTRL:
3449			match_strlcpy(buf, &args[0], sizeof(buf));
3450			if (!strcmp(buf, "auto"))
3451				user = false;
3452			else if (!strcmp(buf, "user"))
3453				user = true;
3454			else
3455				goto einval;
3456			continue;
3457		case COST_MODEL:
3458			match_strlcpy(buf, &args[0], sizeof(buf));
3459			if (strcmp(buf, "linear"))
3460				goto einval;
3461			continue;
3462		}
3463
3464		tok = match_token(p, i_lcoef_tokens, args);
3465		if (tok == NR_I_LCOEFS)
3466			goto einval;
3467		if (match_u64(&args[0], &v))
3468			goto einval;
3469		u[tok] = v;
3470		user = true;
3471	}
3472
3473	if (user) {
3474		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3475		ioc->user_cost_model = true;
3476	} else {
3477		ioc->user_cost_model = false;
3478	}
3479	ioc_refresh_params(ioc, true);
3480	spin_unlock_irq(&ioc->lock);
3481
3482	blk_mq_unquiesce_queue(q);
3483	blk_mq_unfreeze_queue(q);
3484
3485	blkg_conf_exit(&ctx);
3486	return nbytes;
3487
3488einval:
3489	spin_unlock_irq(&ioc->lock);
3490
3491	blk_mq_unquiesce_queue(q);
3492	blk_mq_unfreeze_queue(q);
3493
3494	ret = -EINVAL;
3495err:
3496	blkg_conf_exit(&ctx);
3497	return ret;
3498}
3499
3500static struct cftype ioc_files[] = {
3501	{
3502		.name = "weight",
3503		.flags = CFTYPE_NOT_ON_ROOT,
3504		.seq_show = ioc_weight_show,
3505		.write = ioc_weight_write,
3506	},
3507	{
3508		.name = "cost.qos",
3509		.flags = CFTYPE_ONLY_ON_ROOT,
3510		.seq_show = ioc_qos_show,
3511		.write = ioc_qos_write,
3512	},
3513	{
3514		.name = "cost.model",
3515		.flags = CFTYPE_ONLY_ON_ROOT,
3516		.seq_show = ioc_cost_model_show,
3517		.write = ioc_cost_model_write,
3518	},
3519	{}
3520};
3521
3522static struct blkcg_policy blkcg_policy_iocost = {
3523	.dfl_cftypes	= ioc_files,
3524	.cpd_alloc_fn	= ioc_cpd_alloc,
3525	.cpd_free_fn	= ioc_cpd_free,
3526	.pd_alloc_fn	= ioc_pd_alloc,
3527	.pd_init_fn	= ioc_pd_init,
3528	.pd_free_fn	= ioc_pd_free,
3529	.pd_stat_fn	= ioc_pd_stat,
3530};
3531
3532static int __init ioc_init(void)
3533{
3534	return blkcg_policy_register(&blkcg_policy_iocost);
3535}
3536
3537static void __exit ioc_exit(void)
3538{
3539	blkcg_policy_unregister(&blkcg_policy_iocost);
3540}
3541
3542module_init(ioc_init);
3543module_exit(ioc_exit);