Linux Audio

Check our new training course

Linux kernel drivers training

May 6-19, 2025
Register
Loading...
Note: File does not exist in v3.1.
   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * IO cost model based controller.
   4 *
   5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7 * Copyright (C) 2019 Facebook
   8 *
   9 * One challenge of controlling IO resources is the lack of trivially
  10 * observable cost metric.  This is distinguished from CPU and memory where
  11 * wallclock time and the number of bytes can serve as accurate enough
  12 * approximations.
  13 *
  14 * Bandwidth and iops are the most commonly used metrics for IO devices but
  15 * depending on the type and specifics of the device, different IO patterns
  16 * easily lead to multiple orders of magnitude variations rendering them
  17 * useless for the purpose of IO capacity distribution.  While on-device
  18 * time, with a lot of clutches, could serve as a useful approximation for
  19 * non-queued rotational devices, this is no longer viable with modern
  20 * devices, even the rotational ones.
  21 *
  22 * While there is no cost metric we can trivially observe, it isn't a
  23 * complete mystery.  For example, on a rotational device, seek cost
  24 * dominates while a contiguous transfer contributes a smaller amount
  25 * proportional to the size.  If we can characterize at least the relative
  26 * costs of these different types of IOs, it should be possible to
  27 * implement a reasonable work-conserving proportional IO resource
  28 * distribution.
  29 *
  30 * 1. IO Cost Model
  31 *
  32 * IO cost model estimates the cost of an IO given its basic parameters and
  33 * history (e.g. the end sector of the last IO).  The cost is measured in
  34 * device time.  If a given IO is estimated to cost 10ms, the device should
  35 * be able to process ~100 of those IOs in a second.
  36 *
  37 * Currently, there's only one builtin cost model - linear.  Each IO is
  38 * classified as sequential or random and given a base cost accordingly.
  39 * On top of that, a size cost proportional to the length of the IO is
  40 * added.  While simple, this model captures the operational
  41 * characteristics of a wide varienty of devices well enough.  Default
  42 * paramters for several different classes of devices are provided and the
  43 * parameters can be configured from userspace via
  44 * /sys/fs/cgroup/io.cost.model.
  45 *
  46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47 * device-specific coefficients.
  48 *
  49 * 2. Control Strategy
  50 *
  51 * The device virtual time (vtime) is used as the primary control metric.
  52 * The control strategy is composed of the following three parts.
  53 *
  54 * 2-1. Vtime Distribution
  55 *
  56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57 * calculated.  Please consider the following hierarchy where the numbers
  58 * inside parentheses denote the configured weights.
  59 *
  60 *           root
  61 *         /       \
  62 *      A (w:100)  B (w:300)
  63 *      /       \
  64 *  A0 (w:100)  A1 (w:100)
  65 *
  66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69 * 12.5% each.  The distribution mechanism only cares about these flattened
  70 * shares.  They're called hweights (hierarchical weights) and always add
  71 * upto 1 (HWEIGHT_WHOLE).
  72 *
  73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75 * against the device vtime - an IO which takes 10ms on the underlying
  76 * device is considered to take 80ms on A0.
  77 *
  78 * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79 * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80 * the vtime consumed by past IOs and can issue a new IO iff doing so
  81 * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82 * suspended until the vtime has progressed enough to cover it.
  83 *
  84 * 2-2. Vrate Adjustment
  85 *
  86 * It's unrealistic to expect the cost model to be perfect.  There are too
  87 * many devices and even on the same device the overall performance
  88 * fluctuates depending on numerous factors such as IO mixture and device
  89 * internal garbage collection.  The controller needs to adapt dynamically.
  90 *
  91 * This is achieved by adjusting the overall IO rate according to how busy
  92 * the device is.  If the device becomes overloaded, we're sending down too
  93 * many IOs and should generally slow down.  If there are waiting issuers
  94 * but the device isn't saturated, we're issuing too few and should
  95 * generally speed up.
  96 *
  97 * To slow down, we lower the vrate - the rate at which the device vtime
  98 * passes compared to the wall clock.  For example, if the vtime is running
  99 * at the vrate of 75%, all cgroups added up would only be able to issue
 100 * 750ms worth of IOs per second, and vice-versa for speeding up.
 101 *
 102 * Device business is determined using two criteria - rq wait and
 103 * completion latencies.
 104 *
 105 * When a device gets saturated, the on-device and then the request queues
 106 * fill up and a bio which is ready to be issued has to wait for a request
 107 * to become available.  When this delay becomes noticeable, it's a clear
 108 * indication that the device is saturated and we lower the vrate.  This
 109 * saturation signal is fairly conservative as it only triggers when both
 110 * hardware and software queues are filled up, and is used as the default
 111 * busy signal.
 112 *
 113 * As devices can have deep queues and be unfair in how the queued commands
 114 * are executed, soley depending on rq wait may not result in satisfactory
 115 * control quality.  For a better control quality, completion latency QoS
 116 * parameters can be configured so that the device is considered saturated
 117 * if N'th percentile completion latency rises above the set point.
 118 *
 119 * The completion latency requirements are a function of both the
 120 * underlying device characteristics and the desired IO latency quality of
 121 * service.  There is an inherent trade-off - the tighter the latency QoS,
 122 * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123 * and can be set through /sys/fs/cgroup/io.cost.qos.
 124 *
 125 * 2-3. Work Conservation
 126 *
 127 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128 * periodically while B is sending out enough parallel IOs to saturate the
 129 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130 * cost per second, i.e., 10% of the device capacity.  The naive
 131 * distribution of half and half would lead to 60% utilization of the
 132 * device, a significant reduction in the total amount of work done
 133 * compared to free-for-all competition.  This is too high a cost to pay
 134 * for IO control.
 135 *
 136 * To conserve the total amount of work done, we keep track of how much
 137 * each active cgroup is actually using and yield part of its weight if
 138 * there are other cgroups which can make use of it.  In the above case,
 139 * A's weight will be lowered so that it hovers above the actual usage and
 140 * B would be able to use the rest.
 141 *
 142 * As we don't want to penalize a cgroup for donating its weight, the
 143 * surplus weight adjustment factors in a margin and has an immediate
 144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145 *
 146 * Note that adjusting down surplus weights has the same effects as
 147 * accelerating vtime for other cgroups and work conservation can also be
 148 * implemented by adjusting vrate dynamically.  However, squaring who can
 149 * donate and should take back how much requires hweight propagations
 150 * anyway making it easier to implement and understand as a separate
 151 * mechanism.
 152 *
 153 * 3. Monitoring
 154 *
 155 * Instead of debugfs or other clumsy monitoring mechanisms, this
 156 * controller uses a drgn based monitoring script -
 157 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158 * https://github.com/osandov/drgn.  The ouput looks like the following.
 159 *
 160 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161 *                 active      weight      hweight% inflt% dbt  delay usages%
 162 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164 *
 165 * - per	: Timer period
 166 * - cur_per	: Internal wall and device vtime clock
 167 * - vrate	: Device virtual time rate against wall clock
 168 * - weight	: Surplus-adjusted and configured weights
 169 * - hweight	: Surplus-adjusted and configured hierarchical weights
 170 * - inflt	: The percentage of in-flight IO cost at the end of last period
 171 * - del_ms	: Deferred issuer delay induction level and duration
 172 * - usages	: Usage history
 173 */
 174
 175#include <linux/kernel.h>
 176#include <linux/module.h>
 177#include <linux/timer.h>
 178#include <linux/time64.h>
 179#include <linux/parser.h>
 180#include <linux/sched/signal.h>
 181#include <linux/blk-cgroup.h>
 182#include "blk-rq-qos.h"
 183#include "blk-stat.h"
 184#include "blk-wbt.h"
 185
 186#ifdef CONFIG_TRACEPOINTS
 187
 188/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 189#define TRACE_IOCG_PATH_LEN 1024
 190static DEFINE_SPINLOCK(trace_iocg_path_lock);
 191static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 192
 193#define TRACE_IOCG_PATH(type, iocg, ...)					\
 194	do {									\
 195		unsigned long flags;						\
 196		if (trace_iocost_##type##_enabled()) {				\
 197			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
 198			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
 199				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
 200			trace_iocost_##type(iocg, trace_iocg_path,		\
 201					      ##__VA_ARGS__);			\
 202			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
 203		}								\
 204	} while (0)
 205
 206#else	/* CONFIG_TRACE_POINTS */
 207#define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
 208#endif	/* CONFIG_TRACE_POINTS */
 209
 210enum {
 211	MILLION			= 1000000,
 212
 213	/* timer period is calculated from latency requirements, bound it */
 214	MIN_PERIOD		= USEC_PER_MSEC,
 215	MAX_PERIOD		= USEC_PER_SEC,
 216
 217	/*
 218	 * A cgroup's vtime can run 50% behind the device vtime, which
 219	 * serves as its IO credit buffer.  Surplus weight adjustment is
 220	 * immediately canceled if the vtime margin runs below 10%.
 221	 */
 222	MARGIN_PCT		= 50,
 223	INUSE_MARGIN_PCT	= 10,
 224
 225	/* Have some play in waitq timer operations */
 226	WAITQ_TIMER_MARGIN_PCT	= 5,
 227
 228	/*
 229	 * vtime can wrap well within a reasonable uptime when vrate is
 230	 * consistently raised.  Don't trust recorded cgroup vtime if the
 231	 * period counter indicates that it's older than 5mins.
 232	 */
 233	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
 234
 235	/*
 236	 * Remember the past three non-zero usages and use the max for
 237	 * surplus calculation.  Three slots guarantee that we remember one
 238	 * full period usage from the last active stretch even after
 239	 * partial deactivation and re-activation periods.  Don't start
 240	 * giving away weight before collecting two data points to prevent
 241	 * hweight adjustments based on one partial activation period.
 242	 */
 243	NR_USAGE_SLOTS		= 3,
 244	MIN_VALID_USAGES	= 2,
 245
 246	/* 1/64k is granular enough and can easily be handled w/ u32 */
 247	HWEIGHT_WHOLE		= 1 << 16,
 248
 249	/*
 250	 * As vtime is used to calculate the cost of each IO, it needs to
 251	 * be fairly high precision.  For example, it should be able to
 252	 * represent the cost of a single page worth of discard with
 253	 * suffificient accuracy.  At the same time, it should be able to
 254	 * represent reasonably long enough durations to be useful and
 255	 * convenient during operation.
 256	 *
 257	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 258	 * granularity and days of wrap-around time even at extreme vrates.
 259	 */
 260	VTIME_PER_SEC_SHIFT	= 37,
 261	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
 262	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
 263	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
 264
 265	/* bound vrate adjustments within two orders of magnitude */
 266	VRATE_MIN_PPM		= 10000,	/* 1% */
 267	VRATE_MAX_PPM		= 100000000,	/* 10000% */
 268
 269	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 270	VRATE_CLAMP_ADJ_PCT	= 4,
 271
 272	/* if IOs end up waiting for requests, issue less */
 273	RQ_WAIT_BUSY_PCT	= 5,
 274
 275	/* unbusy hysterisis */
 276	UNBUSY_THR_PCT		= 75,
 277
 278	/* don't let cmds which take a very long time pin lagging for too long */
 279	MAX_LAGGING_PERIODS	= 10,
 280
 281	/*
 282	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
 283	 * donate the surplus.
 284	 */
 285	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
 286	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
 287	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
 288
 289	/* switch iff the conditions are met for longer than this */
 290	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
 291
 292	/*
 293	 * Count IO size in 4k pages.  The 12bit shift helps keeping
 294	 * size-proportional components of cost calculation in closer
 295	 * numbers of digits to per-IO cost components.
 296	 */
 297	IOC_PAGE_SHIFT		= 12,
 298	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
 299	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
 300
 301	/* if apart further than 16M, consider randio for linear model */
 302	LCOEF_RANDIO_PAGES	= 4096,
 303};
 304
 305enum ioc_running {
 306	IOC_IDLE,
 307	IOC_RUNNING,
 308	IOC_STOP,
 309};
 310
 311/* io.cost.qos controls including per-dev enable of the whole controller */
 312enum {
 313	QOS_ENABLE,
 314	QOS_CTRL,
 315	NR_QOS_CTRL_PARAMS,
 316};
 317
 318/* io.cost.qos params */
 319enum {
 320	QOS_RPPM,
 321	QOS_RLAT,
 322	QOS_WPPM,
 323	QOS_WLAT,
 324	QOS_MIN,
 325	QOS_MAX,
 326	NR_QOS_PARAMS,
 327};
 328
 329/* io.cost.model controls */
 330enum {
 331	COST_CTRL,
 332	COST_MODEL,
 333	NR_COST_CTRL_PARAMS,
 334};
 335
 336/* builtin linear cost model coefficients */
 337enum {
 338	I_LCOEF_RBPS,
 339	I_LCOEF_RSEQIOPS,
 340	I_LCOEF_RRANDIOPS,
 341	I_LCOEF_WBPS,
 342	I_LCOEF_WSEQIOPS,
 343	I_LCOEF_WRANDIOPS,
 344	NR_I_LCOEFS,
 345};
 346
 347enum {
 348	LCOEF_RPAGE,
 349	LCOEF_RSEQIO,
 350	LCOEF_RRANDIO,
 351	LCOEF_WPAGE,
 352	LCOEF_WSEQIO,
 353	LCOEF_WRANDIO,
 354	NR_LCOEFS,
 355};
 356
 357enum {
 358	AUTOP_INVALID,
 359	AUTOP_HDD,
 360	AUTOP_SSD_QD1,
 361	AUTOP_SSD_DFL,
 362	AUTOP_SSD_FAST,
 363};
 364
 365struct ioc_gq;
 366
 367struct ioc_params {
 368	u32				qos[NR_QOS_PARAMS];
 369	u64				i_lcoefs[NR_I_LCOEFS];
 370	u64				lcoefs[NR_LCOEFS];
 371	u32				too_fast_vrate_pct;
 372	u32				too_slow_vrate_pct;
 373};
 374
 375struct ioc_missed {
 376	u32				nr_met;
 377	u32				nr_missed;
 378	u32				last_met;
 379	u32				last_missed;
 380};
 381
 382struct ioc_pcpu_stat {
 383	struct ioc_missed		missed[2];
 384
 385	u64				rq_wait_ns;
 386	u64				last_rq_wait_ns;
 387};
 388
 389/* per device */
 390struct ioc {
 391	struct rq_qos			rqos;
 392
 393	bool				enabled;
 394
 395	struct ioc_params		params;
 396	u32				period_us;
 397	u32				margin_us;
 398	u64				vrate_min;
 399	u64				vrate_max;
 400
 401	spinlock_t			lock;
 402	struct timer_list		timer;
 403	struct list_head		active_iocgs;	/* active cgroups */
 404	struct ioc_pcpu_stat __percpu	*pcpu_stat;
 405
 406	enum ioc_running		running;
 407	atomic64_t			vtime_rate;
 408
 409	seqcount_spinlock_t		period_seqcount;
 410	u32				period_at;	/* wallclock starttime */
 411	u64				period_at_vtime; /* vtime starttime */
 412
 413	atomic64_t			cur_period;	/* inc'd each period */
 414	int				busy_level;	/* saturation history */
 415
 416	u64				inuse_margin_vtime;
 417	bool				weights_updated;
 418	atomic_t			hweight_gen;	/* for lazy hweights */
 419
 420	u64				autop_too_fast_at;
 421	u64				autop_too_slow_at;
 422	int				autop_idx;
 423	bool				user_qos_params:1;
 424	bool				user_cost_model:1;
 425};
 426
 427/* per device-cgroup pair */
 428struct ioc_gq {
 429	struct blkg_policy_data		pd;
 430	struct ioc			*ioc;
 431
 432	/*
 433	 * A iocg can get its weight from two sources - an explicit
 434	 * per-device-cgroup configuration or the default weight of the
 435	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 436	 * configuration.  `weight` is the effective considering both
 437	 * sources.
 438	 *
 439	 * When an idle cgroup becomes active its `active` goes from 0 to
 440	 * `weight`.  `inuse` is the surplus adjusted active weight.
 441	 * `active` and `inuse` are used to calculate `hweight_active` and
 442	 * `hweight_inuse`.
 443	 *
 444	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
 445	 * surplus adjustments.
 446	 */
 447	u32				cfg_weight;
 448	u32				weight;
 449	u32				active;
 450	u32				inuse;
 451	u32				last_inuse;
 452
 453	sector_t			cursor;		/* to detect randio */
 454
 455	/*
 456	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
 457	 * issued.  If lagging behind device vtime, the delta represents
 458	 * the currently available IO budget.  If runnning ahead, the
 459	 * overage.
 460	 *
 461	 * `vtime_done` is the same but progressed on completion rather
 462	 * than issue.  The delta behind `vtime` represents the cost of
 463	 * currently in-flight IOs.
 464	 *
 465	 * `last_vtime` is used to remember `vtime` at the end of the last
 466	 * period to calculate utilization.
 467	 */
 468	atomic64_t			vtime;
 469	atomic64_t			done_vtime;
 470	u64				abs_vdebt;
 471	u64				last_vtime;
 472
 473	/*
 474	 * The period this iocg was last active in.  Used for deactivation
 475	 * and invalidating `vtime`.
 476	 */
 477	atomic64_t			active_period;
 478	struct list_head		active_list;
 479
 480	/* see __propagate_active_weight() and current_hweight() for details */
 481	u64				child_active_sum;
 482	u64				child_inuse_sum;
 483	int				hweight_gen;
 484	u32				hweight_active;
 485	u32				hweight_inuse;
 486	bool				has_surplus;
 487
 488	struct wait_queue_head		waitq;
 489	struct hrtimer			waitq_timer;
 490	struct hrtimer			delay_timer;
 491
 492	/* usage is recorded as fractions of HWEIGHT_WHOLE */
 493	int				usage_idx;
 494	u32				usages[NR_USAGE_SLOTS];
 495
 496	/* this iocg's depth in the hierarchy and ancestors including self */
 497	int				level;
 498	struct ioc_gq			*ancestors[];
 499};
 500
 501/* per cgroup */
 502struct ioc_cgrp {
 503	struct blkcg_policy_data	cpd;
 504	unsigned int			dfl_weight;
 505};
 506
 507struct ioc_now {
 508	u64				now_ns;
 509	u32				now;
 510	u64				vnow;
 511	u64				vrate;
 512};
 513
 514struct iocg_wait {
 515	struct wait_queue_entry		wait;
 516	struct bio			*bio;
 517	u64				abs_cost;
 518	bool				committed;
 519};
 520
 521struct iocg_wake_ctx {
 522	struct ioc_gq			*iocg;
 523	u32				hw_inuse;
 524	s64				vbudget;
 525};
 526
 527static const struct ioc_params autop[] = {
 528	[AUTOP_HDD] = {
 529		.qos				= {
 530			[QOS_RLAT]		=        250000, /* 250ms */
 531			[QOS_WLAT]		=        250000,
 532			[QOS_MIN]		= VRATE_MIN_PPM,
 533			[QOS_MAX]		= VRATE_MAX_PPM,
 534		},
 535		.i_lcoefs			= {
 536			[I_LCOEF_RBPS]		=     174019176,
 537			[I_LCOEF_RSEQIOPS]	=         41708,
 538			[I_LCOEF_RRANDIOPS]	=           370,
 539			[I_LCOEF_WBPS]		=     178075866,
 540			[I_LCOEF_WSEQIOPS]	=         42705,
 541			[I_LCOEF_WRANDIOPS]	=           378,
 542		},
 543	},
 544	[AUTOP_SSD_QD1] = {
 545		.qos				= {
 546			[QOS_RLAT]		=         25000, /* 25ms */
 547			[QOS_WLAT]		=         25000,
 548			[QOS_MIN]		= VRATE_MIN_PPM,
 549			[QOS_MAX]		= VRATE_MAX_PPM,
 550		},
 551		.i_lcoefs			= {
 552			[I_LCOEF_RBPS]		=     245855193,
 553			[I_LCOEF_RSEQIOPS]	=         61575,
 554			[I_LCOEF_RRANDIOPS]	=          6946,
 555			[I_LCOEF_WBPS]		=     141365009,
 556			[I_LCOEF_WSEQIOPS]	=         33716,
 557			[I_LCOEF_WRANDIOPS]	=         26796,
 558		},
 559	},
 560	[AUTOP_SSD_DFL] = {
 561		.qos				= {
 562			[QOS_RLAT]		=         25000, /* 25ms */
 563			[QOS_WLAT]		=         25000,
 564			[QOS_MIN]		= VRATE_MIN_PPM,
 565			[QOS_MAX]		= VRATE_MAX_PPM,
 566		},
 567		.i_lcoefs			= {
 568			[I_LCOEF_RBPS]		=     488636629,
 569			[I_LCOEF_RSEQIOPS]	=          8932,
 570			[I_LCOEF_RRANDIOPS]	=          8518,
 571			[I_LCOEF_WBPS]		=     427891549,
 572			[I_LCOEF_WSEQIOPS]	=         28755,
 573			[I_LCOEF_WRANDIOPS]	=         21940,
 574		},
 575		.too_fast_vrate_pct		=           500,
 576	},
 577	[AUTOP_SSD_FAST] = {
 578		.qos				= {
 579			[QOS_RLAT]		=          5000, /* 5ms */
 580			[QOS_WLAT]		=          5000,
 581			[QOS_MIN]		= VRATE_MIN_PPM,
 582			[QOS_MAX]		= VRATE_MAX_PPM,
 583		},
 584		.i_lcoefs			= {
 585			[I_LCOEF_RBPS]		=    3102524156LLU,
 586			[I_LCOEF_RSEQIOPS]	=        724816,
 587			[I_LCOEF_RRANDIOPS]	=        778122,
 588			[I_LCOEF_WBPS]		=    1742780862LLU,
 589			[I_LCOEF_WSEQIOPS]	=        425702,
 590			[I_LCOEF_WRANDIOPS]	=	 443193,
 591		},
 592		.too_slow_vrate_pct		=            10,
 593	},
 594};
 595
 596/*
 597 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 598 * vtime credit shortage and down on device saturation.
 599 */
 600static u32 vrate_adj_pct[] =
 601	{ 0, 0, 0, 0,
 602	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 603	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 604	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 605
 606static struct blkcg_policy blkcg_policy_iocost;
 607
 608/* accessors and helpers */
 609static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 610{
 611	return container_of(rqos, struct ioc, rqos);
 612}
 613
 614static struct ioc *q_to_ioc(struct request_queue *q)
 615{
 616	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 617}
 618
 619static const char *q_name(struct request_queue *q)
 620{
 621	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 622		return kobject_name(q->kobj.parent);
 623	else
 624		return "<unknown>";
 625}
 626
 627static const char __maybe_unused *ioc_name(struct ioc *ioc)
 628{
 629	return q_name(ioc->rqos.q);
 630}
 631
 632static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 633{
 634	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 635}
 636
 637static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 638{
 639	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 640}
 641
 642static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 643{
 644	return pd_to_blkg(&iocg->pd);
 645}
 646
 647static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 648{
 649	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 650			    struct ioc_cgrp, cpd);
 651}
 652
 653/*
 654 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 655 * weight, the more expensive each IO.  Must round up.
 656 */
 657static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 658{
 659	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
 660}
 661
 662/*
 663 * The inverse of abs_cost_to_cost().  Must round up.
 664 */
 665static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 666{
 667	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
 668}
 669
 670static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
 671{
 672	bio->bi_iocost_cost = cost;
 673	atomic64_add(cost, &iocg->vtime);
 674}
 675
 676#define CREATE_TRACE_POINTS
 677#include <trace/events/iocost.h>
 678
 679/* latency Qos params changed, update period_us and all the dependent params */
 680static void ioc_refresh_period_us(struct ioc *ioc)
 681{
 682	u32 ppm, lat, multi, period_us;
 683
 684	lockdep_assert_held(&ioc->lock);
 685
 686	/* pick the higher latency target */
 687	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 688		ppm = ioc->params.qos[QOS_RPPM];
 689		lat = ioc->params.qos[QOS_RLAT];
 690	} else {
 691		ppm = ioc->params.qos[QOS_WPPM];
 692		lat = ioc->params.qos[QOS_WLAT];
 693	}
 694
 695	/*
 696	 * We want the period to be long enough to contain a healthy number
 697	 * of IOs while short enough for granular control.  Define it as a
 698	 * multiple of the latency target.  Ideally, the multiplier should
 699	 * be scaled according to the percentile so that it would nominally
 700	 * contain a certain number of requests.  Let's be simpler and
 701	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 702	 */
 703	if (ppm)
 704		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 705	else
 706		multi = 2;
 707	period_us = multi * lat;
 708	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 709
 710	/* calculate dependent params */
 711	ioc->period_us = period_us;
 712	ioc->margin_us = period_us * MARGIN_PCT / 100;
 713	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
 714			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
 715}
 716
 717static int ioc_autop_idx(struct ioc *ioc)
 718{
 719	int idx = ioc->autop_idx;
 720	const struct ioc_params *p = &autop[idx];
 721	u32 vrate_pct;
 722	u64 now_ns;
 723
 724	/* rotational? */
 725	if (!blk_queue_nonrot(ioc->rqos.q))
 726		return AUTOP_HDD;
 727
 728	/* handle SATA SSDs w/ broken NCQ */
 729	if (blk_queue_depth(ioc->rqos.q) == 1)
 730		return AUTOP_SSD_QD1;
 731
 732	/* use one of the normal ssd sets */
 733	if (idx < AUTOP_SSD_DFL)
 734		return AUTOP_SSD_DFL;
 735
 736	/* if user is overriding anything, maintain what was there */
 737	if (ioc->user_qos_params || ioc->user_cost_model)
 738		return idx;
 739
 740	/* step up/down based on the vrate */
 741	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
 742			      VTIME_PER_USEC);
 743	now_ns = ktime_get_ns();
 744
 745	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 746		if (!ioc->autop_too_fast_at)
 747			ioc->autop_too_fast_at = now_ns;
 748		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 749			return idx + 1;
 750	} else {
 751		ioc->autop_too_fast_at = 0;
 752	}
 753
 754	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 755		if (!ioc->autop_too_slow_at)
 756			ioc->autop_too_slow_at = now_ns;
 757		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 758			return idx - 1;
 759	} else {
 760		ioc->autop_too_slow_at = 0;
 761	}
 762
 763	return idx;
 764}
 765
 766/*
 767 * Take the followings as input
 768 *
 769 *  @bps	maximum sequential throughput
 770 *  @seqiops	maximum sequential 4k iops
 771 *  @randiops	maximum random 4k iops
 772 *
 773 * and calculate the linear model cost coefficients.
 774 *
 775 *  *@page	per-page cost		1s / (@bps / 4096)
 776 *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
 777 *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
 778 */
 779static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 780			u64 *page, u64 *seqio, u64 *randio)
 781{
 782	u64 v;
 783
 784	*page = *seqio = *randio = 0;
 785
 786	if (bps)
 787		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 788					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 789
 790	if (seqiops) {
 791		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 792		if (v > *page)
 793			*seqio = v - *page;
 794	}
 795
 796	if (randiops) {
 797		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 798		if (v > *page)
 799			*randio = v - *page;
 800	}
 801}
 802
 803static void ioc_refresh_lcoefs(struct ioc *ioc)
 804{
 805	u64 *u = ioc->params.i_lcoefs;
 806	u64 *c = ioc->params.lcoefs;
 807
 808	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 809		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 810	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 811		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 812}
 813
 814static bool ioc_refresh_params(struct ioc *ioc, bool force)
 815{
 816	const struct ioc_params *p;
 817	int idx;
 818
 819	lockdep_assert_held(&ioc->lock);
 820
 821	idx = ioc_autop_idx(ioc);
 822	p = &autop[idx];
 823
 824	if (idx == ioc->autop_idx && !force)
 825		return false;
 826
 827	if (idx != ioc->autop_idx)
 828		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 829
 830	ioc->autop_idx = idx;
 831	ioc->autop_too_fast_at = 0;
 832	ioc->autop_too_slow_at = 0;
 833
 834	if (!ioc->user_qos_params)
 835		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 836	if (!ioc->user_cost_model)
 837		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 838
 839	ioc_refresh_period_us(ioc);
 840	ioc_refresh_lcoefs(ioc);
 841
 842	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 843					    VTIME_PER_USEC, MILLION);
 844	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 845				   VTIME_PER_USEC, MILLION);
 846
 847	return true;
 848}
 849
 850/* take a snapshot of the current [v]time and vrate */
 851static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 852{
 853	unsigned seq;
 854
 855	now->now_ns = ktime_get();
 856	now->now = ktime_to_us(now->now_ns);
 857	now->vrate = atomic64_read(&ioc->vtime_rate);
 858
 859	/*
 860	 * The current vtime is
 861	 *
 862	 *   vtime at period start + (wallclock time since the start) * vrate
 863	 *
 864	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
 865	 * needed, they're seqcount protected.
 866	 */
 867	do {
 868		seq = read_seqcount_begin(&ioc->period_seqcount);
 869		now->vnow = ioc->period_at_vtime +
 870			(now->now - ioc->period_at) * now->vrate;
 871	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
 872}
 873
 874static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
 875{
 876	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
 877
 878	write_seqcount_begin(&ioc->period_seqcount);
 879	ioc->period_at = now->now;
 880	ioc->period_at_vtime = now->vnow;
 881	write_seqcount_end(&ioc->period_seqcount);
 882
 883	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
 884	add_timer(&ioc->timer);
 885}
 886
 887/*
 888 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 889 * weight sums and propagate upwards accordingly.
 890 */
 891static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 892{
 893	struct ioc *ioc = iocg->ioc;
 894	int lvl;
 895
 896	lockdep_assert_held(&ioc->lock);
 897
 898	inuse = min(active, inuse);
 899
 900	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
 901		struct ioc_gq *parent = iocg->ancestors[lvl];
 902		struct ioc_gq *child = iocg->ancestors[lvl + 1];
 903		u32 parent_active = 0, parent_inuse = 0;
 904
 905		/* update the level sums */
 906		parent->child_active_sum += (s32)(active - child->active);
 907		parent->child_inuse_sum += (s32)(inuse - child->inuse);
 908		/* apply the udpates */
 909		child->active = active;
 910		child->inuse = inuse;
 911
 912		/*
 913		 * The delta between inuse and active sums indicates that
 914		 * that much of weight is being given away.  Parent's inuse
 915		 * and active should reflect the ratio.
 916		 */
 917		if (parent->child_active_sum) {
 918			parent_active = parent->weight;
 919			parent_inuse = DIV64_U64_ROUND_UP(
 920				parent_active * parent->child_inuse_sum,
 921				parent->child_active_sum);
 922		}
 923
 924		/* do we need to keep walking up? */
 925		if (parent_active == parent->active &&
 926		    parent_inuse == parent->inuse)
 927			break;
 928
 929		active = parent_active;
 930		inuse = parent_inuse;
 931	}
 932
 933	ioc->weights_updated = true;
 934}
 935
 936static void commit_active_weights(struct ioc *ioc)
 937{
 938	lockdep_assert_held(&ioc->lock);
 939
 940	if (ioc->weights_updated) {
 941		/* paired with rmb in current_hweight(), see there */
 942		smp_wmb();
 943		atomic_inc(&ioc->hweight_gen);
 944		ioc->weights_updated = false;
 945	}
 946}
 947
 948static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 949{
 950	__propagate_active_weight(iocg, active, inuse);
 951	commit_active_weights(iocg->ioc);
 952}
 953
 954static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
 955{
 956	struct ioc *ioc = iocg->ioc;
 957	int lvl;
 958	u32 hwa, hwi;
 959	int ioc_gen;
 960
 961	/* hot path - if uptodate, use cached */
 962	ioc_gen = atomic_read(&ioc->hweight_gen);
 963	if (ioc_gen == iocg->hweight_gen)
 964		goto out;
 965
 966	/*
 967	 * Paired with wmb in commit_active_weights().  If we saw the
 968	 * updated hweight_gen, all the weight updates from
 969	 * __propagate_active_weight() are visible too.
 970	 *
 971	 * We can race with weight updates during calculation and get it
 972	 * wrong.  However, hweight_gen would have changed and a future
 973	 * reader will recalculate and we're guaranteed to discard the
 974	 * wrong result soon.
 975	 */
 976	smp_rmb();
 977
 978	hwa = hwi = HWEIGHT_WHOLE;
 979	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
 980		struct ioc_gq *parent = iocg->ancestors[lvl];
 981		struct ioc_gq *child = iocg->ancestors[lvl + 1];
 982		u32 active_sum = READ_ONCE(parent->child_active_sum);
 983		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
 984		u32 active = READ_ONCE(child->active);
 985		u32 inuse = READ_ONCE(child->inuse);
 986
 987		/* we can race with deactivations and either may read as zero */
 988		if (!active_sum || !inuse_sum)
 989			continue;
 990
 991		active_sum = max(active, active_sum);
 992		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
 993
 994		inuse_sum = max(inuse, inuse_sum);
 995		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
 996	}
 997
 998	iocg->hweight_active = max_t(u32, hwa, 1);
 999	iocg->hweight_inuse = max_t(u32, hwi, 1);
1000	iocg->hweight_gen = ioc_gen;
1001out:
1002	if (hw_activep)
1003		*hw_activep = iocg->hweight_active;
1004	if (hw_inusep)
1005		*hw_inusep = iocg->hweight_inuse;
1006}
1007
1008static void weight_updated(struct ioc_gq *iocg)
1009{
1010	struct ioc *ioc = iocg->ioc;
1011	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1012	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1013	u32 weight;
1014
1015	lockdep_assert_held(&ioc->lock);
1016
1017	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1018	if (weight != iocg->weight && iocg->active)
1019		propagate_active_weight(iocg, weight,
1020			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1021	iocg->weight = weight;
1022}
1023
1024static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1025{
1026	struct ioc *ioc = iocg->ioc;
1027	u64 last_period, cur_period, max_period_delta;
1028	u64 vtime, vmargin, vmin;
1029	int i;
1030
1031	/*
1032	 * If seem to be already active, just update the stamp to tell the
1033	 * timer that we're still active.  We don't mind occassional races.
1034	 */
1035	if (!list_empty(&iocg->active_list)) {
1036		ioc_now(ioc, now);
1037		cur_period = atomic64_read(&ioc->cur_period);
1038		if (atomic64_read(&iocg->active_period) != cur_period)
1039			atomic64_set(&iocg->active_period, cur_period);
1040		return true;
1041	}
1042
1043	/* racy check on internal node IOs, treat as root level IOs */
1044	if (iocg->child_active_sum)
1045		return false;
1046
1047	spin_lock_irq(&ioc->lock);
1048
1049	ioc_now(ioc, now);
1050
1051	/* update period */
1052	cur_period = atomic64_read(&ioc->cur_period);
1053	last_period = atomic64_read(&iocg->active_period);
1054	atomic64_set(&iocg->active_period, cur_period);
1055
1056	/* already activated or breaking leaf-only constraint? */
1057	if (!list_empty(&iocg->active_list))
1058		goto succeed_unlock;
1059	for (i = iocg->level - 1; i > 0; i--)
1060		if (!list_empty(&iocg->ancestors[i]->active_list))
1061			goto fail_unlock;
1062
1063	if (iocg->child_active_sum)
1064		goto fail_unlock;
1065
1066	/*
1067	 * vtime may wrap when vrate is raised substantially due to
1068	 * underestimated IO costs.  Look at the period and ignore its
1069	 * vtime if the iocg has been idle for too long.  Also, cap the
1070	 * budget it can start with to the margin.
1071	 */
1072	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1073	vtime = atomic64_read(&iocg->vtime);
1074	vmargin = ioc->margin_us * now->vrate;
1075	vmin = now->vnow - vmargin;
1076
1077	if (last_period + max_period_delta < cur_period ||
1078	    time_before64(vtime, vmin)) {
1079		atomic64_add(vmin - vtime, &iocg->vtime);
1080		atomic64_add(vmin - vtime, &iocg->done_vtime);
1081		vtime = vmin;
1082	}
1083
1084	/*
1085	 * Activate, propagate weight and start period timer if not
1086	 * running.  Reset hweight_gen to avoid accidental match from
1087	 * wrapping.
1088	 */
1089	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1090	list_add(&iocg->active_list, &ioc->active_iocgs);
1091	propagate_active_weight(iocg, iocg->weight,
1092				iocg->last_inuse ?: iocg->weight);
1093
1094	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1095			last_period, cur_period, vtime);
1096
1097	iocg->last_vtime = vtime;
1098
1099	if (ioc->running == IOC_IDLE) {
1100		ioc->running = IOC_RUNNING;
1101		ioc_start_period(ioc, now);
1102	}
1103
1104succeed_unlock:
1105	spin_unlock_irq(&ioc->lock);
1106	return true;
1107
1108fail_unlock:
1109	spin_unlock_irq(&ioc->lock);
1110	return false;
1111}
1112
1113static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1114			int flags, void *key)
1115{
1116	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1117	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1118	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1119
1120	ctx->vbudget -= cost;
1121
1122	if (ctx->vbudget < 0)
1123		return -1;
1124
1125	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1126
1127	/*
1128	 * autoremove_wake_function() removes the wait entry only when it
1129	 * actually changed the task state.  We want the wait always
1130	 * removed.  Remove explicitly and use default_wake_function().
1131	 */
1132	list_del_init(&wq_entry->entry);
1133	wait->committed = true;
1134
1135	default_wake_function(wq_entry, mode, flags, key);
1136	return 0;
1137}
1138
1139static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1140{
1141	struct ioc *ioc = iocg->ioc;
1142	struct iocg_wake_ctx ctx = { .iocg = iocg };
1143	u64 margin_ns = (u64)(ioc->period_us *
1144			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1145	u64 vdebt, vshortage, expires, oexpires;
1146	s64 vbudget;
1147	u32 hw_inuse;
1148
1149	lockdep_assert_held(&iocg->waitq.lock);
1150
1151	current_hweight(iocg, NULL, &hw_inuse);
1152	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1153
1154	/* pay off debt */
1155	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1156	if (vdebt && vbudget > 0) {
1157		u64 delta = min_t(u64, vbudget, vdebt);
1158		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1159				    iocg->abs_vdebt);
1160
1161		atomic64_add(delta, &iocg->vtime);
1162		atomic64_add(delta, &iocg->done_vtime);
1163		iocg->abs_vdebt -= abs_delta;
1164	}
1165
1166	/*
1167	 * Wake up the ones which are due and see how much vtime we'll need
1168	 * for the next one.
1169	 */
1170	ctx.hw_inuse = hw_inuse;
1171	ctx.vbudget = vbudget - vdebt;
1172	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1173	if (!waitqueue_active(&iocg->waitq))
1174		return;
1175	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1176		return;
1177
1178	/* determine next wakeup, add a quarter margin to guarantee chunking */
1179	vshortage = -ctx.vbudget;
1180	expires = now->now_ns +
1181		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1182	expires += margin_ns / 4;
1183
1184	/* if already active and close enough, don't bother */
1185	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1186	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1187	    abs(oexpires - expires) <= margin_ns / 4)
1188		return;
1189
1190	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1191			       margin_ns / 4, HRTIMER_MODE_ABS);
1192}
1193
1194static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1195{
1196	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1197	struct ioc_now now;
1198	unsigned long flags;
1199
1200	ioc_now(iocg->ioc, &now);
1201
1202	spin_lock_irqsave(&iocg->waitq.lock, flags);
1203	iocg_kick_waitq(iocg, &now);
1204	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1205
1206	return HRTIMER_NORESTART;
1207}
1208
1209static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1210{
1211	struct ioc *ioc = iocg->ioc;
1212	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1213	u64 vtime = atomic64_read(&iocg->vtime);
1214	u64 vmargin = ioc->margin_us * now->vrate;
1215	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1216	u64 delta_ns, expires, oexpires;
1217	u32 hw_inuse;
1218
1219	lockdep_assert_held(&iocg->waitq.lock);
1220
1221	/* debt-adjust vtime */
1222	current_hweight(iocg, NULL, &hw_inuse);
1223	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1224
1225	/*
1226	 * Clear or maintain depending on the overage. Non-zero vdebt is what
1227	 * guarantees that @iocg is online and future iocg_kick_delay() will
1228	 * clear use_delay. Don't leave it on when there's no vdebt.
1229	 */
1230	if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1231		blkcg_clear_delay(blkg);
1232		return false;
1233	}
1234	if (!atomic_read(&blkg->use_delay) &&
1235	    time_before_eq64(vtime, now->vnow + vmargin))
1236		return false;
1237
1238	/* use delay */
1239	delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1240				      now->vrate) * NSEC_PER_USEC;
1241	blkcg_set_delay(blkg, delta_ns);
1242	expires = now->now_ns + delta_ns;
1243
1244	/* if already active and close enough, don't bother */
1245	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1246	if (hrtimer_is_queued(&iocg->delay_timer) &&
1247	    abs(oexpires - expires) <= margin_ns / 4)
1248		return true;
1249
1250	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1251			       margin_ns / 4, HRTIMER_MODE_ABS);
1252	return true;
1253}
1254
1255static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1256{
1257	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1258	struct ioc_now now;
1259	unsigned long flags;
1260
1261	spin_lock_irqsave(&iocg->waitq.lock, flags);
1262	ioc_now(iocg->ioc, &now);
1263	iocg_kick_delay(iocg, &now);
1264	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1265
1266	return HRTIMER_NORESTART;
1267}
1268
1269static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1270{
1271	u32 nr_met[2] = { };
1272	u32 nr_missed[2] = { };
1273	u64 rq_wait_ns = 0;
1274	int cpu, rw;
1275
1276	for_each_online_cpu(cpu) {
1277		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1278		u64 this_rq_wait_ns;
1279
1280		for (rw = READ; rw <= WRITE; rw++) {
1281			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1282			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1283
1284			nr_met[rw] += this_met - stat->missed[rw].last_met;
1285			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1286			stat->missed[rw].last_met = this_met;
1287			stat->missed[rw].last_missed = this_missed;
1288		}
1289
1290		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1291		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1292		stat->last_rq_wait_ns = this_rq_wait_ns;
1293	}
1294
1295	for (rw = READ; rw <= WRITE; rw++) {
1296		if (nr_met[rw] + nr_missed[rw])
1297			missed_ppm_ar[rw] =
1298				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1299						   nr_met[rw] + nr_missed[rw]);
1300		else
1301			missed_ppm_ar[rw] = 0;
1302	}
1303
1304	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1305				   ioc->period_us * NSEC_PER_USEC);
1306}
1307
1308/* was iocg idle this period? */
1309static bool iocg_is_idle(struct ioc_gq *iocg)
1310{
1311	struct ioc *ioc = iocg->ioc;
1312
1313	/* did something get issued this period? */
1314	if (atomic64_read(&iocg->active_period) ==
1315	    atomic64_read(&ioc->cur_period))
1316		return false;
1317
1318	/* is something in flight? */
1319	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1320		return false;
1321
1322	return true;
1323}
1324
1325/* returns usage with margin added if surplus is large enough */
1326static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1327{
1328	/* add margin */
1329	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1330	usage += SURPLUS_SCALE_ABS;
1331
1332	/* don't bother if the surplus is too small */
1333	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1334		return 0;
1335
1336	return usage;
1337}
1338
1339static void ioc_timer_fn(struct timer_list *timer)
1340{
1341	struct ioc *ioc = container_of(timer, struct ioc, timer);
1342	struct ioc_gq *iocg, *tiocg;
1343	struct ioc_now now;
1344	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1345	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1346	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1347	u32 missed_ppm[2], rq_wait_pct;
1348	u64 period_vtime;
1349	int prev_busy_level, i;
1350
1351	/* how were the latencies during the period? */
1352	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1353
1354	/* take care of active iocgs */
1355	spin_lock_irq(&ioc->lock);
1356
1357	ioc_now(ioc, &now);
1358
1359	period_vtime = now.vnow - ioc->period_at_vtime;
1360	if (WARN_ON_ONCE(!period_vtime)) {
1361		spin_unlock_irq(&ioc->lock);
1362		return;
1363	}
1364
1365	/*
1366	 * Waiters determine the sleep durations based on the vrate they
1367	 * saw at the time of sleep.  If vrate has increased, some waiters
1368	 * could be sleeping for too long.  Wake up tardy waiters which
1369	 * should have woken up in the last period and expire idle iocgs.
1370	 */
1371	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1372		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1373		    !iocg_is_idle(iocg))
1374			continue;
1375
1376		spin_lock(&iocg->waitq.lock);
1377
1378		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1379			/* might be oversleeping vtime / hweight changes, kick */
1380			iocg_kick_waitq(iocg, &now);
1381			iocg_kick_delay(iocg, &now);
1382		} else if (iocg_is_idle(iocg)) {
1383			/* no waiter and idle, deactivate */
1384			iocg->last_inuse = iocg->inuse;
1385			__propagate_active_weight(iocg, 0, 0);
1386			list_del_init(&iocg->active_list);
1387		}
1388
1389		spin_unlock(&iocg->waitq.lock);
1390	}
1391	commit_active_weights(ioc);
1392
1393	/* calc usages and see whether some weights need to be moved around */
1394	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1395		u64 vdone, vtime, vusage, vmargin, vmin;
1396		u32 hw_active, hw_inuse, usage;
1397
1398		/*
1399		 * Collect unused and wind vtime closer to vnow to prevent
1400		 * iocgs from accumulating a large amount of budget.
1401		 */
1402		vdone = atomic64_read(&iocg->done_vtime);
1403		vtime = atomic64_read(&iocg->vtime);
1404		current_hweight(iocg, &hw_active, &hw_inuse);
1405
1406		/*
1407		 * Latency QoS detection doesn't account for IOs which are
1408		 * in-flight for longer than a period.  Detect them by
1409		 * comparing vdone against period start.  If lagging behind
1410		 * IOs from past periods, don't increase vrate.
1411		 */
1412		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1413		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1414		    time_after64(vtime, vdone) &&
1415		    time_after64(vtime, now.vnow -
1416				 MAX_LAGGING_PERIODS * period_vtime) &&
1417		    time_before64(vdone, now.vnow - period_vtime))
1418			nr_lagging++;
1419
1420		if (waitqueue_active(&iocg->waitq))
1421			vusage = now.vnow - iocg->last_vtime;
1422		else if (time_before64(iocg->last_vtime, vtime))
1423			vusage = vtime - iocg->last_vtime;
1424		else
1425			vusage = 0;
1426
1427		iocg->last_vtime += vusage;
1428		/*
1429		 * Factor in in-flight vtime into vusage to avoid
1430		 * high-latency completions appearing as idle.  This should
1431		 * be done after the above ->last_time adjustment.
1432		 */
1433		vusage = max(vusage, vtime - vdone);
1434
1435		/* calculate hweight based usage ratio and record */
1436		if (vusage) {
1437			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1438						   period_vtime);
1439			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1440			iocg->usages[iocg->usage_idx] = usage;
1441		} else {
1442			usage = 0;
1443		}
1444
1445		/* see whether there's surplus vtime */
1446		vmargin = ioc->margin_us * now.vrate;
1447		vmin = now.vnow - vmargin;
1448
1449		iocg->has_surplus = false;
1450
1451		if (!waitqueue_active(&iocg->waitq) &&
1452		    time_before64(vtime, vmin)) {
1453			u64 delta = vmin - vtime;
1454
1455			/* throw away surplus vtime */
1456			atomic64_add(delta, &iocg->vtime);
1457			atomic64_add(delta, &iocg->done_vtime);
1458			iocg->last_vtime += delta;
1459			/* if usage is sufficiently low, maybe it can donate */
1460			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1461				iocg->has_surplus = true;
1462				nr_surpluses++;
1463			}
1464		} else if (hw_inuse < hw_active) {
1465			u32 new_hwi, new_inuse;
1466
1467			/* was donating but might need to take back some */
1468			if (waitqueue_active(&iocg->waitq)) {
1469				new_hwi = hw_active;
1470			} else {
1471				new_hwi = max(hw_inuse,
1472					      usage * SURPLUS_SCALE_PCT / 100 +
1473					      SURPLUS_SCALE_ABS);
1474			}
1475
1476			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1477					      hw_inuse);
1478			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1479
1480			if (new_inuse > iocg->inuse) {
1481				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1482						iocg->inuse, new_inuse,
1483						hw_inuse, new_hwi);
1484				__propagate_active_weight(iocg, iocg->weight,
1485							  new_inuse);
1486			}
1487		} else {
1488			/* genuninely out of vtime */
1489			nr_shortages++;
1490		}
1491	}
1492
1493	if (!nr_shortages || !nr_surpluses)
1494		goto skip_surplus_transfers;
1495
1496	/* there are both shortages and surpluses, transfer surpluses */
1497	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1498		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1499		int nr_valid = 0;
1500
1501		if (!iocg->has_surplus)
1502			continue;
1503
1504		/* base the decision on max historical usage */
1505		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1506			if (iocg->usages[i]) {
1507				usage = max(usage, iocg->usages[i]);
1508				nr_valid++;
1509			}
1510		}
1511		if (nr_valid < MIN_VALID_USAGES)
1512			continue;
1513
1514		current_hweight(iocg, &hw_active, &hw_inuse);
1515		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1516		if (!new_hwi)
1517			continue;
1518
1519		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1520					       hw_inuse);
1521		if (new_inuse < iocg->inuse) {
1522			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1523					iocg->inuse, new_inuse,
1524					hw_inuse, new_hwi);
1525			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1526		}
1527	}
1528skip_surplus_transfers:
1529	commit_active_weights(ioc);
1530
1531	/*
1532	 * If q is getting clogged or we're missing too much, we're issuing
1533	 * too much IO and should lower vtime rate.  If we're not missing
1534	 * and experiencing shortages but not surpluses, we're too stingy
1535	 * and should increase vtime rate.
1536	 */
1537	prev_busy_level = ioc->busy_level;
1538	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1539	    missed_ppm[READ] > ppm_rthr ||
1540	    missed_ppm[WRITE] > ppm_wthr) {
1541		/* clearly missing QoS targets, slow down vrate */
1542		ioc->busy_level = max(ioc->busy_level, 0);
1543		ioc->busy_level++;
1544	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1545		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1546		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1547		/* QoS targets are being met with >25% margin */
1548		if (nr_shortages) {
1549			/*
1550			 * We're throttling while the device has spare
1551			 * capacity.  If vrate was being slowed down, stop.
1552			 */
1553			ioc->busy_level = min(ioc->busy_level, 0);
1554
1555			/*
1556			 * If there are IOs spanning multiple periods, wait
1557			 * them out before pushing the device harder.  If
1558			 * there are surpluses, let redistribution work it
1559			 * out first.
1560			 */
1561			if (!nr_lagging && !nr_surpluses)
1562				ioc->busy_level--;
1563		} else {
1564			/*
1565			 * Nobody is being throttled and the users aren't
1566			 * issuing enough IOs to saturate the device.  We
1567			 * simply don't know how close the device is to
1568			 * saturation.  Coast.
1569			 */
1570			ioc->busy_level = 0;
1571		}
1572	} else {
1573		/* inside the hysterisis margin, we're good */
1574		ioc->busy_level = 0;
1575	}
1576
1577	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1578
1579	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1580		u64 vrate = atomic64_read(&ioc->vtime_rate);
1581		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1582
1583		/* rq_wait signal is always reliable, ignore user vrate_min */
1584		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1585			vrate_min = VRATE_MIN;
1586
1587		/*
1588		 * If vrate is out of bounds, apply clamp gradually as the
1589		 * bounds can change abruptly.  Otherwise, apply busy_level
1590		 * based adjustment.
1591		 */
1592		if (vrate < vrate_min) {
1593			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1594					  100);
1595			vrate = min(vrate, vrate_min);
1596		} else if (vrate > vrate_max) {
1597			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1598					  100);
1599			vrate = max(vrate, vrate_max);
1600		} else {
1601			int idx = min_t(int, abs(ioc->busy_level),
1602					ARRAY_SIZE(vrate_adj_pct) - 1);
1603			u32 adj_pct = vrate_adj_pct[idx];
1604
1605			if (ioc->busy_level > 0)
1606				adj_pct = 100 - adj_pct;
1607			else
1608				adj_pct = 100 + adj_pct;
1609
1610			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1611				      vrate_min, vrate_max);
1612		}
1613
1614		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1615					   nr_lagging, nr_shortages,
1616					   nr_surpluses);
1617
1618		atomic64_set(&ioc->vtime_rate, vrate);
1619		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1620			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1621	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1622		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1623					   missed_ppm, rq_wait_pct, nr_lagging,
1624					   nr_shortages, nr_surpluses);
1625	}
1626
1627	ioc_refresh_params(ioc, false);
1628
1629	/*
1630	 * This period is done.  Move onto the next one.  If nothing's
1631	 * going on with the device, stop the timer.
1632	 */
1633	atomic64_inc(&ioc->cur_period);
1634
1635	if (ioc->running != IOC_STOP) {
1636		if (!list_empty(&ioc->active_iocgs)) {
1637			ioc_start_period(ioc, &now);
1638		} else {
1639			ioc->busy_level = 0;
1640			ioc->running = IOC_IDLE;
1641		}
1642	}
1643
1644	spin_unlock_irq(&ioc->lock);
1645}
1646
1647static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1648				    bool is_merge, u64 *costp)
1649{
1650	struct ioc *ioc = iocg->ioc;
1651	u64 coef_seqio, coef_randio, coef_page;
1652	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1653	u64 seek_pages = 0;
1654	u64 cost = 0;
1655
1656	switch (bio_op(bio)) {
1657	case REQ_OP_READ:
1658		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1659		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1660		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1661		break;
1662	case REQ_OP_WRITE:
1663		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1664		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1665		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1666		break;
1667	default:
1668		goto out;
1669	}
1670
1671	if (iocg->cursor) {
1672		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1673		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1674	}
1675
1676	if (!is_merge) {
1677		if (seek_pages > LCOEF_RANDIO_PAGES) {
1678			cost += coef_randio;
1679		} else {
1680			cost += coef_seqio;
1681		}
1682	}
1683	cost += pages * coef_page;
1684out:
1685	*costp = cost;
1686}
1687
1688static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1689{
1690	u64 cost;
1691
1692	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1693	return cost;
1694}
1695
1696static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1697					 u64 *costp)
1698{
1699	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1700
1701	switch (req_op(rq)) {
1702	case REQ_OP_READ:
1703		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1704		break;
1705	case REQ_OP_WRITE:
1706		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1707		break;
1708	default:
1709		*costp = 0;
1710	}
1711}
1712
1713static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1714{
1715	u64 cost;
1716
1717	calc_size_vtime_cost_builtin(rq, ioc, &cost);
1718	return cost;
1719}
1720
1721static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1722{
1723	struct blkcg_gq *blkg = bio->bi_blkg;
1724	struct ioc *ioc = rqos_to_ioc(rqos);
1725	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1726	struct ioc_now now;
1727	struct iocg_wait wait;
1728	u32 hw_active, hw_inuse;
1729	u64 abs_cost, cost, vtime;
1730
1731	/* bypass IOs if disabled or for root cgroup */
1732	if (!ioc->enabled || !iocg->level)
1733		return;
1734
1735	/* always activate so that even 0 cost IOs get protected to some level */
1736	if (!iocg_activate(iocg, &now))
1737		return;
1738
1739	/* calculate the absolute vtime cost */
1740	abs_cost = calc_vtime_cost(bio, iocg, false);
1741	if (!abs_cost)
1742		return;
1743
1744	iocg->cursor = bio_end_sector(bio);
1745
1746	vtime = atomic64_read(&iocg->vtime);
1747	current_hweight(iocg, &hw_active, &hw_inuse);
1748
1749	if (hw_inuse < hw_active &&
1750	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1751		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1752				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1753		spin_lock_irq(&ioc->lock);
1754		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1755		spin_unlock_irq(&ioc->lock);
1756		current_hweight(iocg, &hw_active, &hw_inuse);
1757	}
1758
1759	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1760
1761	/*
1762	 * If no one's waiting and within budget, issue right away.  The
1763	 * tests are racy but the races aren't systemic - we only miss once
1764	 * in a while which is fine.
1765	 */
1766	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1767	    time_before_eq64(vtime + cost, now.vnow)) {
1768		iocg_commit_bio(iocg, bio, cost);
1769		return;
1770	}
1771
1772	/*
1773	 * We activated above but w/o any synchronization. Deactivation is
1774	 * synchronized with waitq.lock and we won't get deactivated as long
1775	 * as we're waiting or has debt, so we're good if we're activated
1776	 * here. In the unlikely case that we aren't, just issue the IO.
1777	 */
1778	spin_lock_irq(&iocg->waitq.lock);
1779
1780	if (unlikely(list_empty(&iocg->active_list))) {
1781		spin_unlock_irq(&iocg->waitq.lock);
1782		iocg_commit_bio(iocg, bio, cost);
1783		return;
1784	}
1785
1786	/*
1787	 * We're over budget. If @bio has to be issued regardless, remember
1788	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1789	 * off the debt before waking more IOs.
1790	 *
1791	 * This way, the debt is continuously paid off each period with the
1792	 * actual budget available to the cgroup. If we just wound vtime, we
1793	 * would incorrectly use the current hw_inuse for the entire amount
1794	 * which, for example, can lead to the cgroup staying blocked for a
1795	 * long time even with substantially raised hw_inuse.
1796	 *
1797	 * An iocg with vdebt should stay online so that the timer can keep
1798	 * deducting its vdebt and [de]activate use_delay mechanism
1799	 * accordingly. We don't want to race against the timer trying to
1800	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1801	 * penalizing the cgroup and its descendants.
1802	 */
1803	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1804		iocg->abs_vdebt += abs_cost;
1805		if (iocg_kick_delay(iocg, &now))
1806			blkcg_schedule_throttle(rqos->q,
1807					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1808		spin_unlock_irq(&iocg->waitq.lock);
1809		return;
1810	}
1811
1812	/*
1813	 * Append self to the waitq and schedule the wakeup timer if we're
1814	 * the first waiter.  The timer duration is calculated based on the
1815	 * current vrate.  vtime and hweight changes can make it too short
1816	 * or too long.  Each wait entry records the absolute cost it's
1817	 * waiting for to allow re-evaluation using a custom wait entry.
1818	 *
1819	 * If too short, the timer simply reschedules itself.  If too long,
1820	 * the period timer will notice and trigger wakeups.
1821	 *
1822	 * All waiters are on iocg->waitq and the wait states are
1823	 * synchronized using waitq.lock.
1824	 */
1825	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1826	wait.wait.private = current;
1827	wait.bio = bio;
1828	wait.abs_cost = abs_cost;
1829	wait.committed = false;	/* will be set true by waker */
1830
1831	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1832	iocg_kick_waitq(iocg, &now);
1833
1834	spin_unlock_irq(&iocg->waitq.lock);
1835
1836	while (true) {
1837		set_current_state(TASK_UNINTERRUPTIBLE);
1838		if (wait.committed)
1839			break;
1840		io_schedule();
1841	}
1842
1843	/* waker already committed us, proceed */
1844	finish_wait(&iocg->waitq, &wait.wait);
1845}
1846
1847static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1848			   struct bio *bio)
1849{
1850	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1851	struct ioc *ioc = iocg->ioc;
1852	sector_t bio_end = bio_end_sector(bio);
1853	struct ioc_now now;
1854	u32 hw_inuse;
1855	u64 abs_cost, cost;
1856	unsigned long flags;
1857
1858	/* bypass if disabled or for root cgroup */
1859	if (!ioc->enabled || !iocg->level)
1860		return;
1861
1862	abs_cost = calc_vtime_cost(bio, iocg, true);
1863	if (!abs_cost)
1864		return;
1865
1866	ioc_now(ioc, &now);
1867	current_hweight(iocg, NULL, &hw_inuse);
1868	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1869
1870	/* update cursor if backmerging into the request at the cursor */
1871	if (blk_rq_pos(rq) < bio_end &&
1872	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1873		iocg->cursor = bio_end;
1874
1875	/*
1876	 * Charge if there's enough vtime budget and the existing request has
1877	 * cost assigned.
1878	 */
1879	if (rq->bio && rq->bio->bi_iocost_cost &&
1880	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1881		iocg_commit_bio(iocg, bio, cost);
1882		return;
1883	}
1884
1885	/*
1886	 * Otherwise, account it as debt if @iocg is online, which it should
1887	 * be for the vast majority of cases. See debt handling in
1888	 * ioc_rqos_throttle() for details.
1889	 */
1890	spin_lock_irqsave(&iocg->waitq.lock, flags);
1891	if (likely(!list_empty(&iocg->active_list))) {
1892		iocg->abs_vdebt += abs_cost;
1893		iocg_kick_delay(iocg, &now);
1894	} else {
1895		iocg_commit_bio(iocg, bio, cost);
1896	}
1897	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1898}
1899
1900static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1901{
1902	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1903
1904	if (iocg && bio->bi_iocost_cost)
1905		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1906}
1907
1908static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1909{
1910	struct ioc *ioc = rqos_to_ioc(rqos);
1911	u64 on_q_ns, rq_wait_ns, size_nsec;
1912	int pidx, rw;
1913
1914	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1915		return;
1916
1917	switch (req_op(rq) & REQ_OP_MASK) {
1918	case REQ_OP_READ:
1919		pidx = QOS_RLAT;
1920		rw = READ;
1921		break;
1922	case REQ_OP_WRITE:
1923		pidx = QOS_WLAT;
1924		rw = WRITE;
1925		break;
1926	default:
1927		return;
1928	}
1929
1930	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1931	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1932	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
1933
1934	if (on_q_ns <= size_nsec ||
1935	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1936		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1937	else
1938		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1939
1940	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1941}
1942
1943static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1944{
1945	struct ioc *ioc = rqos_to_ioc(rqos);
1946
1947	spin_lock_irq(&ioc->lock);
1948	ioc_refresh_params(ioc, false);
1949	spin_unlock_irq(&ioc->lock);
1950}
1951
1952static void ioc_rqos_exit(struct rq_qos *rqos)
1953{
1954	struct ioc *ioc = rqos_to_ioc(rqos);
1955
1956	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1957
1958	spin_lock_irq(&ioc->lock);
1959	ioc->running = IOC_STOP;
1960	spin_unlock_irq(&ioc->lock);
1961
1962	del_timer_sync(&ioc->timer);
1963	free_percpu(ioc->pcpu_stat);
1964	kfree(ioc);
1965}
1966
1967static struct rq_qos_ops ioc_rqos_ops = {
1968	.throttle = ioc_rqos_throttle,
1969	.merge = ioc_rqos_merge,
1970	.done_bio = ioc_rqos_done_bio,
1971	.done = ioc_rqos_done,
1972	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1973	.exit = ioc_rqos_exit,
1974};
1975
1976static int blk_iocost_init(struct request_queue *q)
1977{
1978	struct ioc *ioc;
1979	struct rq_qos *rqos;
1980	int ret;
1981
1982	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1983	if (!ioc)
1984		return -ENOMEM;
1985
1986	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1987	if (!ioc->pcpu_stat) {
1988		kfree(ioc);
1989		return -ENOMEM;
1990	}
1991
1992	rqos = &ioc->rqos;
1993	rqos->id = RQ_QOS_COST;
1994	rqos->ops = &ioc_rqos_ops;
1995	rqos->q = q;
1996
1997	spin_lock_init(&ioc->lock);
1998	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1999	INIT_LIST_HEAD(&ioc->active_iocgs);
2000
2001	ioc->running = IOC_IDLE;
2002	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2003	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2004	ioc->period_at = ktime_to_us(ktime_get());
2005	atomic64_set(&ioc->cur_period, 0);
2006	atomic_set(&ioc->hweight_gen, 0);
2007
2008	spin_lock_irq(&ioc->lock);
2009	ioc->autop_idx = AUTOP_INVALID;
2010	ioc_refresh_params(ioc, true);
2011	spin_unlock_irq(&ioc->lock);
2012
2013	rq_qos_add(q, rqos);
2014	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2015	if (ret) {
2016		rq_qos_del(q, rqos);
2017		free_percpu(ioc->pcpu_stat);
2018		kfree(ioc);
2019		return ret;
2020	}
2021	return 0;
2022}
2023
2024static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2025{
2026	struct ioc_cgrp *iocc;
2027
2028	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2029	if (!iocc)
2030		return NULL;
2031
2032	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2033	return &iocc->cpd;
2034}
2035
2036static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2037{
2038	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2039}
2040
2041static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2042					     struct blkcg *blkcg)
2043{
2044	int levels = blkcg->css.cgroup->level + 1;
2045	struct ioc_gq *iocg;
2046
2047	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2048	if (!iocg)
2049		return NULL;
2050
2051	return &iocg->pd;
2052}
2053
2054static void ioc_pd_init(struct blkg_policy_data *pd)
2055{
2056	struct ioc_gq *iocg = pd_to_iocg(pd);
2057	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2058	struct ioc *ioc = q_to_ioc(blkg->q);
2059	struct ioc_now now;
2060	struct blkcg_gq *tblkg;
2061	unsigned long flags;
2062
2063	ioc_now(ioc, &now);
2064
2065	iocg->ioc = ioc;
2066	atomic64_set(&iocg->vtime, now.vnow);
2067	atomic64_set(&iocg->done_vtime, now.vnow);
2068	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2069	INIT_LIST_HEAD(&iocg->active_list);
2070	iocg->hweight_active = HWEIGHT_WHOLE;
2071	iocg->hweight_inuse = HWEIGHT_WHOLE;
2072
2073	init_waitqueue_head(&iocg->waitq);
2074	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2075	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2076	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2077	iocg->delay_timer.function = iocg_delay_timer_fn;
2078
2079	iocg->level = blkg->blkcg->css.cgroup->level;
2080
2081	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2082		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2083		iocg->ancestors[tiocg->level] = tiocg;
2084	}
2085
2086	spin_lock_irqsave(&ioc->lock, flags);
2087	weight_updated(iocg);
2088	spin_unlock_irqrestore(&ioc->lock, flags);
2089}
2090
2091static void ioc_pd_free(struct blkg_policy_data *pd)
2092{
2093	struct ioc_gq *iocg = pd_to_iocg(pd);
2094	struct ioc *ioc = iocg->ioc;
2095	unsigned long flags;
2096
2097	if (ioc) {
2098		spin_lock_irqsave(&ioc->lock, flags);
2099		if (!list_empty(&iocg->active_list)) {
2100			propagate_active_weight(iocg, 0, 0);
2101			list_del_init(&iocg->active_list);
2102		}
2103		spin_unlock_irqrestore(&ioc->lock, flags);
2104
2105		hrtimer_cancel(&iocg->waitq_timer);
2106		hrtimer_cancel(&iocg->delay_timer);
2107	}
2108	kfree(iocg);
2109}
2110
2111static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2112			     int off)
2113{
2114	const char *dname = blkg_dev_name(pd->blkg);
2115	struct ioc_gq *iocg = pd_to_iocg(pd);
2116
2117	if (dname && iocg->cfg_weight)
2118		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2119	return 0;
2120}
2121
2122
2123static int ioc_weight_show(struct seq_file *sf, void *v)
2124{
2125	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2126	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2127
2128	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2129	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2130			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2131	return 0;
2132}
2133
2134static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2135				size_t nbytes, loff_t off)
2136{
2137	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2138	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2139	struct blkg_conf_ctx ctx;
2140	struct ioc_gq *iocg;
2141	u32 v;
2142	int ret;
2143
2144	if (!strchr(buf, ':')) {
2145		struct blkcg_gq *blkg;
2146
2147		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2148			return -EINVAL;
2149
2150		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2151			return -EINVAL;
2152
2153		spin_lock(&blkcg->lock);
2154		iocc->dfl_weight = v;
2155		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2156			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2157
2158			if (iocg) {
2159				spin_lock_irq(&iocg->ioc->lock);
2160				weight_updated(iocg);
2161				spin_unlock_irq(&iocg->ioc->lock);
2162			}
2163		}
2164		spin_unlock(&blkcg->lock);
2165
2166		return nbytes;
2167	}
2168
2169	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2170	if (ret)
2171		return ret;
2172
2173	iocg = blkg_to_iocg(ctx.blkg);
2174
2175	if (!strncmp(ctx.body, "default", 7)) {
2176		v = 0;
2177	} else {
2178		if (!sscanf(ctx.body, "%u", &v))
2179			goto einval;
2180		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2181			goto einval;
2182	}
2183
2184	spin_lock(&iocg->ioc->lock);
2185	iocg->cfg_weight = v;
2186	weight_updated(iocg);
2187	spin_unlock(&iocg->ioc->lock);
2188
2189	blkg_conf_finish(&ctx);
2190	return nbytes;
2191
2192einval:
2193	blkg_conf_finish(&ctx);
2194	return -EINVAL;
2195}
2196
2197static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2198			  int off)
2199{
2200	const char *dname = blkg_dev_name(pd->blkg);
2201	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2202
2203	if (!dname)
2204		return 0;
2205
2206	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2207		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2208		   ioc->params.qos[QOS_RPPM] / 10000,
2209		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2210		   ioc->params.qos[QOS_RLAT],
2211		   ioc->params.qos[QOS_WPPM] / 10000,
2212		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2213		   ioc->params.qos[QOS_WLAT],
2214		   ioc->params.qos[QOS_MIN] / 10000,
2215		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2216		   ioc->params.qos[QOS_MAX] / 10000,
2217		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2218	return 0;
2219}
2220
2221static int ioc_qos_show(struct seq_file *sf, void *v)
2222{
2223	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2224
2225	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2226			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2227	return 0;
2228}
2229
2230static const match_table_t qos_ctrl_tokens = {
2231	{ QOS_ENABLE,		"enable=%u"	},
2232	{ QOS_CTRL,		"ctrl=%s"	},
2233	{ NR_QOS_CTRL_PARAMS,	NULL		},
2234};
2235
2236static const match_table_t qos_tokens = {
2237	{ QOS_RPPM,		"rpct=%s"	},
2238	{ QOS_RLAT,		"rlat=%u"	},
2239	{ QOS_WPPM,		"wpct=%s"	},
2240	{ QOS_WLAT,		"wlat=%u"	},
2241	{ QOS_MIN,		"min=%s"	},
2242	{ QOS_MAX,		"max=%s"	},
2243	{ NR_QOS_PARAMS,	NULL		},
2244};
2245
2246static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2247			     size_t nbytes, loff_t off)
2248{
2249	struct gendisk *disk;
2250	struct ioc *ioc;
2251	u32 qos[NR_QOS_PARAMS];
2252	bool enable, user;
2253	char *p;
2254	int ret;
2255
2256	disk = blkcg_conf_get_disk(&input);
2257	if (IS_ERR(disk))
2258		return PTR_ERR(disk);
2259
2260	ioc = q_to_ioc(disk->queue);
2261	if (!ioc) {
2262		ret = blk_iocost_init(disk->queue);
2263		if (ret)
2264			goto err;
2265		ioc = q_to_ioc(disk->queue);
2266	}
2267
2268	spin_lock_irq(&ioc->lock);
2269	memcpy(qos, ioc->params.qos, sizeof(qos));
2270	enable = ioc->enabled;
2271	user = ioc->user_qos_params;
2272	spin_unlock_irq(&ioc->lock);
2273
2274	while ((p = strsep(&input, " \t\n"))) {
2275		substring_t args[MAX_OPT_ARGS];
2276		char buf[32];
2277		int tok;
2278		s64 v;
2279
2280		if (!*p)
2281			continue;
2282
2283		switch (match_token(p, qos_ctrl_tokens, args)) {
2284		case QOS_ENABLE:
2285			match_u64(&args[0], &v);
2286			enable = v;
2287			continue;
2288		case QOS_CTRL:
2289			match_strlcpy(buf, &args[0], sizeof(buf));
2290			if (!strcmp(buf, "auto"))
2291				user = false;
2292			else if (!strcmp(buf, "user"))
2293				user = true;
2294			else
2295				goto einval;
2296			continue;
2297		}
2298
2299		tok = match_token(p, qos_tokens, args);
2300		switch (tok) {
2301		case QOS_RPPM:
2302		case QOS_WPPM:
2303			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2304			    sizeof(buf))
2305				goto einval;
2306			if (cgroup_parse_float(buf, 2, &v))
2307				goto einval;
2308			if (v < 0 || v > 10000)
2309				goto einval;
2310			qos[tok] = v * 100;
2311			break;
2312		case QOS_RLAT:
2313		case QOS_WLAT:
2314			if (match_u64(&args[0], &v))
2315				goto einval;
2316			qos[tok] = v;
2317			break;
2318		case QOS_MIN:
2319		case QOS_MAX:
2320			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2321			    sizeof(buf))
2322				goto einval;
2323			if (cgroup_parse_float(buf, 2, &v))
2324				goto einval;
2325			if (v < 0)
2326				goto einval;
2327			qos[tok] = clamp_t(s64, v * 100,
2328					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2329			break;
2330		default:
2331			goto einval;
2332		}
2333		user = true;
2334	}
2335
2336	if (qos[QOS_MIN] > qos[QOS_MAX])
2337		goto einval;
2338
2339	spin_lock_irq(&ioc->lock);
2340
2341	if (enable) {
2342		blk_stat_enable_accounting(ioc->rqos.q);
2343		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2344		ioc->enabled = true;
2345	} else {
2346		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2347		ioc->enabled = false;
2348	}
2349
2350	if (user) {
2351		memcpy(ioc->params.qos, qos, sizeof(qos));
2352		ioc->user_qos_params = true;
2353	} else {
2354		ioc->user_qos_params = false;
2355	}
2356
2357	ioc_refresh_params(ioc, true);
2358	spin_unlock_irq(&ioc->lock);
2359
2360	put_disk_and_module(disk);
2361	return nbytes;
2362einval:
2363	ret = -EINVAL;
2364err:
2365	put_disk_and_module(disk);
2366	return ret;
2367}
2368
2369static u64 ioc_cost_model_prfill(struct seq_file *sf,
2370				 struct blkg_policy_data *pd, int off)
2371{
2372	const char *dname = blkg_dev_name(pd->blkg);
2373	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2374	u64 *u = ioc->params.i_lcoefs;
2375
2376	if (!dname)
2377		return 0;
2378
2379	seq_printf(sf, "%s ctrl=%s model=linear "
2380		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2381		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2382		   dname, ioc->user_cost_model ? "user" : "auto",
2383		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2384		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2385	return 0;
2386}
2387
2388static int ioc_cost_model_show(struct seq_file *sf, void *v)
2389{
2390	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2391
2392	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2393			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2394	return 0;
2395}
2396
2397static const match_table_t cost_ctrl_tokens = {
2398	{ COST_CTRL,		"ctrl=%s"	},
2399	{ COST_MODEL,		"model=%s"	},
2400	{ NR_COST_CTRL_PARAMS,	NULL		},
2401};
2402
2403static const match_table_t i_lcoef_tokens = {
2404	{ I_LCOEF_RBPS,		"rbps=%u"	},
2405	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2406	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2407	{ I_LCOEF_WBPS,		"wbps=%u"	},
2408	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2409	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2410	{ NR_I_LCOEFS,		NULL		},
2411};
2412
2413static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2414				    size_t nbytes, loff_t off)
2415{
2416	struct gendisk *disk;
2417	struct ioc *ioc;
2418	u64 u[NR_I_LCOEFS];
2419	bool user;
2420	char *p;
2421	int ret;
2422
2423	disk = blkcg_conf_get_disk(&input);
2424	if (IS_ERR(disk))
2425		return PTR_ERR(disk);
2426
2427	ioc = q_to_ioc(disk->queue);
2428	if (!ioc) {
2429		ret = blk_iocost_init(disk->queue);
2430		if (ret)
2431			goto err;
2432		ioc = q_to_ioc(disk->queue);
2433	}
2434
2435	spin_lock_irq(&ioc->lock);
2436	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2437	user = ioc->user_cost_model;
2438	spin_unlock_irq(&ioc->lock);
2439
2440	while ((p = strsep(&input, " \t\n"))) {
2441		substring_t args[MAX_OPT_ARGS];
2442		char buf[32];
2443		int tok;
2444		u64 v;
2445
2446		if (!*p)
2447			continue;
2448
2449		switch (match_token(p, cost_ctrl_tokens, args)) {
2450		case COST_CTRL:
2451			match_strlcpy(buf, &args[0], sizeof(buf));
2452			if (!strcmp(buf, "auto"))
2453				user = false;
2454			else if (!strcmp(buf, "user"))
2455				user = true;
2456			else
2457				goto einval;
2458			continue;
2459		case COST_MODEL:
2460			match_strlcpy(buf, &args[0], sizeof(buf));
2461			if (strcmp(buf, "linear"))
2462				goto einval;
2463			continue;
2464		}
2465
2466		tok = match_token(p, i_lcoef_tokens, args);
2467		if (tok == NR_I_LCOEFS)
2468			goto einval;
2469		if (match_u64(&args[0], &v))
2470			goto einval;
2471		u[tok] = v;
2472		user = true;
2473	}
2474
2475	spin_lock_irq(&ioc->lock);
2476	if (user) {
2477		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2478		ioc->user_cost_model = true;
2479	} else {
2480		ioc->user_cost_model = false;
2481	}
2482	ioc_refresh_params(ioc, true);
2483	spin_unlock_irq(&ioc->lock);
2484
2485	put_disk_and_module(disk);
2486	return nbytes;
2487
2488einval:
2489	ret = -EINVAL;
2490err:
2491	put_disk_and_module(disk);
2492	return ret;
2493}
2494
2495static struct cftype ioc_files[] = {
2496	{
2497		.name = "weight",
2498		.flags = CFTYPE_NOT_ON_ROOT,
2499		.seq_show = ioc_weight_show,
2500		.write = ioc_weight_write,
2501	},
2502	{
2503		.name = "cost.qos",
2504		.flags = CFTYPE_ONLY_ON_ROOT,
2505		.seq_show = ioc_qos_show,
2506		.write = ioc_qos_write,
2507	},
2508	{
2509		.name = "cost.model",
2510		.flags = CFTYPE_ONLY_ON_ROOT,
2511		.seq_show = ioc_cost_model_show,
2512		.write = ioc_cost_model_write,
2513	},
2514	{}
2515};
2516
2517static struct blkcg_policy blkcg_policy_iocost = {
2518	.dfl_cftypes	= ioc_files,
2519	.cpd_alloc_fn	= ioc_cpd_alloc,
2520	.cpd_free_fn	= ioc_cpd_free,
2521	.pd_alloc_fn	= ioc_pd_alloc,
2522	.pd_init_fn	= ioc_pd_init,
2523	.pd_free_fn	= ioc_pd_free,
2524};
2525
2526static int __init ioc_init(void)
2527{
2528	return blkcg_policy_register(&blkcg_policy_iocost);
2529}
2530
2531static void __exit ioc_exit(void)
2532{
2533	return blkcg_policy_unregister(&blkcg_policy_iocost);
2534}
2535
2536module_init(ioc_init);
2537module_exit(ioc_exit);