Linux Audio

Check our new training course

Embedded Linux training

Mar 10-20, 2025, special US time zones
Register
Loading...
Note: File does not exist in v4.6.
   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * IO cost model based controller.
   4 *
   5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7 * Copyright (C) 2019 Facebook
   8 *
   9 * One challenge of controlling IO resources is the lack of trivially
  10 * observable cost metric.  This is distinguished from CPU and memory where
  11 * wallclock time and the number of bytes can serve as accurate enough
  12 * approximations.
  13 *
  14 * Bandwidth and iops are the most commonly used metrics for IO devices but
  15 * depending on the type and specifics of the device, different IO patterns
  16 * easily lead to multiple orders of magnitude variations rendering them
  17 * useless for the purpose of IO capacity distribution.  While on-device
  18 * time, with a lot of clutches, could serve as a useful approximation for
  19 * non-queued rotational devices, this is no longer viable with modern
  20 * devices, even the rotational ones.
  21 *
  22 * While there is no cost metric we can trivially observe, it isn't a
  23 * complete mystery.  For example, on a rotational device, seek cost
  24 * dominates while a contiguous transfer contributes a smaller amount
  25 * proportional to the size.  If we can characterize at least the relative
  26 * costs of these different types of IOs, it should be possible to
  27 * implement a reasonable work-conserving proportional IO resource
  28 * distribution.
  29 *
  30 * 1. IO Cost Model
  31 *
  32 * IO cost model estimates the cost of an IO given its basic parameters and
  33 * history (e.g. the end sector of the last IO).  The cost is measured in
  34 * device time.  If a given IO is estimated to cost 10ms, the device should
  35 * be able to process ~100 of those IOs in a second.
  36 *
  37 * Currently, there's only one builtin cost model - linear.  Each IO is
  38 * classified as sequential or random and given a base cost accordingly.
  39 * On top of that, a size cost proportional to the length of the IO is
  40 * added.  While simple, this model captures the operational
  41 * characteristics of a wide varienty of devices well enough.  Default
  42 * paramters for several different classes of devices are provided and the
  43 * parameters can be configured from userspace via
  44 * /sys/fs/cgroup/io.cost.model.
  45 *
  46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47 * device-specific coefficients.
  48 *
  49 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  50 * device-specific coefficients.
  51 *
  52 * 2. Control Strategy
  53 *
  54 * The device virtual time (vtime) is used as the primary control metric.
  55 * The control strategy is composed of the following three parts.
  56 *
  57 * 2-1. Vtime Distribution
  58 *
  59 * When a cgroup becomes active in terms of IOs, its hierarchical share is
  60 * calculated.  Please consider the following hierarchy where the numbers
  61 * inside parentheses denote the configured weights.
  62 *
  63 *           root
  64 *         /       \
  65 *      A (w:100)  B (w:300)
  66 *      /       \
  67 *  A0 (w:100)  A1 (w:100)
  68 *
  69 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  70 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  71 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  72 * 12.5% each.  The distribution mechanism only cares about these flattened
  73 * shares.  They're called hweights (hierarchical weights) and always add
  74 * upto 1 (HWEIGHT_WHOLE).
  75 *
  76 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  77 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  78 * against the device vtime - an IO which takes 10ms on the underlying
  79 * device is considered to take 80ms on A0.
  80 *
  81 * This constitutes the basis of IO capacity distribution.  Each cgroup's
  82 * vtime is running at a rate determined by its hweight.  A cgroup tracks
  83 * the vtime consumed by past IOs and can issue a new IO iff doing so
  84 * wouldn't outrun the current device vtime.  Otherwise, the IO is
  85 * suspended until the vtime has progressed enough to cover it.
  86 *
  87 * 2-2. Vrate Adjustment
  88 *
  89 * It's unrealistic to expect the cost model to be perfect.  There are too
  90 * many devices and even on the same device the overall performance
  91 * fluctuates depending on numerous factors such as IO mixture and device
  92 * internal garbage collection.  The controller needs to adapt dynamically.
  93 *
  94 * This is achieved by adjusting the overall IO rate according to how busy
  95 * the device is.  If the device becomes overloaded, we're sending down too
  96 * many IOs and should generally slow down.  If there are waiting issuers
  97 * but the device isn't saturated, we're issuing too few and should
  98 * generally speed up.
  99 *
 100 * To slow down, we lower the vrate - the rate at which the device vtime
 101 * passes compared to the wall clock.  For example, if the vtime is running
 102 * at the vrate of 75%, all cgroups added up would only be able to issue
 103 * 750ms worth of IOs per second, and vice-versa for speeding up.
 104 *
 105 * Device business is determined using two criteria - rq wait and
 106 * completion latencies.
 107 *
 108 * When a device gets saturated, the on-device and then the request queues
 109 * fill up and a bio which is ready to be issued has to wait for a request
 110 * to become available.  When this delay becomes noticeable, it's a clear
 111 * indication that the device is saturated and we lower the vrate.  This
 112 * saturation signal is fairly conservative as it only triggers when both
 113 * hardware and software queues are filled up, and is used as the default
 114 * busy signal.
 115 *
 116 * As devices can have deep queues and be unfair in how the queued commands
 117 * are executed, soley depending on rq wait may not result in satisfactory
 118 * control quality.  For a better control quality, completion latency QoS
 119 * parameters can be configured so that the device is considered saturated
 120 * if N'th percentile completion latency rises above the set point.
 121 *
 122 * The completion latency requirements are a function of both the
 123 * underlying device characteristics and the desired IO latency quality of
 124 * service.  There is an inherent trade-off - the tighter the latency QoS,
 125 * the higher the bandwidth lossage.  Latency QoS is disabled by default
 126 * and can be set through /sys/fs/cgroup/io.cost.qos.
 127 *
 128 * 2-3. Work Conservation
 129 *
 130 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 131 * periodically while B is sending out enough parallel IOs to saturate the
 132 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 133 * cost per second, i.e., 10% of the device capacity.  The naive
 134 * distribution of half and half would lead to 60% utilization of the
 135 * device, a significant reduction in the total amount of work done
 136 * compared to free-for-all competition.  This is too high a cost to pay
 137 * for IO control.
 138 *
 139 * To conserve the total amount of work done, we keep track of how much
 140 * each active cgroup is actually using and yield part of its weight if
 141 * there are other cgroups which can make use of it.  In the above case,
 142 * A's weight will be lowered so that it hovers above the actual usage and
 143 * B would be able to use the rest.
 144 *
 145 * As we don't want to penalize a cgroup for donating its weight, the
 146 * surplus weight adjustment factors in a margin and has an immediate
 147 * snapback mechanism in case the cgroup needs more IO vtime for itself.
 148 *
 149 * Note that adjusting down surplus weights has the same effects as
 150 * accelerating vtime for other cgroups and work conservation can also be
 151 * implemented by adjusting vrate dynamically.  However, squaring who can
 152 * donate and should take back how much requires hweight propagations
 153 * anyway making it easier to implement and understand as a separate
 154 * mechanism.
 155 *
 156 * 3. Monitoring
 157 *
 158 * Instead of debugfs or other clumsy monitoring mechanisms, this
 159 * controller uses a drgn based monitoring script -
 160 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 161 * https://github.com/osandov/drgn.  The ouput looks like the following.
 162 *
 163 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 164 *                 active      weight      hweight% inflt% dbt  delay usages%
 165 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 166 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 167 *
 168 * - per	: Timer period
 169 * - cur_per	: Internal wall and device vtime clock
 170 * - vrate	: Device virtual time rate against wall clock
 171 * - weight	: Surplus-adjusted and configured weights
 172 * - hweight	: Surplus-adjusted and configured hierarchical weights
 173 * - inflt	: The percentage of in-flight IO cost at the end of last period
 174 * - del_ms	: Deferred issuer delay induction level and duration
 175 * - usages	: Usage history
 176 */
 177
 178#include <linux/kernel.h>
 179#include <linux/module.h>
 180#include <linux/timer.h>
 181#include <linux/time64.h>
 182#include <linux/parser.h>
 183#include <linux/sched/signal.h>
 184#include <linux/blk-cgroup.h>
 185#include "blk-rq-qos.h"
 186#include "blk-stat.h"
 187#include "blk-wbt.h"
 188
 189#ifdef CONFIG_TRACEPOINTS
 190
 191/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 192#define TRACE_IOCG_PATH_LEN 1024
 193static DEFINE_SPINLOCK(trace_iocg_path_lock);
 194static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 195
 196#define TRACE_IOCG_PATH(type, iocg, ...)					\
 197	do {									\
 198		unsigned long flags;						\
 199		if (trace_iocost_##type##_enabled()) {				\
 200			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
 201			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
 202				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
 203			trace_iocost_##type(iocg, trace_iocg_path,		\
 204					      ##__VA_ARGS__);			\
 205			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
 206		}								\
 207	} while (0)
 208
 209#else	/* CONFIG_TRACE_POINTS */
 210#define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
 211#endif	/* CONFIG_TRACE_POINTS */
 212
 213enum {
 214	MILLION			= 1000000,
 215
 216	/* timer period is calculated from latency requirements, bound it */
 217	MIN_PERIOD		= USEC_PER_MSEC,
 218	MAX_PERIOD		= USEC_PER_SEC,
 219
 220	/*
 221	 * A cgroup's vtime can run 50% behind the device vtime, which
 222	 * serves as its IO credit buffer.  Surplus weight adjustment is
 223	 * immediately canceled if the vtime margin runs below 10%.
 224	 */
 225	MARGIN_PCT		= 50,
 226	INUSE_MARGIN_PCT	= 10,
 227
 228	/* Have some play in waitq timer operations */
 229	WAITQ_TIMER_MARGIN_PCT	= 5,
 230
 231	/*
 232	 * vtime can wrap well within a reasonable uptime when vrate is
 233	 * consistently raised.  Don't trust recorded cgroup vtime if the
 234	 * period counter indicates that it's older than 5mins.
 235	 */
 236	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
 237
 238	/*
 239	 * Remember the past three non-zero usages and use the max for
 240	 * surplus calculation.  Three slots guarantee that we remember one
 241	 * full period usage from the last active stretch even after
 242	 * partial deactivation and re-activation periods.  Don't start
 243	 * giving away weight before collecting two data points to prevent
 244	 * hweight adjustments based on one partial activation period.
 245	 */
 246	NR_USAGE_SLOTS		= 3,
 247	MIN_VALID_USAGES	= 2,
 248
 249	/* 1/64k is granular enough and can easily be handled w/ u32 */
 250	HWEIGHT_WHOLE		= 1 << 16,
 251
 252	/*
 253	 * As vtime is used to calculate the cost of each IO, it needs to
 254	 * be fairly high precision.  For example, it should be able to
 255	 * represent the cost of a single page worth of discard with
 256	 * suffificient accuracy.  At the same time, it should be able to
 257	 * represent reasonably long enough durations to be useful and
 258	 * convenient during operation.
 259	 *
 260	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 261	 * granularity and days of wrap-around time even at extreme vrates.
 262	 */
 263	VTIME_PER_SEC_SHIFT	= 37,
 264	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
 265	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
 266
 267	/* bound vrate adjustments within two orders of magnitude */
 268	VRATE_MIN_PPM		= 10000,	/* 1% */
 269	VRATE_MAX_PPM		= 100000000,	/* 10000% */
 270
 271	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 272	VRATE_CLAMP_ADJ_PCT	= 4,
 273
 274	/* if IOs end up waiting for requests, issue less */
 275	RQ_WAIT_BUSY_PCT	= 5,
 276
 277	/* unbusy hysterisis */
 278	UNBUSY_THR_PCT		= 75,
 279
 280	/* don't let cmds which take a very long time pin lagging for too long */
 281	MAX_LAGGING_PERIODS	= 10,
 282
 283	/*
 284	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
 285	 * donate the surplus.
 286	 */
 287	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
 288	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
 289	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
 290
 291	/* switch iff the conditions are met for longer than this */
 292	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
 293
 294	/*
 295	 * Count IO size in 4k pages.  The 12bit shift helps keeping
 296	 * size-proportional components of cost calculation in closer
 297	 * numbers of digits to per-IO cost components.
 298	 */
 299	IOC_PAGE_SHIFT		= 12,
 300	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
 301	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
 302
 303	/* if apart further than 16M, consider randio for linear model */
 304	LCOEF_RANDIO_PAGES	= 4096,
 305};
 306
 307enum ioc_running {
 308	IOC_IDLE,
 309	IOC_RUNNING,
 310	IOC_STOP,
 311};
 312
 313/* io.cost.qos controls including per-dev enable of the whole controller */
 314enum {
 315	QOS_ENABLE,
 316	QOS_CTRL,
 317	NR_QOS_CTRL_PARAMS,
 318};
 319
 320/* io.cost.qos params */
 321enum {
 322	QOS_RPPM,
 323	QOS_RLAT,
 324	QOS_WPPM,
 325	QOS_WLAT,
 326	QOS_MIN,
 327	QOS_MAX,
 328	NR_QOS_PARAMS,
 329};
 330
 331/* io.cost.model controls */
 332enum {
 333	COST_CTRL,
 334	COST_MODEL,
 335	NR_COST_CTRL_PARAMS,
 336};
 337
 338/* builtin linear cost model coefficients */
 339enum {
 340	I_LCOEF_RBPS,
 341	I_LCOEF_RSEQIOPS,
 342	I_LCOEF_RRANDIOPS,
 343	I_LCOEF_WBPS,
 344	I_LCOEF_WSEQIOPS,
 345	I_LCOEF_WRANDIOPS,
 346	NR_I_LCOEFS,
 347};
 348
 349enum {
 350	LCOEF_RPAGE,
 351	LCOEF_RSEQIO,
 352	LCOEF_RRANDIO,
 353	LCOEF_WPAGE,
 354	LCOEF_WSEQIO,
 355	LCOEF_WRANDIO,
 356	NR_LCOEFS,
 357};
 358
 359enum {
 360	AUTOP_INVALID,
 361	AUTOP_HDD,
 362	AUTOP_SSD_QD1,
 363	AUTOP_SSD_DFL,
 364	AUTOP_SSD_FAST,
 365};
 366
 367struct ioc_gq;
 368
 369struct ioc_params {
 370	u32				qos[NR_QOS_PARAMS];
 371	u64				i_lcoefs[NR_I_LCOEFS];
 372	u64				lcoefs[NR_LCOEFS];
 373	u32				too_fast_vrate_pct;
 374	u32				too_slow_vrate_pct;
 375};
 376
 377struct ioc_missed {
 378	u32				nr_met;
 379	u32				nr_missed;
 380	u32				last_met;
 381	u32				last_missed;
 382};
 383
 384struct ioc_pcpu_stat {
 385	struct ioc_missed		missed[2];
 386
 387	u64				rq_wait_ns;
 388	u64				last_rq_wait_ns;
 389};
 390
 391/* per device */
 392struct ioc {
 393	struct rq_qos			rqos;
 394
 395	bool				enabled;
 396
 397	struct ioc_params		params;
 398	u32				period_us;
 399	u32				margin_us;
 400	u64				vrate_min;
 401	u64				vrate_max;
 402
 403	spinlock_t			lock;
 404	struct timer_list		timer;
 405	struct list_head		active_iocgs;	/* active cgroups */
 406	struct ioc_pcpu_stat __percpu	*pcpu_stat;
 407
 408	enum ioc_running		running;
 409	atomic64_t			vtime_rate;
 410
 411	seqcount_t			period_seqcount;
 412	u32				period_at;	/* wallclock starttime */
 413	u64				period_at_vtime; /* vtime starttime */
 414
 415	atomic64_t			cur_period;	/* inc'd each period */
 416	int				busy_level;	/* saturation history */
 417
 418	u64				inuse_margin_vtime;
 419	bool				weights_updated;
 420	atomic_t			hweight_gen;	/* for lazy hweights */
 421
 422	u64				autop_too_fast_at;
 423	u64				autop_too_slow_at;
 424	int				autop_idx;
 425	bool				user_qos_params:1;
 426	bool				user_cost_model:1;
 427};
 428
 429/* per device-cgroup pair */
 430struct ioc_gq {
 431	struct blkg_policy_data		pd;
 432	struct ioc			*ioc;
 433
 434	/*
 435	 * A iocg can get its weight from two sources - an explicit
 436	 * per-device-cgroup configuration or the default weight of the
 437	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 438	 * configuration.  `weight` is the effective considering both
 439	 * sources.
 440	 *
 441	 * When an idle cgroup becomes active its `active` goes from 0 to
 442	 * `weight`.  `inuse` is the surplus adjusted active weight.
 443	 * `active` and `inuse` are used to calculate `hweight_active` and
 444	 * `hweight_inuse`.
 445	 *
 446	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
 447	 * surplus adjustments.
 448	 */
 449	u32				cfg_weight;
 450	u32				weight;
 451	u32				active;
 452	u32				inuse;
 453	u32				last_inuse;
 454
 455	sector_t			cursor;		/* to detect randio */
 456
 457	/*
 458	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
 459	 * issued.  If lagging behind device vtime, the delta represents
 460	 * the currently available IO budget.  If runnning ahead, the
 461	 * overage.
 462	 *
 463	 * `vtime_done` is the same but progressed on completion rather
 464	 * than issue.  The delta behind `vtime` represents the cost of
 465	 * currently in-flight IOs.
 466	 *
 467	 * `last_vtime` is used to remember `vtime` at the end of the last
 468	 * period to calculate utilization.
 469	 */
 470	atomic64_t			vtime;
 471	atomic64_t			done_vtime;
 472	atomic64_t			abs_vdebt;
 473	u64				last_vtime;
 474
 475	/*
 476	 * The period this iocg was last active in.  Used for deactivation
 477	 * and invalidating `vtime`.
 478	 */
 479	atomic64_t			active_period;
 480	struct list_head		active_list;
 481
 482	/* see __propagate_active_weight() and current_hweight() for details */
 483	u64				child_active_sum;
 484	u64				child_inuse_sum;
 485	int				hweight_gen;
 486	u32				hweight_active;
 487	u32				hweight_inuse;
 488	bool				has_surplus;
 489
 490	struct wait_queue_head		waitq;
 491	struct hrtimer			waitq_timer;
 492	struct hrtimer			delay_timer;
 493
 494	/* usage is recorded as fractions of HWEIGHT_WHOLE */
 495	int				usage_idx;
 496	u32				usages[NR_USAGE_SLOTS];
 497
 498	/* this iocg's depth in the hierarchy and ancestors including self */
 499	int				level;
 500	struct ioc_gq			*ancestors[];
 501};
 502
 503/* per cgroup */
 504struct ioc_cgrp {
 505	struct blkcg_policy_data	cpd;
 506	unsigned int			dfl_weight;
 507};
 508
 509struct ioc_now {
 510	u64				now_ns;
 511	u32				now;
 512	u64				vnow;
 513	u64				vrate;
 514};
 515
 516struct iocg_wait {
 517	struct wait_queue_entry		wait;
 518	struct bio			*bio;
 519	u64				abs_cost;
 520	bool				committed;
 521};
 522
 523struct iocg_wake_ctx {
 524	struct ioc_gq			*iocg;
 525	u32				hw_inuse;
 526	s64				vbudget;
 527};
 528
 529static const struct ioc_params autop[] = {
 530	[AUTOP_HDD] = {
 531		.qos				= {
 532			[QOS_RLAT]		=        250000, /* 250ms */
 533			[QOS_WLAT]		=        250000,
 534			[QOS_MIN]		= VRATE_MIN_PPM,
 535			[QOS_MAX]		= VRATE_MAX_PPM,
 536		},
 537		.i_lcoefs			= {
 538			[I_LCOEF_RBPS]		=     174019176,
 539			[I_LCOEF_RSEQIOPS]	=         41708,
 540			[I_LCOEF_RRANDIOPS]	=           370,
 541			[I_LCOEF_WBPS]		=     178075866,
 542			[I_LCOEF_WSEQIOPS]	=         42705,
 543			[I_LCOEF_WRANDIOPS]	=           378,
 544		},
 545	},
 546	[AUTOP_SSD_QD1] = {
 547		.qos				= {
 548			[QOS_RLAT]		=         25000, /* 25ms */
 549			[QOS_WLAT]		=         25000,
 550			[QOS_MIN]		= VRATE_MIN_PPM,
 551			[QOS_MAX]		= VRATE_MAX_PPM,
 552		},
 553		.i_lcoefs			= {
 554			[I_LCOEF_RBPS]		=     245855193,
 555			[I_LCOEF_RSEQIOPS]	=         61575,
 556			[I_LCOEF_RRANDIOPS]	=          6946,
 557			[I_LCOEF_WBPS]		=     141365009,
 558			[I_LCOEF_WSEQIOPS]	=         33716,
 559			[I_LCOEF_WRANDIOPS]	=         26796,
 560		},
 561	},
 562	[AUTOP_SSD_DFL] = {
 563		.qos				= {
 564			[QOS_RLAT]		=         25000, /* 25ms */
 565			[QOS_WLAT]		=         25000,
 566			[QOS_MIN]		= VRATE_MIN_PPM,
 567			[QOS_MAX]		= VRATE_MAX_PPM,
 568		},
 569		.i_lcoefs			= {
 570			[I_LCOEF_RBPS]		=     488636629,
 571			[I_LCOEF_RSEQIOPS]	=          8932,
 572			[I_LCOEF_RRANDIOPS]	=          8518,
 573			[I_LCOEF_WBPS]		=     427891549,
 574			[I_LCOEF_WSEQIOPS]	=         28755,
 575			[I_LCOEF_WRANDIOPS]	=         21940,
 576		},
 577		.too_fast_vrate_pct		=           500,
 578	},
 579	[AUTOP_SSD_FAST] = {
 580		.qos				= {
 581			[QOS_RLAT]		=          5000, /* 5ms */
 582			[QOS_WLAT]		=          5000,
 583			[QOS_MIN]		= VRATE_MIN_PPM,
 584			[QOS_MAX]		= VRATE_MAX_PPM,
 585		},
 586		.i_lcoefs			= {
 587			[I_LCOEF_RBPS]		=    3102524156LLU,
 588			[I_LCOEF_RSEQIOPS]	=        724816,
 589			[I_LCOEF_RRANDIOPS]	=        778122,
 590			[I_LCOEF_WBPS]		=    1742780862LLU,
 591			[I_LCOEF_WSEQIOPS]	=        425702,
 592			[I_LCOEF_WRANDIOPS]	=	 443193,
 593		},
 594		.too_slow_vrate_pct		=            10,
 595	},
 596};
 597
 598/*
 599 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 600 * vtime credit shortage and down on device saturation.
 601 */
 602static u32 vrate_adj_pct[] =
 603	{ 0, 0, 0, 0,
 604	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 605	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 606	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 607
 608static struct blkcg_policy blkcg_policy_iocost;
 609
 610/* accessors and helpers */
 611static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 612{
 613	return container_of(rqos, struct ioc, rqos);
 614}
 615
 616static struct ioc *q_to_ioc(struct request_queue *q)
 617{
 618	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 619}
 620
 621static const char *q_name(struct request_queue *q)
 622{
 623	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 624		return kobject_name(q->kobj.parent);
 625	else
 626		return "<unknown>";
 627}
 628
 629static const char __maybe_unused *ioc_name(struct ioc *ioc)
 630{
 631	return q_name(ioc->rqos.q);
 632}
 633
 634static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 635{
 636	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 637}
 638
 639static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 640{
 641	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 642}
 643
 644static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 645{
 646	return pd_to_blkg(&iocg->pd);
 647}
 648
 649static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 650{
 651	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 652			    struct ioc_cgrp, cpd);
 653}
 654
 655/*
 656 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 657 * weight, the more expensive each IO.  Must round up.
 658 */
 659static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 660{
 661	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
 662}
 663
 664/*
 665 * The inverse of abs_cost_to_cost().  Must round up.
 666 */
 667static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 668{
 669	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
 670}
 671
 672static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
 673{
 674	bio->bi_iocost_cost = cost;
 675	atomic64_add(cost, &iocg->vtime);
 676}
 677
 678#define CREATE_TRACE_POINTS
 679#include <trace/events/iocost.h>
 680
 681/* latency Qos params changed, update period_us and all the dependent params */
 682static void ioc_refresh_period_us(struct ioc *ioc)
 683{
 684	u32 ppm, lat, multi, period_us;
 685
 686	lockdep_assert_held(&ioc->lock);
 687
 688	/* pick the higher latency target */
 689	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 690		ppm = ioc->params.qos[QOS_RPPM];
 691		lat = ioc->params.qos[QOS_RLAT];
 692	} else {
 693		ppm = ioc->params.qos[QOS_WPPM];
 694		lat = ioc->params.qos[QOS_WLAT];
 695	}
 696
 697	/*
 698	 * We want the period to be long enough to contain a healthy number
 699	 * of IOs while short enough for granular control.  Define it as a
 700	 * multiple of the latency target.  Ideally, the multiplier should
 701	 * be scaled according to the percentile so that it would nominally
 702	 * contain a certain number of requests.  Let's be simpler and
 703	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 704	 */
 705	if (ppm)
 706		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 707	else
 708		multi = 2;
 709	period_us = multi * lat;
 710	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 711
 712	/* calculate dependent params */
 713	ioc->period_us = period_us;
 714	ioc->margin_us = period_us * MARGIN_PCT / 100;
 715	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
 716			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
 717}
 718
 719static int ioc_autop_idx(struct ioc *ioc)
 720{
 721	int idx = ioc->autop_idx;
 722	const struct ioc_params *p = &autop[idx];
 723	u32 vrate_pct;
 724	u64 now_ns;
 725
 726	/* rotational? */
 727	if (!blk_queue_nonrot(ioc->rqos.q))
 728		return AUTOP_HDD;
 729
 730	/* handle SATA SSDs w/ broken NCQ */
 731	if (blk_queue_depth(ioc->rqos.q) == 1)
 732		return AUTOP_SSD_QD1;
 733
 734	/* use one of the normal ssd sets */
 735	if (idx < AUTOP_SSD_DFL)
 736		return AUTOP_SSD_DFL;
 737
 738	/* if user is overriding anything, maintain what was there */
 739	if (ioc->user_qos_params || ioc->user_cost_model)
 740		return idx;
 741
 742	/* step up/down based on the vrate */
 743	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
 744			      VTIME_PER_USEC);
 745	now_ns = ktime_get_ns();
 746
 747	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 748		if (!ioc->autop_too_fast_at)
 749			ioc->autop_too_fast_at = now_ns;
 750		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 751			return idx + 1;
 752	} else {
 753		ioc->autop_too_fast_at = 0;
 754	}
 755
 756	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 757		if (!ioc->autop_too_slow_at)
 758			ioc->autop_too_slow_at = now_ns;
 759		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 760			return idx - 1;
 761	} else {
 762		ioc->autop_too_slow_at = 0;
 763	}
 764
 765	return idx;
 766}
 767
 768/*
 769 * Take the followings as input
 770 *
 771 *  @bps	maximum sequential throughput
 772 *  @seqiops	maximum sequential 4k iops
 773 *  @randiops	maximum random 4k iops
 774 *
 775 * and calculate the linear model cost coefficients.
 776 *
 777 *  *@page	per-page cost		1s / (@bps / 4096)
 778 *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
 779 *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
 780 */
 781static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 782			u64 *page, u64 *seqio, u64 *randio)
 783{
 784	u64 v;
 785
 786	*page = *seqio = *randio = 0;
 787
 788	if (bps)
 789		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 790					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 791
 792	if (seqiops) {
 793		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 794		if (v > *page)
 795			*seqio = v - *page;
 796	}
 797
 798	if (randiops) {
 799		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 800		if (v > *page)
 801			*randio = v - *page;
 802	}
 803}
 804
 805static void ioc_refresh_lcoefs(struct ioc *ioc)
 806{
 807	u64 *u = ioc->params.i_lcoefs;
 808	u64 *c = ioc->params.lcoefs;
 809
 810	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 811		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 812	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 813		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 814}
 815
 816static bool ioc_refresh_params(struct ioc *ioc, bool force)
 817{
 818	const struct ioc_params *p;
 819	int idx;
 820
 821	lockdep_assert_held(&ioc->lock);
 822
 823	idx = ioc_autop_idx(ioc);
 824	p = &autop[idx];
 825
 826	if (idx == ioc->autop_idx && !force)
 827		return false;
 828
 829	if (idx != ioc->autop_idx)
 830		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 831
 832	ioc->autop_idx = idx;
 833	ioc->autop_too_fast_at = 0;
 834	ioc->autop_too_slow_at = 0;
 835
 836	if (!ioc->user_qos_params)
 837		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 838	if (!ioc->user_cost_model)
 839		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 840
 841	ioc_refresh_period_us(ioc);
 842	ioc_refresh_lcoefs(ioc);
 843
 844	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 845					    VTIME_PER_USEC, MILLION);
 846	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 847				   VTIME_PER_USEC, MILLION);
 848
 849	return true;
 850}
 851
 852/* take a snapshot of the current [v]time and vrate */
 853static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 854{
 855	unsigned seq;
 856
 857	now->now_ns = ktime_get();
 858	now->now = ktime_to_us(now->now_ns);
 859	now->vrate = atomic64_read(&ioc->vtime_rate);
 860
 861	/*
 862	 * The current vtime is
 863	 *
 864	 *   vtime at period start + (wallclock time since the start) * vrate
 865	 *
 866	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
 867	 * needed, they're seqcount protected.
 868	 */
 869	do {
 870		seq = read_seqcount_begin(&ioc->period_seqcount);
 871		now->vnow = ioc->period_at_vtime +
 872			(now->now - ioc->period_at) * now->vrate;
 873	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
 874}
 875
 876static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
 877{
 878	lockdep_assert_held(&ioc->lock);
 879	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
 880
 881	write_seqcount_begin(&ioc->period_seqcount);
 882	ioc->period_at = now->now;
 883	ioc->period_at_vtime = now->vnow;
 884	write_seqcount_end(&ioc->period_seqcount);
 885
 886	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
 887	add_timer(&ioc->timer);
 888}
 889
 890/*
 891 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 892 * weight sums and propagate upwards accordingly.
 893 */
 894static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 895{
 896	struct ioc *ioc = iocg->ioc;
 897	int lvl;
 898
 899	lockdep_assert_held(&ioc->lock);
 900
 901	inuse = min(active, inuse);
 902
 903	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
 904		struct ioc_gq *parent = iocg->ancestors[lvl];
 905		struct ioc_gq *child = iocg->ancestors[lvl + 1];
 906		u32 parent_active = 0, parent_inuse = 0;
 907
 908		/* update the level sums */
 909		parent->child_active_sum += (s32)(active - child->active);
 910		parent->child_inuse_sum += (s32)(inuse - child->inuse);
 911		/* apply the udpates */
 912		child->active = active;
 913		child->inuse = inuse;
 914
 915		/*
 916		 * The delta between inuse and active sums indicates that
 917		 * that much of weight is being given away.  Parent's inuse
 918		 * and active should reflect the ratio.
 919		 */
 920		if (parent->child_active_sum) {
 921			parent_active = parent->weight;
 922			parent_inuse = DIV64_U64_ROUND_UP(
 923				parent_active * parent->child_inuse_sum,
 924				parent->child_active_sum);
 925		}
 926
 927		/* do we need to keep walking up? */
 928		if (parent_active == parent->active &&
 929		    parent_inuse == parent->inuse)
 930			break;
 931
 932		active = parent_active;
 933		inuse = parent_inuse;
 934	}
 935
 936	ioc->weights_updated = true;
 937}
 938
 939static void commit_active_weights(struct ioc *ioc)
 940{
 941	lockdep_assert_held(&ioc->lock);
 942
 943	if (ioc->weights_updated) {
 944		/* paired with rmb in current_hweight(), see there */
 945		smp_wmb();
 946		atomic_inc(&ioc->hweight_gen);
 947		ioc->weights_updated = false;
 948	}
 949}
 950
 951static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 952{
 953	__propagate_active_weight(iocg, active, inuse);
 954	commit_active_weights(iocg->ioc);
 955}
 956
 957static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
 958{
 959	struct ioc *ioc = iocg->ioc;
 960	int lvl;
 961	u32 hwa, hwi;
 962	int ioc_gen;
 963
 964	/* hot path - if uptodate, use cached */
 965	ioc_gen = atomic_read(&ioc->hweight_gen);
 966	if (ioc_gen == iocg->hweight_gen)
 967		goto out;
 968
 969	/*
 970	 * Paired with wmb in commit_active_weights().  If we saw the
 971	 * updated hweight_gen, all the weight updates from
 972	 * __propagate_active_weight() are visible too.
 973	 *
 974	 * We can race with weight updates during calculation and get it
 975	 * wrong.  However, hweight_gen would have changed and a future
 976	 * reader will recalculate and we're guaranteed to discard the
 977	 * wrong result soon.
 978	 */
 979	smp_rmb();
 980
 981	hwa = hwi = HWEIGHT_WHOLE;
 982	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
 983		struct ioc_gq *parent = iocg->ancestors[lvl];
 984		struct ioc_gq *child = iocg->ancestors[lvl + 1];
 985		u32 active_sum = READ_ONCE(parent->child_active_sum);
 986		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
 987		u32 active = READ_ONCE(child->active);
 988		u32 inuse = READ_ONCE(child->inuse);
 989
 990		/* we can race with deactivations and either may read as zero */
 991		if (!active_sum || !inuse_sum)
 992			continue;
 993
 994		active_sum = max(active, active_sum);
 995		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
 996
 997		inuse_sum = max(inuse, inuse_sum);
 998		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
 999	}
1000
1001	iocg->hweight_active = max_t(u32, hwa, 1);
1002	iocg->hweight_inuse = max_t(u32, hwi, 1);
1003	iocg->hweight_gen = ioc_gen;
1004out:
1005	if (hw_activep)
1006		*hw_activep = iocg->hweight_active;
1007	if (hw_inusep)
1008		*hw_inusep = iocg->hweight_inuse;
1009}
1010
1011static void weight_updated(struct ioc_gq *iocg)
1012{
1013	struct ioc *ioc = iocg->ioc;
1014	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1015	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1016	u32 weight;
1017
1018	lockdep_assert_held(&ioc->lock);
1019
1020	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1021	if (weight != iocg->weight && iocg->active)
1022		propagate_active_weight(iocg, weight,
1023			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1024	iocg->weight = weight;
1025}
1026
1027static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1028{
1029	struct ioc *ioc = iocg->ioc;
1030	u64 last_period, cur_period, max_period_delta;
1031	u64 vtime, vmargin, vmin;
1032	int i;
1033
1034	/*
1035	 * If seem to be already active, just update the stamp to tell the
1036	 * timer that we're still active.  We don't mind occassional races.
1037	 */
1038	if (!list_empty(&iocg->active_list)) {
1039		ioc_now(ioc, now);
1040		cur_period = atomic64_read(&ioc->cur_period);
1041		if (atomic64_read(&iocg->active_period) != cur_period)
1042			atomic64_set(&iocg->active_period, cur_period);
1043		return true;
1044	}
1045
1046	/* racy check on internal node IOs, treat as root level IOs */
1047	if (iocg->child_active_sum)
1048		return false;
1049
1050	spin_lock_irq(&ioc->lock);
1051
1052	ioc_now(ioc, now);
1053
1054	/* update period */
1055	cur_period = atomic64_read(&ioc->cur_period);
1056	last_period = atomic64_read(&iocg->active_period);
1057	atomic64_set(&iocg->active_period, cur_period);
1058
1059	/* already activated or breaking leaf-only constraint? */
1060	if (!list_empty(&iocg->active_list))
1061		goto succeed_unlock;
1062	for (i = iocg->level - 1; i > 0; i--)
1063		if (!list_empty(&iocg->ancestors[i]->active_list))
1064			goto fail_unlock;
1065
1066	if (iocg->child_active_sum)
1067		goto fail_unlock;
1068
1069	/*
1070	 * vtime may wrap when vrate is raised substantially due to
1071	 * underestimated IO costs.  Look at the period and ignore its
1072	 * vtime if the iocg has been idle for too long.  Also, cap the
1073	 * budget it can start with to the margin.
1074	 */
1075	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1076	vtime = atomic64_read(&iocg->vtime);
1077	vmargin = ioc->margin_us * now->vrate;
1078	vmin = now->vnow - vmargin;
1079
1080	if (last_period + max_period_delta < cur_period ||
1081	    time_before64(vtime, vmin)) {
1082		atomic64_add(vmin - vtime, &iocg->vtime);
1083		atomic64_add(vmin - vtime, &iocg->done_vtime);
1084		vtime = vmin;
1085	}
1086
1087	/*
1088	 * Activate, propagate weight and start period timer if not
1089	 * running.  Reset hweight_gen to avoid accidental match from
1090	 * wrapping.
1091	 */
1092	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1093	list_add(&iocg->active_list, &ioc->active_iocgs);
1094	propagate_active_weight(iocg, iocg->weight,
1095				iocg->last_inuse ?: iocg->weight);
1096
1097	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1098			last_period, cur_period, vtime);
1099
1100	iocg->last_vtime = vtime;
1101
1102	if (ioc->running == IOC_IDLE) {
1103		ioc->running = IOC_RUNNING;
1104		ioc_start_period(ioc, now);
1105	}
1106
1107succeed_unlock:
1108	spin_unlock_irq(&ioc->lock);
1109	return true;
1110
1111fail_unlock:
1112	spin_unlock_irq(&ioc->lock);
1113	return false;
1114}
1115
1116static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1117			int flags, void *key)
1118{
1119	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1120	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1121	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1122
1123	ctx->vbudget -= cost;
1124
1125	if (ctx->vbudget < 0)
1126		return -1;
1127
1128	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1129
1130	/*
1131	 * autoremove_wake_function() removes the wait entry only when it
1132	 * actually changed the task state.  We want the wait always
1133	 * removed.  Remove explicitly and use default_wake_function().
1134	 */
1135	list_del_init(&wq_entry->entry);
1136	wait->committed = true;
1137
1138	default_wake_function(wq_entry, mode, flags, key);
1139	return 0;
1140}
1141
1142static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1143{
1144	struct ioc *ioc = iocg->ioc;
1145	struct iocg_wake_ctx ctx = { .iocg = iocg };
1146	u64 margin_ns = (u64)(ioc->period_us *
1147			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1148	u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
1149	s64 vbudget;
1150	u32 hw_inuse;
1151
1152	lockdep_assert_held(&iocg->waitq.lock);
1153
1154	current_hweight(iocg, NULL, &hw_inuse);
1155	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1156
1157	/* pay off debt */
1158	abs_vdebt = atomic64_read(&iocg->abs_vdebt);
1159	vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
1160	if (vdebt && vbudget > 0) {
1161		u64 delta = min_t(u64, vbudget, vdebt);
1162		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1163				    abs_vdebt);
1164
1165		atomic64_add(delta, &iocg->vtime);
1166		atomic64_add(delta, &iocg->done_vtime);
1167		atomic64_sub(abs_delta, &iocg->abs_vdebt);
1168		if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
1169			atomic64_set(&iocg->abs_vdebt, 0);
1170	}
1171
1172	/*
1173	 * Wake up the ones which are due and see how much vtime we'll need
1174	 * for the next one.
1175	 */
1176	ctx.hw_inuse = hw_inuse;
1177	ctx.vbudget = vbudget - vdebt;
1178	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1179	if (!waitqueue_active(&iocg->waitq))
1180		return;
1181	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1182		return;
1183
1184	/* determine next wakeup, add a quarter margin to guarantee chunking */
1185	vshortage = -ctx.vbudget;
1186	expires = now->now_ns +
1187		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1188	expires += margin_ns / 4;
1189
1190	/* if already active and close enough, don't bother */
1191	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1192	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1193	    abs(oexpires - expires) <= margin_ns / 4)
1194		return;
1195
1196	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1197			       margin_ns / 4, HRTIMER_MODE_ABS);
1198}
1199
1200static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1201{
1202	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1203	struct ioc_now now;
1204	unsigned long flags;
1205
1206	ioc_now(iocg->ioc, &now);
1207
1208	spin_lock_irqsave(&iocg->waitq.lock, flags);
1209	iocg_kick_waitq(iocg, &now);
1210	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1211
1212	return HRTIMER_NORESTART;
1213}
1214
1215static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1216{
1217	struct ioc *ioc = iocg->ioc;
1218	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1219	u64 vtime = atomic64_read(&iocg->vtime);
1220	u64 vmargin = ioc->margin_us * now->vrate;
1221	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1222	u64 expires, oexpires;
1223	u32 hw_inuse;
1224
1225	/* debt-adjust vtime */
1226	current_hweight(iocg, NULL, &hw_inuse);
1227	vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
1228
1229	/* clear or maintain depending on the overage */
1230	if (time_before_eq64(vtime, now->vnow)) {
1231		blkcg_clear_delay(blkg);
1232		return;
1233	}
1234	if (!atomic_read(&blkg->use_delay) &&
1235	    time_before_eq64(vtime, now->vnow + vmargin))
1236		return;
1237
1238	/* use delay */
1239	if (cost) {
1240		u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1241						 now->vrate);
1242		blkcg_add_delay(blkg, now->now_ns, cost_ns);
1243	}
1244	blkcg_use_delay(blkg);
1245
1246	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1247						   now->vrate) * NSEC_PER_USEC;
1248
1249	/* if already active and close enough, don't bother */
1250	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1251	if (hrtimer_is_queued(&iocg->delay_timer) &&
1252	    abs(oexpires - expires) <= margin_ns / 4)
1253		return;
1254
1255	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1256			       margin_ns / 4, HRTIMER_MODE_ABS);
1257}
1258
1259static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1260{
1261	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1262	struct ioc_now now;
1263
1264	ioc_now(iocg->ioc, &now);
1265	iocg_kick_delay(iocg, &now, 0);
1266
1267	return HRTIMER_NORESTART;
1268}
1269
1270static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1271{
1272	u32 nr_met[2] = { };
1273	u32 nr_missed[2] = { };
1274	u64 rq_wait_ns = 0;
1275	int cpu, rw;
1276
1277	for_each_online_cpu(cpu) {
1278		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1279		u64 this_rq_wait_ns;
1280
1281		for (rw = READ; rw <= WRITE; rw++) {
1282			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1283			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1284
1285			nr_met[rw] += this_met - stat->missed[rw].last_met;
1286			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1287			stat->missed[rw].last_met = this_met;
1288			stat->missed[rw].last_missed = this_missed;
1289		}
1290
1291		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1292		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1293		stat->last_rq_wait_ns = this_rq_wait_ns;
1294	}
1295
1296	for (rw = READ; rw <= WRITE; rw++) {
1297		if (nr_met[rw] + nr_missed[rw])
1298			missed_ppm_ar[rw] =
1299				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1300						   nr_met[rw] + nr_missed[rw]);
1301		else
1302			missed_ppm_ar[rw] = 0;
1303	}
1304
1305	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1306				   ioc->period_us * NSEC_PER_USEC);
1307}
1308
1309/* was iocg idle this period? */
1310static bool iocg_is_idle(struct ioc_gq *iocg)
1311{
1312	struct ioc *ioc = iocg->ioc;
1313
1314	/* did something get issued this period? */
1315	if (atomic64_read(&iocg->active_period) ==
1316	    atomic64_read(&ioc->cur_period))
1317		return false;
1318
1319	/* is something in flight? */
1320	if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
1321		return false;
1322
1323	return true;
1324}
1325
1326/* returns usage with margin added if surplus is large enough */
1327static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1328{
1329	/* add margin */
1330	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1331	usage += SURPLUS_SCALE_ABS;
1332
1333	/* don't bother if the surplus is too small */
1334	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1335		return 0;
1336
1337	return usage;
1338}
1339
1340static void ioc_timer_fn(struct timer_list *timer)
1341{
1342	struct ioc *ioc = container_of(timer, struct ioc, timer);
1343	struct ioc_gq *iocg, *tiocg;
1344	struct ioc_now now;
1345	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1346	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1347	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1348	u32 missed_ppm[2], rq_wait_pct;
1349	u64 period_vtime;
1350	int prev_busy_level, i;
1351
1352	/* how were the latencies during the period? */
1353	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1354
1355	/* take care of active iocgs */
1356	spin_lock_irq(&ioc->lock);
1357
1358	ioc_now(ioc, &now);
1359
1360	period_vtime = now.vnow - ioc->period_at_vtime;
1361	if (WARN_ON_ONCE(!period_vtime)) {
1362		spin_unlock_irq(&ioc->lock);
1363		return;
1364	}
1365
1366	/*
1367	 * Waiters determine the sleep durations based on the vrate they
1368	 * saw at the time of sleep.  If vrate has increased, some waiters
1369	 * could be sleeping for too long.  Wake up tardy waiters which
1370	 * should have woken up in the last period and expire idle iocgs.
1371	 */
1372	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1373		if (!waitqueue_active(&iocg->waitq) &&
1374		    !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
1375			continue;
1376
1377		spin_lock(&iocg->waitq.lock);
1378
1379		if (waitqueue_active(&iocg->waitq) ||
1380		    atomic64_read(&iocg->abs_vdebt)) {
1381			/* might be oversleeping vtime / hweight changes, kick */
1382			iocg_kick_waitq(iocg, &now);
1383			iocg_kick_delay(iocg, &now, 0);
1384		} else if (iocg_is_idle(iocg)) {
1385			/* no waiter and idle, deactivate */
1386			iocg->last_inuse = iocg->inuse;
1387			__propagate_active_weight(iocg, 0, 0);
1388			list_del_init(&iocg->active_list);
1389		}
1390
1391		spin_unlock(&iocg->waitq.lock);
1392	}
1393	commit_active_weights(ioc);
1394
1395	/* calc usages and see whether some weights need to be moved around */
1396	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1397		u64 vdone, vtime, vusage, vmargin, vmin;
1398		u32 hw_active, hw_inuse, usage;
1399
1400		/*
1401		 * Collect unused and wind vtime closer to vnow to prevent
1402		 * iocgs from accumulating a large amount of budget.
1403		 */
1404		vdone = atomic64_read(&iocg->done_vtime);
1405		vtime = atomic64_read(&iocg->vtime);
1406		current_hweight(iocg, &hw_active, &hw_inuse);
1407
1408		/*
1409		 * Latency QoS detection doesn't account for IOs which are
1410		 * in-flight for longer than a period.  Detect them by
1411		 * comparing vdone against period start.  If lagging behind
1412		 * IOs from past periods, don't increase vrate.
1413		 */
1414		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1415		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1416		    time_after64(vtime, vdone) &&
1417		    time_after64(vtime, now.vnow -
1418				 MAX_LAGGING_PERIODS * period_vtime) &&
1419		    time_before64(vdone, now.vnow - period_vtime))
1420			nr_lagging++;
1421
1422		if (waitqueue_active(&iocg->waitq))
1423			vusage = now.vnow - iocg->last_vtime;
1424		else if (time_before64(iocg->last_vtime, vtime))
1425			vusage = vtime - iocg->last_vtime;
1426		else
1427			vusage = 0;
1428
1429		iocg->last_vtime += vusage;
1430		/*
1431		 * Factor in in-flight vtime into vusage to avoid
1432		 * high-latency completions appearing as idle.  This should
1433		 * be done after the above ->last_time adjustment.
1434		 */
1435		vusage = max(vusage, vtime - vdone);
1436
1437		/* calculate hweight based usage ratio and record */
1438		if (vusage) {
1439			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1440						   period_vtime);
1441			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1442			iocg->usages[iocg->usage_idx] = usage;
1443		} else {
1444			usage = 0;
1445		}
1446
1447		/* see whether there's surplus vtime */
1448		vmargin = ioc->margin_us * now.vrate;
1449		vmin = now.vnow - vmargin;
1450
1451		iocg->has_surplus = false;
1452
1453		if (!waitqueue_active(&iocg->waitq) &&
1454		    time_before64(vtime, vmin)) {
1455			u64 delta = vmin - vtime;
1456
1457			/* throw away surplus vtime */
1458			atomic64_add(delta, &iocg->vtime);
1459			atomic64_add(delta, &iocg->done_vtime);
1460			iocg->last_vtime += delta;
1461			/* if usage is sufficiently low, maybe it can donate */
1462			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1463				iocg->has_surplus = true;
1464				nr_surpluses++;
1465			}
1466		} else if (hw_inuse < hw_active) {
1467			u32 new_hwi, new_inuse;
1468
1469			/* was donating but might need to take back some */
1470			if (waitqueue_active(&iocg->waitq)) {
1471				new_hwi = hw_active;
1472			} else {
1473				new_hwi = max(hw_inuse,
1474					      usage * SURPLUS_SCALE_PCT / 100 +
1475					      SURPLUS_SCALE_ABS);
1476			}
1477
1478			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1479					      hw_inuse);
1480			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1481
1482			if (new_inuse > iocg->inuse) {
1483				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1484						iocg->inuse, new_inuse,
1485						hw_inuse, new_hwi);
1486				__propagate_active_weight(iocg, iocg->weight,
1487							  new_inuse);
1488			}
1489		} else {
1490			/* genuninely out of vtime */
1491			nr_shortages++;
1492		}
1493	}
1494
1495	if (!nr_shortages || !nr_surpluses)
1496		goto skip_surplus_transfers;
1497
1498	/* there are both shortages and surpluses, transfer surpluses */
1499	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1500		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1501		int nr_valid = 0;
1502
1503		if (!iocg->has_surplus)
1504			continue;
1505
1506		/* base the decision on max historical usage */
1507		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1508			if (iocg->usages[i]) {
1509				usage = max(usage, iocg->usages[i]);
1510				nr_valid++;
1511			}
1512		}
1513		if (nr_valid < MIN_VALID_USAGES)
1514			continue;
1515
1516		current_hweight(iocg, &hw_active, &hw_inuse);
1517		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1518		if (!new_hwi)
1519			continue;
1520
1521		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1522					       hw_inuse);
1523		if (new_inuse < iocg->inuse) {
1524			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1525					iocg->inuse, new_inuse,
1526					hw_inuse, new_hwi);
1527			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1528		}
1529	}
1530skip_surplus_transfers:
1531	commit_active_weights(ioc);
1532
1533	/*
1534	 * If q is getting clogged or we're missing too much, we're issuing
1535	 * too much IO and should lower vtime rate.  If we're not missing
1536	 * and experiencing shortages but not surpluses, we're too stingy
1537	 * and should increase vtime rate.
1538	 */
1539	prev_busy_level = ioc->busy_level;
1540	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1541	    missed_ppm[READ] > ppm_rthr ||
1542	    missed_ppm[WRITE] > ppm_wthr) {
1543		ioc->busy_level = max(ioc->busy_level, 0);
1544		ioc->busy_level++;
1545	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1546		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1547		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1548		/* take action iff there is contention */
1549		if (nr_shortages && !nr_lagging) {
1550			ioc->busy_level = min(ioc->busy_level, 0);
1551			/* redistribute surpluses first */
1552			if (!nr_surpluses)
1553				ioc->busy_level--;
1554		}
1555	} else {
1556		ioc->busy_level = 0;
1557	}
1558
1559	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1560
1561	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1562		u64 vrate = atomic64_read(&ioc->vtime_rate);
1563		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1564
1565		/* rq_wait signal is always reliable, ignore user vrate_min */
1566		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1567			vrate_min = VRATE_MIN;
1568
1569		/*
1570		 * If vrate is out of bounds, apply clamp gradually as the
1571		 * bounds can change abruptly.  Otherwise, apply busy_level
1572		 * based adjustment.
1573		 */
1574		if (vrate < vrate_min) {
1575			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1576					  100);
1577			vrate = min(vrate, vrate_min);
1578		} else if (vrate > vrate_max) {
1579			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1580					  100);
1581			vrate = max(vrate, vrate_max);
1582		} else {
1583			int idx = min_t(int, abs(ioc->busy_level),
1584					ARRAY_SIZE(vrate_adj_pct) - 1);
1585			u32 adj_pct = vrate_adj_pct[idx];
1586
1587			if (ioc->busy_level > 0)
1588				adj_pct = 100 - adj_pct;
1589			else
1590				adj_pct = 100 + adj_pct;
1591
1592			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1593				      vrate_min, vrate_max);
1594		}
1595
1596		trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
1597					   nr_lagging, nr_shortages,
1598					   nr_surpluses);
1599
1600		atomic64_set(&ioc->vtime_rate, vrate);
1601		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1602			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1603	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1604		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1605					   &missed_ppm, rq_wait_pct, nr_lagging,
1606					   nr_shortages, nr_surpluses);
1607	}
1608
1609	ioc_refresh_params(ioc, false);
1610
1611	/*
1612	 * This period is done.  Move onto the next one.  If nothing's
1613	 * going on with the device, stop the timer.
1614	 */
1615	atomic64_inc(&ioc->cur_period);
1616
1617	if (ioc->running != IOC_STOP) {
1618		if (!list_empty(&ioc->active_iocgs)) {
1619			ioc_start_period(ioc, &now);
1620		} else {
1621			ioc->busy_level = 0;
1622			ioc->running = IOC_IDLE;
1623		}
1624	}
1625
1626	spin_unlock_irq(&ioc->lock);
1627}
1628
1629static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1630				    bool is_merge, u64 *costp)
1631{
1632	struct ioc *ioc = iocg->ioc;
1633	u64 coef_seqio, coef_randio, coef_page;
1634	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1635	u64 seek_pages = 0;
1636	u64 cost = 0;
1637
1638	switch (bio_op(bio)) {
1639	case REQ_OP_READ:
1640		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1641		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1642		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1643		break;
1644	case REQ_OP_WRITE:
1645		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1646		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1647		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1648		break;
1649	default:
1650		goto out;
1651	}
1652
1653	if (iocg->cursor) {
1654		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1655		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1656	}
1657
1658	if (!is_merge) {
1659		if (seek_pages > LCOEF_RANDIO_PAGES) {
1660			cost += coef_randio;
1661		} else {
1662			cost += coef_seqio;
1663		}
1664	}
1665	cost += pages * coef_page;
1666out:
1667	*costp = cost;
1668}
1669
1670static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1671{
1672	u64 cost;
1673
1674	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1675	return cost;
1676}
1677
1678static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1679{
1680	struct blkcg_gq *blkg = bio->bi_blkg;
1681	struct ioc *ioc = rqos_to_ioc(rqos);
1682	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1683	struct ioc_now now;
1684	struct iocg_wait wait;
1685	u32 hw_active, hw_inuse;
1686	u64 abs_cost, cost, vtime;
1687
1688	/* bypass IOs if disabled or for root cgroup */
1689	if (!ioc->enabled || !iocg->level)
1690		return;
1691
1692	/* always activate so that even 0 cost IOs get protected to some level */
1693	if (!iocg_activate(iocg, &now))
1694		return;
1695
1696	/* calculate the absolute vtime cost */
1697	abs_cost = calc_vtime_cost(bio, iocg, false);
1698	if (!abs_cost)
1699		return;
1700
1701	iocg->cursor = bio_end_sector(bio);
1702
1703	vtime = atomic64_read(&iocg->vtime);
1704	current_hweight(iocg, &hw_active, &hw_inuse);
1705
1706	if (hw_inuse < hw_active &&
1707	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1708		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1709				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1710		spin_lock_irq(&ioc->lock);
1711		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1712		spin_unlock_irq(&ioc->lock);
1713		current_hweight(iocg, &hw_active, &hw_inuse);
1714	}
1715
1716	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1717
1718	/*
1719	 * If no one's waiting and within budget, issue right away.  The
1720	 * tests are racy but the races aren't systemic - we only miss once
1721	 * in a while which is fine.
1722	 */
1723	if (!waitqueue_active(&iocg->waitq) &&
1724	    !atomic64_read(&iocg->abs_vdebt) &&
1725	    time_before_eq64(vtime + cost, now.vnow)) {
1726		iocg_commit_bio(iocg, bio, cost);
1727		return;
1728	}
1729
1730	/*
1731	 * We're over budget.  If @bio has to be issued regardless,
1732	 * remember the abs_cost instead of advancing vtime.
1733	 * iocg_kick_waitq() will pay off the debt before waking more IOs.
1734	 * This way, the debt is continuously paid off each period with the
1735	 * actual budget available to the cgroup.  If we just wound vtime,
1736	 * we would incorrectly use the current hw_inuse for the entire
1737	 * amount which, for example, can lead to the cgroup staying
1738	 * blocked for a long time even with substantially raised hw_inuse.
1739	 */
1740	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1741		atomic64_add(abs_cost, &iocg->abs_vdebt);
1742		iocg_kick_delay(iocg, &now, cost);
1743		return;
1744	}
1745
1746	/*
1747	 * Append self to the waitq and schedule the wakeup timer if we're
1748	 * the first waiter.  The timer duration is calculated based on the
1749	 * current vrate.  vtime and hweight changes can make it too short
1750	 * or too long.  Each wait entry records the absolute cost it's
1751	 * waiting for to allow re-evaluation using a custom wait entry.
1752	 *
1753	 * If too short, the timer simply reschedules itself.  If too long,
1754	 * the period timer will notice and trigger wakeups.
1755	 *
1756	 * All waiters are on iocg->waitq and the wait states are
1757	 * synchronized using waitq.lock.
1758	 */
1759	spin_lock_irq(&iocg->waitq.lock);
1760
1761	/*
1762	 * We activated above but w/o any synchronization.  Deactivation is
1763	 * synchronized with waitq.lock and we won't get deactivated as
1764	 * long as we're waiting, so we're good if we're activated here.
1765	 * In the unlikely case that we are deactivated, just issue the IO.
1766	 */
1767	if (unlikely(list_empty(&iocg->active_list))) {
1768		spin_unlock_irq(&iocg->waitq.lock);
1769		iocg_commit_bio(iocg, bio, cost);
1770		return;
1771	}
1772
1773	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1774	wait.wait.private = current;
1775	wait.bio = bio;
1776	wait.abs_cost = abs_cost;
1777	wait.committed = false;	/* will be set true by waker */
1778
1779	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1780	iocg_kick_waitq(iocg, &now);
1781
1782	spin_unlock_irq(&iocg->waitq.lock);
1783
1784	while (true) {
1785		set_current_state(TASK_UNINTERRUPTIBLE);
1786		if (wait.committed)
1787			break;
1788		io_schedule();
1789	}
1790
1791	/* waker already committed us, proceed */
1792	finish_wait(&iocg->waitq, &wait.wait);
1793}
1794
1795static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1796			   struct bio *bio)
1797{
1798	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1799	struct ioc *ioc = iocg->ioc;
1800	sector_t bio_end = bio_end_sector(bio);
1801	struct ioc_now now;
1802	u32 hw_inuse;
1803	u64 abs_cost, cost;
1804
1805	/* bypass if disabled or for root cgroup */
1806	if (!ioc->enabled || !iocg->level)
1807		return;
1808
1809	abs_cost = calc_vtime_cost(bio, iocg, true);
1810	if (!abs_cost)
1811		return;
1812
1813	ioc_now(ioc, &now);
1814	current_hweight(iocg, NULL, &hw_inuse);
1815	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1816
1817	/* update cursor if backmerging into the request at the cursor */
1818	if (blk_rq_pos(rq) < bio_end &&
1819	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1820		iocg->cursor = bio_end;
1821
1822	/*
1823	 * Charge if there's enough vtime budget and the existing request
1824	 * has cost assigned.  Otherwise, account it as debt.  See debt
1825	 * handling in ioc_rqos_throttle() for details.
1826	 */
1827	if (rq->bio && rq->bio->bi_iocost_cost &&
1828	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
1829		iocg_commit_bio(iocg, bio, cost);
1830	else
1831		atomic64_add(abs_cost, &iocg->abs_vdebt);
1832}
1833
1834static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1835{
1836	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1837
1838	if (iocg && bio->bi_iocost_cost)
1839		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1840}
1841
1842static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1843{
1844	struct ioc *ioc = rqos_to_ioc(rqos);
1845	u64 on_q_ns, rq_wait_ns;
1846	int pidx, rw;
1847
1848	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1849		return;
1850
1851	switch (req_op(rq) & REQ_OP_MASK) {
1852	case REQ_OP_READ:
1853		pidx = QOS_RLAT;
1854		rw = READ;
1855		break;
1856	case REQ_OP_WRITE:
1857		pidx = QOS_WLAT;
1858		rw = WRITE;
1859		break;
1860	default:
1861		return;
1862	}
1863
1864	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1865	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1866
1867	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1868		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1869	else
1870		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1871
1872	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1873}
1874
1875static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1876{
1877	struct ioc *ioc = rqos_to_ioc(rqos);
1878
1879	spin_lock_irq(&ioc->lock);
1880	ioc_refresh_params(ioc, false);
1881	spin_unlock_irq(&ioc->lock);
1882}
1883
1884static void ioc_rqos_exit(struct rq_qos *rqos)
1885{
1886	struct ioc *ioc = rqos_to_ioc(rqos);
1887
1888	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1889
1890	spin_lock_irq(&ioc->lock);
1891	ioc->running = IOC_STOP;
1892	spin_unlock_irq(&ioc->lock);
1893
1894	del_timer_sync(&ioc->timer);
1895	free_percpu(ioc->pcpu_stat);
1896	kfree(ioc);
1897}
1898
1899static struct rq_qos_ops ioc_rqos_ops = {
1900	.throttle = ioc_rqos_throttle,
1901	.merge = ioc_rqos_merge,
1902	.done_bio = ioc_rqos_done_bio,
1903	.done = ioc_rqos_done,
1904	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1905	.exit = ioc_rqos_exit,
1906};
1907
1908static int blk_iocost_init(struct request_queue *q)
1909{
1910	struct ioc *ioc;
1911	struct rq_qos *rqos;
1912	int ret;
1913
1914	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1915	if (!ioc)
1916		return -ENOMEM;
1917
1918	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1919	if (!ioc->pcpu_stat) {
1920		kfree(ioc);
1921		return -ENOMEM;
1922	}
1923
1924	rqos = &ioc->rqos;
1925	rqos->id = RQ_QOS_COST;
1926	rqos->ops = &ioc_rqos_ops;
1927	rqos->q = q;
1928
1929	spin_lock_init(&ioc->lock);
1930	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1931	INIT_LIST_HEAD(&ioc->active_iocgs);
1932
1933	ioc->running = IOC_IDLE;
1934	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1935	seqcount_init(&ioc->period_seqcount);
1936	ioc->period_at = ktime_to_us(ktime_get());
1937	atomic64_set(&ioc->cur_period, 0);
1938	atomic_set(&ioc->hweight_gen, 0);
1939
1940	spin_lock_irq(&ioc->lock);
1941	ioc->autop_idx = AUTOP_INVALID;
1942	ioc_refresh_params(ioc, true);
1943	spin_unlock_irq(&ioc->lock);
1944
1945	rq_qos_add(q, rqos);
1946	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1947	if (ret) {
1948		rq_qos_del(q, rqos);
1949		free_percpu(ioc->pcpu_stat);
1950		kfree(ioc);
1951		return ret;
1952	}
1953	return 0;
1954}
1955
1956static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1957{
1958	struct ioc_cgrp *iocc;
1959
1960	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1961	if (!iocc)
1962		return NULL;
1963
1964	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1965	return &iocc->cpd;
1966}
1967
1968static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1969{
1970	kfree(container_of(cpd, struct ioc_cgrp, cpd));
1971}
1972
1973static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
1974					     struct blkcg *blkcg)
1975{
1976	int levels = blkcg->css.cgroup->level + 1;
1977	struct ioc_gq *iocg;
1978
1979	iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
1980			    gfp, q->node);
1981	if (!iocg)
1982		return NULL;
1983
1984	return &iocg->pd;
1985}
1986
1987static void ioc_pd_init(struct blkg_policy_data *pd)
1988{
1989	struct ioc_gq *iocg = pd_to_iocg(pd);
1990	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
1991	struct ioc *ioc = q_to_ioc(blkg->q);
1992	struct ioc_now now;
1993	struct blkcg_gq *tblkg;
1994	unsigned long flags;
1995
1996	ioc_now(ioc, &now);
1997
1998	iocg->ioc = ioc;
1999	atomic64_set(&iocg->vtime, now.vnow);
2000	atomic64_set(&iocg->done_vtime, now.vnow);
2001	atomic64_set(&iocg->abs_vdebt, 0);
2002	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2003	INIT_LIST_HEAD(&iocg->active_list);
2004	iocg->hweight_active = HWEIGHT_WHOLE;
2005	iocg->hweight_inuse = HWEIGHT_WHOLE;
2006
2007	init_waitqueue_head(&iocg->waitq);
2008	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2009	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2010	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2011	iocg->delay_timer.function = iocg_delay_timer_fn;
2012
2013	iocg->level = blkg->blkcg->css.cgroup->level;
2014
2015	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2016		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2017		iocg->ancestors[tiocg->level] = tiocg;
2018	}
2019
2020	spin_lock_irqsave(&ioc->lock, flags);
2021	weight_updated(iocg);
2022	spin_unlock_irqrestore(&ioc->lock, flags);
2023}
2024
2025static void ioc_pd_free(struct blkg_policy_data *pd)
2026{
2027	struct ioc_gq *iocg = pd_to_iocg(pd);
2028	struct ioc *ioc = iocg->ioc;
2029
2030	if (ioc) {
2031		spin_lock(&ioc->lock);
2032		if (!list_empty(&iocg->active_list)) {
2033			propagate_active_weight(iocg, 0, 0);
2034			list_del_init(&iocg->active_list);
2035		}
2036		spin_unlock(&ioc->lock);
2037
2038		hrtimer_cancel(&iocg->waitq_timer);
2039		hrtimer_cancel(&iocg->delay_timer);
2040	}
2041	kfree(iocg);
2042}
2043
2044static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2045			     int off)
2046{
2047	const char *dname = blkg_dev_name(pd->blkg);
2048	struct ioc_gq *iocg = pd_to_iocg(pd);
2049
2050	if (dname && iocg->cfg_weight)
2051		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2052	return 0;
2053}
2054
2055
2056static int ioc_weight_show(struct seq_file *sf, void *v)
2057{
2058	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2059	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2060
2061	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2062	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2063			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2064	return 0;
2065}
2066
2067static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2068				size_t nbytes, loff_t off)
2069{
2070	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2071	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2072	struct blkg_conf_ctx ctx;
2073	struct ioc_gq *iocg;
2074	u32 v;
2075	int ret;
2076
2077	if (!strchr(buf, ':')) {
2078		struct blkcg_gq *blkg;
2079
2080		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2081			return -EINVAL;
2082
2083		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2084			return -EINVAL;
2085
2086		spin_lock(&blkcg->lock);
2087		iocc->dfl_weight = v;
2088		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2089			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2090
2091			if (iocg) {
2092				spin_lock_irq(&iocg->ioc->lock);
2093				weight_updated(iocg);
2094				spin_unlock_irq(&iocg->ioc->lock);
2095			}
2096		}
2097		spin_unlock(&blkcg->lock);
2098
2099		return nbytes;
2100	}
2101
2102	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2103	if (ret)
2104		return ret;
2105
2106	iocg = blkg_to_iocg(ctx.blkg);
2107
2108	if (!strncmp(ctx.body, "default", 7)) {
2109		v = 0;
2110	} else {
2111		if (!sscanf(ctx.body, "%u", &v))
2112			goto einval;
2113		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2114			goto einval;
2115	}
2116
2117	spin_lock(&iocg->ioc->lock);
2118	iocg->cfg_weight = v;
2119	weight_updated(iocg);
2120	spin_unlock(&iocg->ioc->lock);
2121
2122	blkg_conf_finish(&ctx);
2123	return nbytes;
2124
2125einval:
2126	blkg_conf_finish(&ctx);
2127	return -EINVAL;
2128}
2129
2130static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2131			  int off)
2132{
2133	const char *dname = blkg_dev_name(pd->blkg);
2134	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2135
2136	if (!dname)
2137		return 0;
2138
2139	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2140		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2141		   ioc->params.qos[QOS_RPPM] / 10000,
2142		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2143		   ioc->params.qos[QOS_RLAT],
2144		   ioc->params.qos[QOS_WPPM] / 10000,
2145		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2146		   ioc->params.qos[QOS_WLAT],
2147		   ioc->params.qos[QOS_MIN] / 10000,
2148		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2149		   ioc->params.qos[QOS_MAX] / 10000,
2150		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2151	return 0;
2152}
2153
2154static int ioc_qos_show(struct seq_file *sf, void *v)
2155{
2156	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2157
2158	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2159			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2160	return 0;
2161}
2162
2163static const match_table_t qos_ctrl_tokens = {
2164	{ QOS_ENABLE,		"enable=%u"	},
2165	{ QOS_CTRL,		"ctrl=%s"	},
2166	{ NR_QOS_CTRL_PARAMS,	NULL		},
2167};
2168
2169static const match_table_t qos_tokens = {
2170	{ QOS_RPPM,		"rpct=%s"	},
2171	{ QOS_RLAT,		"rlat=%u"	},
2172	{ QOS_WPPM,		"wpct=%s"	},
2173	{ QOS_WLAT,		"wlat=%u"	},
2174	{ QOS_MIN,		"min=%s"	},
2175	{ QOS_MAX,		"max=%s"	},
2176	{ NR_QOS_PARAMS,	NULL		},
2177};
2178
2179static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2180			     size_t nbytes, loff_t off)
2181{
2182	struct gendisk *disk;
2183	struct ioc *ioc;
2184	u32 qos[NR_QOS_PARAMS];
2185	bool enable, user;
2186	char *p;
2187	int ret;
2188
2189	disk = blkcg_conf_get_disk(&input);
2190	if (IS_ERR(disk))
2191		return PTR_ERR(disk);
2192
2193	ioc = q_to_ioc(disk->queue);
2194	if (!ioc) {
2195		ret = blk_iocost_init(disk->queue);
2196		if (ret)
2197			goto err;
2198		ioc = q_to_ioc(disk->queue);
2199	}
2200
2201	spin_lock_irq(&ioc->lock);
2202	memcpy(qos, ioc->params.qos, sizeof(qos));
2203	enable = ioc->enabled;
2204	user = ioc->user_qos_params;
2205	spin_unlock_irq(&ioc->lock);
2206
2207	while ((p = strsep(&input, " \t\n"))) {
2208		substring_t args[MAX_OPT_ARGS];
2209		char buf[32];
2210		int tok;
2211		s64 v;
2212
2213		if (!*p)
2214			continue;
2215
2216		switch (match_token(p, qos_ctrl_tokens, args)) {
2217		case QOS_ENABLE:
2218			match_u64(&args[0], &v);
2219			enable = v;
2220			continue;
2221		case QOS_CTRL:
2222			match_strlcpy(buf, &args[0], sizeof(buf));
2223			if (!strcmp(buf, "auto"))
2224				user = false;
2225			else if (!strcmp(buf, "user"))
2226				user = true;
2227			else
2228				goto einval;
2229			continue;
2230		}
2231
2232		tok = match_token(p, qos_tokens, args);
2233		switch (tok) {
2234		case QOS_RPPM:
2235		case QOS_WPPM:
2236			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2237			    sizeof(buf))
2238				goto einval;
2239			if (cgroup_parse_float(buf, 2, &v))
2240				goto einval;
2241			if (v < 0 || v > 10000)
2242				goto einval;
2243			qos[tok] = v * 100;
2244			break;
2245		case QOS_RLAT:
2246		case QOS_WLAT:
2247			if (match_u64(&args[0], &v))
2248				goto einval;
2249			qos[tok] = v;
2250			break;
2251		case QOS_MIN:
2252		case QOS_MAX:
2253			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2254			    sizeof(buf))
2255				goto einval;
2256			if (cgroup_parse_float(buf, 2, &v))
2257				goto einval;
2258			if (v < 0)
2259				goto einval;
2260			qos[tok] = clamp_t(s64, v * 100,
2261					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2262			break;
2263		default:
2264			goto einval;
2265		}
2266		user = true;
2267	}
2268
2269	if (qos[QOS_MIN] > qos[QOS_MAX])
2270		goto einval;
2271
2272	spin_lock_irq(&ioc->lock);
2273
2274	if (enable) {
2275		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2276		ioc->enabled = true;
2277	} else {
2278		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2279		ioc->enabled = false;
2280	}
2281
2282	if (user) {
2283		memcpy(ioc->params.qos, qos, sizeof(qos));
2284		ioc->user_qos_params = true;
2285	} else {
2286		ioc->user_qos_params = false;
2287	}
2288
2289	ioc_refresh_params(ioc, true);
2290	spin_unlock_irq(&ioc->lock);
2291
2292	put_disk_and_module(disk);
2293	return nbytes;
2294einval:
2295	ret = -EINVAL;
2296err:
2297	put_disk_and_module(disk);
2298	return ret;
2299}
2300
2301static u64 ioc_cost_model_prfill(struct seq_file *sf,
2302				 struct blkg_policy_data *pd, int off)
2303{
2304	const char *dname = blkg_dev_name(pd->blkg);
2305	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2306	u64 *u = ioc->params.i_lcoefs;
2307
2308	if (!dname)
2309		return 0;
2310
2311	seq_printf(sf, "%s ctrl=%s model=linear "
2312		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2313		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2314		   dname, ioc->user_cost_model ? "user" : "auto",
2315		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2316		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2317	return 0;
2318}
2319
2320static int ioc_cost_model_show(struct seq_file *sf, void *v)
2321{
2322	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2323
2324	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2325			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2326	return 0;
2327}
2328
2329static const match_table_t cost_ctrl_tokens = {
2330	{ COST_CTRL,		"ctrl=%s"	},
2331	{ COST_MODEL,		"model=%s"	},
2332	{ NR_COST_CTRL_PARAMS,	NULL		},
2333};
2334
2335static const match_table_t i_lcoef_tokens = {
2336	{ I_LCOEF_RBPS,		"rbps=%u"	},
2337	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2338	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2339	{ I_LCOEF_WBPS,		"wbps=%u"	},
2340	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2341	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2342	{ NR_I_LCOEFS,		NULL		},
2343};
2344
2345static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2346				    size_t nbytes, loff_t off)
2347{
2348	struct gendisk *disk;
2349	struct ioc *ioc;
2350	u64 u[NR_I_LCOEFS];
2351	bool user;
2352	char *p;
2353	int ret;
2354
2355	disk = blkcg_conf_get_disk(&input);
2356	if (IS_ERR(disk))
2357		return PTR_ERR(disk);
2358
2359	ioc = q_to_ioc(disk->queue);
2360	if (!ioc) {
2361		ret = blk_iocost_init(disk->queue);
2362		if (ret)
2363			goto err;
2364		ioc = q_to_ioc(disk->queue);
2365	}
2366
2367	spin_lock_irq(&ioc->lock);
2368	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2369	user = ioc->user_cost_model;
2370	spin_unlock_irq(&ioc->lock);
2371
2372	while ((p = strsep(&input, " \t\n"))) {
2373		substring_t args[MAX_OPT_ARGS];
2374		char buf[32];
2375		int tok;
2376		u64 v;
2377
2378		if (!*p)
2379			continue;
2380
2381		switch (match_token(p, cost_ctrl_tokens, args)) {
2382		case COST_CTRL:
2383			match_strlcpy(buf, &args[0], sizeof(buf));
2384			if (!strcmp(buf, "auto"))
2385				user = false;
2386			else if (!strcmp(buf, "user"))
2387				user = true;
2388			else
2389				goto einval;
2390			continue;
2391		case COST_MODEL:
2392			match_strlcpy(buf, &args[0], sizeof(buf));
2393			if (strcmp(buf, "linear"))
2394				goto einval;
2395			continue;
2396		}
2397
2398		tok = match_token(p, i_lcoef_tokens, args);
2399		if (tok == NR_I_LCOEFS)
2400			goto einval;
2401		if (match_u64(&args[0], &v))
2402			goto einval;
2403		u[tok] = v;
2404		user = true;
2405	}
2406
2407	spin_lock_irq(&ioc->lock);
2408	if (user) {
2409		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2410		ioc->user_cost_model = true;
2411	} else {
2412		ioc->user_cost_model = false;
2413	}
2414	ioc_refresh_params(ioc, true);
2415	spin_unlock_irq(&ioc->lock);
2416
2417	put_disk_and_module(disk);
2418	return nbytes;
2419
2420einval:
2421	ret = -EINVAL;
2422err:
2423	put_disk_and_module(disk);
2424	return ret;
2425}
2426
2427static struct cftype ioc_files[] = {
2428	{
2429		.name = "weight",
2430		.flags = CFTYPE_NOT_ON_ROOT,
2431		.seq_show = ioc_weight_show,
2432		.write = ioc_weight_write,
2433	},
2434	{
2435		.name = "cost.qos",
2436		.flags = CFTYPE_ONLY_ON_ROOT,
2437		.seq_show = ioc_qos_show,
2438		.write = ioc_qos_write,
2439	},
2440	{
2441		.name = "cost.model",
2442		.flags = CFTYPE_ONLY_ON_ROOT,
2443		.seq_show = ioc_cost_model_show,
2444		.write = ioc_cost_model_write,
2445	},
2446	{}
2447};
2448
2449static struct blkcg_policy blkcg_policy_iocost = {
2450	.dfl_cftypes	= ioc_files,
2451	.cpd_alloc_fn	= ioc_cpd_alloc,
2452	.cpd_free_fn	= ioc_cpd_free,
2453	.pd_alloc_fn	= ioc_pd_alloc,
2454	.pd_init_fn	= ioc_pd_init,
2455	.pd_free_fn	= ioc_pd_free,
2456};
2457
2458static int __init ioc_init(void)
2459{
2460	return blkcg_policy_register(&blkcg_policy_iocost);
2461}
2462
2463static void __exit ioc_exit(void)
2464{
2465	return blkcg_policy_unregister(&blkcg_policy_iocost);
2466}
2467
2468module_init(ioc_init);
2469module_exit(ioc_exit);