Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1/*
   2 * Copyright © 2015-2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *   Robert Bragg <robert@sixbynine.org>
  25 */
  26
  27
  28/**
  29 * DOC: i915 Perf Overview
  30 *
  31 * Gen graphics supports a large number of performance counters that can help
  32 * driver and application developers understand and optimize their use of the
  33 * GPU.
  34 *
  35 * This i915 perf interface enables userspace to configure and open a file
  36 * descriptor representing a stream of GPU metrics which can then be read() as
  37 * a stream of sample records.
  38 *
  39 * The interface is particularly suited to exposing buffered metrics that are
  40 * captured by DMA from the GPU, unsynchronized with and unrelated to the CPU.
  41 *
  42 * Streams representing a single context are accessible to applications with a
  43 * corresponding drm file descriptor, such that OpenGL can use the interface
  44 * without special privileges. Access to system-wide metrics requires root
  45 * privileges by default, unless changed via the dev.i915.perf_event_paranoid
  46 * sysctl option.
  47 *
  48 */
  49
  50/**
  51 * DOC: i915 Perf History and Comparison with Core Perf
  52 *
  53 * The interface was initially inspired by the core Perf infrastructure but
  54 * some notable differences are:
  55 *
  56 * i915 perf file descriptors represent a "stream" instead of an "event"; where
  57 * a perf event primarily corresponds to a single 64bit value, while a stream
  58 * might sample sets of tightly-coupled counters, depending on the
  59 * configuration.  For example the Gen OA unit isn't designed to support
  60 * orthogonal configurations of individual counters; it's configured for a set
  61 * of related counters. Samples for an i915 perf stream capturing OA metrics
  62 * will include a set of counter values packed in a compact HW specific format.
  63 * The OA unit supports a number of different packing formats which can be
  64 * selected by the user opening the stream. Perf has support for grouping
  65 * events, but each event in the group is configured, validated and
  66 * authenticated individually with separate system calls.
  67 *
  68 * i915 perf stream configurations are provided as an array of u64 (key,value)
  69 * pairs, instead of a fixed struct with multiple miscellaneous config members,
  70 * interleaved with event-type specific members.
  71 *
  72 * i915 perf doesn't support exposing metrics via an mmap'd circular buffer.
  73 * The supported metrics are being written to memory by the GPU unsynchronized
  74 * with the CPU, using HW specific packing formats for counter sets. Sometimes
  75 * the constraints on HW configuration require reports to be filtered before it
  76 * would be acceptable to expose them to unprivileged applications - to hide
  77 * the metrics of other processes/contexts. For these use cases a read() based
  78 * interface is a good fit, and provides an opportunity to filter data as it
  79 * gets copied from the GPU mapped buffers to userspace buffers.
  80 *
  81 *
  82 * Issues hit with first prototype based on Core Perf
  83 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  84 *
  85 * The first prototype of this driver was based on the core perf
  86 * infrastructure, and while we did make that mostly work, with some changes to
  87 * perf, we found we were breaking or working around too many assumptions baked
  88 * into perf's currently cpu centric design.
  89 *
  90 * In the end we didn't see a clear benefit to making perf's implementation and
  91 * interface more complex by changing design assumptions while we knew we still
  92 * wouldn't be able to use any existing perf based userspace tools.
  93 *
  94 * Also considering the Gen specific nature of the Observability hardware and
  95 * how userspace will sometimes need to combine i915 perf OA metrics with
  96 * side-band OA data captured via MI_REPORT_PERF_COUNT commands; we're
  97 * expecting the interface to be used by a platform specific userspace such as
  98 * OpenGL or tools. This is to say; we aren't inherently missing out on having
  99 * a standard vendor/architecture agnostic interface by not using perf.
 100 *
 101 *
 102 * For posterity, in case we might re-visit trying to adapt core perf to be
 103 * better suited to exposing i915 metrics these were the main pain points we
 104 * hit:
 105 *
 106 * - The perf based OA PMU driver broke some significant design assumptions:
 107 *
 108 *   Existing perf pmus are used for profiling work on a cpu and we were
 109 *   introducing the idea of _IS_DEVICE pmus with different security
 110 *   implications, the need to fake cpu-related data (such as user/kernel
 111 *   registers) to fit with perf's current design, and adding _DEVICE records
 112 *   as a way to forward device-specific status records.
 113 *
 114 *   The OA unit writes reports of counters into a circular buffer, without
 115 *   involvement from the CPU, making our PMU driver the first of a kind.
 116 *
 117 *   Given the way we were periodically forward data from the GPU-mapped, OA
 118 *   buffer to perf's buffer, those bursts of sample writes looked to perf like
 119 *   we were sampling too fast and so we had to subvert its throttling checks.
 120 *
 121 *   Perf supports groups of counters and allows those to be read via
 122 *   transactions internally but transactions currently seem designed to be
 123 *   explicitly initiated from the cpu (say in response to a userspace read())
 124 *   and while we could pull a report out of the OA buffer we can't
 125 *   trigger a report from the cpu on demand.
 126 *
 127 *   Related to being report based; the OA counters are configured in HW as a
 128 *   set while perf generally expects counter configurations to be orthogonal.
 129 *   Although counters can be associated with a group leader as they are
 130 *   opened, there's no clear precedent for being able to provide group-wide
 131 *   configuration attributes (for example we want to let userspace choose the
 132 *   OA unit report format used to capture all counters in a set, or specify a
 133 *   GPU context to filter metrics on). We avoided using perf's grouping
 134 *   feature and forwarded OA reports to userspace via perf's 'raw' sample
 135 *   field. This suited our userspace well considering how coupled the counters
 136 *   are when dealing with normalizing. It would be inconvenient to split
 137 *   counters up into separate events, only to require userspace to recombine
 138 *   them. For Mesa it's also convenient to be forwarded raw, periodic reports
 139 *   for combining with the side-band raw reports it captures using
 140 *   MI_REPORT_PERF_COUNT commands.
 141 *
 142 *   - As a side note on perf's grouping feature; there was also some concern
 143 *     that using PERF_FORMAT_GROUP as a way to pack together counter values
 144 *     would quite drastically inflate our sample sizes, which would likely
 145 *     lower the effective sampling resolutions we could use when the available
 146 *     memory bandwidth is limited.
 147 *
 148 *     With the OA unit's report formats, counters are packed together as 32
 149 *     or 40bit values, with the largest report size being 256 bytes.
 150 *
 151 *     PERF_FORMAT_GROUP values are 64bit, but there doesn't appear to be a
 152 *     documented ordering to the values, implying PERF_FORMAT_ID must also be
 153 *     used to add a 64bit ID before each value; giving 16 bytes per counter.
 154 *
 155 *   Related to counter orthogonality; we can't time share the OA unit, while
 156 *   event scheduling is a central design idea within perf for allowing
 157 *   userspace to open + enable more events than can be configured in HW at any
 158 *   one time.  The OA unit is not designed to allow re-configuration while in
 159 *   use. We can't reconfigure the OA unit without losing internal OA unit
 160 *   state which we can't access explicitly to save and restore. Reconfiguring
 161 *   the OA unit is also relatively slow, involving ~100 register writes. From
 162 *   userspace Mesa also depends on a stable OA configuration when emitting
 163 *   MI_REPORT_PERF_COUNT commands and importantly the OA unit can't be
 164 *   disabled while there are outstanding MI_RPC commands lest we hang the
 165 *   command streamer.
 166 *
 167 *   The contents of sample records aren't extensible by device drivers (i.e.
 168 *   the sample_type bits). As an example; Sourab Gupta had been looking to
 169 *   attach GPU timestamps to our OA samples. We were shoehorning OA reports
 170 *   into sample records by using the 'raw' field, but it's tricky to pack more
 171 *   than one thing into this field because events/core.c currently only lets a
 172 *   pmu give a single raw data pointer plus len which will be copied into the
 173 *   ring buffer. To include more than the OA report we'd have to copy the
 174 *   report into an intermediate larger buffer. I'd been considering allowing a
 175 *   vector of data+len values to be specified for copying the raw data, but
 176 *   it felt like a kludge to being using the raw field for this purpose.
 177 *
 178 * - It felt like our perf based PMU was making some technical compromises
 179 *   just for the sake of using perf:
 180 *
 181 *   perf_event_open() requires events to either relate to a pid or a specific
 182 *   cpu core, while our device pmu related to neither.  Events opened with a
 183 *   pid will be automatically enabled/disabled according to the scheduling of
 184 *   that process - so not appropriate for us. When an event is related to a
 185 *   cpu id, perf ensures pmu methods will be invoked via an inter process
 186 *   interrupt on that core. To avoid invasive changes our userspace opened OA
 187 *   perf events for a specific cpu. This was workable but it meant the
 188 *   majority of the OA driver ran in atomic context, including all OA report
 189 *   forwarding, which wasn't really necessary in our case and seems to make
 190 *   our locking requirements somewhat complex as we handled the interaction
 191 *   with the rest of the i915 driver.
 192 */
 193
 194#include <linux/anon_inodes.h>
 195#include <linux/sizes.h>
 196#include <linux/uuid.h>
 197
 198#include "i915_drv.h"
 199#include "i915_oa_hsw.h"
 200#include "i915_oa_bdw.h"
 201#include "i915_oa_chv.h"
 202#include "i915_oa_sklgt2.h"
 203#include "i915_oa_sklgt3.h"
 204#include "i915_oa_sklgt4.h"
 205#include "i915_oa_bxt.h"
 206#include "i915_oa_kblgt2.h"
 207#include "i915_oa_kblgt3.h"
 208#include "i915_oa_glk.h"
 209#include "i915_oa_cflgt2.h"
 210#include "i915_oa_cflgt3.h"
 211#include "i915_oa_cnl.h"
 212
 213/* HW requires this to be a power of two, between 128k and 16M, though driver
 214 * is currently generally designed assuming the largest 16M size is used such
 215 * that the overflow cases are unlikely in normal operation.
 216 */
 217#define OA_BUFFER_SIZE		SZ_16M
 218
 219#define OA_TAKEN(tail, head)	((tail - head) & (OA_BUFFER_SIZE - 1))
 220
 221/**
 222 * DOC: OA Tail Pointer Race
 223 *
 224 * There's a HW race condition between OA unit tail pointer register updates and
 225 * writes to memory whereby the tail pointer can sometimes get ahead of what's
 226 * been written out to the OA buffer so far (in terms of what's visible to the
 227 * CPU).
 228 *
 229 * Although this can be observed explicitly while copying reports to userspace
 230 * by checking for a zeroed report-id field in tail reports, we want to account
 231 * for this earlier, as part of the oa_buffer_check to avoid lots of redundant
 232 * read() attempts.
 233 *
 234 * In effect we define a tail pointer for reading that lags the real tail
 235 * pointer by at least %OA_TAIL_MARGIN_NSEC nanoseconds, which gives enough
 236 * time for the corresponding reports to become visible to the CPU.
 237 *
 238 * To manage this we actually track two tail pointers:
 239 *  1) An 'aging' tail with an associated timestamp that is tracked until we
 240 *     can trust the corresponding data is visible to the CPU; at which point
 241 *     it is considered 'aged'.
 242 *  2) An 'aged' tail that can be used for read()ing.
 243 *
 244 * The two separate pointers let us decouple read()s from tail pointer aging.
 245 *
 246 * The tail pointers are checked and updated at a limited rate within a hrtimer
 247 * callback (the same callback that is used for delivering EPOLLIN events)
 248 *
 249 * Initially the tails are marked invalid with %INVALID_TAIL_PTR which
 250 * indicates that an updated tail pointer is needed.
 251 *
 252 * Most of the implementation details for this workaround are in
 253 * oa_buffer_check_unlocked() and _append_oa_reports()
 254 *
 255 * Note for posterity: previously the driver used to define an effective tail
 256 * pointer that lagged the real pointer by a 'tail margin' measured in bytes
 257 * derived from %OA_TAIL_MARGIN_NSEC and the configured sampling frequency.
 258 * This was flawed considering that the OA unit may also automatically generate
 259 * non-periodic reports (such as on context switch) or the OA unit may be
 260 * enabled without any periodic sampling.
 261 */
 262#define OA_TAIL_MARGIN_NSEC	100000ULL
 263#define INVALID_TAIL_PTR	0xffffffff
 264
 265/* frequency for checking whether the OA unit has written new reports to the
 266 * circular OA buffer...
 267 */
 268#define POLL_FREQUENCY 200
 269#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 270
 271/* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */
 272static int zero;
 273static int one = 1;
 274static u32 i915_perf_stream_paranoid = true;
 275
 276/* The maximum exponent the hardware accepts is 63 (essentially it selects one
 277 * of the 64bit timestamp bits to trigger reports from) but there's currently
 278 * no known use case for sampling as infrequently as once per 47 thousand years.
 279 *
 280 * Since the timestamps included in OA reports are only 32bits it seems
 281 * reasonable to limit the OA exponent where it's still possible to account for
 282 * overflow in OA report timestamps.
 283 */
 284#define OA_EXPONENT_MAX 31
 285
 286#define INVALID_CTX_ID 0xffffffff
 287
 288/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
 289#define OAREPORT_REASON_MASK           0x3f
 290#define OAREPORT_REASON_SHIFT          19
 291#define OAREPORT_REASON_TIMER          (1<<0)
 292#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 293#define OAREPORT_REASON_CLK_RATIO      (1<<5)
 294
 295
 296/* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
 297 *
 298 * The highest sampling frequency we can theoretically program the OA unit
 299 * with is always half the timestamp frequency: E.g. 6.25Mhz for Haswell.
 300 *
 301 * Initialized just before we register the sysctl parameter.
 302 */
 303static int oa_sample_rate_hard_limit;
 304
 305/* Theoretically we can program the OA unit to sample every 160ns but don't
 306 * allow that by default unless root...
 307 *
 308 * The default threshold of 100000Hz is based on perf's similar
 309 * kernel.perf_event_max_sample_rate sysctl parameter.
 310 */
 311static u32 i915_oa_max_sample_rate = 100000;
 312
 313/* XXX: beware if future OA HW adds new report formats that the current
 314 * code assumes all reports have a power-of-two size and ~(size - 1) can
 315 * be used as a mask to align the OA tail pointer.
 316 */
 317static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
 318	[I915_OA_FORMAT_A13]	    = { 0, 64 },
 319	[I915_OA_FORMAT_A29]	    = { 1, 128 },
 320	[I915_OA_FORMAT_A13_B8_C8]  = { 2, 128 },
 321	/* A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size */
 322	[I915_OA_FORMAT_B4_C8]	    = { 4, 64 },
 323	[I915_OA_FORMAT_A45_B8_C8]  = { 5, 256 },
 324	[I915_OA_FORMAT_B4_C8_A16]  = { 6, 128 },
 325	[I915_OA_FORMAT_C4_B8]	    = { 7, 64 },
 326};
 327
 328static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 329	[I915_OA_FORMAT_A12]		    = { 0, 64 },
 330	[I915_OA_FORMAT_A12_B8_C8]	    = { 2, 128 },
 331	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
 332	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
 333};
 334
 335#define SAMPLE_OA_REPORT      (1<<0)
 336
 337/**
 338 * struct perf_open_properties - for validated properties given to open a stream
 339 * @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags
 340 * @single_context: Whether a single or all gpu contexts should be monitored
 341 * @ctx_handle: A gem ctx handle for use with @single_context
 342 * @metrics_set: An ID for an OA unit metric set advertised via sysfs
 343 * @oa_format: An OA unit HW report format
 344 * @oa_periodic: Whether to enable periodic OA unit sampling
 345 * @oa_period_exponent: The OA unit sampling period is derived from this
 346 *
 347 * As read_properties_unlocked() enumerates and validates the properties given
 348 * to open a stream of metrics the configuration is built up in the structure
 349 * which starts out zero initialized.
 350 */
 351struct perf_open_properties {
 352	u32 sample_flags;
 353
 354	u64 single_context:1;
 355	u64 ctx_handle;
 356
 357	/* OA sampling state */
 358	int metrics_set;
 359	int oa_format;
 360	bool oa_periodic;
 361	int oa_period_exponent;
 362};
 363
 364static void free_oa_config(struct drm_i915_private *dev_priv,
 365			   struct i915_oa_config *oa_config)
 366{
 367	if (!PTR_ERR(oa_config->flex_regs))
 368		kfree(oa_config->flex_regs);
 369	if (!PTR_ERR(oa_config->b_counter_regs))
 370		kfree(oa_config->b_counter_regs);
 371	if (!PTR_ERR(oa_config->mux_regs))
 372		kfree(oa_config->mux_regs);
 373	kfree(oa_config);
 374}
 375
 376static void put_oa_config(struct drm_i915_private *dev_priv,
 377			  struct i915_oa_config *oa_config)
 378{
 379	if (!atomic_dec_and_test(&oa_config->ref_count))
 380		return;
 381
 382	free_oa_config(dev_priv, oa_config);
 383}
 384
 385static int get_oa_config(struct drm_i915_private *dev_priv,
 386			 int metrics_set,
 387			 struct i915_oa_config **out_config)
 388{
 389	int ret;
 390
 391	if (metrics_set == 1) {
 392		*out_config = &dev_priv->perf.oa.test_config;
 393		atomic_inc(&dev_priv->perf.oa.test_config.ref_count);
 394		return 0;
 395	}
 396
 397	ret = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
 398	if (ret)
 399		return ret;
 400
 401	*out_config = idr_find(&dev_priv->perf.metrics_idr, metrics_set);
 402	if (!*out_config)
 403		ret = -EINVAL;
 404	else
 405		atomic_inc(&(*out_config)->ref_count);
 406
 407	mutex_unlock(&dev_priv->perf.metrics_lock);
 408
 409	return ret;
 410}
 411
 412static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 413{
 414	return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
 415}
 416
 417static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 418{
 419	u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
 420
 421	return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
 422}
 423
 424/**
 425 * oa_buffer_check_unlocked - check for data and update tail ptr state
 426 * @dev_priv: i915 device instance
 427 *
 428 * This is either called via fops (for blocking reads in user ctx) or the poll
 429 * check hrtimer (atomic ctx) to check the OA buffer tail pointer and check
 430 * if there is data available for userspace to read.
 431 *
 432 * This function is central to providing a workaround for the OA unit tail
 433 * pointer having a race with respect to what data is visible to the CPU.
 434 * It is responsible for reading tail pointers from the hardware and giving
 435 * the pointers time to 'age' before they are made available for reading.
 436 * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
 437 *
 438 * Besides returning true when there is data available to read() this function
 439 * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
 440 * and .aged_tail_idx state used for reading.
 441 *
 442 * Note: It's safe to read OA config state here unlocked, assuming that this is
 443 * only called while the stream is enabled, while the global OA configuration
 444 * can't be modified.
 445 *
 446 * Returns: %true if the OA buffer contains data, else %false
 447 */
 448static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 449{
 450	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 451	unsigned long flags;
 452	unsigned int aged_idx;
 453	u32 head, hw_tail, aged_tail, aging_tail;
 454	u64 now;
 455
 456	/* We have to consider the (unlikely) possibility that read() errors
 457	 * could result in an OA buffer reset which might reset the head,
 458	 * tails[] and aged_tail state.
 459	 */
 460	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 461
 462	/* NB: The head we observe here might effectively be a little out of
 463	 * date (between head and tails[aged_idx].offset if there is currently
 464	 * a read() in progress.
 465	 */
 466	head = dev_priv->perf.oa.oa_buffer.head;
 467
 468	aged_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
 469	aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
 470	aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
 471
 472	hw_tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
 473
 474	/* The tail pointer increases in 64 byte increments,
 475	 * not in report_size steps...
 476	 */
 477	hw_tail &= ~(report_size - 1);
 478
 479	now = ktime_get_mono_fast_ns();
 480
 481	/* Update the aged tail
 482	 *
 483	 * Flip the tail pointer available for read()s once the aging tail is
 484	 * old enough to trust that the corresponding data will be visible to
 485	 * the CPU...
 486	 *
 487	 * Do this before updating the aging pointer in case we may be able to
 488	 * immediately start aging a new pointer too (if new data has become
 489	 * available) without needing to wait for a later hrtimer callback.
 490	 */
 491	if (aging_tail != INVALID_TAIL_PTR &&
 492	    ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
 493	     OA_TAIL_MARGIN_NSEC)) {
 494
 495		aged_idx ^= 1;
 496		dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
 497
 498		aged_tail = aging_tail;
 499
 500		/* Mark that we need a new pointer to start aging... */
 501		dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
 502		aging_tail = INVALID_TAIL_PTR;
 503	}
 504
 505	/* Update the aging tail
 506	 *
 507	 * We throttle aging tail updates until we have a new tail that
 508	 * represents >= one report more data than is already available for
 509	 * reading. This ensures there will be enough data for a successful
 510	 * read once this new pointer has aged and ensures we will give the new
 511	 * pointer time to age.
 512	 */
 513	if (aging_tail == INVALID_TAIL_PTR &&
 514	    (aged_tail == INVALID_TAIL_PTR ||
 515	     OA_TAKEN(hw_tail, aged_tail) >= report_size)) {
 516		struct i915_vma *vma = dev_priv->perf.oa.oa_buffer.vma;
 517		u32 gtt_offset = i915_ggtt_offset(vma);
 518
 519		/* Be paranoid and do a bounds check on the pointer read back
 520		 * from hardware, just in case some spurious hardware condition
 521		 * could put the tail out of bounds...
 522		 */
 523		if (hw_tail >= gtt_offset &&
 524		    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
 525			dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset =
 526				aging_tail = hw_tail;
 527			dev_priv->perf.oa.oa_buffer.aging_timestamp = now;
 528		} else {
 529			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
 530				  hw_tail);
 531		}
 532	}
 533
 534	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 535
 536	return aged_tail == INVALID_TAIL_PTR ?
 537		false : OA_TAKEN(aged_tail, head) >= report_size;
 538}
 539
 540/**
 541 * append_oa_status - Appends a status record to a userspace read() buffer.
 542 * @stream: An i915-perf stream opened for OA metrics
 543 * @buf: destination buffer given by userspace
 544 * @count: the number of bytes userspace wants to read
 545 * @offset: (inout): the current position for writing into @buf
 546 * @type: The kind of status to report to userspace
 547 *
 548 * Writes a status record (such as `DRM_I915_PERF_RECORD_OA_REPORT_LOST`)
 549 * into the userspace read() buffer.
 550 *
 551 * The @buf @offset will only be updated on success.
 552 *
 553 * Returns: 0 on success, negative error code on failure.
 554 */
 555static int append_oa_status(struct i915_perf_stream *stream,
 556			    char __user *buf,
 557			    size_t count,
 558			    size_t *offset,
 559			    enum drm_i915_perf_record_type type)
 560{
 561	struct drm_i915_perf_record_header header = { type, 0, sizeof(header) };
 562
 563	if ((count - *offset) < header.size)
 564		return -ENOSPC;
 565
 566	if (copy_to_user(buf + *offset, &header, sizeof(header)))
 567		return -EFAULT;
 568
 569	(*offset) += header.size;
 570
 571	return 0;
 572}
 573
 574/**
 575 * append_oa_sample - Copies single OA report into userspace read() buffer.
 576 * @stream: An i915-perf stream opened for OA metrics
 577 * @buf: destination buffer given by userspace
 578 * @count: the number of bytes userspace wants to read
 579 * @offset: (inout): the current position for writing into @buf
 580 * @report: A single OA report to (optionally) include as part of the sample
 581 *
 582 * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
 583 * properties when opening a stream, tracked as `stream->sample_flags`. This
 584 * function copies the requested components of a single sample to the given
 585 * read() @buf.
 586 *
 587 * The @buf @offset will only be updated on success.
 588 *
 589 * Returns: 0 on success, negative error code on failure.
 590 */
 591static int append_oa_sample(struct i915_perf_stream *stream,
 592			    char __user *buf,
 593			    size_t count,
 594			    size_t *offset,
 595			    const u8 *report)
 596{
 597	struct drm_i915_private *dev_priv = stream->dev_priv;
 598	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 599	struct drm_i915_perf_record_header header;
 600	u32 sample_flags = stream->sample_flags;
 601
 602	header.type = DRM_I915_PERF_RECORD_SAMPLE;
 603	header.pad = 0;
 604	header.size = stream->sample_size;
 605
 606	if ((count - *offset) < header.size)
 607		return -ENOSPC;
 608
 609	buf += *offset;
 610	if (copy_to_user(buf, &header, sizeof(header)))
 611		return -EFAULT;
 612	buf += sizeof(header);
 613
 614	if (sample_flags & SAMPLE_OA_REPORT) {
 615		if (copy_to_user(buf, report, report_size))
 616			return -EFAULT;
 617	}
 618
 619	(*offset) += header.size;
 620
 621	return 0;
 622}
 623
 624/**
 625 * Copies all buffered OA reports into userspace read() buffer.
 626 * @stream: An i915-perf stream opened for OA metrics
 627 * @buf: destination buffer given by userspace
 628 * @count: the number of bytes userspace wants to read
 629 * @offset: (inout): the current position for writing into @buf
 630 *
 631 * Notably any error condition resulting in a short read (-%ENOSPC or
 632 * -%EFAULT) will be returned even though one or more records may
 633 * have been successfully copied. In this case it's up to the caller
 634 * to decide if the error should be squashed before returning to
 635 * userspace.
 636 *
 637 * Note: reports are consumed from the head, and appended to the
 638 * tail, so the tail chases the head?... If you think that's mad
 639 * and back-to-front you're not alone, but this follows the
 640 * Gen PRM naming convention.
 641 *
 642 * Returns: 0 on success, negative error code on failure.
 643 */
 644static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 645				  char __user *buf,
 646				  size_t count,
 647				  size_t *offset)
 648{
 649	struct drm_i915_private *dev_priv = stream->dev_priv;
 650	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 651	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 652	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
 653	u32 mask = (OA_BUFFER_SIZE - 1);
 654	size_t start_offset = *offset;
 655	unsigned long flags;
 656	unsigned int aged_tail_idx;
 657	u32 head, tail;
 658	u32 taken;
 659	int ret = 0;
 660
 661	if (WARN_ON(!stream->enabled))
 662		return -EIO;
 663
 664	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 665
 666	head = dev_priv->perf.oa.oa_buffer.head;
 667	aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
 668	tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
 669
 670	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 671
 672	/*
 673	 * An invalid tail pointer here means we're still waiting for the poll
 674	 * hrtimer callback to give us a pointer
 675	 */
 676	if (tail == INVALID_TAIL_PTR)
 677		return -EAGAIN;
 678
 679	/*
 680	 * NB: oa_buffer.head/tail include the gtt_offset which we don't want
 681	 * while indexing relative to oa_buf_base.
 682	 */
 683	head -= gtt_offset;
 684	tail -= gtt_offset;
 685
 686	/*
 687	 * An out of bounds or misaligned head or tail pointer implies a driver
 688	 * bug since we validate + align the tail pointers we read from the
 689	 * hardware and we are in full control of the head pointer which should
 690	 * only be incremented by multiples of the report size (notably also
 691	 * all a power of two).
 692	 */
 693	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
 694		      tail > OA_BUFFER_SIZE || tail % report_size,
 695		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 696		      head, tail))
 697		return -EIO;
 698
 699
 700	for (/* none */;
 701	     (taken = OA_TAKEN(tail, head));
 702	     head = (head + report_size) & mask) {
 703		u8 *report = oa_buf_base + head;
 704		u32 *report32 = (void *)report;
 705		u32 ctx_id;
 706		u32 reason;
 707
 708		/*
 709		 * All the report sizes factor neatly into the buffer
 710		 * size so we never expect to see a report split
 711		 * between the beginning and end of the buffer.
 712		 *
 713		 * Given the initial alignment check a misalignment
 714		 * here would imply a driver bug that would result
 715		 * in an overrun.
 716		 */
 717		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
 718			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 719			break;
 720		}
 721
 722		/*
 723		 * The reason field includes flags identifying what
 724		 * triggered this specific report (mostly timer
 725		 * triggered or e.g. due to a context switch).
 726		 *
 727		 * This field is never expected to be zero so we can
 728		 * check that the report isn't invalid before copying
 729		 * it to userspace...
 730		 */
 731		reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
 732			  OAREPORT_REASON_MASK);
 733		if (reason == 0) {
 734			if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
 735				DRM_NOTE("Skipping spurious, invalid OA report\n");
 736			continue;
 737		}
 738
 739		/*
 740		 * XXX: Just keep the lower 21 bits for now since I'm not
 741		 * entirely sure if the HW touches any of the higher bits in
 742		 * this field
 743		 */
 744		ctx_id = report32[2] & 0x1fffff;
 745
 746		/*
 747		 * Squash whatever is in the CTX_ID field if it's marked as
 748		 * invalid to be sure we avoid false-positive, single-context
 749		 * filtering below...
 750		 *
 751		 * Note: that we don't clear the valid_ctx_bit so userspace can
 752		 * understand that the ID has been squashed by the kernel.
 753		 */
 754		if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
 755			ctx_id = report32[2] = INVALID_CTX_ID;
 756
 757		/*
 758		 * NB: For Gen 8 the OA unit no longer supports clock gating
 759		 * off for a specific context and the kernel can't securely
 760		 * stop the counters from updating as system-wide / global
 761		 * values.
 762		 *
 763		 * Automatic reports now include a context ID so reports can be
 764		 * filtered on the cpu but it's not worth trying to
 765		 * automatically subtract/hide counter progress for other
 766		 * contexts while filtering since we can't stop userspace
 767		 * issuing MI_REPORT_PERF_COUNT commands which would still
 768		 * provide a side-band view of the real values.
 769		 *
 770		 * To allow userspace (such as Mesa/GL_INTEL_performance_query)
 771		 * to normalize counters for a single filtered context then it
 772		 * needs be forwarded bookend context-switch reports so that it
 773		 * can track switches in between MI_REPORT_PERF_COUNT commands
 774		 * and can itself subtract/ignore the progress of counters
 775		 * associated with other contexts. Note that the hardware
 776		 * automatically triggers reports when switching to a new
 777		 * context which are tagged with the ID of the newly active
 778		 * context. To avoid the complexity (and likely fragility) of
 779		 * reading ahead while parsing reports to try and minimize
 780		 * forwarding redundant context switch reports (i.e. between
 781		 * other, unrelated contexts) we simply elect to forward them
 782		 * all.
 783		 *
 784		 * We don't rely solely on the reason field to identify context
 785		 * switches since it's not-uncommon for periodic samples to
 786		 * identify a switch before any 'context switch' report.
 787		 */
 788		if (!dev_priv->perf.oa.exclusive_stream->ctx ||
 789		    dev_priv->perf.oa.specific_ctx_id == ctx_id ||
 790		    (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
 791		     dev_priv->perf.oa.specific_ctx_id) ||
 792		    reason & OAREPORT_REASON_CTX_SWITCH) {
 793
 794			/*
 795			 * While filtering for a single context we avoid
 796			 * leaking the IDs of other contexts.
 797			 */
 798			if (dev_priv->perf.oa.exclusive_stream->ctx &&
 799			    dev_priv->perf.oa.specific_ctx_id != ctx_id) {
 800				report32[2] = INVALID_CTX_ID;
 801			}
 802
 803			ret = append_oa_sample(stream, buf, count, offset,
 804					       report);
 805			if (ret)
 806				break;
 807
 808			dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
 809		}
 810
 811		/*
 812		 * The above reason field sanity check is based on
 813		 * the assumption that the OA buffer is initially
 814		 * zeroed and we reset the field after copying so the
 815		 * check is still meaningful once old reports start
 816		 * being overwritten.
 817		 */
 818		report32[0] = 0;
 819	}
 820
 821	if (start_offset != *offset) {
 822		spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 823
 824		/*
 825		 * We removed the gtt_offset for the copy loop above, indexing
 826		 * relative to oa_buf_base so put back here...
 827		 */
 828		head += gtt_offset;
 829
 830		I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
 831		dev_priv->perf.oa.oa_buffer.head = head;
 832
 833		spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 834	}
 835
 836	return ret;
 837}
 838
 839/**
 840 * gen8_oa_read - copy status records then buffered OA reports
 841 * @stream: An i915-perf stream opened for OA metrics
 842 * @buf: destination buffer given by userspace
 843 * @count: the number of bytes userspace wants to read
 844 * @offset: (inout): the current position for writing into @buf
 845 *
 846 * Checks OA unit status registers and if necessary appends corresponding
 847 * status records for userspace (such as for a buffer full condition) and then
 848 * initiate appending any buffered OA reports.
 849 *
 850 * Updates @offset according to the number of bytes successfully copied into
 851 * the userspace buffer.
 852 *
 853 * NB: some data may be successfully copied to the userspace buffer
 854 * even if an error is returned, and this is reflected in the
 855 * updated @offset.
 856 *
 857 * Returns: zero on success or a negative error code
 858 */
 859static int gen8_oa_read(struct i915_perf_stream *stream,
 860			char __user *buf,
 861			size_t count,
 862			size_t *offset)
 863{
 864	struct drm_i915_private *dev_priv = stream->dev_priv;
 865	u32 oastatus;
 866	int ret;
 867
 868	if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
 869		return -EIO;
 870
 871	oastatus = I915_READ(GEN8_OASTATUS);
 872
 873	/*
 874	 * We treat OABUFFER_OVERFLOW as a significant error:
 875	 *
 876	 * Although theoretically we could handle this more gracefully
 877	 * sometimes, some Gens don't correctly suppress certain
 878	 * automatically triggered reports in this condition and so we
 879	 * have to assume that old reports are now being trampled
 880	 * over.
 881	 *
 882	 * Considering how we don't currently give userspace control
 883	 * over the OA buffer size and always configure a large 16MB
 884	 * buffer, then a buffer overflow does anyway likely indicate
 885	 * that something has gone quite badly wrong.
 886	 */
 887	if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
 888		ret = append_oa_status(stream, buf, count, offset,
 889				       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
 890		if (ret)
 891			return ret;
 892
 893		DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
 894			  dev_priv->perf.oa.period_exponent);
 895
 896		dev_priv->perf.oa.ops.oa_disable(dev_priv);
 897		dev_priv->perf.oa.ops.oa_enable(dev_priv);
 898
 899		/*
 900		 * Note: .oa_enable() is expected to re-init the oabuffer and
 901		 * reset GEN8_OASTATUS for us
 902		 */
 903		oastatus = I915_READ(GEN8_OASTATUS);
 904	}
 905
 906	if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
 907		ret = append_oa_status(stream, buf, count, offset,
 908				       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
 909		if (ret)
 910			return ret;
 911		I915_WRITE(GEN8_OASTATUS,
 912			   oastatus & ~GEN8_OASTATUS_REPORT_LOST);
 913	}
 914
 915	return gen8_append_oa_reports(stream, buf, count, offset);
 916}
 917
 918/**
 919 * Copies all buffered OA reports into userspace read() buffer.
 920 * @stream: An i915-perf stream opened for OA metrics
 921 * @buf: destination buffer given by userspace
 922 * @count: the number of bytes userspace wants to read
 923 * @offset: (inout): the current position for writing into @buf
 924 *
 925 * Notably any error condition resulting in a short read (-%ENOSPC or
 926 * -%EFAULT) will be returned even though one or more records may
 927 * have been successfully copied. In this case it's up to the caller
 928 * to decide if the error should be squashed before returning to
 929 * userspace.
 930 *
 931 * Note: reports are consumed from the head, and appended to the
 932 * tail, so the tail chases the head?... If you think that's mad
 933 * and back-to-front you're not alone, but this follows the
 934 * Gen PRM naming convention.
 935 *
 936 * Returns: 0 on success, negative error code on failure.
 937 */
 938static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 939				  char __user *buf,
 940				  size_t count,
 941				  size_t *offset)
 942{
 943	struct drm_i915_private *dev_priv = stream->dev_priv;
 944	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 945	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 946	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
 947	u32 mask = (OA_BUFFER_SIZE - 1);
 948	size_t start_offset = *offset;
 949	unsigned long flags;
 950	unsigned int aged_tail_idx;
 951	u32 head, tail;
 952	u32 taken;
 953	int ret = 0;
 954
 955	if (WARN_ON(!stream->enabled))
 956		return -EIO;
 957
 958	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 959
 960	head = dev_priv->perf.oa.oa_buffer.head;
 961	aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
 962	tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
 963
 964	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 965
 966	/* An invalid tail pointer here means we're still waiting for the poll
 967	 * hrtimer callback to give us a pointer
 968	 */
 969	if (tail == INVALID_TAIL_PTR)
 970		return -EAGAIN;
 971
 972	/* NB: oa_buffer.head/tail include the gtt_offset which we don't want
 973	 * while indexing relative to oa_buf_base.
 974	 */
 975	head -= gtt_offset;
 976	tail -= gtt_offset;
 977
 978	/* An out of bounds or misaligned head or tail pointer implies a driver
 979	 * bug since we validate + align the tail pointers we read from the
 980	 * hardware and we are in full control of the head pointer which should
 981	 * only be incremented by multiples of the report size (notably also
 982	 * all a power of two).
 983	 */
 984	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
 985		      tail > OA_BUFFER_SIZE || tail % report_size,
 986		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 987		      head, tail))
 988		return -EIO;
 989
 990
 991	for (/* none */;
 992	     (taken = OA_TAKEN(tail, head));
 993	     head = (head + report_size) & mask) {
 994		u8 *report = oa_buf_base + head;
 995		u32 *report32 = (void *)report;
 996
 997		/* All the report sizes factor neatly into the buffer
 998		 * size so we never expect to see a report split
 999		 * between the beginning and end of the buffer.
1000		 *
1001		 * Given the initial alignment check a misalignment
1002		 * here would imply a driver bug that would result
1003		 * in an overrun.
1004		 */
1005		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
1006			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
1007			break;
1008		}
1009
1010		/* The report-ID field for periodic samples includes
1011		 * some undocumented flags related to what triggered
1012		 * the report and is never expected to be zero so we
1013		 * can check that the report isn't invalid before
1014		 * copying it to userspace...
1015		 */
1016		if (report32[0] == 0) {
1017			if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
1018				DRM_NOTE("Skipping spurious, invalid OA report\n");
1019			continue;
1020		}
1021
1022		ret = append_oa_sample(stream, buf, count, offset, report);
1023		if (ret)
1024			break;
1025
1026		/* The above report-id field sanity check is based on
1027		 * the assumption that the OA buffer is initially
1028		 * zeroed and we reset the field after copying so the
1029		 * check is still meaningful once old reports start
1030		 * being overwritten.
1031		 */
1032		report32[0] = 0;
1033	}
1034
1035	if (start_offset != *offset) {
1036		spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1037
1038		/* We removed the gtt_offset for the copy loop above, indexing
1039		 * relative to oa_buf_base so put back here...
1040		 */
1041		head += gtt_offset;
1042
1043		I915_WRITE(GEN7_OASTATUS2,
1044			   ((head & GEN7_OASTATUS2_HEAD_MASK) |
1045			    OA_MEM_SELECT_GGTT));
1046		dev_priv->perf.oa.oa_buffer.head = head;
1047
1048		spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1049	}
1050
1051	return ret;
1052}
1053
1054/**
1055 * gen7_oa_read - copy status records then buffered OA reports
1056 * @stream: An i915-perf stream opened for OA metrics
1057 * @buf: destination buffer given by userspace
1058 * @count: the number of bytes userspace wants to read
1059 * @offset: (inout): the current position for writing into @buf
1060 *
1061 * Checks Gen 7 specific OA unit status registers and if necessary appends
1062 * corresponding status records for userspace (such as for a buffer full
1063 * condition) and then initiate appending any buffered OA reports.
1064 *
1065 * Updates @offset according to the number of bytes successfully copied into
1066 * the userspace buffer.
1067 *
1068 * Returns: zero on success or a negative error code
1069 */
1070static int gen7_oa_read(struct i915_perf_stream *stream,
1071			char __user *buf,
1072			size_t count,
1073			size_t *offset)
1074{
1075	struct drm_i915_private *dev_priv = stream->dev_priv;
1076	u32 oastatus1;
1077	int ret;
1078
1079	if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
1080		return -EIO;
1081
1082	oastatus1 = I915_READ(GEN7_OASTATUS1);
1083
1084	/* XXX: On Haswell we don't have a safe way to clear oastatus1
1085	 * bits while the OA unit is enabled (while the tail pointer
1086	 * may be updated asynchronously) so we ignore status bits
1087	 * that have already been reported to userspace.
1088	 */
1089	oastatus1 &= ~dev_priv->perf.oa.gen7_latched_oastatus1;
1090
1091	/* We treat OABUFFER_OVERFLOW as a significant error:
1092	 *
1093	 * - The status can be interpreted to mean that the buffer is
1094	 *   currently full (with a higher precedence than OA_TAKEN()
1095	 *   which will start to report a near-empty buffer after an
1096	 *   overflow) but it's awkward that we can't clear the status
1097	 *   on Haswell, so without a reset we won't be able to catch
1098	 *   the state again.
1099	 *
1100	 * - Since it also implies the HW has started overwriting old
1101	 *   reports it may also affect our sanity checks for invalid
1102	 *   reports when copying to userspace that assume new reports
1103	 *   are being written to cleared memory.
1104	 *
1105	 * - In the future we may want to introduce a flight recorder
1106	 *   mode where the driver will automatically maintain a safe
1107	 *   guard band between head/tail, avoiding this overflow
1108	 *   condition, but we avoid the added driver complexity for
1109	 *   now.
1110	 */
1111	if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) {
1112		ret = append_oa_status(stream, buf, count, offset,
1113				       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
1114		if (ret)
1115			return ret;
1116
1117		DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
1118			  dev_priv->perf.oa.period_exponent);
1119
1120		dev_priv->perf.oa.ops.oa_disable(dev_priv);
1121		dev_priv->perf.oa.ops.oa_enable(dev_priv);
1122
1123		oastatus1 = I915_READ(GEN7_OASTATUS1);
1124	}
1125
1126	if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
1127		ret = append_oa_status(stream, buf, count, offset,
1128				       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
1129		if (ret)
1130			return ret;
1131		dev_priv->perf.oa.gen7_latched_oastatus1 |=
1132			GEN7_OASTATUS1_REPORT_LOST;
1133	}
1134
1135	return gen7_append_oa_reports(stream, buf, count, offset);
1136}
1137
1138/**
1139 * i915_oa_wait_unlocked - handles blocking IO until OA data available
1140 * @stream: An i915-perf stream opened for OA metrics
1141 *
1142 * Called when userspace tries to read() from a blocking stream FD opened
1143 * for OA metrics. It waits until the hrtimer callback finds a non-empty
1144 * OA buffer and wakes us.
1145 *
1146 * Note: it's acceptable to have this return with some false positives
1147 * since any subsequent read handling will return -EAGAIN if there isn't
1148 * really data ready for userspace yet.
1149 *
1150 * Returns: zero on success or a negative error code
1151 */
1152static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
1153{
1154	struct drm_i915_private *dev_priv = stream->dev_priv;
1155
1156	/* We would wait indefinitely if periodic sampling is not enabled */
1157	if (!dev_priv->perf.oa.periodic)
1158		return -EIO;
1159
1160	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
1161					oa_buffer_check_unlocked(dev_priv));
1162}
1163
1164/**
1165 * i915_oa_poll_wait - call poll_wait() for an OA stream poll()
1166 * @stream: An i915-perf stream opened for OA metrics
1167 * @file: An i915 perf stream file
1168 * @wait: poll() state table
1169 *
1170 * For handling userspace polling on an i915 perf stream opened for OA metrics,
1171 * this starts a poll_wait with the wait queue that our hrtimer callback wakes
1172 * when it sees data ready to read in the circular OA buffer.
1173 */
1174static void i915_oa_poll_wait(struct i915_perf_stream *stream,
1175			      struct file *file,
1176			      poll_table *wait)
1177{
1178	struct drm_i915_private *dev_priv = stream->dev_priv;
1179
1180	poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
1181}
1182
1183/**
1184 * i915_oa_read - just calls through to &i915_oa_ops->read
1185 * @stream: An i915-perf stream opened for OA metrics
1186 * @buf: destination buffer given by userspace
1187 * @count: the number of bytes userspace wants to read
1188 * @offset: (inout): the current position for writing into @buf
1189 *
1190 * Updates @offset according to the number of bytes successfully copied into
1191 * the userspace buffer.
1192 *
1193 * Returns: zero on success or a negative error code
1194 */
1195static int i915_oa_read(struct i915_perf_stream *stream,
1196			char __user *buf,
1197			size_t count,
1198			size_t *offset)
1199{
1200	struct drm_i915_private *dev_priv = stream->dev_priv;
1201
1202	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
1203}
1204
1205/**
1206 * oa_get_render_ctx_id - determine and hold ctx hw id
1207 * @stream: An i915-perf stream opened for OA metrics
1208 *
1209 * Determine the render context hw id, and ensure it remains fixed for the
1210 * lifetime of the stream. This ensures that we don't have to worry about
1211 * updating the context ID in OACONTROL on the fly.
1212 *
1213 * Returns: zero on success or a negative error code
1214 */
1215static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
1216{
1217	struct drm_i915_private *dev_priv = stream->dev_priv;
1218
1219	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
1220		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
1221	} else {
1222		struct intel_engine_cs *engine = dev_priv->engine[RCS];
1223		struct intel_ring *ring;
1224		int ret;
1225
1226		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
1227		if (ret)
1228			return ret;
1229
1230		/*
1231		 * As the ID is the gtt offset of the context's vma we
1232		 * pin the vma to ensure the ID remains fixed.
1233		 *
1234		 * NB: implied RCS engine...
1235		 */
1236		ring = engine->context_pin(engine, stream->ctx);
1237		mutex_unlock(&dev_priv->drm.struct_mutex);
1238		if (IS_ERR(ring))
1239			return PTR_ERR(ring);
1240
1241
1242		/*
1243		 * Explicitly track the ID (instead of calling
1244		 * i915_ggtt_offset() on the fly) considering the difference
1245		 * with gen8+ and execlists
1246		 */
1247		dev_priv->perf.oa.specific_ctx_id =
1248			i915_ggtt_offset(stream->ctx->engine[engine->id].state);
1249	}
1250
1251	return 0;
1252}
1253
1254/**
1255 * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold
1256 * @stream: An i915-perf stream opened for OA metrics
1257 *
1258 * In case anything needed doing to ensure the context HW ID would remain valid
1259 * for the lifetime of the stream, then that can be undone here.
1260 */
1261static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
1262{
1263	struct drm_i915_private *dev_priv = stream->dev_priv;
1264
1265	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
1266		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
1267	} else {
1268		struct intel_engine_cs *engine = dev_priv->engine[RCS];
1269
1270		mutex_lock(&dev_priv->drm.struct_mutex);
1271
1272		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
1273		engine->context_unpin(engine, stream->ctx);
1274
1275		mutex_unlock(&dev_priv->drm.struct_mutex);
1276	}
1277}
1278
1279static void
1280free_oa_buffer(struct drm_i915_private *i915)
1281{
1282	mutex_lock(&i915->drm.struct_mutex);
1283
1284	i915_gem_object_unpin_map(i915->perf.oa.oa_buffer.vma->obj);
1285	i915_vma_unpin(i915->perf.oa.oa_buffer.vma);
1286	i915_gem_object_put(i915->perf.oa.oa_buffer.vma->obj);
1287
1288	i915->perf.oa.oa_buffer.vma = NULL;
1289	i915->perf.oa.oa_buffer.vaddr = NULL;
1290
1291	mutex_unlock(&i915->drm.struct_mutex);
1292}
1293
1294static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
1295{
1296	struct drm_i915_private *dev_priv = stream->dev_priv;
1297
1298	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
1299
1300	/*
1301	 * Unset exclusive_stream first, it will be checked while disabling
1302	 * the metric set on gen8+.
1303	 */
1304	mutex_lock(&dev_priv->drm.struct_mutex);
1305	dev_priv->perf.oa.exclusive_stream = NULL;
1306	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
1307	mutex_unlock(&dev_priv->drm.struct_mutex);
1308
1309	free_oa_buffer(dev_priv);
1310
1311	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
1312	intel_runtime_pm_put(dev_priv);
1313
1314	if (stream->ctx)
1315		oa_put_render_ctx_id(stream);
1316
1317	put_oa_config(dev_priv, stream->oa_config);
1318
1319	if (dev_priv->perf.oa.spurious_report_rs.missed) {
1320		DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
1321			 dev_priv->perf.oa.spurious_report_rs.missed);
1322	}
1323}
1324
1325static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
1326{
1327	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
1328	unsigned long flags;
1329
1330	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1331
1332	/* Pre-DevBDW: OABUFFER must be set with counters off,
1333	 * before OASTATUS1, but after OASTATUS2
1334	 */
1335	I915_WRITE(GEN7_OASTATUS2, gtt_offset | OA_MEM_SELECT_GGTT); /* head */
1336	dev_priv->perf.oa.oa_buffer.head = gtt_offset;
1337
1338	I915_WRITE(GEN7_OABUFFER, gtt_offset);
1339
1340	I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
1341
1342	/* Mark that we need updated tail pointers to read from... */
1343	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1344	dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1345
1346	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1347
1348	/* On Haswell we have to track which OASTATUS1 flags we've
1349	 * already seen since they can't be cleared while periodic
1350	 * sampling is enabled.
1351	 */
1352	dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
1353
1354	/* NB: although the OA buffer will initially be allocated
1355	 * zeroed via shmfs (and so this memset is redundant when
1356	 * first allocating), we may re-init the OA buffer, either
1357	 * when re-enabling a stream or in error/reset paths.
1358	 *
1359	 * The reason we clear the buffer for each re-init is for the
1360	 * sanity check in gen7_append_oa_reports() that looks at the
1361	 * report-id field to make sure it's non-zero which relies on
1362	 * the assumption that new reports are being written to zeroed
1363	 * memory...
1364	 */
1365	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
1366
1367	/* Maybe make ->pollin per-stream state if we support multiple
1368	 * concurrent streams in the future.
1369	 */
1370	dev_priv->perf.oa.pollin = false;
1371}
1372
1373static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
1374{
1375	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
1376	unsigned long flags;
1377
1378	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1379
1380	I915_WRITE(GEN8_OASTATUS, 0);
1381	I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
1382	dev_priv->perf.oa.oa_buffer.head = gtt_offset;
1383
1384	I915_WRITE(GEN8_OABUFFER_UDW, 0);
1385
1386	/*
1387	 * PRM says:
1388	 *
1389	 *  "This MMIO must be set before the OATAILPTR
1390	 *  register and after the OAHEADPTR register. This is
1391	 *  to enable proper functionality of the overflow
1392	 *  bit."
1393	 */
1394	I915_WRITE(GEN8_OABUFFER, gtt_offset |
1395		   OABUFFER_SIZE_16M | OA_MEM_SELECT_GGTT);
1396	I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
1397
1398	/* Mark that we need updated tail pointers to read from... */
1399	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1400	dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1401
1402	/*
1403	 * Reset state used to recognise context switches, affecting which
1404	 * reports we will forward to userspace while filtering for a single
1405	 * context.
1406	 */
1407	dev_priv->perf.oa.oa_buffer.last_ctx_id = INVALID_CTX_ID;
1408
1409	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1410
1411	/*
1412	 * NB: although the OA buffer will initially be allocated
1413	 * zeroed via shmfs (and so this memset is redundant when
1414	 * first allocating), we may re-init the OA buffer, either
1415	 * when re-enabling a stream or in error/reset paths.
1416	 *
1417	 * The reason we clear the buffer for each re-init is for the
1418	 * sanity check in gen8_append_oa_reports() that looks at the
1419	 * reason field to make sure it's non-zero which relies on
1420	 * the assumption that new reports are being written to zeroed
1421	 * memory...
1422	 */
1423	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
1424
1425	/*
1426	 * Maybe make ->pollin per-stream state if we support multiple
1427	 * concurrent streams in the future.
1428	 */
1429	dev_priv->perf.oa.pollin = false;
1430}
1431
1432static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
1433{
1434	struct drm_i915_gem_object *bo;
1435	struct i915_vma *vma;
1436	int ret;
1437
1438	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
1439		return -ENODEV;
1440
1441	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
1442	if (ret)
1443		return ret;
1444
1445	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
1446	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
1447
1448	bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
1449	if (IS_ERR(bo)) {
1450		DRM_ERROR("Failed to allocate OA buffer\n");
1451		ret = PTR_ERR(bo);
1452		goto unlock;
1453	}
1454
1455	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
1456	if (ret)
1457		goto err_unref;
1458
1459	/* PreHSW required 512K alignment, HSW requires 16M */
1460	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
1461	if (IS_ERR(vma)) {
1462		ret = PTR_ERR(vma);
1463		goto err_unref;
1464	}
1465	dev_priv->perf.oa.oa_buffer.vma = vma;
1466
1467	dev_priv->perf.oa.oa_buffer.vaddr =
1468		i915_gem_object_pin_map(bo, I915_MAP_WB);
1469	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
1470		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
1471		goto err_unpin;
1472	}
1473
1474	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
1475
1476	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
1477			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
1478			 dev_priv->perf.oa.oa_buffer.vaddr);
1479
1480	goto unlock;
1481
1482err_unpin:
1483	__i915_vma_unpin(vma);
1484
1485err_unref:
1486	i915_gem_object_put(bo);
1487
1488	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
1489	dev_priv->perf.oa.oa_buffer.vma = NULL;
1490
1491unlock:
1492	mutex_unlock(&dev_priv->drm.struct_mutex);
1493	return ret;
1494}
1495
1496static void config_oa_regs(struct drm_i915_private *dev_priv,
1497			   const struct i915_oa_reg *regs,
1498			   u32 n_regs)
1499{
1500	u32 i;
1501
1502	for (i = 0; i < n_regs; i++) {
1503		const struct i915_oa_reg *reg = regs + i;
1504
1505		I915_WRITE(reg->addr, reg->value);
1506	}
1507}
1508
1509static int hsw_enable_metric_set(struct drm_i915_private *dev_priv,
1510				 const struct i915_oa_config *oa_config)
1511{
1512	/* PRM:
1513	 *
1514	 * OA unit is using “crclk” for its functionality. When trunk
1515	 * level clock gating takes place, OA clock would be gated,
1516	 * unable to count the events from non-render clock domain.
1517	 * Render clock gating must be disabled when OA is enabled to
1518	 * count the events from non-render domain. Unit level clock
1519	 * gating for RCS should also be disabled.
1520	 */
1521	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
1522				    ~GEN7_DOP_CLOCK_GATE_ENABLE));
1523	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
1524				  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
1525
1526	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
1527
1528	/* It apparently takes a fairly long time for a new MUX
1529	 * configuration to be be applied after these register writes.
1530	 * This delay duration was derived empirically based on the
1531	 * render_basic config but hopefully it covers the maximum
1532	 * configuration latency.
1533	 *
1534	 * As a fallback, the checks in _append_oa_reports() to skip
1535	 * invalid OA reports do also seem to work to discard reports
1536	 * generated before this config has completed - albeit not
1537	 * silently.
1538	 *
1539	 * Unfortunately this is essentially a magic number, since we
1540	 * don't currently know of a reliable mechanism for predicting
1541	 * how long the MUX config will take to apply and besides
1542	 * seeing invalid reports we don't know of a reliable way to
1543	 * explicitly check that the MUX config has landed.
1544	 *
1545	 * It's even possible we've miss characterized the underlying
1546	 * problem - it just seems like the simplest explanation why
1547	 * a delay at this location would mitigate any invalid reports.
1548	 */
1549	usleep_range(15000, 20000);
1550
1551	config_oa_regs(dev_priv, oa_config->b_counter_regs,
1552		       oa_config->b_counter_regs_len);
1553
1554	return 0;
1555}
1556
1557static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
1558{
1559	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
1560				  ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
1561	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
1562				    GEN7_DOP_CLOCK_GATE_ENABLE));
1563
1564	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
1565				      ~GT_NOA_ENABLE));
1566}
1567
1568/*
1569 * NB: It must always remain pointer safe to run this even if the OA unit
1570 * has been disabled.
1571 *
1572 * It's fine to put out-of-date values into these per-context registers
1573 * in the case that the OA unit has been disabled.
1574 */
1575static void gen8_update_reg_state_unlocked(struct i915_gem_context *ctx,
1576					   u32 *reg_state,
1577					   const struct i915_oa_config *oa_config)
1578{
1579	struct drm_i915_private *dev_priv = ctx->i915;
1580	u32 ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_offset;
1581	u32 ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_offset;
1582	/* The MMIO offsets for Flex EU registers aren't contiguous */
1583	u32 flex_mmio[] = {
1584		i915_mmio_reg_offset(EU_PERF_CNTL0),
1585		i915_mmio_reg_offset(EU_PERF_CNTL1),
1586		i915_mmio_reg_offset(EU_PERF_CNTL2),
1587		i915_mmio_reg_offset(EU_PERF_CNTL3),
1588		i915_mmio_reg_offset(EU_PERF_CNTL4),
1589		i915_mmio_reg_offset(EU_PERF_CNTL5),
1590		i915_mmio_reg_offset(EU_PERF_CNTL6),
1591	};
1592	int i;
1593
1594	reg_state[ctx_oactxctrl] = i915_mmio_reg_offset(GEN8_OACTXCONTROL);
1595	reg_state[ctx_oactxctrl+1] = (dev_priv->perf.oa.period_exponent <<
1596				      GEN8_OA_TIMER_PERIOD_SHIFT) |
1597				     (dev_priv->perf.oa.periodic ?
1598				      GEN8_OA_TIMER_ENABLE : 0) |
1599				     GEN8_OA_COUNTER_RESUME;
1600
1601	for (i = 0; i < ARRAY_SIZE(flex_mmio); i++) {
1602		u32 state_offset = ctx_flexeu0 + i * 2;
1603		u32 mmio = flex_mmio[i];
1604
1605		/*
1606		 * This arbitrary default will select the 'EU FPU0 Pipeline
1607		 * Active' event. In the future it's anticipated that there
1608		 * will be an explicit 'No Event' we can select, but not yet...
1609		 */
1610		u32 value = 0;
1611
1612		if (oa_config) {
1613			u32 j;
1614
1615			for (j = 0; j < oa_config->flex_regs_len; j++) {
1616				if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
1617					value = oa_config->flex_regs[j].value;
1618					break;
1619				}
1620			}
1621		}
1622
1623		reg_state[state_offset] = mmio;
1624		reg_state[state_offset+1] = value;
1625	}
1626}
1627
1628/*
1629 * Same as gen8_update_reg_state_unlocked only through the batchbuffer. This
1630 * is only used by the kernel context.
1631 */
1632static int gen8_emit_oa_config(struct i915_request *rq,
1633			       const struct i915_oa_config *oa_config)
1634{
1635	struct drm_i915_private *dev_priv = rq->i915;
1636	/* The MMIO offsets for Flex EU registers aren't contiguous */
1637	u32 flex_mmio[] = {
1638		i915_mmio_reg_offset(EU_PERF_CNTL0),
1639		i915_mmio_reg_offset(EU_PERF_CNTL1),
1640		i915_mmio_reg_offset(EU_PERF_CNTL2),
1641		i915_mmio_reg_offset(EU_PERF_CNTL3),
1642		i915_mmio_reg_offset(EU_PERF_CNTL4),
1643		i915_mmio_reg_offset(EU_PERF_CNTL5),
1644		i915_mmio_reg_offset(EU_PERF_CNTL6),
1645	};
1646	u32 *cs;
1647	int i;
1648
1649	cs = intel_ring_begin(rq, ARRAY_SIZE(flex_mmio) * 2 + 4);
1650	if (IS_ERR(cs))
1651		return PTR_ERR(cs);
1652
1653	*cs++ = MI_LOAD_REGISTER_IMM(ARRAY_SIZE(flex_mmio) + 1);
1654
1655	*cs++ = i915_mmio_reg_offset(GEN8_OACTXCONTROL);
1656	*cs++ = (dev_priv->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
1657		(dev_priv->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
1658		GEN8_OA_COUNTER_RESUME;
1659
1660	for (i = 0; i < ARRAY_SIZE(flex_mmio); i++) {
1661		u32 mmio = flex_mmio[i];
1662
1663		/*
1664		 * This arbitrary default will select the 'EU FPU0 Pipeline
1665		 * Active' event. In the future it's anticipated that there
1666		 * will be an explicit 'No Event' we can select, but not
1667		 * yet...
1668		 */
1669		u32 value = 0;
1670
1671		if (oa_config) {
1672			u32 j;
1673
1674			for (j = 0; j < oa_config->flex_regs_len; j++) {
1675				if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
1676					value = oa_config->flex_regs[j].value;
1677					break;
1678				}
1679			}
1680		}
1681
1682		*cs++ = mmio;
1683		*cs++ = value;
1684	}
1685
1686	*cs++ = MI_NOOP;
1687	intel_ring_advance(rq, cs);
1688
1689	return 0;
1690}
1691
1692static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
1693						 const struct i915_oa_config *oa_config)
1694{
1695	struct intel_engine_cs *engine = dev_priv->engine[RCS];
1696	struct i915_gem_timeline *timeline;
1697	struct i915_request *rq;
1698	int ret;
1699
1700	lockdep_assert_held(&dev_priv->drm.struct_mutex);
1701
1702	i915_retire_requests(dev_priv);
1703
1704	rq = i915_request_alloc(engine, dev_priv->kernel_context);
1705	if (IS_ERR(rq))
1706		return PTR_ERR(rq);
1707
1708	ret = gen8_emit_oa_config(rq, oa_config);
1709	if (ret) {
1710		i915_request_add(rq);
1711		return ret;
1712	}
1713
1714	/* Queue this switch after all other activity */
1715	list_for_each_entry(timeline, &dev_priv->gt.timelines, link) {
1716		struct i915_request *prev;
1717		struct intel_timeline *tl;
1718
1719		tl = &timeline->engine[engine->id];
1720		prev = i915_gem_active_raw(&tl->last_request,
1721					   &dev_priv->drm.struct_mutex);
1722		if (prev)
1723			i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1724							 &prev->submit,
1725							 GFP_KERNEL);
1726	}
1727
1728	i915_request_add(rq);
1729
1730	return 0;
1731}
1732
1733/*
1734 * Manages updating the per-context aspects of the OA stream
1735 * configuration across all contexts.
1736 *
1737 * The awkward consideration here is that OACTXCONTROL controls the
1738 * exponent for periodic sampling which is primarily used for system
1739 * wide profiling where we'd like a consistent sampling period even in
1740 * the face of context switches.
1741 *
1742 * Our approach of updating the register state context (as opposed to
1743 * say using a workaround batch buffer) ensures that the hardware
1744 * won't automatically reload an out-of-date timer exponent even
1745 * transiently before a WA BB could be parsed.
1746 *
1747 * This function needs to:
1748 * - Ensure the currently running context's per-context OA state is
1749 *   updated
1750 * - Ensure that all existing contexts will have the correct per-context
1751 *   OA state if they are scheduled for use.
1752 * - Ensure any new contexts will be initialized with the correct
1753 *   per-context OA state.
1754 *
1755 * Note: it's only the RCS/Render context that has any OA state.
1756 */
1757static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
1758				       const struct i915_oa_config *oa_config)
1759{
1760	struct i915_gem_context *ctx;
1761	int ret;
1762	unsigned int wait_flags = I915_WAIT_LOCKED;
1763
1764	lockdep_assert_held(&dev_priv->drm.struct_mutex);
1765
1766	/* Switch away from any user context. */
1767	ret = gen8_switch_to_updated_kernel_context(dev_priv, oa_config);
1768	if (ret)
1769		goto out;
1770
1771	/*
1772	 * The OA register config is setup through the context image. This image
1773	 * might be written to by the GPU on context switch (in particular on
1774	 * lite-restore). This means we can't safely update a context's image,
1775	 * if this context is scheduled/submitted to run on the GPU.
1776	 *
1777	 * We could emit the OA register config through the batch buffer but
1778	 * this might leave small interval of time where the OA unit is
1779	 * configured at an invalid sampling period.
1780	 *
1781	 * So far the best way to work around this issue seems to be draining
1782	 * the GPU from any submitted work.
1783	 */
1784	ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
1785	if (ret)
1786		goto out;
1787
1788	/* Update all contexts now that we've stalled the submission. */
1789	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
1790		struct intel_context *ce = &ctx->engine[RCS];
1791		u32 *regs;
1792
1793		/* OA settings will be set upon first use */
1794		if (!ce->state)
1795			continue;
1796
1797		regs = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
1798		if (IS_ERR(regs)) {
1799			ret = PTR_ERR(regs);
1800			goto out;
1801		}
1802
1803		ce->state->obj->mm.dirty = true;
1804		regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
1805
1806		gen8_update_reg_state_unlocked(ctx, regs, oa_config);
1807
1808		i915_gem_object_unpin_map(ce->state->obj);
1809	}
1810
1811 out:
1812	return ret;
1813}
1814
1815static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
1816				  const struct i915_oa_config *oa_config)
1817{
1818	int ret;
1819
1820	/*
1821	 * We disable slice/unslice clock ratio change reports on SKL since
1822	 * they are too noisy. The HW generates a lot of redundant reports
1823	 * where the ratio hasn't really changed causing a lot of redundant
1824	 * work to processes and increasing the chances we'll hit buffer
1825	 * overruns.
1826	 *
1827	 * Although we don't currently use the 'disable overrun' OABUFFER
1828	 * feature it's worth noting that clock ratio reports have to be
1829	 * disabled before considering to use that feature since the HW doesn't
1830	 * correctly block these reports.
1831	 *
1832	 * Currently none of the high-level metrics we have depend on knowing
1833	 * this ratio to normalize.
1834	 *
1835	 * Note: This register is not power context saved and restored, but
1836	 * that's OK considering that we disable RC6 while the OA unit is
1837	 * enabled.
1838	 *
1839	 * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
1840	 * be read back from automatically triggered reports, as part of the
1841	 * RPT_ID field.
1842	 */
1843	if (IS_GEN9(dev_priv) || IS_GEN10(dev_priv)) {
1844		I915_WRITE(GEN8_OA_DEBUG,
1845			   _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
1846					      GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
1847	}
1848
1849	/*
1850	 * Update all contexts prior writing the mux configurations as we need
1851	 * to make sure all slices/subslices are ON before writing to NOA
1852	 * registers.
1853	 */
1854	ret = gen8_configure_all_contexts(dev_priv, oa_config);
1855	if (ret)
1856		return ret;
1857
1858	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
1859
1860	config_oa_regs(dev_priv, oa_config->b_counter_regs,
1861		       oa_config->b_counter_regs_len);
1862
1863	return 0;
1864}
1865
1866static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
1867{
1868	/* Reset all contexts' slices/subslices configurations. */
1869	gen8_configure_all_contexts(dev_priv, NULL);
1870
1871	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
1872				      ~GT_NOA_ENABLE));
1873
1874}
1875
1876static void gen10_disable_metric_set(struct drm_i915_private *dev_priv)
1877{
1878	/* Reset all contexts' slices/subslices configurations. */
1879	gen8_configure_all_contexts(dev_priv, NULL);
1880
1881	/* Make sure we disable noa to save power. */
1882	I915_WRITE(RPM_CONFIG1,
1883		   I915_READ(RPM_CONFIG1) & ~GEN10_GT_NOA_ENABLE);
1884}
1885
1886static void gen7_oa_enable(struct drm_i915_private *dev_priv)
1887{
1888	/*
1889	 * Reset buf pointers so we don't forward reports from before now.
1890	 *
1891	 * Think carefully if considering trying to avoid this, since it
1892	 * also ensures status flags and the buffer itself are cleared
1893	 * in error paths, and we have checks for invalid reports based
1894	 * on the assumption that certain fields are written to zeroed
1895	 * memory which this helps maintains.
1896	 */
1897	gen7_init_oa_buffer(dev_priv);
1898
1899	if (dev_priv->perf.oa.exclusive_stream->enabled) {
1900		struct i915_gem_context *ctx =
1901			dev_priv->perf.oa.exclusive_stream->ctx;
1902		u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
1903
1904		bool periodic = dev_priv->perf.oa.periodic;
1905		u32 period_exponent = dev_priv->perf.oa.period_exponent;
1906		u32 report_format = dev_priv->perf.oa.oa_buffer.format;
1907
1908		I915_WRITE(GEN7_OACONTROL,
1909			   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
1910			   (period_exponent <<
1911			    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
1912			   (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
1913			   (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
1914			   (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
1915			   GEN7_OACONTROL_ENABLE);
1916	} else
1917		I915_WRITE(GEN7_OACONTROL, 0);
1918}
1919
1920static void gen8_oa_enable(struct drm_i915_private *dev_priv)
1921{
1922	u32 report_format = dev_priv->perf.oa.oa_buffer.format;
1923
1924	/*
1925	 * Reset buf pointers so we don't forward reports from before now.
1926	 *
1927	 * Think carefully if considering trying to avoid this, since it
1928	 * also ensures status flags and the buffer itself are cleared
1929	 * in error paths, and we have checks for invalid reports based
1930	 * on the assumption that certain fields are written to zeroed
1931	 * memory which this helps maintains.
1932	 */
1933	gen8_init_oa_buffer(dev_priv);
1934
1935	/*
1936	 * Note: we don't rely on the hardware to perform single context
1937	 * filtering and instead filter on the cpu based on the context-id
1938	 * field of reports
1939	 */
1940	I915_WRITE(GEN8_OACONTROL, (report_format <<
1941				    GEN8_OA_REPORT_FORMAT_SHIFT) |
1942				   GEN8_OA_COUNTER_ENABLE);
1943}
1944
1945/**
1946 * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
1947 * @stream: An i915 perf stream opened for OA metrics
1948 *
1949 * [Re]enables hardware periodic sampling according to the period configured
1950 * when opening the stream. This also starts a hrtimer that will periodically
1951 * check for data in the circular OA buffer for notifying userspace (e.g.
1952 * during a read() or poll()).
1953 */
1954static void i915_oa_stream_enable(struct i915_perf_stream *stream)
1955{
1956	struct drm_i915_private *dev_priv = stream->dev_priv;
1957
1958	dev_priv->perf.oa.ops.oa_enable(dev_priv);
1959
1960	if (dev_priv->perf.oa.periodic)
1961		hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
1962			      ns_to_ktime(POLL_PERIOD),
1963			      HRTIMER_MODE_REL_PINNED);
1964}
1965
1966static void gen7_oa_disable(struct drm_i915_private *dev_priv)
1967{
1968	I915_WRITE(GEN7_OACONTROL, 0);
1969}
1970
1971static void gen8_oa_disable(struct drm_i915_private *dev_priv)
1972{
1973	I915_WRITE(GEN8_OACONTROL, 0);
1974}
1975
1976/**
1977 * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
1978 * @stream: An i915 perf stream opened for OA metrics
1979 *
1980 * Stops the OA unit from periodically writing counter reports into the
1981 * circular OA buffer. This also stops the hrtimer that periodically checks for
1982 * data in the circular OA buffer, for notifying userspace.
1983 */
1984static void i915_oa_stream_disable(struct i915_perf_stream *stream)
1985{
1986	struct drm_i915_private *dev_priv = stream->dev_priv;
1987
1988	dev_priv->perf.oa.ops.oa_disable(dev_priv);
1989
1990	if (dev_priv->perf.oa.periodic)
1991		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
1992}
1993
1994static const struct i915_perf_stream_ops i915_oa_stream_ops = {
1995	.destroy = i915_oa_stream_destroy,
1996	.enable = i915_oa_stream_enable,
1997	.disable = i915_oa_stream_disable,
1998	.wait_unlocked = i915_oa_wait_unlocked,
1999	.poll_wait = i915_oa_poll_wait,
2000	.read = i915_oa_read,
2001};
2002
2003/**
2004 * i915_oa_stream_init - validate combined props for OA stream and init
2005 * @stream: An i915 perf stream
2006 * @param: The open parameters passed to `DRM_I915_PERF_OPEN`
2007 * @props: The property state that configures stream (individually validated)
2008 *
2009 * While read_properties_unlocked() validates properties in isolation it
2010 * doesn't ensure that the combination necessarily makes sense.
2011 *
2012 * At this point it has been determined that userspace wants a stream of
2013 * OA metrics, but still we need to further validate the combined
2014 * properties are OK.
2015 *
2016 * If the configuration makes sense then we can allocate memory for
2017 * a circular OA buffer and apply the requested metric set configuration.
2018 *
2019 * Returns: zero on success or a negative error code.
2020 */
2021static int i915_oa_stream_init(struct i915_perf_stream *stream,
2022			       struct drm_i915_perf_open_param *param,
2023			       struct perf_open_properties *props)
2024{
2025	struct drm_i915_private *dev_priv = stream->dev_priv;
2026	int format_size;
2027	int ret;
2028
2029	/* If the sysfs metrics/ directory wasn't registered for some
2030	 * reason then don't let userspace try their luck with config
2031	 * IDs
2032	 */
2033	if (!dev_priv->perf.metrics_kobj) {
2034		DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
2035		return -EINVAL;
2036	}
2037
2038	if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
2039		DRM_DEBUG("Only OA report sampling supported\n");
2040		return -EINVAL;
2041	}
2042
2043	if (!dev_priv->perf.oa.ops.init_oa_buffer) {
2044		DRM_DEBUG("OA unit not supported\n");
2045		return -ENODEV;
2046	}
2047
2048	/* To avoid the complexity of having to accurately filter
2049	 * counter reports and marshal to the appropriate client
2050	 * we currently only allow exclusive access
2051	 */
2052	if (dev_priv->perf.oa.exclusive_stream) {
2053		DRM_DEBUG("OA unit already in use\n");
2054		return -EBUSY;
2055	}
2056
2057	if (!props->oa_format) {
2058		DRM_DEBUG("OA report format not specified\n");
2059		return -EINVAL;
2060	}
2061
2062	/* We set up some ratelimit state to potentially throttle any _NOTES
2063	 * about spurious, invalid OA reports which we don't forward to
2064	 * userspace.
2065	 *
2066	 * The initialization is associated with opening the stream (not driver
2067	 * init) considering we print a _NOTE about any throttling when closing
2068	 * the stream instead of waiting until driver _fini which no one would
2069	 * ever see.
2070	 *
2071	 * Using the same limiting factors as printk_ratelimit()
2072	 */
2073	ratelimit_state_init(&dev_priv->perf.oa.spurious_report_rs,
2074			     5 * HZ, 10);
2075	/* Since we use a DRM_NOTE for spurious reports it would be
2076	 * inconsistent to let __ratelimit() automatically print a warning for
2077	 * throttling.
2078	 */
2079	ratelimit_set_flags(&dev_priv->perf.oa.spurious_report_rs,
2080			    RATELIMIT_MSG_ON_RELEASE);
2081
2082	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
2083
2084	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
2085
2086	stream->sample_flags |= SAMPLE_OA_REPORT;
2087	stream->sample_size += format_size;
2088
2089	dev_priv->perf.oa.oa_buffer.format_size = format_size;
2090	if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
2091		return -EINVAL;
2092
2093	dev_priv->perf.oa.oa_buffer.format =
2094		dev_priv->perf.oa.oa_formats[props->oa_format].format;
2095
2096	dev_priv->perf.oa.periodic = props->oa_periodic;
2097	if (dev_priv->perf.oa.periodic)
2098		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
2099
2100	if (stream->ctx) {
2101		ret = oa_get_render_ctx_id(stream);
2102		if (ret)
2103			return ret;
2104	}
2105
2106	ret = get_oa_config(dev_priv, props->metrics_set, &stream->oa_config);
2107	if (ret)
2108		goto err_config;
2109
2110	/* PRM - observability performance counters:
2111	 *
2112	 *   OACONTROL, performance counter enable, note:
2113	 *
2114	 *   "When this bit is set, in order to have coherent counts,
2115	 *   RC6 power state and trunk clock gating must be disabled.
2116	 *   This can be achieved by programming MMIO registers as
2117	 *   0xA094=0 and 0xA090[31]=1"
2118	 *
2119	 *   In our case we are expecting that taking pm + FORCEWAKE
2120	 *   references will effectively disable RC6.
2121	 */
2122	intel_runtime_pm_get(dev_priv);
2123	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
2124
2125	ret = alloc_oa_buffer(dev_priv);
2126	if (ret)
2127		goto err_oa_buf_alloc;
2128
2129	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
2130	if (ret)
2131		goto err_lock;
2132
2133	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
2134						      stream->oa_config);
2135	if (ret)
2136		goto err_enable;
2137
2138	stream->ops = &i915_oa_stream_ops;
2139
2140	dev_priv->perf.oa.exclusive_stream = stream;
2141
2142	mutex_unlock(&dev_priv->drm.struct_mutex);
2143
2144	return 0;
2145
2146err_enable:
2147	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
2148	mutex_unlock(&dev_priv->drm.struct_mutex);
2149
2150err_lock:
2151	free_oa_buffer(dev_priv);
2152
2153err_oa_buf_alloc:
2154	put_oa_config(dev_priv, stream->oa_config);
2155
2156	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
2157	intel_runtime_pm_put(dev_priv);
2158
2159err_config:
2160	if (stream->ctx)
2161		oa_put_render_ctx_id(stream);
2162
2163	return ret;
2164}
2165
2166void i915_oa_init_reg_state(struct intel_engine_cs *engine,
2167			    struct i915_gem_context *ctx,
2168			    u32 *reg_state)
2169{
2170	struct i915_perf_stream *stream;
2171
2172	if (engine->id != RCS)
2173		return;
2174
2175	stream = engine->i915->perf.oa.exclusive_stream;
2176	if (stream)
2177		gen8_update_reg_state_unlocked(ctx, reg_state, stream->oa_config);
2178}
2179
2180/**
2181 * i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
2182 * @stream: An i915 perf stream
2183 * @file: An i915 perf stream file
2184 * @buf: destination buffer given by userspace
2185 * @count: the number of bytes userspace wants to read
2186 * @ppos: (inout) file seek position (unused)
2187 *
2188 * Besides wrapping &i915_perf_stream_ops->read this provides a common place to
2189 * ensure that if we've successfully copied any data then reporting that takes
2190 * precedence over any internal error status, so the data isn't lost.
2191 *
2192 * For example ret will be -ENOSPC whenever there is more buffered data than
2193 * can be copied to userspace, but that's only interesting if we weren't able
2194 * to copy some data because it implies the userspace buffer is too small to
2195 * receive a single record (and we never split records).
2196 *
2197 * Another case with ret == -EFAULT is more of a grey area since it would seem
2198 * like bad form for userspace to ask us to overrun its buffer, but the user
2199 * knows best:
2200 *
2201 *   http://yarchive.net/comp/linux/partial_reads_writes.html
2202 *
2203 * Returns: The number of bytes copied or a negative error code on failure.
2204 */
2205static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream,
2206				     struct file *file,
2207				     char __user *buf,
2208				     size_t count,
2209				     loff_t *ppos)
2210{
2211	/* Note we keep the offset (aka bytes read) separate from any
2212	 * error status so that the final check for whether we return
2213	 * the bytes read with a higher precedence than any error (see
2214	 * comment below) doesn't need to be handled/duplicated in
2215	 * stream->ops->read() implementations.
2216	 */
2217	size_t offset = 0;
2218	int ret = stream->ops->read(stream, buf, count, &offset);
2219
2220	return offset ?: (ret ?: -EAGAIN);
2221}
2222
2223/**
2224 * i915_perf_read - handles read() FOP for i915 perf stream FDs
2225 * @file: An i915 perf stream file
2226 * @buf: destination buffer given by userspace
2227 * @count: the number of bytes userspace wants to read
2228 * @ppos: (inout) file seek position (unused)
2229 *
2230 * The entry point for handling a read() on a stream file descriptor from
2231 * userspace. Most of the work is left to the i915_perf_read_locked() and
2232 * &i915_perf_stream_ops->read but to save having stream implementations (of
2233 * which we might have multiple later) we handle blocking read here.
2234 *
2235 * We can also consistently treat trying to read from a disabled stream
2236 * as an IO error so implementations can assume the stream is enabled
2237 * while reading.
2238 *
2239 * Returns: The number of bytes copied or a negative error code on failure.
2240 */
2241static ssize_t i915_perf_read(struct file *file,
2242			      char __user *buf,
2243			      size_t count,
2244			      loff_t *ppos)
2245{
2246	struct i915_perf_stream *stream = file->private_data;
2247	struct drm_i915_private *dev_priv = stream->dev_priv;
2248	ssize_t ret;
2249
2250	/* To ensure it's handled consistently we simply treat all reads of a
2251	 * disabled stream as an error. In particular it might otherwise lead
2252	 * to a deadlock for blocking file descriptors...
2253	 */
2254	if (!stream->enabled)
2255		return -EIO;
2256
2257	if (!(file->f_flags & O_NONBLOCK)) {
2258		/* There's the small chance of false positives from
2259		 * stream->ops->wait_unlocked.
2260		 *
2261		 * E.g. with single context filtering since we only wait until
2262		 * oabuffer has >= 1 report we don't immediately know whether
2263		 * any reports really belong to the current context
2264		 */
2265		do {
2266			ret = stream->ops->wait_unlocked(stream);
2267			if (ret)
2268				return ret;
2269
2270			mutex_lock(&dev_priv->perf.lock);
2271			ret = i915_perf_read_locked(stream, file,
2272						    buf, count, ppos);
2273			mutex_unlock(&dev_priv->perf.lock);
2274		} while (ret == -EAGAIN);
2275	} else {
2276		mutex_lock(&dev_priv->perf.lock);
2277		ret = i915_perf_read_locked(stream, file, buf, count, ppos);
2278		mutex_unlock(&dev_priv->perf.lock);
2279	}
2280
2281	/* We allow the poll checking to sometimes report false positive EPOLLIN
2282	 * events where we might actually report EAGAIN on read() if there's
2283	 * not really any data available. In this situation though we don't
2284	 * want to enter a busy loop between poll() reporting a EPOLLIN event
2285	 * and read() returning -EAGAIN. Clearing the oa.pollin state here
2286	 * effectively ensures we back off until the next hrtimer callback
2287	 * before reporting another EPOLLIN event.
2288	 */
2289	if (ret >= 0 || ret == -EAGAIN) {
2290		/* Maybe make ->pollin per-stream state if we support multiple
2291		 * concurrent streams in the future.
2292		 */
2293		dev_priv->perf.oa.pollin = false;
2294	}
2295
2296	return ret;
2297}
2298
2299static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
2300{
2301	struct drm_i915_private *dev_priv =
2302		container_of(hrtimer, typeof(*dev_priv),
2303			     perf.oa.poll_check_timer);
2304
2305	if (oa_buffer_check_unlocked(dev_priv)) {
2306		dev_priv->perf.oa.pollin = true;
2307		wake_up(&dev_priv->perf.oa.poll_wq);
2308	}
2309
2310	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
2311
2312	return HRTIMER_RESTART;
2313}
2314
2315/**
2316 * i915_perf_poll_locked - poll_wait() with a suitable wait queue for stream
2317 * @dev_priv: i915 device instance
2318 * @stream: An i915 perf stream
2319 * @file: An i915 perf stream file
2320 * @wait: poll() state table
2321 *
2322 * For handling userspace polling on an i915 perf stream, this calls through to
2323 * &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that
2324 * will be woken for new stream data.
2325 *
2326 * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
2327 * with any non-file-operation driver hooks.
2328 *
2329 * Returns: any poll events that are ready without sleeping
2330 */
2331static __poll_t i915_perf_poll_locked(struct drm_i915_private *dev_priv,
2332					  struct i915_perf_stream *stream,
2333					  struct file *file,
2334					  poll_table *wait)
2335{
2336	__poll_t events = 0;
2337
2338	stream->ops->poll_wait(stream, file, wait);
2339
2340	/* Note: we don't explicitly check whether there's something to read
2341	 * here since this path may be very hot depending on what else
2342	 * userspace is polling, or on the timeout in use. We rely solely on
2343	 * the hrtimer/oa_poll_check_timer_cb to notify us when there are
2344	 * samples to read.
2345	 */
2346	if (dev_priv->perf.oa.pollin)
2347		events |= EPOLLIN;
2348
2349	return events;
2350}
2351
2352/**
2353 * i915_perf_poll - call poll_wait() with a suitable wait queue for stream
2354 * @file: An i915 perf stream file
2355 * @wait: poll() state table
2356 *
2357 * For handling userspace polling on an i915 perf stream, this ensures
2358 * poll_wait() gets called with a wait queue that will be woken for new stream
2359 * data.
2360 *
2361 * Note: Implementation deferred to i915_perf_poll_locked()
2362 *
2363 * Returns: any poll events that are ready without sleeping
2364 */
2365static __poll_t i915_perf_poll(struct file *file, poll_table *wait)
2366{
2367	struct i915_perf_stream *stream = file->private_data;
2368	struct drm_i915_private *dev_priv = stream->dev_priv;
2369	__poll_t ret;
2370
2371	mutex_lock(&dev_priv->perf.lock);
2372	ret = i915_perf_poll_locked(dev_priv, stream, file, wait);
2373	mutex_unlock(&dev_priv->perf.lock);
2374
2375	return ret;
2376}
2377
2378/**
2379 * i915_perf_enable_locked - handle `I915_PERF_IOCTL_ENABLE` ioctl
2380 * @stream: A disabled i915 perf stream
2381 *
2382 * [Re]enables the associated capture of data for this stream.
2383 *
2384 * If a stream was previously enabled then there's currently no intention
2385 * to provide userspace any guarantee about the preservation of previously
2386 * buffered data.
2387 */
2388static void i915_perf_enable_locked(struct i915_perf_stream *stream)
2389{
2390	if (stream->enabled)
2391		return;
2392
2393	/* Allow stream->ops->enable() to refer to this */
2394	stream->enabled = true;
2395
2396	if (stream->ops->enable)
2397		stream->ops->enable(stream);
2398}
2399
2400/**
2401 * i915_perf_disable_locked - handle `I915_PERF_IOCTL_DISABLE` ioctl
2402 * @stream: An enabled i915 perf stream
2403 *
2404 * Disables the associated capture of data for this stream.
2405 *
2406 * The intention is that disabling an re-enabling a stream will ideally be
2407 * cheaper than destroying and re-opening a stream with the same configuration,
2408 * though there are no formal guarantees about what state or buffered data
2409 * must be retained between disabling and re-enabling a stream.
2410 *
2411 * Note: while a stream is disabled it's considered an error for userspace
2412 * to attempt to read from the stream (-EIO).
2413 */
2414static void i915_perf_disable_locked(struct i915_perf_stream *stream)
2415{
2416	if (!stream->enabled)
2417		return;
2418
2419	/* Allow stream->ops->disable() to refer to this */
2420	stream->enabled = false;
2421
2422	if (stream->ops->disable)
2423		stream->ops->disable(stream);
2424}
2425
2426/**
2427 * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
2428 * @stream: An i915 perf stream
2429 * @cmd: the ioctl request
2430 * @arg: the ioctl data
2431 *
2432 * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
2433 * with any non-file-operation driver hooks.
2434 *
2435 * Returns: zero on success or a negative error code. Returns -EINVAL for
2436 * an unknown ioctl request.
2437 */
2438static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
2439				   unsigned int cmd,
2440				   unsigned long arg)
2441{
2442	switch (cmd) {
2443	case I915_PERF_IOCTL_ENABLE:
2444		i915_perf_enable_locked(stream);
2445		return 0;
2446	case I915_PERF_IOCTL_DISABLE:
2447		i915_perf_disable_locked(stream);
2448		return 0;
2449	}
2450
2451	return -EINVAL;
2452}
2453
2454/**
2455 * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
2456 * @file: An i915 perf stream file
2457 * @cmd: the ioctl request
2458 * @arg: the ioctl data
2459 *
2460 * Implementation deferred to i915_perf_ioctl_locked().
2461 *
2462 * Returns: zero on success or a negative error code. Returns -EINVAL for
2463 * an unknown ioctl request.
2464 */
2465static long i915_perf_ioctl(struct file *file,
2466			    unsigned int cmd,
2467			    unsigned long arg)
2468{
2469	struct i915_perf_stream *stream = file->private_data;
2470	struct drm_i915_private *dev_priv = stream->dev_priv;
2471	long ret;
2472
2473	mutex_lock(&dev_priv->perf.lock);
2474	ret = i915_perf_ioctl_locked(stream, cmd, arg);
2475	mutex_unlock(&dev_priv->perf.lock);
2476
2477	return ret;
2478}
2479
2480/**
2481 * i915_perf_destroy_locked - destroy an i915 perf stream
2482 * @stream: An i915 perf stream
2483 *
2484 * Frees all resources associated with the given i915 perf @stream, disabling
2485 * any associated data capture in the process.
2486 *
2487 * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
2488 * with any non-file-operation driver hooks.
2489 */
2490static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
2491{
2492	if (stream->enabled)
2493		i915_perf_disable_locked(stream);
2494
2495	if (stream->ops->destroy)
2496		stream->ops->destroy(stream);
2497
2498	list_del(&stream->link);
2499
2500	if (stream->ctx)
2501		i915_gem_context_put(stream->ctx);
2502
2503	kfree(stream);
2504}
2505
2506/**
2507 * i915_perf_release - handles userspace close() of a stream file
2508 * @inode: anonymous inode associated with file
2509 * @file: An i915 perf stream file
2510 *
2511 * Cleans up any resources associated with an open i915 perf stream file.
2512 *
2513 * NB: close() can't really fail from the userspace point of view.
2514 *
2515 * Returns: zero on success or a negative error code.
2516 */
2517static int i915_perf_release(struct inode *inode, struct file *file)
2518{
2519	struct i915_perf_stream *stream = file->private_data;
2520	struct drm_i915_private *dev_priv = stream->dev_priv;
2521
2522	mutex_lock(&dev_priv->perf.lock);
2523	i915_perf_destroy_locked(stream);
2524	mutex_unlock(&dev_priv->perf.lock);
2525
2526	return 0;
2527}
2528
2529
2530static const struct file_operations fops = {
2531	.owner		= THIS_MODULE,
2532	.llseek		= no_llseek,
2533	.release	= i915_perf_release,
2534	.poll		= i915_perf_poll,
2535	.read		= i915_perf_read,
2536	.unlocked_ioctl	= i915_perf_ioctl,
2537	/* Our ioctl have no arguments, so it's safe to use the same function
2538	 * to handle 32bits compatibility.
2539	 */
2540	.compat_ioctl   = i915_perf_ioctl,
2541};
2542
2543
2544/**
2545 * i915_perf_open_ioctl_locked - DRM ioctl() for userspace to open a stream FD
2546 * @dev_priv: i915 device instance
2547 * @param: The open parameters passed to 'DRM_I915_PERF_OPEN`
2548 * @props: individually validated u64 property value pairs
2549 * @file: drm file
2550 *
2551 * See i915_perf_ioctl_open() for interface details.
2552 *
2553 * Implements further stream config validation and stream initialization on
2554 * behalf of i915_perf_open_ioctl() with the &drm_i915_private->perf.lock mutex
2555 * taken to serialize with any non-file-operation driver hooks.
2556 *
2557 * Note: at this point the @props have only been validated in isolation and
2558 * it's still necessary to validate that the combination of properties makes
2559 * sense.
2560 *
2561 * In the case where userspace is interested in OA unit metrics then further
2562 * config validation and stream initialization details will be handled by
2563 * i915_oa_stream_init(). The code here should only validate config state that
2564 * will be relevant to all stream types / backends.
2565 *
2566 * Returns: zero on success or a negative error code.
2567 */
2568static int
2569i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
2570			    struct drm_i915_perf_open_param *param,
2571			    struct perf_open_properties *props,
2572			    struct drm_file *file)
2573{
2574	struct i915_gem_context *specific_ctx = NULL;
2575	struct i915_perf_stream *stream = NULL;
2576	unsigned long f_flags = 0;
2577	bool privileged_op = true;
2578	int stream_fd;
2579	int ret;
2580
2581	if (props->single_context) {
2582		u32 ctx_handle = props->ctx_handle;
2583		struct drm_i915_file_private *file_priv = file->driver_priv;
2584
2585		specific_ctx = i915_gem_context_lookup(file_priv, ctx_handle);
2586		if (!specific_ctx) {
2587			DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
2588				  ctx_handle);
2589			ret = -ENOENT;
2590			goto err;
2591		}
2592	}
2593
2594	/*
2595	 * On Haswell the OA unit supports clock gating off for a specific
2596	 * context and in this mode there's no visibility of metrics for the
2597	 * rest of the system, which we consider acceptable for a
2598	 * non-privileged client.
2599	 *
2600	 * For Gen8+ the OA unit no longer supports clock gating off for a
2601	 * specific context and the kernel can't securely stop the counters
2602	 * from updating as system-wide / global values. Even though we can
2603	 * filter reports based on the included context ID we can't block
2604	 * clients from seeing the raw / global counter values via
2605	 * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
2606	 * enable the OA unit by default.
2607	 */
2608	if (IS_HASWELL(dev_priv) && specific_ctx)
2609		privileged_op = false;
2610
2611	/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
2612	 * we check a dev.i915.perf_stream_paranoid sysctl option
2613	 * to determine if it's ok to access system wide OA counters
2614	 * without CAP_SYS_ADMIN privileges.
2615	 */
2616	if (privileged_op &&
2617	    i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
2618		DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
2619		ret = -EACCES;
2620		goto err_ctx;
2621	}
2622
2623	stream = kzalloc(sizeof(*stream), GFP_KERNEL);
2624	if (!stream) {
2625		ret = -ENOMEM;
2626		goto err_ctx;
2627	}
2628
2629	stream->dev_priv = dev_priv;
2630	stream->ctx = specific_ctx;
2631
2632	ret = i915_oa_stream_init(stream, param, props);
2633	if (ret)
2634		goto err_alloc;
2635
2636	/* we avoid simply assigning stream->sample_flags = props->sample_flags
2637	 * to have _stream_init check the combination of sample flags more
2638	 * thoroughly, but still this is the expected result at this point.
2639	 */
2640	if (WARN_ON(stream->sample_flags != props->sample_flags)) {
2641		ret = -ENODEV;
2642		goto err_flags;
2643	}
2644
2645	list_add(&stream->link, &dev_priv->perf.streams);
2646
2647	if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
2648		f_flags |= O_CLOEXEC;
2649	if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
2650		f_flags |= O_NONBLOCK;
2651
2652	stream_fd = anon_inode_getfd("[i915_perf]", &fops, stream, f_flags);
2653	if (stream_fd < 0) {
2654		ret = stream_fd;
2655		goto err_open;
2656	}
2657
2658	if (!(param->flags & I915_PERF_FLAG_DISABLED))
2659		i915_perf_enable_locked(stream);
2660
2661	return stream_fd;
2662
2663err_open:
2664	list_del(&stream->link);
2665err_flags:
2666	if (stream->ops->destroy)
2667		stream->ops->destroy(stream);
2668err_alloc:
2669	kfree(stream);
2670err_ctx:
2671	if (specific_ctx)
2672		i915_gem_context_put(specific_ctx);
2673err:
2674	return ret;
2675}
2676
2677static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
2678{
2679	return div64_u64(1000000000ULL * (2ULL << exponent),
2680			 1000ULL * INTEL_INFO(dev_priv)->cs_timestamp_frequency_khz);
2681}
2682
2683/**
2684 * read_properties_unlocked - validate + copy userspace stream open properties
2685 * @dev_priv: i915 device instance
2686 * @uprops: The array of u64 key value pairs given by userspace
2687 * @n_props: The number of key value pairs expected in @uprops
2688 * @props: The stream configuration built up while validating properties
2689 *
2690 * Note this function only validates properties in isolation it doesn't
2691 * validate that the combination of properties makes sense or that all
2692 * properties necessary for a particular kind of stream have been set.
2693 *
2694 * Note that there currently aren't any ordering requirements for properties so
2695 * we shouldn't validate or assume anything about ordering here. This doesn't
2696 * rule out defining new properties with ordering requirements in the future.
2697 */
2698static int read_properties_unlocked(struct drm_i915_private *dev_priv,
2699				    u64 __user *uprops,
2700				    u32 n_props,
2701				    struct perf_open_properties *props)
2702{
2703	u64 __user *uprop = uprops;
2704	u32 i;
2705
2706	memset(props, 0, sizeof(struct perf_open_properties));
2707
2708	if (!n_props) {
2709		DRM_DEBUG("No i915 perf properties given\n");
2710		return -EINVAL;
2711	}
2712
2713	/* Considering that ID = 0 is reserved and assuming that we don't
2714	 * (currently) expect any configurations to ever specify duplicate
2715	 * values for a particular property ID then the last _PROP_MAX value is
2716	 * one greater than the maximum number of properties we expect to get
2717	 * from userspace.
2718	 */
2719	if (n_props >= DRM_I915_PERF_PROP_MAX) {
2720		DRM_DEBUG("More i915 perf properties specified than exist\n");
2721		return -EINVAL;
2722	}
2723
2724	for (i = 0; i < n_props; i++) {
2725		u64 oa_period, oa_freq_hz;
2726		u64 id, value;
2727		int ret;
2728
2729		ret = get_user(id, uprop);
2730		if (ret)
2731			return ret;
2732
2733		ret = get_user(value, uprop + 1);
2734		if (ret)
2735			return ret;
2736
2737		if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) {
2738			DRM_DEBUG("Unknown i915 perf property ID\n");
2739			return -EINVAL;
2740		}
2741
2742		switch ((enum drm_i915_perf_property_id)id) {
2743		case DRM_I915_PERF_PROP_CTX_HANDLE:
2744			props->single_context = 1;
2745			props->ctx_handle = value;
2746			break;
2747		case DRM_I915_PERF_PROP_SAMPLE_OA:
2748			props->sample_flags |= SAMPLE_OA_REPORT;
2749			break;
2750		case DRM_I915_PERF_PROP_OA_METRICS_SET:
2751			if (value == 0) {
2752				DRM_DEBUG("Unknown OA metric set ID\n");
2753				return -EINVAL;
2754			}
2755			props->metrics_set = value;
2756			break;
2757		case DRM_I915_PERF_PROP_OA_FORMAT:
2758			if (value == 0 || value >= I915_OA_FORMAT_MAX) {
2759				DRM_DEBUG("Out-of-range OA report format %llu\n",
2760					  value);
2761				return -EINVAL;
2762			}
2763			if (!dev_priv->perf.oa.oa_formats[value].size) {
2764				DRM_DEBUG("Unsupported OA report format %llu\n",
2765					  value);
2766				return -EINVAL;
2767			}
2768			props->oa_format = value;
2769			break;
2770		case DRM_I915_PERF_PROP_OA_EXPONENT:
2771			if (value > OA_EXPONENT_MAX) {
2772				DRM_DEBUG("OA timer exponent too high (> %u)\n",
2773					 OA_EXPONENT_MAX);
2774				return -EINVAL;
2775			}
2776
2777			/* Theoretically we can program the OA unit to sample
2778			 * e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns
2779			 * for BXT. We don't allow such high sampling
2780			 * frequencies by default unless root.
2781			 */
2782
2783			BUILD_BUG_ON(sizeof(oa_period) != 8);
2784			oa_period = oa_exponent_to_ns(dev_priv, value);
2785
2786			/* This check is primarily to ensure that oa_period <=
2787			 * UINT32_MAX (before passing to do_div which only
2788			 * accepts a u32 denominator), but we can also skip
2789			 * checking anything < 1Hz which implicitly can't be
2790			 * limited via an integer oa_max_sample_rate.
2791			 */
2792			if (oa_period <= NSEC_PER_SEC) {
2793				u64 tmp = NSEC_PER_SEC;
2794				do_div(tmp, oa_period);
2795				oa_freq_hz = tmp;
2796			} else
2797				oa_freq_hz = 0;
2798
2799			if (oa_freq_hz > i915_oa_max_sample_rate &&
2800			    !capable(CAP_SYS_ADMIN)) {
2801				DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n",
2802					  i915_oa_max_sample_rate);
2803				return -EACCES;
2804			}
2805
2806			props->oa_periodic = true;
2807			props->oa_period_exponent = value;
2808			break;
2809		case DRM_I915_PERF_PROP_MAX:
2810			MISSING_CASE(id);
2811			return -EINVAL;
2812		}
2813
2814		uprop += 2;
2815	}
2816
2817	return 0;
2818}
2819
2820/**
2821 * i915_perf_open_ioctl - DRM ioctl() for userspace to open a stream FD
2822 * @dev: drm device
2823 * @data: ioctl data copied from userspace (unvalidated)
2824 * @file: drm file
2825 *
2826 * Validates the stream open parameters given by userspace including flags
2827 * and an array of u64 key, value pair properties.
2828 *
2829 * Very little is assumed up front about the nature of the stream being
2830 * opened (for instance we don't assume it's for periodic OA unit metrics). An
2831 * i915-perf stream is expected to be a suitable interface for other forms of
2832 * buffered data written by the GPU besides periodic OA metrics.
2833 *
2834 * Note we copy the properties from userspace outside of the i915 perf
2835 * mutex to avoid an awkward lockdep with mmap_sem.
2836 *
2837 * Most of the implementation details are handled by
2838 * i915_perf_open_ioctl_locked() after taking the &drm_i915_private->perf.lock
2839 * mutex for serializing with any non-file-operation driver hooks.
2840 *
2841 * Return: A newly opened i915 Perf stream file descriptor or negative
2842 * error code on failure.
2843 */
2844int i915_perf_open_ioctl(struct drm_device *dev, void *data,
2845			 struct drm_file *file)
2846{
2847	struct drm_i915_private *dev_priv = dev->dev_private;
2848	struct drm_i915_perf_open_param *param = data;
2849	struct perf_open_properties props;
2850	u32 known_open_flags;
2851	int ret;
2852
2853	if (!dev_priv->perf.initialized) {
2854		DRM_DEBUG("i915 perf interface not available for this system\n");
2855		return -ENOTSUPP;
2856	}
2857
2858	known_open_flags = I915_PERF_FLAG_FD_CLOEXEC |
2859			   I915_PERF_FLAG_FD_NONBLOCK |
2860			   I915_PERF_FLAG_DISABLED;
2861	if (param->flags & ~known_open_flags) {
2862		DRM_DEBUG("Unknown drm_i915_perf_open_param flag\n");
2863		return -EINVAL;
2864	}
2865
2866	ret = read_properties_unlocked(dev_priv,
2867				       u64_to_user_ptr(param->properties_ptr),
2868				       param->num_properties,
2869				       &props);
2870	if (ret)
2871		return ret;
2872
2873	mutex_lock(&dev_priv->perf.lock);
2874	ret = i915_perf_open_ioctl_locked(dev_priv, param, &props, file);
2875	mutex_unlock(&dev_priv->perf.lock);
2876
2877	return ret;
2878}
2879
2880/**
2881 * i915_perf_register - exposes i915-perf to userspace
2882 * @dev_priv: i915 device instance
2883 *
2884 * In particular OA metric sets are advertised under a sysfs metrics/
2885 * directory allowing userspace to enumerate valid IDs that can be
2886 * used to open an i915-perf stream.
2887 */
2888void i915_perf_register(struct drm_i915_private *dev_priv)
2889{
2890	int ret;
2891
2892	if (!dev_priv->perf.initialized)
2893		return;
2894
2895	/* To be sure we're synchronized with an attempted
2896	 * i915_perf_open_ioctl(); considering that we register after
2897	 * being exposed to userspace.
2898	 */
2899	mutex_lock(&dev_priv->perf.lock);
2900
2901	dev_priv->perf.metrics_kobj =
2902		kobject_create_and_add("metrics",
2903				       &dev_priv->drm.primary->kdev->kobj);
2904	if (!dev_priv->perf.metrics_kobj)
2905		goto exit;
2906
2907	sysfs_attr_init(&dev_priv->perf.oa.test_config.sysfs_metric_id.attr);
2908
2909	if (IS_HASWELL(dev_priv)) {
2910		i915_perf_load_test_config_hsw(dev_priv);
2911	} else if (IS_BROADWELL(dev_priv)) {
2912		i915_perf_load_test_config_bdw(dev_priv);
2913	} else if (IS_CHERRYVIEW(dev_priv)) {
2914		i915_perf_load_test_config_chv(dev_priv);
2915	} else if (IS_SKYLAKE(dev_priv)) {
2916		if (IS_SKL_GT2(dev_priv))
2917			i915_perf_load_test_config_sklgt2(dev_priv);
2918		else if (IS_SKL_GT3(dev_priv))
2919			i915_perf_load_test_config_sklgt3(dev_priv);
2920		else if (IS_SKL_GT4(dev_priv))
2921			i915_perf_load_test_config_sklgt4(dev_priv);
2922	} else if (IS_BROXTON(dev_priv)) {
2923		i915_perf_load_test_config_bxt(dev_priv);
2924	} else if (IS_KABYLAKE(dev_priv)) {
2925		if (IS_KBL_GT2(dev_priv))
2926			i915_perf_load_test_config_kblgt2(dev_priv);
2927		else if (IS_KBL_GT3(dev_priv))
2928			i915_perf_load_test_config_kblgt3(dev_priv);
2929	} else if (IS_GEMINILAKE(dev_priv)) {
2930		i915_perf_load_test_config_glk(dev_priv);
2931	} else if (IS_COFFEELAKE(dev_priv)) {
2932		if (IS_CFL_GT2(dev_priv))
2933			i915_perf_load_test_config_cflgt2(dev_priv);
2934		if (IS_CFL_GT3(dev_priv))
2935			i915_perf_load_test_config_cflgt3(dev_priv);
2936	} else if (IS_CANNONLAKE(dev_priv)) {
2937		i915_perf_load_test_config_cnl(dev_priv);
2938	}
2939
2940	if (dev_priv->perf.oa.test_config.id == 0)
2941		goto sysfs_error;
2942
2943	ret = sysfs_create_group(dev_priv->perf.metrics_kobj,
2944				 &dev_priv->perf.oa.test_config.sysfs_metric);
2945	if (ret)
2946		goto sysfs_error;
2947
2948	atomic_set(&dev_priv->perf.oa.test_config.ref_count, 1);
2949
2950	goto exit;
2951
2952sysfs_error:
2953	kobject_put(dev_priv->perf.metrics_kobj);
2954	dev_priv->perf.metrics_kobj = NULL;
2955
2956exit:
2957	mutex_unlock(&dev_priv->perf.lock);
2958}
2959
2960/**
2961 * i915_perf_unregister - hide i915-perf from userspace
2962 * @dev_priv: i915 device instance
2963 *
2964 * i915-perf state cleanup is split up into an 'unregister' and
2965 * 'deinit' phase where the interface is first hidden from
2966 * userspace by i915_perf_unregister() before cleaning up
2967 * remaining state in i915_perf_fini().
2968 */
2969void i915_perf_unregister(struct drm_i915_private *dev_priv)
2970{
2971	if (!dev_priv->perf.metrics_kobj)
2972		return;
2973
2974	sysfs_remove_group(dev_priv->perf.metrics_kobj,
2975			   &dev_priv->perf.oa.test_config.sysfs_metric);
2976
2977	kobject_put(dev_priv->perf.metrics_kobj);
2978	dev_priv->perf.metrics_kobj = NULL;
2979}
2980
2981static bool gen8_is_valid_flex_addr(struct drm_i915_private *dev_priv, u32 addr)
2982{
2983	static const i915_reg_t flex_eu_regs[] = {
2984		EU_PERF_CNTL0,
2985		EU_PERF_CNTL1,
2986		EU_PERF_CNTL2,
2987		EU_PERF_CNTL3,
2988		EU_PERF_CNTL4,
2989		EU_PERF_CNTL5,
2990		EU_PERF_CNTL6,
2991	};
2992	int i;
2993
2994	for (i = 0; i < ARRAY_SIZE(flex_eu_regs); i++) {
2995		if (i915_mmio_reg_offset(flex_eu_regs[i]) == addr)
2996			return true;
2997	}
2998	return false;
2999}
3000
3001static bool gen7_is_valid_b_counter_addr(struct drm_i915_private *dev_priv, u32 addr)
3002{
3003	return (addr >= i915_mmio_reg_offset(OASTARTTRIG1) &&
3004		addr <= i915_mmio_reg_offset(OASTARTTRIG8)) ||
3005		(addr >= i915_mmio_reg_offset(OAREPORTTRIG1) &&
3006		 addr <= i915_mmio_reg_offset(OAREPORTTRIG8)) ||
3007		(addr >= i915_mmio_reg_offset(OACEC0_0) &&
3008		 addr <= i915_mmio_reg_offset(OACEC7_1));
3009}
3010
3011static bool gen7_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3012{
3013	return addr == i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) ||
3014		(addr >= i915_mmio_reg_offset(MICRO_BP0_0) &&
3015		 addr <= i915_mmio_reg_offset(NOA_WRITE)) ||
3016		(addr >= i915_mmio_reg_offset(OA_PERFCNT1_LO) &&
3017		 addr <= i915_mmio_reg_offset(OA_PERFCNT2_HI)) ||
3018		(addr >= i915_mmio_reg_offset(OA_PERFMATRIX_LO) &&
3019		 addr <= i915_mmio_reg_offset(OA_PERFMATRIX_HI));
3020}
3021
3022static bool gen8_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3023{
3024	return gen7_is_valid_mux_addr(dev_priv, addr) ||
3025		addr == i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) ||
3026		(addr >= i915_mmio_reg_offset(RPM_CONFIG0) &&
3027		 addr <= i915_mmio_reg_offset(NOA_CONFIG(8)));
3028}
3029
3030static bool gen10_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3031{
3032	return gen8_is_valid_mux_addr(dev_priv, addr) ||
3033		(addr >= i915_mmio_reg_offset(OA_PERFCNT3_LO) &&
3034		 addr <= i915_mmio_reg_offset(OA_PERFCNT4_HI));
3035}
3036
3037static bool hsw_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3038{
3039	return gen7_is_valid_mux_addr(dev_priv, addr) ||
3040		(addr >= 0x25100 && addr <= 0x2FF90) ||
3041		(addr >= i915_mmio_reg_offset(HSW_MBVID2_NOA0) &&
3042		 addr <= i915_mmio_reg_offset(HSW_MBVID2_NOA9)) ||
3043		addr == i915_mmio_reg_offset(HSW_MBVID2_MISR0);
3044}
3045
3046static bool chv_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3047{
3048	return gen7_is_valid_mux_addr(dev_priv, addr) ||
3049		(addr >= 0x182300 && addr <= 0x1823A4);
3050}
3051
3052static uint32_t mask_reg_value(u32 reg, u32 val)
3053{
3054	/* HALF_SLICE_CHICKEN2 is programmed with a the
3055	 * WaDisableSTUnitPowerOptimization workaround. Make sure the value
3056	 * programmed by userspace doesn't change this.
3057	 */
3058	if (i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) == reg)
3059		val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE);
3060
3061	/* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function
3062	 * indicated by its name and a bunch of selection fields used by OA
3063	 * configs.
3064	 */
3065	if (i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) == reg)
3066		val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE);
3067
3068	return val;
3069}
3070
3071static struct i915_oa_reg *alloc_oa_regs(struct drm_i915_private *dev_priv,
3072					 bool (*is_valid)(struct drm_i915_private *dev_priv, u32 addr),
3073					 u32 __user *regs,
3074					 u32 n_regs)
3075{
3076	struct i915_oa_reg *oa_regs;
3077	int err;
3078	u32 i;
3079
3080	if (!n_regs)
3081		return NULL;
3082
3083	if (!access_ok(VERIFY_READ, regs, n_regs * sizeof(u32) * 2))
3084		return ERR_PTR(-EFAULT);
3085
3086	/* No is_valid function means we're not allowing any register to be programmed. */
3087	GEM_BUG_ON(!is_valid);
3088	if (!is_valid)
3089		return ERR_PTR(-EINVAL);
3090
3091	oa_regs = kmalloc_array(n_regs, sizeof(*oa_regs), GFP_KERNEL);
3092	if (!oa_regs)
3093		return ERR_PTR(-ENOMEM);
3094
3095	for (i = 0; i < n_regs; i++) {
3096		u32 addr, value;
3097
3098		err = get_user(addr, regs);
3099		if (err)
3100			goto addr_err;
3101
3102		if (!is_valid(dev_priv, addr)) {
3103			DRM_DEBUG("Invalid oa_reg address: %X\n", addr);
3104			err = -EINVAL;
3105			goto addr_err;
3106		}
3107
3108		err = get_user(value, regs + 1);
3109		if (err)
3110			goto addr_err;
3111
3112		oa_regs[i].addr = _MMIO(addr);
3113		oa_regs[i].value = mask_reg_value(addr, value);
3114
3115		regs += 2;
3116	}
3117
3118	return oa_regs;
3119
3120addr_err:
3121	kfree(oa_regs);
3122	return ERR_PTR(err);
3123}
3124
3125static ssize_t show_dynamic_id(struct device *dev,
3126			       struct device_attribute *attr,
3127			       char *buf)
3128{
3129	struct i915_oa_config *oa_config =
3130		container_of(attr, typeof(*oa_config), sysfs_metric_id);
3131
3132	return sprintf(buf, "%d\n", oa_config->id);
3133}
3134
3135static int create_dynamic_oa_sysfs_entry(struct drm_i915_private *dev_priv,
3136					 struct i915_oa_config *oa_config)
3137{
3138	sysfs_attr_init(&oa_config->sysfs_metric_id.attr);
3139	oa_config->sysfs_metric_id.attr.name = "id";
3140	oa_config->sysfs_metric_id.attr.mode = S_IRUGO;
3141	oa_config->sysfs_metric_id.show = show_dynamic_id;
3142	oa_config->sysfs_metric_id.store = NULL;
3143
3144	oa_config->attrs[0] = &oa_config->sysfs_metric_id.attr;
3145	oa_config->attrs[1] = NULL;
3146
3147	oa_config->sysfs_metric.name = oa_config->uuid;
3148	oa_config->sysfs_metric.attrs = oa_config->attrs;
3149
3150	return sysfs_create_group(dev_priv->perf.metrics_kobj,
3151				  &oa_config->sysfs_metric);
3152}
3153
3154/**
3155 * i915_perf_add_config_ioctl - DRM ioctl() for userspace to add a new OA config
3156 * @dev: drm device
3157 * @data: ioctl data (pointer to struct drm_i915_perf_oa_config) copied from
3158 *        userspace (unvalidated)
3159 * @file: drm file
3160 *
3161 * Validates the submitted OA register to be saved into a new OA config that
3162 * can then be used for programming the OA unit and its NOA network.
3163 *
3164 * Returns: A new allocated config number to be used with the perf open ioctl
3165 * or a negative error code on failure.
3166 */
3167int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
3168			       struct drm_file *file)
3169{
3170	struct drm_i915_private *dev_priv = dev->dev_private;
3171	struct drm_i915_perf_oa_config *args = data;
3172	struct i915_oa_config *oa_config, *tmp;
3173	int err, id;
3174
3175	if (!dev_priv->perf.initialized) {
3176		DRM_DEBUG("i915 perf interface not available for this system\n");
3177		return -ENOTSUPP;
3178	}
3179
3180	if (!dev_priv->perf.metrics_kobj) {
3181		DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
3182		return -EINVAL;
3183	}
3184
3185	if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
3186		DRM_DEBUG("Insufficient privileges to add i915 OA config\n");
3187		return -EACCES;
3188	}
3189
3190	if ((!args->mux_regs_ptr || !args->n_mux_regs) &&
3191	    (!args->boolean_regs_ptr || !args->n_boolean_regs) &&
3192	    (!args->flex_regs_ptr || !args->n_flex_regs)) {
3193		DRM_DEBUG("No OA registers given\n");
3194		return -EINVAL;
3195	}
3196
3197	oa_config = kzalloc(sizeof(*oa_config), GFP_KERNEL);
3198	if (!oa_config) {
3199		DRM_DEBUG("Failed to allocate memory for the OA config\n");
3200		return -ENOMEM;
3201	}
3202
3203	atomic_set(&oa_config->ref_count, 1);
3204
3205	if (!uuid_is_valid(args->uuid)) {
3206		DRM_DEBUG("Invalid uuid format for OA config\n");
3207		err = -EINVAL;
3208		goto reg_err;
3209	}
3210
3211	/* Last character in oa_config->uuid will be 0 because oa_config is
3212	 * kzalloc.
3213	 */
3214	memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid));
3215
3216	oa_config->mux_regs_len = args->n_mux_regs;
3217	oa_config->mux_regs =
3218		alloc_oa_regs(dev_priv,
3219			      dev_priv->perf.oa.ops.is_valid_mux_reg,
3220			      u64_to_user_ptr(args->mux_regs_ptr),
3221			      args->n_mux_regs);
3222
3223	if (IS_ERR(oa_config->mux_regs)) {
3224		DRM_DEBUG("Failed to create OA config for mux_regs\n");
3225		err = PTR_ERR(oa_config->mux_regs);
3226		goto reg_err;
3227	}
3228
3229	oa_config->b_counter_regs_len = args->n_boolean_regs;
3230	oa_config->b_counter_regs =
3231		alloc_oa_regs(dev_priv,
3232			      dev_priv->perf.oa.ops.is_valid_b_counter_reg,
3233			      u64_to_user_ptr(args->boolean_regs_ptr),
3234			      args->n_boolean_regs);
3235
3236	if (IS_ERR(oa_config->b_counter_regs)) {
3237		DRM_DEBUG("Failed to create OA config for b_counter_regs\n");
3238		err = PTR_ERR(oa_config->b_counter_regs);
3239		goto reg_err;
3240	}
3241
3242	if (INTEL_GEN(dev_priv) < 8) {
3243		if (args->n_flex_regs != 0) {
3244			err = -EINVAL;
3245			goto reg_err;
3246		}
3247	} else {
3248		oa_config->flex_regs_len = args->n_flex_regs;
3249		oa_config->flex_regs =
3250			alloc_oa_regs(dev_priv,
3251				      dev_priv->perf.oa.ops.is_valid_flex_reg,
3252				      u64_to_user_ptr(args->flex_regs_ptr),
3253				      args->n_flex_regs);
3254
3255		if (IS_ERR(oa_config->flex_regs)) {
3256			DRM_DEBUG("Failed to create OA config for flex_regs\n");
3257			err = PTR_ERR(oa_config->flex_regs);
3258			goto reg_err;
3259		}
3260	}
3261
3262	err = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
3263	if (err)
3264		goto reg_err;
3265
3266	/* We shouldn't have too many configs, so this iteration shouldn't be
3267	 * too costly.
3268	 */
3269	idr_for_each_entry(&dev_priv->perf.metrics_idr, tmp, id) {
3270		if (!strcmp(tmp->uuid, oa_config->uuid)) {
3271			DRM_DEBUG("OA config already exists with this uuid\n");
3272			err = -EADDRINUSE;
3273			goto sysfs_err;
3274		}
3275	}
3276
3277	err = create_dynamic_oa_sysfs_entry(dev_priv, oa_config);
3278	if (err) {
3279		DRM_DEBUG("Failed to create sysfs entry for OA config\n");
3280		goto sysfs_err;
3281	}
3282
3283	/* Config id 0 is invalid, id 1 for kernel stored test config. */
3284	oa_config->id = idr_alloc(&dev_priv->perf.metrics_idr,
3285				  oa_config, 2,
3286				  0, GFP_KERNEL);
3287	if (oa_config->id < 0) {
3288		DRM_DEBUG("Failed to create sysfs entry for OA config\n");
3289		err = oa_config->id;
3290		goto sysfs_err;
3291	}
3292
3293	mutex_unlock(&dev_priv->perf.metrics_lock);
3294
3295	return oa_config->id;
3296
3297sysfs_err:
3298	mutex_unlock(&dev_priv->perf.metrics_lock);
3299reg_err:
3300	put_oa_config(dev_priv, oa_config);
3301	DRM_DEBUG("Failed to add new OA config\n");
3302	return err;
3303}
3304
3305/**
3306 * i915_perf_remove_config_ioctl - DRM ioctl() for userspace to remove an OA config
3307 * @dev: drm device
3308 * @data: ioctl data (pointer to u64 integer) copied from userspace
3309 * @file: drm file
3310 *
3311 * Configs can be removed while being used, the will stop appearing in sysfs
3312 * and their content will be freed when the stream using the config is closed.
3313 *
3314 * Returns: 0 on success or a negative error code on failure.
3315 */
3316int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
3317				  struct drm_file *file)
3318{
3319	struct drm_i915_private *dev_priv = dev->dev_private;
3320	u64 *arg = data;
3321	struct i915_oa_config *oa_config;
3322	int ret;
3323
3324	if (!dev_priv->perf.initialized) {
3325		DRM_DEBUG("i915 perf interface not available for this system\n");
3326		return -ENOTSUPP;
3327	}
3328
3329	if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
3330		DRM_DEBUG("Insufficient privileges to remove i915 OA config\n");
3331		return -EACCES;
3332	}
3333
3334	ret = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
3335	if (ret)
3336		goto lock_err;
3337
3338	oa_config = idr_find(&dev_priv->perf.metrics_idr, *arg);
3339	if (!oa_config) {
3340		DRM_DEBUG("Failed to remove unknown OA config\n");
3341		ret = -ENOENT;
3342		goto config_err;
3343	}
3344
3345	GEM_BUG_ON(*arg != oa_config->id);
3346
3347	sysfs_remove_group(dev_priv->perf.metrics_kobj,
3348			   &oa_config->sysfs_metric);
3349
3350	idr_remove(&dev_priv->perf.metrics_idr, *arg);
3351	put_oa_config(dev_priv, oa_config);
3352
3353config_err:
3354	mutex_unlock(&dev_priv->perf.metrics_lock);
3355lock_err:
3356	return ret;
3357}
3358
3359static struct ctl_table oa_table[] = {
3360	{
3361	 .procname = "perf_stream_paranoid",
3362	 .data = &i915_perf_stream_paranoid,
3363	 .maxlen = sizeof(i915_perf_stream_paranoid),
3364	 .mode = 0644,
3365	 .proc_handler = proc_dointvec_minmax,
3366	 .extra1 = &zero,
3367	 .extra2 = &one,
3368	 },
3369	{
3370	 .procname = "oa_max_sample_rate",
3371	 .data = &i915_oa_max_sample_rate,
3372	 .maxlen = sizeof(i915_oa_max_sample_rate),
3373	 .mode = 0644,
3374	 .proc_handler = proc_dointvec_minmax,
3375	 .extra1 = &zero,
3376	 .extra2 = &oa_sample_rate_hard_limit,
3377	 },
3378	{}
3379};
3380
3381static struct ctl_table i915_root[] = {
3382	{
3383	 .procname = "i915",
3384	 .maxlen = 0,
3385	 .mode = 0555,
3386	 .child = oa_table,
3387	 },
3388	{}
3389};
3390
3391static struct ctl_table dev_root[] = {
3392	{
3393	 .procname = "dev",
3394	 .maxlen = 0,
3395	 .mode = 0555,
3396	 .child = i915_root,
3397	 },
3398	{}
3399};
3400
3401/**
3402 * i915_perf_init - initialize i915-perf state on module load
3403 * @dev_priv: i915 device instance
3404 *
3405 * Initializes i915-perf state without exposing anything to userspace.
3406 *
3407 * Note: i915-perf initialization is split into an 'init' and 'register'
3408 * phase with the i915_perf_register() exposing state to userspace.
3409 */
3410void i915_perf_init(struct drm_i915_private *dev_priv)
3411{
3412	if (IS_HASWELL(dev_priv)) {
3413		dev_priv->perf.oa.ops.is_valid_b_counter_reg =
3414			gen7_is_valid_b_counter_addr;
3415		dev_priv->perf.oa.ops.is_valid_mux_reg =
3416			hsw_is_valid_mux_addr;
3417		dev_priv->perf.oa.ops.is_valid_flex_reg = NULL;
3418		dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
3419		dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
3420		dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
3421		dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
3422		dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
3423		dev_priv->perf.oa.ops.read = gen7_oa_read;
3424		dev_priv->perf.oa.ops.oa_hw_tail_read =
3425			gen7_oa_hw_tail_read;
3426
3427		dev_priv->perf.oa.oa_formats = hsw_oa_formats;
3428	} else if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
3429		/* Note: that although we could theoretically also support the
3430		 * legacy ringbuffer mode on BDW (and earlier iterations of
3431		 * this driver, before upstreaming did this) it didn't seem
3432		 * worth the complexity to maintain now that BDW+ enable
3433		 * execlist mode by default.
3434		 */
3435		dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
3436
3437		dev_priv->perf.oa.ops.init_oa_buffer = gen8_init_oa_buffer;
3438		dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
3439		dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
3440		dev_priv->perf.oa.ops.read = gen8_oa_read;
3441		dev_priv->perf.oa.ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
3442
3443		if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv)) {
3444			dev_priv->perf.oa.ops.is_valid_b_counter_reg =
3445				gen7_is_valid_b_counter_addr;
3446			dev_priv->perf.oa.ops.is_valid_mux_reg =
3447				gen8_is_valid_mux_addr;
3448			dev_priv->perf.oa.ops.is_valid_flex_reg =
3449				gen8_is_valid_flex_addr;
3450
3451			if (IS_CHERRYVIEW(dev_priv)) {
3452				dev_priv->perf.oa.ops.is_valid_mux_reg =
3453					chv_is_valid_mux_addr;
3454			}
3455
3456			dev_priv->perf.oa.ops.enable_metric_set = gen8_enable_metric_set;
3457			dev_priv->perf.oa.ops.disable_metric_set = gen8_disable_metric_set;
3458
3459			if (IS_GEN8(dev_priv)) {
3460				dev_priv->perf.oa.ctx_oactxctrl_offset = 0x120;
3461				dev_priv->perf.oa.ctx_flexeu0_offset = 0x2ce;
3462
3463				dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
3464			} else {
3465				dev_priv->perf.oa.ctx_oactxctrl_offset = 0x128;
3466				dev_priv->perf.oa.ctx_flexeu0_offset = 0x3de;
3467
3468				dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
3469			}
3470		} else if (IS_GEN10(dev_priv)) {
3471			dev_priv->perf.oa.ops.is_valid_b_counter_reg =
3472				gen7_is_valid_b_counter_addr;
3473			dev_priv->perf.oa.ops.is_valid_mux_reg =
3474				gen10_is_valid_mux_addr;
3475			dev_priv->perf.oa.ops.is_valid_flex_reg =
3476				gen8_is_valid_flex_addr;
3477
3478			dev_priv->perf.oa.ops.enable_metric_set = gen8_enable_metric_set;
3479			dev_priv->perf.oa.ops.disable_metric_set = gen10_disable_metric_set;
3480
3481			dev_priv->perf.oa.ctx_oactxctrl_offset = 0x128;
3482			dev_priv->perf.oa.ctx_flexeu0_offset = 0x3de;
3483
3484			dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
3485		}
3486	}
3487
3488	if (dev_priv->perf.oa.ops.enable_metric_set) {
3489		hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
3490				CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3491		dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
3492		init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
3493
3494		INIT_LIST_HEAD(&dev_priv->perf.streams);
3495		mutex_init(&dev_priv->perf.lock);
3496		spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
3497
3498		oa_sample_rate_hard_limit = 1000 *
3499			(INTEL_INFO(dev_priv)->cs_timestamp_frequency_khz / 2);
3500		dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
3501
3502		mutex_init(&dev_priv->perf.metrics_lock);
3503		idr_init(&dev_priv->perf.metrics_idr);
3504
3505		dev_priv->perf.initialized = true;
3506	}
3507}
3508
3509static int destroy_config(int id, void *p, void *data)
3510{
3511	struct drm_i915_private *dev_priv = data;
3512	struct i915_oa_config *oa_config = p;
3513
3514	put_oa_config(dev_priv, oa_config);
3515
3516	return 0;
3517}
3518
3519/**
3520 * i915_perf_fini - Counter part to i915_perf_init()
3521 * @dev_priv: i915 device instance
3522 */
3523void i915_perf_fini(struct drm_i915_private *dev_priv)
3524{
3525	if (!dev_priv->perf.initialized)
3526		return;
3527
3528	idr_for_each(&dev_priv->perf.metrics_idr, destroy_config, dev_priv);
3529	idr_destroy(&dev_priv->perf.metrics_idr);
3530
3531	unregister_sysctl_table(dev_priv->perf.sysctl_header);
3532
3533	memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
3534
3535	dev_priv->perf.initialized = false;
3536}