Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.6.
  1// SPDX-License-Identifier: MIT
  2/*
  3 * Copyright © 2019 Intel Corporation
  4 */
  5
  6#include "i915_drv.h"
  7#include "i915_request.h"
  8
  9#include "intel_context.h"
 10#include "intel_engine_heartbeat.h"
 11#include "intel_engine_pm.h"
 12#include "intel_engine.h"
 13#include "intel_gt.h"
 14#include "intel_reset.h"
 15
 16/*
 17 * While the engine is active, we send a periodic pulse along the engine
 18 * to check on its health and to flush any idle-barriers. If that request
 19 * is stuck, and we fail to preempt it, we declare the engine hung and
 20 * issue a reset -- in the hope that restores progress.
 21 */
 22
 23static bool next_heartbeat(struct intel_engine_cs *engine)
 24{
 25	struct i915_request *rq;
 26	long delay;
 27
 28	delay = READ_ONCE(engine->props.heartbeat_interval_ms);
 29
 30	rq = engine->heartbeat.systole;
 31
 32	/*
 33	 * FIXME: The final period extension is disabled if the period has been
 34	 * modified from the default. This is to prevent issues with certain
 35	 * selftests which override the value and expect specific behaviour.
 36	 * Once the selftests have been updated to either cope with variable
 37	 * heartbeat periods (or to override the pre-emption timeout as well,
 38	 * or just to add a selftest specific override of the extension), the
 39	 * generic override can be removed.
 40	 */
 41	if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
 42	    delay == engine->defaults.heartbeat_interval_ms) {
 43		long longer;
 44
 45		/*
 46		 * The final try is at the highest priority possible. Up until now
 47		 * a pre-emption might not even have been attempted. So make sure
 48		 * this last attempt allows enough time for a pre-emption to occur.
 49		 */
 50		longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
 51		longer = intel_clamp_heartbeat_interval_ms(engine, longer);
 52		if (longer > delay)
 53			delay = longer;
 54	}
 55
 56	if (!delay)
 57		return false;
 58
 59	delay = msecs_to_jiffies_timeout(delay);
 60	if (delay >= HZ)
 61		delay = round_jiffies_up_relative(delay);
 62	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
 63
 64	return true;
 65}
 66
 67static struct i915_request *
 68heartbeat_create(struct intel_context *ce, gfp_t gfp)
 69{
 70	struct i915_request *rq;
 71
 72	intel_context_enter(ce);
 73	rq = __i915_request_create(ce, gfp);
 74	intel_context_exit(ce);
 75
 76	return rq;
 77}
 78
 79static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
 80{
 81	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
 82	i915_request_add_active_barriers(rq);
 83	if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
 84		engine->heartbeat.systole = i915_request_get(rq);
 85}
 86
 87static void heartbeat_commit(struct i915_request *rq,
 88			     const struct i915_sched_attr *attr)
 89{
 90	idle_pulse(rq->engine, rq);
 91
 92	__i915_request_commit(rq);
 93	__i915_request_queue(rq, attr);
 94}
 95
 96static void show_heartbeat(const struct i915_request *rq,
 97			   struct intel_engine_cs *engine)
 98{
 99	struct drm_printer p = drm_debug_printer("heartbeat");
100
101	if (!rq) {
102		intel_engine_dump(engine, &p,
103				  "%s heartbeat not ticking\n",
104				  engine->name);
105	} else {
106		intel_engine_dump(engine, &p,
107				  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
108				  engine->name,
109				  rq->fence.context,
110				  rq->fence.seqno,
111				  rq->sched.attr.priority);
112	}
113}
114
115static void
116reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
117{
118	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
119		show_heartbeat(rq, engine);
120
121	if (intel_engine_uses_guc(engine))
122		/*
123		 * GuC itself is toast or GuC's hang detection
124		 * is disabled. Either way, need to find the
125		 * hang culprit manually.
126		 */
127		intel_guc_find_hung_context(engine);
128
129	intel_gt_handle_error(engine->gt, engine->mask,
130			      I915_ERROR_CAPTURE,
131			      "stopped heartbeat on %s",
132			      engine->name);
133}
134
135static void heartbeat(struct work_struct *wrk)
136{
137	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
138	struct intel_engine_cs *engine =
139		container_of(wrk, typeof(*engine), heartbeat.work.work);
140	struct intel_context *ce = engine->kernel_context;
141	struct i915_request *rq;
142	unsigned long serial;
143
144	/* Just in case everything has gone horribly wrong, give it a kick */
145	intel_engine_flush_submission(engine);
146
147	rq = engine->heartbeat.systole;
148	if (rq && i915_request_completed(rq)) {
149		i915_request_put(rq);
150		engine->heartbeat.systole = NULL;
151	}
152
153	if (!intel_engine_pm_get_if_awake(engine))
154		return;
155
156	if (intel_gt_is_wedged(engine->gt))
157		goto out;
158
159	if (i915_sched_engine_disabled(engine->sched_engine)) {
160		reset_engine(engine, engine->heartbeat.systole);
161		goto out;
162	}
163
164	if (engine->heartbeat.systole) {
165		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
166
167		/* Safeguard against too-fast worker invocations */
168		if (!time_after(jiffies,
169				rq->emitted_jiffies + msecs_to_jiffies(delay)))
170			goto out;
171
172		if (!i915_sw_fence_signaled(&rq->submit)) {
173			/*
174			 * Not yet submitted, system is stalled.
175			 *
176			 * This more often happens for ring submission,
177			 * where all contexts are funnelled into a common
178			 * ringbuffer. If one context is blocked on an
179			 * external fence, not only is it not submitted,
180			 * but all other contexts, including the kernel
181			 * context are stuck waiting for the signal.
182			 */
183		} else if (engine->sched_engine->schedule &&
184			   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
185			/*
186			 * Gradually raise the priority of the heartbeat to
187			 * give high priority work [which presumably desires
188			 * low latency and no jitter] the chance to naturally
189			 * complete before being preempted.
190			 */
191			attr.priority = 0;
192			if (rq->sched.attr.priority >= attr.priority)
193				attr.priority = I915_PRIORITY_HEARTBEAT;
194			if (rq->sched.attr.priority >= attr.priority)
195				attr.priority = I915_PRIORITY_BARRIER;
196
197			local_bh_disable();
198			engine->sched_engine->schedule(rq, &attr);
199			local_bh_enable();
200		} else {
201			reset_engine(engine, rq);
202		}
203
204		rq->emitted_jiffies = jiffies;
205		goto out;
206	}
207
208	serial = READ_ONCE(engine->serial);
209	if (engine->wakeref_serial == serial)
210		goto out;
211
212	if (!mutex_trylock(&ce->timeline->mutex)) {
213		/* Unable to lock the kernel timeline, is the engine stuck? */
214		if (xchg(&engine->heartbeat.blocked, serial) == serial)
215			intel_gt_handle_error(engine->gt, engine->mask,
216					      I915_ERROR_CAPTURE,
217					      "no heartbeat on %s",
218					      engine->name);
219		goto out;
220	}
221
222	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
223	if (IS_ERR(rq))
224		goto unlock;
225
226	heartbeat_commit(rq, &attr);
227
228unlock:
229	mutex_unlock(&ce->timeline->mutex);
230out:
231	if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
232		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
233	intel_engine_pm_put(engine);
234}
235
236void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
237{
238	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
239		return;
240
241	next_heartbeat(engine);
242}
243
244void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
245{
246	if (cancel_delayed_work(&engine->heartbeat.work))
247		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
248}
249
250void intel_gt_unpark_heartbeats(struct intel_gt *gt)
251{
252	struct intel_engine_cs *engine;
253	enum intel_engine_id id;
254
255	for_each_engine(engine, gt, id)
256		if (intel_engine_pm_is_awake(engine))
257			intel_engine_unpark_heartbeat(engine);
258}
259
260void intel_gt_park_heartbeats(struct intel_gt *gt)
261{
262	struct intel_engine_cs *engine;
263	enum intel_engine_id id;
264
265	for_each_engine(engine, gt, id)
266		intel_engine_park_heartbeat(engine);
267}
268
269void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
270{
271	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
272}
273
274static int __intel_engine_pulse(struct intel_engine_cs *engine)
275{
276	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
277	struct intel_context *ce = engine->kernel_context;
278	struct i915_request *rq;
279
280	lockdep_assert_held(&ce->timeline->mutex);
281	GEM_BUG_ON(!intel_engine_has_preemption(engine));
282	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
283
284	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
285	if (IS_ERR(rq))
286		return PTR_ERR(rq);
287
288	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
289
290	heartbeat_commit(rq, &attr);
291	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
292
293	return 0;
294}
295
296static unsigned long set_heartbeat(struct intel_engine_cs *engine,
297				   unsigned long delay)
298{
299	unsigned long old;
300
301	old = xchg(&engine->props.heartbeat_interval_ms, delay);
302	if (delay)
303		intel_engine_unpark_heartbeat(engine);
304	else
305		intel_engine_park_heartbeat(engine);
306
307	return old;
308}
309
310int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
311			       unsigned long delay)
312{
313	struct intel_context *ce = engine->kernel_context;
314	int err = 0;
315
316	if (!delay && !intel_engine_has_preempt_reset(engine))
317		return -ENODEV;
318
319	/* FIXME: Remove together with equally marked hack in next_heartbeat. */
320	if (delay != engine->defaults.heartbeat_interval_ms &&
321	    delay < 2 * engine->props.preempt_timeout_ms) {
322		if (intel_engine_uses_guc(engine))
323			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
324				   engine->name);
325		else
326			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
327				   engine->name);
328	}
329
330	intel_engine_pm_get(engine);
331
332	err = mutex_lock_interruptible(&ce->timeline->mutex);
333	if (err)
334		goto out_rpm;
335
336	if (delay != engine->props.heartbeat_interval_ms) {
337		unsigned long saved = set_heartbeat(engine, delay);
338
339		/* recheck current execution */
340		if (intel_engine_has_preemption(engine)) {
341			err = __intel_engine_pulse(engine);
342			if (err)
343				set_heartbeat(engine, saved);
344		}
345	}
346
347	mutex_unlock(&ce->timeline->mutex);
348
349out_rpm:
350	intel_engine_pm_put(engine);
351	return err;
352}
353
354int intel_engine_pulse(struct intel_engine_cs *engine)
355{
356	struct intel_context *ce = engine->kernel_context;
357	int err;
358
359	if (!intel_engine_has_preemption(engine))
360		return -ENODEV;
361
362	if (!intel_engine_pm_get_if_awake(engine))
363		return 0;
364
365	err = -EINTR;
366	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
367		err = __intel_engine_pulse(engine);
368		mutex_unlock(&ce->timeline->mutex);
369	}
370
371	intel_engine_flush_submission(engine);
372	intel_engine_pm_put(engine);
373	return err;
374}
375
376int intel_engine_flush_barriers(struct intel_engine_cs *engine)
377{
378	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
379	struct intel_context *ce = engine->kernel_context;
380	struct i915_request *rq;
381	int err;
382
383	if (llist_empty(&engine->barrier_tasks))
384		return 0;
385
386	if (!intel_engine_pm_get_if_awake(engine))
387		return 0;
388
389	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
390		err = -EINTR;
391		goto out_rpm;
392	}
393
394	rq = heartbeat_create(ce, GFP_KERNEL);
395	if (IS_ERR(rq)) {
396		err = PTR_ERR(rq);
397		goto out_unlock;
398	}
399
400	heartbeat_commit(rq, &attr);
401
402	err = 0;
403out_unlock:
404	mutex_unlock(&ce->timeline->mutex);
405out_rpm:
406	intel_engine_pm_put(engine);
407	return err;
408}
409
410#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
411#include "selftest_engine_heartbeat.c"
412#endif