a6xx_gpu_state.c - drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c - Linux source code v3.1

Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0
   2/* Copyright (c) 2018-2019 The Linux Foundation. All rights reserved. */
   3
   4#include <linux/ascii85.h>
   5#include "msm_gem.h"
   6#include "a6xx_gpu.h"
   7#include "a6xx_gmu.h"
   8#include "a6xx_gpu_state.h"
   9#include "a6xx_gmu.xml.h"
  10
  11struct a6xx_gpu_state_obj {
  12	const void *handle;
  13	u32 *data;
  14};
  15
  16struct a6xx_gpu_state {
  17	struct msm_gpu_state base;
  18
  19	struct a6xx_gpu_state_obj *gmu_registers;
  20	int nr_gmu_registers;
  21
  22	struct a6xx_gpu_state_obj *registers;
  23	int nr_registers;
  24
  25	struct a6xx_gpu_state_obj *shaders;
  26	int nr_shaders;
  27
  28	struct a6xx_gpu_state_obj *clusters;
  29	int nr_clusters;
  30
  31	struct a6xx_gpu_state_obj *dbgahb_clusters;
  32	int nr_dbgahb_clusters;
  33
  34	struct a6xx_gpu_state_obj *indexed_regs;
  35	int nr_indexed_regs;
  36
  37	struct a6xx_gpu_state_obj *debugbus;
  38	int nr_debugbus;
  39
  40	struct a6xx_gpu_state_obj *vbif_debugbus;
  41
  42	struct a6xx_gpu_state_obj *cx_debugbus;
  43	int nr_cx_debugbus;
  44
  45	struct msm_gpu_state_bo *gmu_log;
  46	struct msm_gpu_state_bo *gmu_hfi;
  47	struct msm_gpu_state_bo *gmu_debug;
  48
  49	s32 hfi_queue_history[2][HFI_HISTORY_SZ];
  50
  51	struct list_head objs;
  52
  53	bool gpu_initialized;
  54};
  55
  56static inline int CRASHDUMP_WRITE(u64 *in, u32 reg, u32 val)
  57{
  58	in[0] = val;
  59	in[1] = (((u64) reg) << 44 | (1 << 21) | 1);
  60
  61	return 2;
  62}
  63
  64static inline int CRASHDUMP_READ(u64 *in, u32 reg, u32 dwords, u64 target)
  65{
  66	in[0] = target;
  67	in[1] = (((u64) reg) << 44 | dwords);
  68
  69	return 2;
  70}
  71
  72static inline int CRASHDUMP_FINI(u64 *in)
  73{
  74	in[0] = 0;
  75	in[1] = 0;
  76
  77	return 2;
  78}
  79
  80struct a6xx_crashdumper {
  81	void *ptr;
  82	struct drm_gem_object *bo;
  83	u64 iova;
  84};
  85
  86struct a6xx_state_memobj {
  87	struct list_head node;
  88	unsigned long long data[];
  89};
  90
  91static void *state_kcalloc(struct a6xx_gpu_state *a6xx_state, int nr, size_t objsize)
  92{
  93	struct a6xx_state_memobj *obj =
  94		kvzalloc((nr * objsize) + sizeof(*obj), GFP_KERNEL);
  95
  96	if (!obj)
  97		return NULL;
  98
  99	list_add_tail(&obj->node, &a6xx_state->objs);
 100	return &obj->data;
 101}
 102
 103static void *state_kmemdup(struct a6xx_gpu_state *a6xx_state, void *src,
 104		size_t size)
 105{
 106	void *dst = state_kcalloc(a6xx_state, 1, size);
 107
 108	if (dst)
 109		memcpy(dst, src, size);
 110	return dst;
 111}
 112
 113/*
 114 * Allocate 1MB for the crashdumper scratch region - 8k for the script and
 115 * the rest for the data
 116 */
 117#define A6XX_CD_DATA_OFFSET 8192
 118#define A6XX_CD_DATA_SIZE  (SZ_1M - 8192)
 119
 120static int a6xx_crashdumper_init(struct msm_gpu *gpu,
 121		struct a6xx_crashdumper *dumper)
 122{
 123	dumper->ptr = msm_gem_kernel_new(gpu->dev,
 124		SZ_1M, MSM_BO_WC, gpu->aspace,
 125		&dumper->bo, &dumper->iova);
 126
 127	if (!IS_ERR(dumper->ptr))
 128		msm_gem_object_set_name(dumper->bo, "crashdump");
 129
 130	return PTR_ERR_OR_ZERO(dumper->ptr);
 131}
 132
 133static int a6xx_crashdumper_run(struct msm_gpu *gpu,
 134		struct a6xx_crashdumper *dumper)
 135{
 136	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 137	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 138	u32 val;
 139	int ret;
 140
 141	if (IS_ERR_OR_NULL(dumper->ptr))
 142		return -EINVAL;
 143
 144	if (!a6xx_gmu_sptprac_is_on(&a6xx_gpu->gmu))
 145		return -EINVAL;
 146
 147	/* Make sure all pending memory writes are posted */
 148	wmb();
 149
 150	gpu_write64(gpu, REG_A6XX_CP_CRASH_SCRIPT_BASE, dumper->iova);
 151
 152	gpu_write(gpu, REG_A6XX_CP_CRASH_DUMP_CNTL, 1);
 153
 154	ret = gpu_poll_timeout(gpu, REG_A6XX_CP_CRASH_DUMP_STATUS, val,
 155		val & 0x02, 100, 10000);
 156
 157	gpu_write(gpu, REG_A6XX_CP_CRASH_DUMP_CNTL, 0);
 158
 159	return ret;
 160}
 161
 162/* read a value from the GX debug bus */
 163static int debugbus_read(struct msm_gpu *gpu, u32 block, u32 offset,
 164		u32 *data)
 165{
 166	u32 reg = A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_INDEX(offset) |
 167		A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_BLK_SEL(block);
 168
 169	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_A, reg);
 170	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_B, reg);
 171	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_C, reg);
 172	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_D, reg);
 173
 174	/* Wait 1 us to make sure the data is flowing */
 175	udelay(1);
 176
 177	data[0] = gpu_read(gpu, REG_A6XX_DBGC_CFG_DBGBUS_TRACE_BUF2);
 178	data[1] = gpu_read(gpu, REG_A6XX_DBGC_CFG_DBGBUS_TRACE_BUF1);
 179
 180	return 2;
 181}
 182
 183#define cxdbg_write(ptr, offset, val) \
 184	msm_writel((val), (ptr) + ((offset) << 2))
 185
 186#define cxdbg_read(ptr, offset) \
 187	msm_readl((ptr) + ((offset) << 2))
 188
 189/* read a value from the CX debug bus */
 190static int cx_debugbus_read(void __iomem *cxdbg, u32 block, u32 offset,
 191		u32 *data)
 192{
 193	u32 reg = A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_INDEX(offset) |
 194		A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
 195
 196	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_A, reg);
 197	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_B, reg);
 198	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_C, reg);
 199	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_D, reg);
 200
 201	/* Wait 1 us to make sure the data is flowing */
 202	udelay(1);
 203
 204	data[0] = cxdbg_read(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_TRACE_BUF2);
 205	data[1] = cxdbg_read(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_TRACE_BUF1);
 206
 207	return 2;
 208}
 209
 210/* Read a chunk of data from the VBIF debug bus */
 211static int vbif_debugbus_read(struct msm_gpu *gpu, u32 ctrl0, u32 ctrl1,
 212		u32 reg, int count, u32 *data)
 213{
 214	int i;
 215
 216	gpu_write(gpu, ctrl0, reg);
 217
 218	for (i = 0; i < count; i++) {
 219		gpu_write(gpu, ctrl1, i);
 220		data[i] = gpu_read(gpu, REG_A6XX_VBIF_TEST_BUS_OUT);
 221	}
 222
 223	return count;
 224}
 225
 226#define AXI_ARB_BLOCKS 2
 227#define XIN_AXI_BLOCKS 5
 228#define XIN_CORE_BLOCKS 4
 229
 230#define VBIF_DEBUGBUS_BLOCK_SIZE \
 231	((16 * AXI_ARB_BLOCKS) + \
 232	 (18 * XIN_AXI_BLOCKS) + \
 233	 (12 * XIN_CORE_BLOCKS))
 234
 235static void a6xx_get_vbif_debugbus_block(struct msm_gpu *gpu,
 236		struct a6xx_gpu_state *a6xx_state,
 237		struct a6xx_gpu_state_obj *obj)
 238{
 239	u32 clk, *ptr;
 240	int i;
 241
 242	obj->data = state_kcalloc(a6xx_state, VBIF_DEBUGBUS_BLOCK_SIZE,
 243		sizeof(u32));
 244	if (!obj->data)
 245		return;
 246
 247	obj->handle = NULL;
 248
 249	/* Get the current clock setting */
 250	clk = gpu_read(gpu, REG_A6XX_VBIF_CLKON);
 251
 252	/* Force on the bus so we can read it */
 253	gpu_write(gpu, REG_A6XX_VBIF_CLKON,
 254		clk | A6XX_VBIF_CLKON_FORCE_ON_TESTBUS);
 255
 256	/* We will read from BUS2 first, so disable BUS1 */
 257	gpu_write(gpu, REG_A6XX_VBIF_TEST_BUS1_CTRL0, 0);
 258
 259	/* Enable the VBIF bus for reading */
 260	gpu_write(gpu, REG_A6XX_VBIF_TEST_BUS_OUT_CTRL, 1);
 261
 262	ptr = obj->data;
 263
 264	for (i = 0; i < AXI_ARB_BLOCKS; i++)
 265		ptr += vbif_debugbus_read(gpu,
 266			REG_A6XX_VBIF_TEST_BUS2_CTRL0,
 267			REG_A6XX_VBIF_TEST_BUS2_CTRL1,
 268			1 << (i + 16), 16, ptr);
 269
 270	for (i = 0; i < XIN_AXI_BLOCKS; i++)
 271		ptr += vbif_debugbus_read(gpu,
 272			REG_A6XX_VBIF_TEST_BUS2_CTRL0,
 273			REG_A6XX_VBIF_TEST_BUS2_CTRL1,
 274			1 << i, 18, ptr);
 275
 276	/* Stop BUS2 so we can turn on BUS1 */
 277	gpu_write(gpu, REG_A6XX_VBIF_TEST_BUS2_CTRL0, 0);
 278
 279	for (i = 0; i < XIN_CORE_BLOCKS; i++)
 280		ptr += vbif_debugbus_read(gpu,
 281			REG_A6XX_VBIF_TEST_BUS1_CTRL0,
 282			REG_A6XX_VBIF_TEST_BUS1_CTRL1,
 283			1 << i, 12, ptr);
 284
 285	/* Restore the VBIF clock setting */
 286	gpu_write(gpu, REG_A6XX_VBIF_CLKON, clk);
 287}
 288
 289static void a6xx_get_debugbus_block(struct msm_gpu *gpu,
 290		struct a6xx_gpu_state *a6xx_state,
 291		const struct a6xx_debugbus_block *block,
 292		struct a6xx_gpu_state_obj *obj)
 293{
 294	int i;
 295	u32 *ptr;
 296
 297	obj->data = state_kcalloc(a6xx_state, block->count, sizeof(u64));
 298	if (!obj->data)
 299		return;
 300
 301	obj->handle = block;
 302
 303	for (ptr = obj->data, i = 0; i < block->count; i++)
 304		ptr += debugbus_read(gpu, block->id, i, ptr);
 305}
 306
 307static void a6xx_get_cx_debugbus_block(void __iomem *cxdbg,
 308		struct a6xx_gpu_state *a6xx_state,
 309		const struct a6xx_debugbus_block *block,
 310		struct a6xx_gpu_state_obj *obj)
 311{
 312	int i;
 313	u32 *ptr;
 314
 315	obj->data = state_kcalloc(a6xx_state, block->count, sizeof(u64));
 316	if (!obj->data)
 317		return;
 318
 319	obj->handle = block;
 320
 321	for (ptr = obj->data, i = 0; i < block->count; i++)
 322		ptr += cx_debugbus_read(cxdbg, block->id, i, ptr);
 323}
 324
 325static void a6xx_get_debugbus(struct msm_gpu *gpu,
 326		struct a6xx_gpu_state *a6xx_state)
 327{
 328	struct resource *res;
 329	void __iomem *cxdbg = NULL;
 330	int nr_debugbus_blocks;
 331
 332	/* Set up the GX debug bus */
 333
 334	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_CNTLT,
 335		A6XX_DBGC_CFG_DBGBUS_CNTLT_SEGT(0xf));
 336
 337	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_CNTLM,
 338		A6XX_DBGC_CFG_DBGBUS_CNTLM_ENABLE(0xf));
 339
 340	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_0, 0);
 341	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_1, 0);
 342	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_2, 0);
 343	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_IVTL_3, 0);
 344
 345	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_BYTEL_0, 0x76543210);
 346	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_BYTEL_1, 0xFEDCBA98);
 347
 348	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_0, 0);
 349	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_1, 0);
 350	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_2, 0);
 351	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_MASKL_3, 0);
 352
 353	/* Set up the CX debug bus - it lives elsewhere in the system so do a
 354	 * temporary ioremap for the registers
 355	 */
 356	res = platform_get_resource_byname(gpu->pdev, IORESOURCE_MEM,
 357			"cx_dbgc");
 358
 359	if (res)
 360		cxdbg = ioremap(res->start, resource_size(res));
 361
 362	if (cxdbg) {
 363		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_CNTLT,
 364			A6XX_DBGC_CFG_DBGBUS_CNTLT_SEGT(0xf));
 365
 366		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_CNTLM,
 367			A6XX_DBGC_CFG_DBGBUS_CNTLM_ENABLE(0xf));
 368
 369		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_0, 0);
 370		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_1, 0);
 371		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_2, 0);
 372		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_IVTL_3, 0);
 373
 374		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_BYTEL_0,
 375			0x76543210);
 376		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_BYTEL_1,
 377			0xFEDCBA98);
 378
 379		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_0, 0);
 380		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_1, 0);
 381		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_2, 0);
 382		cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_MASKL_3, 0);
 383	}
 384
 385	nr_debugbus_blocks = ARRAY_SIZE(a6xx_debugbus_blocks) +
 386		(a6xx_has_gbif(to_adreno_gpu(gpu)) ? 1 : 0);
 387
 388	if (adreno_is_a650_family(to_adreno_gpu(gpu)))
 389		nr_debugbus_blocks += ARRAY_SIZE(a650_debugbus_blocks);
 390
 391	a6xx_state->debugbus = state_kcalloc(a6xx_state, nr_debugbus_blocks,
 392			sizeof(*a6xx_state->debugbus));
 393
 394	if (a6xx_state->debugbus) {
 395		int i;
 396
 397		for (i = 0; i < ARRAY_SIZE(a6xx_debugbus_blocks); i++)
 398			a6xx_get_debugbus_block(gpu,
 399				a6xx_state,
 400				&a6xx_debugbus_blocks[i],
 401				&a6xx_state->debugbus[i]);
 402
 403		a6xx_state->nr_debugbus = ARRAY_SIZE(a6xx_debugbus_blocks);
 404
 405		/*
 406		 * GBIF has same debugbus as of other GPU blocks, fall back to
 407		 * default path if GPU uses GBIF, also GBIF uses exactly same
 408		 * ID as of VBIF.
 409		 */
 410		if (a6xx_has_gbif(to_adreno_gpu(gpu))) {
 411			a6xx_get_debugbus_block(gpu, a6xx_state,
 412				&a6xx_gbif_debugbus_block,
 413				&a6xx_state->debugbus[i]);
 414
 415			a6xx_state->nr_debugbus += 1;
 416		}
 417
 418
 419		if (adreno_is_a650_family(to_adreno_gpu(gpu))) {
 420			for (i = 0; i < ARRAY_SIZE(a650_debugbus_blocks); i++)
 421				a6xx_get_debugbus_block(gpu,
 422					a6xx_state,
 423					&a650_debugbus_blocks[i],
 424					&a6xx_state->debugbus[i]);
 425		}
 426	}
 427
 428	/*  Dump the VBIF debugbus on applicable targets */
 429	if (!a6xx_has_gbif(to_adreno_gpu(gpu))) {
 430		a6xx_state->vbif_debugbus =
 431			state_kcalloc(a6xx_state, 1,
 432					sizeof(*a6xx_state->vbif_debugbus));
 433
 434		if (a6xx_state->vbif_debugbus)
 435			a6xx_get_vbif_debugbus_block(gpu, a6xx_state,
 436					a6xx_state->vbif_debugbus);
 437	}
 438
 439	if (cxdbg) {
 440		a6xx_state->cx_debugbus =
 441			state_kcalloc(a6xx_state,
 442			ARRAY_SIZE(a6xx_cx_debugbus_blocks),
 443			sizeof(*a6xx_state->cx_debugbus));
 444
 445		if (a6xx_state->cx_debugbus) {
 446			int i;
 447
 448			for (i = 0; i < ARRAY_SIZE(a6xx_cx_debugbus_blocks); i++)
 449				a6xx_get_cx_debugbus_block(cxdbg,
 450					a6xx_state,
 451					&a6xx_cx_debugbus_blocks[i],
 452					&a6xx_state->cx_debugbus[i]);
 453
 454			a6xx_state->nr_cx_debugbus =
 455				ARRAY_SIZE(a6xx_cx_debugbus_blocks);
 456		}
 457
 458		iounmap(cxdbg);
 459	}
 460}
 461
 462#define RANGE(reg, a) ((reg)[(a) + 1] - (reg)[(a)] + 1)
 463
 464/* Read a data cluster from behind the AHB aperture */
 465static void a6xx_get_dbgahb_cluster(struct msm_gpu *gpu,
 466		struct a6xx_gpu_state *a6xx_state,
 467		const struct a6xx_dbgahb_cluster *dbgahb,
 468		struct a6xx_gpu_state_obj *obj,
 469		struct a6xx_crashdumper *dumper)
 470{
 471	u64 *in = dumper->ptr;
 472	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
 473	size_t datasize;
 474	int i, regcount = 0;
 475
 476	for (i = 0; i < A6XX_NUM_CONTEXTS; i++) {
 477		int j;
 478
 479		in += CRASHDUMP_WRITE(in, REG_A6XX_HLSQ_DBG_READ_SEL,
 480			(dbgahb->statetype + i * 2) << 8);
 481
 482		for (j = 0; j < dbgahb->count; j += 2) {
 483			int count = RANGE(dbgahb->registers, j);
 484			u32 offset = REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE +
 485				dbgahb->registers[j] - (dbgahb->base >> 2);
 486
 487			in += CRASHDUMP_READ(in, offset, count, out);
 488
 489			out += count * sizeof(u32);
 490
 491			if (i == 0)
 492				regcount += count;
 493		}
 494	}
 495
 496	CRASHDUMP_FINI(in);
 497
 498	datasize = regcount * A6XX_NUM_CONTEXTS * sizeof(u32);
 499
 500	if (WARN_ON(datasize > A6XX_CD_DATA_SIZE))
 501		return;
 502
 503	if (a6xx_crashdumper_run(gpu, dumper))
 504		return;
 505
 506	obj->handle = dbgahb;
 507	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
 508		datasize);
 509}
 510
 511static void a6xx_get_dbgahb_clusters(struct msm_gpu *gpu,
 512		struct a6xx_gpu_state *a6xx_state,
 513		struct a6xx_crashdumper *dumper)
 514{
 515	int i;
 516
 517	a6xx_state->dbgahb_clusters = state_kcalloc(a6xx_state,
 518		ARRAY_SIZE(a6xx_dbgahb_clusters),
 519		sizeof(*a6xx_state->dbgahb_clusters));
 520
 521	if (!a6xx_state->dbgahb_clusters)
 522		return;
 523
 524	a6xx_state->nr_dbgahb_clusters = ARRAY_SIZE(a6xx_dbgahb_clusters);
 525
 526	for (i = 0; i < ARRAY_SIZE(a6xx_dbgahb_clusters); i++)
 527		a6xx_get_dbgahb_cluster(gpu, a6xx_state,
 528			&a6xx_dbgahb_clusters[i],
 529			&a6xx_state->dbgahb_clusters[i], dumper);
 530}
 531
 532/* Read a data cluster from the CP aperture with the crashdumper */
 533static void a6xx_get_cluster(struct msm_gpu *gpu,
 534		struct a6xx_gpu_state *a6xx_state,
 535		const struct a6xx_cluster *cluster,
 536		struct a6xx_gpu_state_obj *obj,
 537		struct a6xx_crashdumper *dumper)
 538{
 539	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 540	u64 *in = dumper->ptr;
 541	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
 542	size_t datasize;
 543	int i, regcount = 0;
 544	u32 id = cluster->id;
 545
 546	/* Skip registers that are not present on older generation */
 547	if (!adreno_is_a660_family(adreno_gpu) &&
 548			cluster->registers == a660_fe_cluster)
 549		return;
 550
 551	if (adreno_is_a650_family(adreno_gpu) &&
 552			cluster->registers == a6xx_ps_cluster)
 553		id = CLUSTER_VPC_PS;
 554
 555	/* Some clusters need a selector register to be programmed too */
 556	if (cluster->sel_reg)
 557		in += CRASHDUMP_WRITE(in, cluster->sel_reg, cluster->sel_val);
 558
 559	for (i = 0; i < A6XX_NUM_CONTEXTS; i++) {
 560		int j;
 561
 562		in += CRASHDUMP_WRITE(in, REG_A6XX_CP_APERTURE_CNTL_CD,
 563			(id << 8) | (i << 4) | i);
 564
 565		for (j = 0; j < cluster->count; j += 2) {
 566			int count = RANGE(cluster->registers, j);
 567
 568			in += CRASHDUMP_READ(in, cluster->registers[j],
 569				count, out);
 570
 571			out += count * sizeof(u32);
 572
 573			if (i == 0)
 574				regcount += count;
 575		}
 576	}
 577
 578	CRASHDUMP_FINI(in);
 579
 580	datasize = regcount * A6XX_NUM_CONTEXTS * sizeof(u32);
 581
 582	if (WARN_ON(datasize > A6XX_CD_DATA_SIZE))
 583		return;
 584
 585	if (a6xx_crashdumper_run(gpu, dumper))
 586		return;
 587
 588	obj->handle = cluster;
 589	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
 590		datasize);
 591}
 592
 593static void a6xx_get_clusters(struct msm_gpu *gpu,
 594		struct a6xx_gpu_state *a6xx_state,
 595		struct a6xx_crashdumper *dumper)
 596{
 597	int i;
 598
 599	a6xx_state->clusters = state_kcalloc(a6xx_state,
 600		ARRAY_SIZE(a6xx_clusters), sizeof(*a6xx_state->clusters));
 601
 602	if (!a6xx_state->clusters)
 603		return;
 604
 605	a6xx_state->nr_clusters = ARRAY_SIZE(a6xx_clusters);
 606
 607	for (i = 0; i < ARRAY_SIZE(a6xx_clusters); i++)
 608		a6xx_get_cluster(gpu, a6xx_state, &a6xx_clusters[i],
 609			&a6xx_state->clusters[i], dumper);
 610}
 611
 612/* Read a shader / debug block from the HLSQ aperture with the crashdumper */
 613static void a6xx_get_shader_block(struct msm_gpu *gpu,
 614		struct a6xx_gpu_state *a6xx_state,
 615		const struct a6xx_shader_block *block,
 616		struct a6xx_gpu_state_obj *obj,
 617		struct a6xx_crashdumper *dumper)
 618{
 619	u64 *in = dumper->ptr;
 620	size_t datasize = block->size * A6XX_NUM_SHADER_BANKS * sizeof(u32);
 621	int i;
 622
 623	if (WARN_ON(datasize > A6XX_CD_DATA_SIZE))
 624		return;
 625
 626	for (i = 0; i < A6XX_NUM_SHADER_BANKS; i++) {
 627		in += CRASHDUMP_WRITE(in, REG_A6XX_HLSQ_DBG_READ_SEL,
 628			(block->type << 8) | i);
 629
 630		in += CRASHDUMP_READ(in, REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE,
 631			block->size, dumper->iova + A6XX_CD_DATA_OFFSET);
 632	}
 633
 634	CRASHDUMP_FINI(in);
 635
 636	if (a6xx_crashdumper_run(gpu, dumper))
 637		return;
 638
 639	obj->handle = block;
 640	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
 641		datasize);
 642}
 643
 644static void a6xx_get_shaders(struct msm_gpu *gpu,
 645		struct a6xx_gpu_state *a6xx_state,
 646		struct a6xx_crashdumper *dumper)
 647{
 648	int i;
 649
 650	a6xx_state->shaders = state_kcalloc(a6xx_state,
 651		ARRAY_SIZE(a6xx_shader_blocks), sizeof(*a6xx_state->shaders));
 652
 653	if (!a6xx_state->shaders)
 654		return;
 655
 656	a6xx_state->nr_shaders = ARRAY_SIZE(a6xx_shader_blocks);
 657
 658	for (i = 0; i < ARRAY_SIZE(a6xx_shader_blocks); i++)
 659		a6xx_get_shader_block(gpu, a6xx_state, &a6xx_shader_blocks[i],
 660			&a6xx_state->shaders[i], dumper);
 661}
 662
 663/* Read registers from behind the HLSQ aperture with the crashdumper */
 664static void a6xx_get_crashdumper_hlsq_registers(struct msm_gpu *gpu,
 665		struct a6xx_gpu_state *a6xx_state,
 666		const struct a6xx_registers *regs,
 667		struct a6xx_gpu_state_obj *obj,
 668		struct a6xx_crashdumper *dumper)
 669
 670{
 671	u64 *in = dumper->ptr;
 672	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
 673	int i, regcount = 0;
 674
 675	in += CRASHDUMP_WRITE(in, REG_A6XX_HLSQ_DBG_READ_SEL, regs->val1);
 676
 677	for (i = 0; i < regs->count; i += 2) {
 678		u32 count = RANGE(regs->registers, i);
 679		u32 offset = REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE +
 680			regs->registers[i] - (regs->val0 >> 2);
 681
 682		in += CRASHDUMP_READ(in, offset, count, out);
 683
 684		out += count * sizeof(u32);
 685		regcount += count;
 686	}
 687
 688	CRASHDUMP_FINI(in);
 689
 690	if (WARN_ON((regcount * sizeof(u32)) > A6XX_CD_DATA_SIZE))
 691		return;
 692
 693	if (a6xx_crashdumper_run(gpu, dumper))
 694		return;
 695
 696	obj->handle = regs;
 697	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
 698		regcount * sizeof(u32));
 699}
 700
 701/* Read a block of registers using the crashdumper */
 702static void a6xx_get_crashdumper_registers(struct msm_gpu *gpu,
 703		struct a6xx_gpu_state *a6xx_state,
 704		const struct a6xx_registers *regs,
 705		struct a6xx_gpu_state_obj *obj,
 706		struct a6xx_crashdumper *dumper)
 707
 708{
 709	u64 *in = dumper->ptr;
 710	u64 out = dumper->iova + A6XX_CD_DATA_OFFSET;
 711	int i, regcount = 0;
 712
 713	/* Skip unsupported registers on older generations */
 714	if (!adreno_is_a660_family(to_adreno_gpu(gpu)) &&
 715			(regs->registers == a660_registers))
 716		return;
 717
 718	/* Some blocks might need to program a selector register first */
 719	if (regs->val0)
 720		in += CRASHDUMP_WRITE(in, regs->val0, regs->val1);
 721
 722	for (i = 0; i < regs->count; i += 2) {
 723		u32 count = RANGE(regs->registers, i);
 724
 725		in += CRASHDUMP_READ(in, regs->registers[i], count, out);
 726
 727		out += count * sizeof(u32);
 728		regcount += count;
 729	}
 730
 731	CRASHDUMP_FINI(in);
 732
 733	if (WARN_ON((regcount * sizeof(u32)) > A6XX_CD_DATA_SIZE))
 734		return;
 735
 736	if (a6xx_crashdumper_run(gpu, dumper))
 737		return;
 738
 739	obj->handle = regs;
 740	obj->data = state_kmemdup(a6xx_state, dumper->ptr + A6XX_CD_DATA_OFFSET,
 741		regcount * sizeof(u32));
 742}
 743
 744/* Read a block of registers via AHB */
 745static void a6xx_get_ahb_gpu_registers(struct msm_gpu *gpu,
 746		struct a6xx_gpu_state *a6xx_state,
 747		const struct a6xx_registers *regs,
 748		struct a6xx_gpu_state_obj *obj)
 749{
 750	int i, regcount = 0, index = 0;
 751
 752	/* Skip unsupported registers on older generations */
 753	if (!adreno_is_a660_family(to_adreno_gpu(gpu)) &&
 754			(regs->registers == a660_registers))
 755		return;
 756
 757	for (i = 0; i < regs->count; i += 2)
 758		regcount += RANGE(regs->registers, i);
 759
 760	obj->handle = (const void *) regs;
 761	obj->data = state_kcalloc(a6xx_state, regcount, sizeof(u32));
 762	if (!obj->data)
 763		return;
 764
 765	for (i = 0; i < regs->count; i += 2) {
 766		u32 count = RANGE(regs->registers, i);
 767		int j;
 768
 769		for (j = 0; j < count; j++)
 770			obj->data[index++] = gpu_read(gpu,
 771				regs->registers[i] + j);
 772	}
 773}
 774
 775/* Read a block of GMU registers */
 776static void _a6xx_get_gmu_registers(struct msm_gpu *gpu,
 777		struct a6xx_gpu_state *a6xx_state,
 778		const struct a6xx_registers *regs,
 779		struct a6xx_gpu_state_obj *obj,
 780		bool rscc)
 781{
 782	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 783	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 784	struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
 785	int i, regcount = 0, index = 0;
 786
 787	for (i = 0; i < regs->count; i += 2)
 788		regcount += RANGE(regs->registers, i);
 789
 790	obj->handle = (const void *) regs;
 791	obj->data = state_kcalloc(a6xx_state, regcount, sizeof(u32));
 792	if (!obj->data)
 793		return;
 794
 795	for (i = 0; i < regs->count; i += 2) {
 796		u32 count = RANGE(regs->registers, i);
 797		int j;
 798
 799		for (j = 0; j < count; j++) {
 800			u32 offset = regs->registers[i] + j;
 801			u32 val;
 802
 803			if (rscc)
 804				val = gmu_read_rscc(gmu, offset);
 805			else
 806				val = gmu_read(gmu, offset);
 807
 808			obj->data[index++] = val;
 809		}
 810	}
 811}
 812
 813static void a6xx_get_gmu_registers(struct msm_gpu *gpu,
 814		struct a6xx_gpu_state *a6xx_state)
 815{
 816	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 817	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 818
 819	a6xx_state->gmu_registers = state_kcalloc(a6xx_state,
 820		3, sizeof(*a6xx_state->gmu_registers));
 821
 822	if (!a6xx_state->gmu_registers)
 823		return;
 824
 825	a6xx_state->nr_gmu_registers = 3;
 826
 827	/* Get the CX GMU registers from AHB */
 828	_a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[0],
 829		&a6xx_state->gmu_registers[0], false);
 830	_a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[1],
 831		&a6xx_state->gmu_registers[1], true);
 832
 833	if (!a6xx_gmu_gx_is_on(&a6xx_gpu->gmu))
 834		return;
 835
 836	/* Set the fence to ALLOW mode so we can access the registers */
 837	gpu_write(gpu, REG_A6XX_GMU_AO_AHB_FENCE_CTRL, 0);
 838
 839	_a6xx_get_gmu_registers(gpu, a6xx_state, &a6xx_gmu_reglist[2],
 840		&a6xx_state->gmu_registers[2], false);
 841}
 842
 843static struct msm_gpu_state_bo *a6xx_snapshot_gmu_bo(
 844		struct a6xx_gpu_state *a6xx_state, struct a6xx_gmu_bo *bo)
 845{
 846	struct msm_gpu_state_bo *snapshot;
 847
 848	if (!bo->size)
 849		return NULL;
 850
 851	snapshot = state_kcalloc(a6xx_state, 1, sizeof(*snapshot));
 852	if (!snapshot)
 853		return NULL;
 854
 855	snapshot->iova = bo->iova;
 856	snapshot->size = bo->size;
 857	snapshot->data = kvzalloc(snapshot->size, GFP_KERNEL);
 858	if (!snapshot->data)
 859		return NULL;
 860
 861	memcpy(snapshot->data, bo->virt, bo->size);
 862
 863	return snapshot;
 864}
 865
 866static void a6xx_snapshot_gmu_hfi_history(struct msm_gpu *gpu,
 867					  struct a6xx_gpu_state *a6xx_state)
 868{
 869	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 870	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 871	struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
 872	unsigned i, j;
 873
 874	BUILD_BUG_ON(ARRAY_SIZE(gmu->queues) != ARRAY_SIZE(a6xx_state->hfi_queue_history));
 875
 876	for (i = 0; i < ARRAY_SIZE(gmu->queues); i++) {
 877		struct a6xx_hfi_queue *queue = &gmu->queues[i];
 878		for (j = 0; j < HFI_HISTORY_SZ; j++) {
 879			unsigned idx = (j + queue->history_idx) % HFI_HISTORY_SZ;
 880			a6xx_state->hfi_queue_history[i][j] = queue->history[idx];
 881		}
 882	}
 883}
 884
 885#define A6XX_REGLIST_SIZE        1
 886#define A6XX_GBIF_REGLIST_SIZE   1
 887static void a6xx_get_registers(struct msm_gpu *gpu,
 888		struct a6xx_gpu_state *a6xx_state,
 889		struct a6xx_crashdumper *dumper)
 890{
 891	int i, count = A6XX_REGLIST_SIZE +
 892		ARRAY_SIZE(a6xx_reglist) +
 893		ARRAY_SIZE(a6xx_hlsq_reglist) + A6XX_GBIF_REGLIST_SIZE;
 894	int index = 0;
 895	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 896
 897	a6xx_state->registers = state_kcalloc(a6xx_state,
 898		count, sizeof(*a6xx_state->registers));
 899
 900	if (!a6xx_state->registers)
 901		return;
 902
 903	a6xx_state->nr_registers = count;
 904
 905	if (adreno_is_a7xx(adreno_gpu))
 906		a6xx_get_ahb_gpu_registers(gpu,
 907			a6xx_state, &a7xx_ahb_reglist,
 908			&a6xx_state->registers[index++]);
 909	else
 910		a6xx_get_ahb_gpu_registers(gpu,
 911			a6xx_state, &a6xx_ahb_reglist,
 912			&a6xx_state->registers[index++]);
 913
 914	if (adreno_is_a7xx(adreno_gpu))
 915		a6xx_get_ahb_gpu_registers(gpu,
 916				a6xx_state, &a7xx_gbif_reglist,
 917				&a6xx_state->registers[index++]);
 918	else if (a6xx_has_gbif(adreno_gpu))
 919		a6xx_get_ahb_gpu_registers(gpu,
 920				a6xx_state, &a6xx_gbif_reglist,
 921				&a6xx_state->registers[index++]);
 922	else
 923		a6xx_get_ahb_gpu_registers(gpu,
 924				a6xx_state, &a6xx_vbif_reglist,
 925				&a6xx_state->registers[index++]);
 926	if (!dumper) {
 927		/*
 928		 * We can't use the crashdumper when the SMMU is stalled,
 929		 * because the GPU has no memory access until we resume
 930		 * translation (but we don't want to do that until after
 931		 * we have captured as much useful GPU state as possible).
 932		 * So instead collect registers via the CPU:
 933		 */
 934		for (i = 0; i < ARRAY_SIZE(a6xx_reglist); i++)
 935			a6xx_get_ahb_gpu_registers(gpu,
 936				a6xx_state, &a6xx_reglist[i],
 937				&a6xx_state->registers[index++]);
 938		return;
 939	}
 940
 941	for (i = 0; i < ARRAY_SIZE(a6xx_reglist); i++)
 942		a6xx_get_crashdumper_registers(gpu,
 943			a6xx_state, &a6xx_reglist[i],
 944			&a6xx_state->registers[index++],
 945			dumper);
 946
 947	for (i = 0; i < ARRAY_SIZE(a6xx_hlsq_reglist); i++)
 948		a6xx_get_crashdumper_hlsq_registers(gpu,
 949			a6xx_state, &a6xx_hlsq_reglist[i],
 950			&a6xx_state->registers[index++],
 951			dumper);
 952}
 953
 954static u32 a6xx_get_cp_roq_size(struct msm_gpu *gpu)
 955{
 956	/* The value at [16:31] is in 4dword units. Convert it to dwords */
 957	return gpu_read(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2) >> 14;
 958}
 959
 960static u32 a7xx_get_cp_roq_size(struct msm_gpu *gpu)
 961{
 962	/*
 963	 * The value at CP_ROQ_THRESHOLDS_2[20:31] is in 4dword units.
 964	 * That register however is not directly accessible from APSS on A7xx.
 965	 * Program the SQE_UCODE_DBG_ADDR with offset=0x70d3 and read the value.
 966	 */
 967	gpu_write(gpu, REG_A6XX_CP_SQE_UCODE_DBG_ADDR, 0x70d3);
 968
 969	return 4 * (gpu_read(gpu, REG_A6XX_CP_SQE_UCODE_DBG_DATA) >> 20);
 970}
 971
 972/* Read a block of data from an indexed register pair */
 973static void a6xx_get_indexed_regs(struct msm_gpu *gpu,
 974		struct a6xx_gpu_state *a6xx_state,
 975		struct a6xx_indexed_registers *indexed,
 976		struct a6xx_gpu_state_obj *obj)
 977{
 978	int i;
 979
 980	obj->handle = (const void *) indexed;
 981	if (indexed->count_fn)
 982		indexed->count = indexed->count_fn(gpu);
 983
 984	obj->data = state_kcalloc(a6xx_state, indexed->count, sizeof(u32));
 985	if (!obj->data)
 986		return;
 987
 988	/* All the indexed banks start at address 0 */
 989	gpu_write(gpu, indexed->addr, 0);
 990
 991	/* Read the data - each read increments the internal address by 1 */
 992	for (i = 0; i < indexed->count; i++)
 993		obj->data[i] = gpu_read(gpu, indexed->data);
 994}
 995
 996static void a6xx_get_indexed_registers(struct msm_gpu *gpu,
 997		struct a6xx_gpu_state *a6xx_state)
 998{
 999	u32 mempool_size;
1000	int count = ARRAY_SIZE(a6xx_indexed_reglist) + 1;
1001	int i;
1002
1003	a6xx_state->indexed_regs = state_kcalloc(a6xx_state, count,
1004		sizeof(*a6xx_state->indexed_regs));
1005	if (!a6xx_state->indexed_regs)
1006		return;
1007
1008	for (i = 0; i < ARRAY_SIZE(a6xx_indexed_reglist); i++)
1009		a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_indexed_reglist[i],
1010			&a6xx_state->indexed_regs[i]);
1011
1012	if (adreno_is_a650_family(to_adreno_gpu(gpu))) {
1013		u32 val;
1014
1015		val = gpu_read(gpu, REG_A6XX_CP_CHICKEN_DBG);
1016		gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, val | 4);
1017
1018		/* Get the contents of the CP mempool */
1019		a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_cp_mempool_indexed,
1020			&a6xx_state->indexed_regs[i]);
1021
1022		gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, val);
1023		a6xx_state->nr_indexed_regs = count;
1024		return;
1025	}
1026
1027	/* Set the CP mempool size to 0 to stabilize it while dumping */
1028	mempool_size = gpu_read(gpu, REG_A6XX_CP_MEM_POOL_SIZE);
1029	gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, 0);
1030
1031	/* Get the contents of the CP mempool */
1032	a6xx_get_indexed_regs(gpu, a6xx_state, &a6xx_cp_mempool_indexed,
1033		&a6xx_state->indexed_regs[i]);
1034
1035	/*
1036	 * Offset 0x2000 in the mempool is the size - copy the saved size over
1037	 * so the data is consistent
1038	 */
1039	a6xx_state->indexed_regs[i].data[0x2000] = mempool_size;
1040
1041	/* Restore the size in the hardware */
1042	gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, mempool_size);
1043}
1044
1045static void a7xx_get_indexed_registers(struct msm_gpu *gpu,
1046		struct a6xx_gpu_state *a6xx_state)
1047{
1048	int i, indexed_count, mempool_count;
1049
1050	indexed_count = ARRAY_SIZE(a7xx_indexed_reglist);
1051	mempool_count = ARRAY_SIZE(a7xx_cp_bv_mempool_indexed);
1052
1053	a6xx_state->indexed_regs = state_kcalloc(a6xx_state,
1054					indexed_count + mempool_count,
1055					sizeof(*a6xx_state->indexed_regs));
1056	if (!a6xx_state->indexed_regs)
1057		return;
1058
1059	a6xx_state->nr_indexed_regs = indexed_count + mempool_count;
1060
1061	/* First read the common regs */
1062	for (i = 0; i < indexed_count; i++)
1063		a6xx_get_indexed_regs(gpu, a6xx_state, &a7xx_indexed_reglist[i],
1064			&a6xx_state->indexed_regs[i]);
1065
1066	gpu_rmw(gpu, REG_A6XX_CP_CHICKEN_DBG, 0, BIT(2));
1067	gpu_rmw(gpu, REG_A7XX_CP_BV_CHICKEN_DBG, 0, BIT(2));
1068
1069	/* Get the contents of the CP_BV mempool */
1070	for (i = 0; i < mempool_count; i++)
1071		a6xx_get_indexed_regs(gpu, a6xx_state, a7xx_cp_bv_mempool_indexed,
1072			&a6xx_state->indexed_regs[indexed_count - 1 + i]);
1073
1074	gpu_rmw(gpu, REG_A6XX_CP_CHICKEN_DBG, BIT(2), 0);
1075	gpu_rmw(gpu, REG_A7XX_CP_BV_CHICKEN_DBG, BIT(2), 0);
1076	return;
1077}
1078
1079struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
1080{
1081	struct a6xx_crashdumper _dumper = { 0 }, *dumper = NULL;
1082	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
1083	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
1084	struct a6xx_gpu_state *a6xx_state = kzalloc(sizeof(*a6xx_state),
1085		GFP_KERNEL);
1086	bool stalled = !!(gpu_read(gpu, REG_A6XX_RBBM_STATUS3) &
1087			A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT);
1088
1089	if (!a6xx_state)
1090		return ERR_PTR(-ENOMEM);
1091
1092	INIT_LIST_HEAD(&a6xx_state->objs);
1093
1094	/* Get the generic state from the adreno core */
1095	adreno_gpu_state_get(gpu, &a6xx_state->base);
1096
1097	if (!adreno_has_gmu_wrapper(adreno_gpu)) {
1098		a6xx_get_gmu_registers(gpu, a6xx_state);
1099
1100		a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log);
1101		a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi);
1102		a6xx_state->gmu_debug = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.debug);
1103
1104		a6xx_snapshot_gmu_hfi_history(gpu, a6xx_state);
1105	}
1106
1107	/* If GX isn't on the rest of the data isn't going to be accessible */
1108	if (!adreno_has_gmu_wrapper(adreno_gpu) && !a6xx_gmu_gx_is_on(&a6xx_gpu->gmu))
1109		return &a6xx_state->base;
1110
1111	/* Get the banks of indexed registers */
1112	if (adreno_is_a7xx(adreno_gpu)) {
1113		a7xx_get_indexed_registers(gpu, a6xx_state);
1114		/* Further codeflow is untested on A7xx. */
1115		return &a6xx_state->base;
1116	}
1117
1118	a6xx_get_indexed_registers(gpu, a6xx_state);
1119
1120	/*
1121	 * Try to initialize the crashdumper, if we are not dumping state
1122	 * with the SMMU stalled.  The crashdumper needs memory access to
1123	 * write out GPU state, so we need to skip this when the SMMU is
1124	 * stalled in response to an iova fault
1125	 */
1126	if (!stalled && !gpu->needs_hw_init &&
1127	    !a6xx_crashdumper_init(gpu, &_dumper)) {
1128		dumper = &_dumper;
1129	}
1130
1131	a6xx_get_registers(gpu, a6xx_state, dumper);
1132
1133	if (dumper) {
1134		a6xx_get_shaders(gpu, a6xx_state, dumper);
1135		a6xx_get_clusters(gpu, a6xx_state, dumper);
1136		a6xx_get_dbgahb_clusters(gpu, a6xx_state, dumper);
1137
1138		msm_gem_kernel_put(dumper->bo, gpu->aspace);
1139	}
1140
1141	if (snapshot_debugbus)
1142		a6xx_get_debugbus(gpu, a6xx_state);
1143
1144	a6xx_state->gpu_initialized = !gpu->needs_hw_init;
1145
1146	return  &a6xx_state->base;
1147}
1148
1149static void a6xx_gpu_state_destroy(struct kref *kref)
1150{
1151	struct a6xx_state_memobj *obj, *tmp;
1152	struct msm_gpu_state *state = container_of(kref,
1153			struct msm_gpu_state, ref);
1154	struct a6xx_gpu_state *a6xx_state = container_of(state,
1155			struct a6xx_gpu_state, base);
1156
1157	if (a6xx_state->gmu_log)
1158		kvfree(a6xx_state->gmu_log->data);
1159
1160	if (a6xx_state->gmu_hfi)
1161		kvfree(a6xx_state->gmu_hfi->data);
1162
1163	if (a6xx_state->gmu_debug)
1164		kvfree(a6xx_state->gmu_debug->data);
1165
1166	list_for_each_entry_safe(obj, tmp, &a6xx_state->objs, node) {
1167		list_del(&obj->node);
1168		kvfree(obj);
1169	}
1170
1171	adreno_gpu_state_destroy(state);
1172	kfree(a6xx_state);
1173}
1174
1175int a6xx_gpu_state_put(struct msm_gpu_state *state)
1176{
1177	if (IS_ERR_OR_NULL(state))
1178		return 1;
1179
1180	return kref_put(&state->ref, a6xx_gpu_state_destroy);
1181}
1182
1183static void a6xx_show_registers(const u32 *registers, u32 *data, size_t count,
1184		struct drm_printer *p)
1185{
1186	int i, index = 0;
1187
1188	if (!data)
1189		return;
1190
1191	for (i = 0; i < count; i += 2) {
1192		u32 count = RANGE(registers, i);
1193		u32 offset = registers[i];
1194		int j;
1195
1196		for (j = 0; j < count; index++, offset++, j++) {
1197			if (data[index] == 0xdeafbead)
1198				continue;
1199
1200			drm_printf(p, "  - { offset: 0x%06x, value: 0x%08x }\n",
1201				offset << 2, data[index]);
1202		}
1203	}
1204}
1205
1206static void print_ascii85(struct drm_printer *p, size_t len, u32 *data)
1207{
1208	char out[ASCII85_BUFSZ];
1209	long i, l, datalen = 0;
1210
1211	for (i = 0; i < len >> 2; i++) {
1212		if (data[i])
1213			datalen = (i + 1) << 2;
1214	}
1215
1216	if (datalen == 0)
1217		return;
1218
1219	drm_puts(p, "    data: !!ascii85 |\n");
1220	drm_puts(p, "      ");
1221
1222
1223	l = ascii85_encode_len(datalen);
1224
1225	for (i = 0; i < l; i++)
1226		drm_puts(p, ascii85_encode(data[i], out));
1227
1228	drm_puts(p, "\n");
1229}
1230
1231static void print_name(struct drm_printer *p, const char *fmt, const char *name)
1232{
1233	drm_puts(p, fmt);
1234	drm_puts(p, name);
1235	drm_puts(p, "\n");
1236}
1237
1238static void a6xx_show_shader(struct a6xx_gpu_state_obj *obj,
1239		struct drm_printer *p)
1240{
1241	const struct a6xx_shader_block *block = obj->handle;
1242	int i;
1243
1244	if (!obj->handle)
1245		return;
1246
1247	print_name(p, "  - type: ", block->name);
1248
1249	for (i = 0; i < A6XX_NUM_SHADER_BANKS; i++) {
1250		drm_printf(p, "    - bank: %d\n", i);
1251		drm_printf(p, "      size: %d\n", block->size);
1252
1253		if (!obj->data)
1254			continue;
1255
1256		print_ascii85(p, block->size << 2,
1257			obj->data + (block->size * i));
1258	}
1259}
1260
1261static void a6xx_show_cluster_data(const u32 *registers, int size, u32 *data,
1262		struct drm_printer *p)
1263{
1264	int ctx, index = 0;
1265
1266	for (ctx = 0; ctx < A6XX_NUM_CONTEXTS; ctx++) {
1267		int j;
1268
1269		drm_printf(p, "    - context: %d\n", ctx);
1270
1271		for (j = 0; j < size; j += 2) {
1272			u32 count = RANGE(registers, j);
1273			u32 offset = registers[j];
1274			int k;
1275
1276			for (k = 0; k < count; index++, offset++, k++) {
1277				if (data[index] == 0xdeafbead)
1278					continue;
1279
1280				drm_printf(p, "      - { offset: 0x%06x, value: 0x%08x }\n",
1281					offset << 2, data[index]);
1282			}
1283		}
1284	}
1285}
1286
1287static void a6xx_show_dbgahb_cluster(struct a6xx_gpu_state_obj *obj,
1288		struct drm_printer *p)
1289{
1290	const struct a6xx_dbgahb_cluster *dbgahb = obj->handle;
1291
1292	if (dbgahb) {
1293		print_name(p, "  - cluster-name: ", dbgahb->name);
1294		a6xx_show_cluster_data(dbgahb->registers, dbgahb->count,
1295			obj->data, p);
1296	}
1297}
1298
1299static void a6xx_show_cluster(struct a6xx_gpu_state_obj *obj,
1300		struct drm_printer *p)
1301{
1302	const struct a6xx_cluster *cluster = obj->handle;
1303
1304	if (cluster) {
1305		print_name(p, "  - cluster-name: ", cluster->name);
1306		a6xx_show_cluster_data(cluster->registers, cluster->count,
1307			obj->data, p);
1308	}
1309}
1310
1311static void a6xx_show_indexed_regs(struct a6xx_gpu_state_obj *obj,
1312		struct drm_printer *p)
1313{
1314	const struct a6xx_indexed_registers *indexed = obj->handle;
1315
1316	if (!indexed)
1317		return;
1318
1319	print_name(p, "  - regs-name: ", indexed->name);
1320	drm_printf(p, "    dwords: %d\n", indexed->count);
1321
1322	print_ascii85(p, indexed->count << 2, obj->data);
1323}
1324
1325static void a6xx_show_debugbus_block(const struct a6xx_debugbus_block *block,
1326		u32 *data, struct drm_printer *p)
1327{
1328	if (block) {
1329		print_name(p, "  - debugbus-block: ", block->name);
1330
1331		/*
1332		 * count for regular debugbus data is in quadwords,
1333		 * but print the size in dwords for consistency
1334		 */
1335		drm_printf(p, "    count: %d\n", block->count << 1);
1336
1337		print_ascii85(p, block->count << 3, data);
1338	}
1339}
1340
1341static void a6xx_show_debugbus(struct a6xx_gpu_state *a6xx_state,
1342		struct drm_printer *p)
1343{
1344	int i;
1345
1346	for (i = 0; i < a6xx_state->nr_debugbus; i++) {
1347		struct a6xx_gpu_state_obj *obj = &a6xx_state->debugbus[i];
1348
1349		a6xx_show_debugbus_block(obj->handle, obj->data, p);
1350	}
1351
1352	if (a6xx_state->vbif_debugbus) {
1353		struct a6xx_gpu_state_obj *obj = a6xx_state->vbif_debugbus;
1354
1355		drm_puts(p, "  - debugbus-block: A6XX_DBGBUS_VBIF\n");
1356		drm_printf(p, "    count: %d\n", VBIF_DEBUGBUS_BLOCK_SIZE);
1357
1358		/* vbif debugbus data is in dwords.  Confusing, huh? */
1359		print_ascii85(p, VBIF_DEBUGBUS_BLOCK_SIZE << 2, obj->data);
1360	}
1361
1362	for (i = 0; i < a6xx_state->nr_cx_debugbus; i++) {
1363		struct a6xx_gpu_state_obj *obj = &a6xx_state->cx_debugbus[i];
1364
1365		a6xx_show_debugbus_block(obj->handle, obj->data, p);
1366	}
1367}
1368
1369void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
1370		struct drm_printer *p)
1371{
1372	struct a6xx_gpu_state *a6xx_state = container_of(state,
1373			struct a6xx_gpu_state, base);
1374	int i;
1375
1376	if (IS_ERR_OR_NULL(state))
1377		return;
1378
1379	drm_printf(p, "gpu-initialized: %d\n", a6xx_state->gpu_initialized);
1380
1381	adreno_show(gpu, state, p);
1382
1383	drm_puts(p, "gmu-log:\n");
1384	if (a6xx_state->gmu_log) {
1385		struct msm_gpu_state_bo *gmu_log = a6xx_state->gmu_log;
1386
1387		drm_printf(p, "    iova: 0x%016llx\n", gmu_log->iova);
1388		drm_printf(p, "    size: %zu\n", gmu_log->size);
1389		adreno_show_object(p, &gmu_log->data, gmu_log->size,
1390				&gmu_log->encoded);
1391	}
1392
1393	drm_puts(p, "gmu-hfi:\n");
1394	if (a6xx_state->gmu_hfi) {
1395		struct msm_gpu_state_bo *gmu_hfi = a6xx_state->gmu_hfi;
1396		unsigned i, j;
1397
1398		drm_printf(p, "    iova: 0x%016llx\n", gmu_hfi->iova);
1399		drm_printf(p, "    size: %zu\n", gmu_hfi->size);
1400		for (i = 0; i < ARRAY_SIZE(a6xx_state->hfi_queue_history); i++) {
1401			drm_printf(p, "    queue-history[%u]:", i);
1402			for (j = 0; j < HFI_HISTORY_SZ; j++) {
1403				drm_printf(p, " %d", a6xx_state->hfi_queue_history[i][j]);
1404			}
1405			drm_printf(p, "\n");
1406		}
1407		adreno_show_object(p, &gmu_hfi->data, gmu_hfi->size,
1408				&gmu_hfi->encoded);
1409	}
1410
1411	drm_puts(p, "gmu-debug:\n");
1412	if (a6xx_state->gmu_debug) {
1413		struct msm_gpu_state_bo *gmu_debug = a6xx_state->gmu_debug;
1414
1415		drm_printf(p, "    iova: 0x%016llx\n", gmu_debug->iova);
1416		drm_printf(p, "    size: %zu\n", gmu_debug->size);
1417		adreno_show_object(p, &gmu_debug->data, gmu_debug->size,
1418				&gmu_debug->encoded);
1419	}
1420
1421	drm_puts(p, "registers:\n");
1422	for (i = 0; i < a6xx_state->nr_registers; i++) {
1423		struct a6xx_gpu_state_obj *obj = &a6xx_state->registers[i];
1424		const struct a6xx_registers *regs = obj->handle;
1425
1426		if (!obj->handle)
1427			continue;
1428
1429		a6xx_show_registers(regs->registers, obj->data, regs->count, p);
1430	}
1431
1432	drm_puts(p, "registers-gmu:\n");
1433	for (i = 0; i < a6xx_state->nr_gmu_registers; i++) {
1434		struct a6xx_gpu_state_obj *obj = &a6xx_state->gmu_registers[i];
1435		const struct a6xx_registers *regs = obj->handle;
1436
1437		if (!obj->handle)
1438			continue;
1439
1440		a6xx_show_registers(regs->registers, obj->data, regs->count, p);
1441	}
1442
1443	drm_puts(p, "indexed-registers:\n");
1444	for (i = 0; i < a6xx_state->nr_indexed_regs; i++)
1445		a6xx_show_indexed_regs(&a6xx_state->indexed_regs[i], p);
1446
1447	drm_puts(p, "shader-blocks:\n");
1448	for (i = 0; i < a6xx_state->nr_shaders; i++)
1449		a6xx_show_shader(&a6xx_state->shaders[i], p);
1450
1451	drm_puts(p, "clusters:\n");
1452	for (i = 0; i < a6xx_state->nr_clusters; i++)
1453		a6xx_show_cluster(&a6xx_state->clusters[i], p);
1454
1455	for (i = 0; i < a6xx_state->nr_dbgahb_clusters; i++)
1456		a6xx_show_dbgahb_cluster(&a6xx_state->dbgahb_clusters[i], p);
1457
1458	drm_puts(p, "debugbus:\n");
1459	a6xx_show_debugbus(a6xx_state, p);
1460}