Linux Audio

Check our new training course

In-person Linux kernel drivers training

Jun 16-20, 2025
Register
Loading...
Note: File does not exist in v6.2.
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * AMD Address Translation Library
  4 *
  5 * umc.c : Unified Memory Controller (UMC) topology helpers
  6 *
  7 * Copyright (c) 2023, Advanced Micro Devices, Inc.
  8 * All Rights Reserved.
  9 *
 10 * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
 11 */
 12
 13#include "internal.h"
 14
 15/*
 16 * MI300 has a fixed, model-specific mapping between a UMC instance and
 17 * its related Data Fabric Coherent Station instance.
 18 *
 19 * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the
 20 * UMC instance within a Node. Use this to find the appropriate Coherent
 21 * Station ID.
 22 *
 23 * Redundant bits were removed from the map below.
 24 */
 25static const u16 umc_coh_st_map[32] = {
 26	0x393, 0x293, 0x193, 0x093,
 27	0x392, 0x292, 0x192, 0x092,
 28	0x391, 0x291, 0x191, 0x091,
 29	0x390, 0x290, 0x190, 0x090,
 30	0x793, 0x693, 0x593, 0x493,
 31	0x792, 0x692, 0x592, 0x492,
 32	0x791, 0x691, 0x591, 0x491,
 33	0x790, 0x690, 0x590, 0x490,
 34};
 35
 36#define UMC_ID_MI300 GENMASK(23, 12)
 37static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
 38{
 39	u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid);
 40	u8 i;
 41
 42	for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) {
 43		if (umc_id == umc_coh_st_map[i])
 44			break;
 45	}
 46
 47	WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map));
 48
 49	return i;
 50}
 51
 52/* XOR the bits in @val. */
 53static u16 bitwise_xor_bits(u16 val)
 54{
 55	u16 tmp = 0;
 56	u8 i;
 57
 58	for (i = 0; i < 16; i++)
 59		tmp ^= (val >> i) & 0x1;
 60
 61	return tmp;
 62}
 63
 64struct xor_bits {
 65	bool	xor_enable;
 66	u16	col_xor;
 67	u32	row_xor;
 68};
 69
 70#define NUM_BANK_BITS	4
 71
 72static struct {
 73	/* UMC::CH::AddrHashBank */
 74	struct xor_bits	bank[NUM_BANK_BITS];
 75
 76	/* UMC::CH::AddrHashPC */
 77	struct xor_bits	pc;
 78
 79	/* UMC::CH::AddrHashPC2 */
 80	u8		bank_xor;
 81} addr_hash;
 82
 83#define MI300_UMC_CH_BASE	0x90000
 84#define MI300_ADDR_HASH_BANK0	(MI300_UMC_CH_BASE + 0xC8)
 85#define MI300_ADDR_HASH_PC	(MI300_UMC_CH_BASE + 0xE0)
 86#define MI300_ADDR_HASH_PC2	(MI300_UMC_CH_BASE + 0xE4)
 87
 88#define ADDR_HASH_XOR_EN	BIT(0)
 89#define ADDR_HASH_COL_XOR	GENMASK(13, 1)
 90#define ADDR_HASH_ROW_XOR	GENMASK(31, 14)
 91#define ADDR_HASH_BANK_XOR	GENMASK(5, 0)
 92
 93/*
 94 * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used
 95 * for hashing. Do this during module init, since the values will not
 96 * change during run time.
 97 *
 98 * These registers are instantiated for each UMC across each AMD Node.
 99 * However, they should be identically programmed due to the fixed hardware
100 * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a
101 * single global structure for simplicity.
102 */
103int get_addr_hash_mi300(void)
104{
105	u32 temp;
106	int ret;
107	u8 i;
108
109	for (i = 0; i < NUM_BANK_BITS; i++) {
110		ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp);
111		if (ret)
112			return ret;
113
114		addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
115		addr_hash.bank[i].col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
116		addr_hash.bank[i].row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
117	}
118
119	ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp);
120	if (ret)
121		return ret;
122
123	addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
124	addr_hash.pc.col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
125	addr_hash.pc.row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
126
127	ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp);
128	if (ret)
129		return ret;
130
131	addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp);
132
133	return 0;
134}
135
136/*
137 * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must
138 * be converted to the intermediate normalized address (NA) before translating to a
139 * system physical address.
140 *
141 * The DRAM address includes bank, row, and column. Also included are bits for
142 * pseudochannel (PC) and stack ID (SID).
143 *
144 * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero
145 *
146 * The MCA address format is as follows:
147 *	MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]}
148 *
149 * The normalized address format is fixed in hardware and is as follows:
150 *	NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]}
151 *
152 * Additionally, the PC and Bank bits may be hashed. This must be accounted for before
153 * reconstructing the normalized address.
154 */
155#define MI300_UMC_MCA_COL	GENMASK(5, 1)
156#define MI300_UMC_MCA_BANK	GENMASK(9, 6)
157#define MI300_UMC_MCA_ROW	GENMASK(24, 10)
158#define MI300_UMC_MCA_PC	BIT(25)
159#define MI300_UMC_MCA_SID	GENMASK(27, 26)
160
161#define MI300_NA_COL_1_0	GENMASK(6, 5)
162#define MI300_NA_PC		BIT(7)
163#define MI300_NA_COL_3_2	GENMASK(9, 8)
164#define MI300_NA_BANK_3_2	GENMASK(11, 10)
165#define MI300_NA_BANK_1_0	GENMASK(13, 12)
166#define MI300_NA_COL_4		BIT(14)
167#define MI300_NA_ROW		GENMASK(28, 15)
168#define MI300_NA_SID		GENMASK(30, 29)
169
170static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
171{
172	u16 i, col, row, bank, pc, sid, temp;
173
174	col  = FIELD_GET(MI300_UMC_MCA_COL,  addr);
175	bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
176	row  = FIELD_GET(MI300_UMC_MCA_ROW,  addr);
177	pc   = FIELD_GET(MI300_UMC_MCA_PC,   addr);
178	sid  = FIELD_GET(MI300_UMC_MCA_SID,  addr);
179
180	/* Calculate hash for each Bank bit. */
181	for (i = 0; i < NUM_BANK_BITS; i++) {
182		if (!addr_hash.bank[i].xor_enable)
183			continue;
184
185		temp  = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
186		temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
187		bank ^= temp << i;
188	}
189
190	/* Calculate hash for PC bit. */
191	if (addr_hash.pc.xor_enable) {
192		/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
193		bank |= sid << 5;
194
195		temp  = bitwise_xor_bits(col  & addr_hash.pc.col_xor);
196		temp ^= bitwise_xor_bits(row  & addr_hash.pc.row_xor);
197		temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor);
198		pc   ^= temp;
199
200		/* Drop SID bits for the sake of debug printing later. */
201		bank &= 0x1F;
202	}
203
204	/* Reconstruct the normalized address starting with NA[4:0] = 0 */
205	addr  = 0;
206
207	/* NA[6:5] = Column[1:0] */
208	temp  = col & 0x3;
209	addr |= FIELD_PREP(MI300_NA_COL_1_0, temp);
210
211	/* NA[7] = PC */
212	addr |= FIELD_PREP(MI300_NA_PC, pc);
213
214	/* NA[9:8] = Column[3:2] */
215	temp  = (col >> 2) & 0x3;
216	addr |= FIELD_PREP(MI300_NA_COL_3_2, temp);
217
218	/* NA[11:10] = Bank[3:2] */
219	temp  = (bank >> 2) & 0x3;
220	addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp);
221
222	/* NA[13:12] = Bank[1:0] */
223	temp  = bank & 0x3;
224	addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp);
225
226	/* NA[14] = Column[4] */
227	temp  = (col >> 4) & 0x1;
228	addr |= FIELD_PREP(MI300_NA_COL_4, temp);
229
230	/* NA[28:15] = Row[13:0] */
231	addr |= FIELD_PREP(MI300_NA_ROW, row);
232
233	/* NA[30:29] = SID[1:0] */
234	addr |= FIELD_PREP(MI300_NA_SID, sid);
235
236	pr_debug("Addr=0x%016lx", addr);
237	pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid);
238
239	return addr;
240}
241
242/*
243 * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
244 * all memory within that DRAM row. This applies to the memory with a DRAM
245 * bank.
246 *
247 * To find the memory addresses, loop through permutations of the DRAM column
248 * bits and find the System Physical address of each. The column bits are used
249 * to calculate the intermediate Normalized address, so all permutations should
250 * be checked.
251 *
252 * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
253 */
254#define MI300_NUM_COL		BIT(HWEIGHT(MI300_UMC_MCA_COL))
255static void retire_row_mi300(struct atl_err *a_err)
256{
257	unsigned long addr;
258	struct page *p;
259	u8 col;
260
261	for (col = 0; col < MI300_NUM_COL; col++) {
262		a_err->addr &= ~MI300_UMC_MCA_COL;
263		a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
264
265		addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
266		if (IS_ERR_VALUE(addr))
267			continue;
268
269		addr = PHYS_PFN(addr);
270
271		/*
272		 * Skip invalid or already poisoned pages to avoid unnecessary
273		 * error messages from memory_failure().
274		 */
275		p = pfn_to_online_page(addr);
276		if (!p)
277			continue;
278
279		if (PageHWPoison(p))
280			continue;
281
282		memory_failure(addr, 0);
283	}
284}
285
286void amd_retire_dram_row(struct atl_err *a_err)
287{
288	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
289		return retire_row_mi300(a_err);
290}
291EXPORT_SYMBOL_GPL(amd_retire_dram_row);
292
293static unsigned long get_addr(unsigned long addr)
294{
295	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
296		return convert_dram_to_norm_addr_mi300(addr);
297
298	return addr;
299}
300
301#define MCA_IPID_INST_ID_HI	GENMASK_ULL(47, 44)
302static u8 get_die_id(struct atl_err *err)
303{
304	/*
305	 * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this
306	 * needs to be divided by 4 to get the internal Die ID.
307	 */
308	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) {
309		u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid);
310
311		return node_id >> 2;
312	}
313
314	/*
315	 * For CPUs, this is the AMD Node ID modulo the number
316	 * of AMD Nodes per socket.
317	 */
318	return topology_amd_node_id(err->cpu) % topology_amd_nodes_per_pkg();
319}
320
321#define UMC_CHANNEL_NUM	GENMASK(31, 20)
322static u8 get_coh_st_inst_id(struct atl_err *err)
323{
324	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
325		return get_coh_st_inst_id_mi300(err);
326
327	return FIELD_GET(UMC_CHANNEL_NUM, err->ipid);
328}
329
330unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
331{
332	u8 socket_id = topology_physical_package_id(err->cpu);
333	u8 coh_st_inst_id = get_coh_st_inst_id(err);
334	unsigned long addr = get_addr(err->addr);
335	u8 die_id = get_die_id(err);
336
337	pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",
338		 socket_id, die_id, coh_st_inst_id, addr);
339
340	return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
341}