Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Ampere Computing SoC's SMpro Error Monitoring Driver
4 *
5 * Copyright (c) 2022, Ampere Computing LLC
6 *
7 */
8
9#include <linux/i2c.h>
10#include <linux/mod_devicetable.h>
11#include <linux/module.h>
12#include <linux/platform_device.h>
13#include <linux/regmap.h>
14
15/* GPI RAS Error Registers */
16#define GPI_RAS_ERR 0x7E
17
18/* Core and L2C Error Registers */
19#define CORE_CE_ERR_CNT 0x80
20#define CORE_CE_ERR_LEN 0x81
21#define CORE_CE_ERR_DATA 0x82
22#define CORE_UE_ERR_CNT 0x83
23#define CORE_UE_ERR_LEN 0x84
24#define CORE_UE_ERR_DATA 0x85
25
26/* Memory Error Registers */
27#define MEM_CE_ERR_CNT 0x90
28#define MEM_CE_ERR_LEN 0x91
29#define MEM_CE_ERR_DATA 0x92
30#define MEM_UE_ERR_CNT 0x93
31#define MEM_UE_ERR_LEN 0x94
32#define MEM_UE_ERR_DATA 0x95
33
34/* RAS Error/Warning Registers */
35#define ERR_SMPRO_TYPE 0xA0
36#define ERR_PMPRO_TYPE 0xA1
37#define ERR_SMPRO_INFO_LO 0xA2
38#define ERR_SMPRO_INFO_HI 0xA3
39#define ERR_SMPRO_DATA_LO 0xA4
40#define ERR_SMPRO_DATA_HI 0xA5
41#define WARN_SMPRO_INFO_LO 0xAA
42#define WARN_SMPRO_INFO_HI 0xAB
43#define ERR_PMPRO_INFO_LO 0xA6
44#define ERR_PMPRO_INFO_HI 0xA7
45#define ERR_PMPRO_DATA_LO 0xA8
46#define ERR_PMPRO_DATA_HI 0xA9
47#define WARN_PMPRO_INFO_LO 0xAC
48#define WARN_PMPRO_INFO_HI 0xAD
49
50/* PCIE Error Registers */
51#define PCIE_CE_ERR_CNT 0xC0
52#define PCIE_CE_ERR_LEN 0xC1
53#define PCIE_CE_ERR_DATA 0xC2
54#define PCIE_UE_ERR_CNT 0xC3
55#define PCIE_UE_ERR_LEN 0xC4
56#define PCIE_UE_ERR_DATA 0xC5
57
58/* Other Error Registers */
59#define OTHER_CE_ERR_CNT 0xD0
60#define OTHER_CE_ERR_LEN 0xD1
61#define OTHER_CE_ERR_DATA 0xD2
62#define OTHER_UE_ERR_CNT 0xD8
63#define OTHER_UE_ERR_LEN 0xD9
64#define OTHER_UE_ERR_DATA 0xDA
65
66/* Event Data Registers */
67#define VRD_WARN_FAULT_EVENT_DATA 0x78
68#define VRD_HOT_EVENT_DATA 0x79
69#define DIMM_HOT_EVENT_DATA 0x7A
70
71#define MAX_READ_BLOCK_LENGTH 48
72
73#define RAS_SMPRO_ERR 0
74#define RAS_PMPRO_ERR 1
75
76enum RAS_48BYTES_ERR_TYPES {
77 CORE_CE_ERR,
78 CORE_UE_ERR,
79 MEM_CE_ERR,
80 MEM_UE_ERR,
81 PCIE_CE_ERR,
82 PCIE_UE_ERR,
83 OTHER_CE_ERR,
84 OTHER_UE_ERR,
85 NUM_48BYTES_ERR_TYPE,
86};
87
88struct smpro_error_hdr {
89 u8 count; /* Number of the RAS errors */
90 u8 len; /* Number of data bytes */
91 u8 data; /* Start of 48-byte data */
92 u8 max_cnt; /* Max num of errors */
93};
94
95/*
96 * Included Address of registers to get Count, Length of data and Data
97 * of the 48 bytes error data
98 */
99static struct smpro_error_hdr smpro_error_table[] = {
100 [CORE_CE_ERR] = {
101 .count = CORE_CE_ERR_CNT,
102 .len = CORE_CE_ERR_LEN,
103 .data = CORE_CE_ERR_DATA,
104 .max_cnt = 32
105 },
106 [CORE_UE_ERR] = {
107 .count = CORE_UE_ERR_CNT,
108 .len = CORE_UE_ERR_LEN,
109 .data = CORE_UE_ERR_DATA,
110 .max_cnt = 32
111 },
112 [MEM_CE_ERR] = {
113 .count = MEM_CE_ERR_CNT,
114 .len = MEM_CE_ERR_LEN,
115 .data = MEM_CE_ERR_DATA,
116 .max_cnt = 16
117 },
118 [MEM_UE_ERR] = {
119 .count = MEM_UE_ERR_CNT,
120 .len = MEM_UE_ERR_LEN,
121 .data = MEM_UE_ERR_DATA,
122 .max_cnt = 16
123 },
124 [PCIE_CE_ERR] = {
125 .count = PCIE_CE_ERR_CNT,
126 .len = PCIE_CE_ERR_LEN,
127 .data = PCIE_CE_ERR_DATA,
128 .max_cnt = 96
129 },
130 [PCIE_UE_ERR] = {
131 .count = PCIE_UE_ERR_CNT,
132 .len = PCIE_UE_ERR_LEN,
133 .data = PCIE_UE_ERR_DATA,
134 .max_cnt = 96
135 },
136 [OTHER_CE_ERR] = {
137 .count = OTHER_CE_ERR_CNT,
138 .len = OTHER_CE_ERR_LEN,
139 .data = OTHER_CE_ERR_DATA,
140 .max_cnt = 8
141 },
142 [OTHER_UE_ERR] = {
143 .count = OTHER_UE_ERR_CNT,
144 .len = OTHER_UE_ERR_LEN,
145 .data = OTHER_UE_ERR_DATA,
146 .max_cnt = 8
147 },
148};
149
150/*
151 * List of SCP registers which are used to get
152 * one type of RAS Internal errors.
153 */
154struct smpro_int_error_hdr {
155 u8 type;
156 u8 info_l;
157 u8 info_h;
158 u8 data_l;
159 u8 data_h;
160 u8 warn_l;
161 u8 warn_h;
162};
163
164static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
165 [RAS_SMPRO_ERR] = {
166 .type = ERR_SMPRO_TYPE,
167 .info_l = ERR_SMPRO_INFO_LO,
168 .info_h = ERR_SMPRO_INFO_HI,
169 .data_l = ERR_SMPRO_DATA_LO,
170 .data_h = ERR_SMPRO_DATA_HI,
171 .warn_l = WARN_SMPRO_INFO_LO,
172 .warn_h = WARN_SMPRO_INFO_HI,
173 },
174 [RAS_PMPRO_ERR] = {
175 .type = ERR_PMPRO_TYPE,
176 .info_l = ERR_PMPRO_INFO_LO,
177 .info_h = ERR_PMPRO_INFO_HI,
178 .data_l = ERR_PMPRO_DATA_LO,
179 .data_h = ERR_PMPRO_DATA_HI,
180 .warn_l = WARN_PMPRO_INFO_LO,
181 .warn_h = WARN_PMPRO_INFO_HI,
182 },
183};
184
185struct smpro_errmon {
186 struct regmap *regmap;
187};
188
189enum EVENT_TYPES {
190 VRD_WARN_FAULT_EVENT,
191 VRD_HOT_EVENT,
192 DIMM_HOT_EVENT,
193 NUM_EVENTS_TYPE,
194};
195
196/* Included Address of event source and data registers */
197static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
198 VRD_WARN_FAULT_EVENT_DATA,
199 VRD_HOT_EVENT_DATA,
200 DIMM_HOT_EVENT_DATA,
201};
202
203static ssize_t smpro_event_data_read(struct device *dev,
204 struct device_attribute *da, char *buf,
205 int channel)
206{
207 struct smpro_errmon *errmon = dev_get_drvdata(dev);
208 s32 event_data;
209 int ret;
210
211 ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
212 if (ret)
213 return ret;
214 /* Clear event after read */
215 if (event_data != 0)
216 regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
217
218 return sysfs_emit(buf, "%04x\n", event_data);
219}
220
221static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
222 char *buf, int channel)
223{
224 struct smpro_errmon *errmon = dev_get_drvdata(dev);
225 struct smpro_error_hdr *err_info;
226 s32 err_count;
227 int ret;
228
229 err_info = &smpro_error_table[channel];
230
231 ret = regmap_read(errmon->regmap, err_info->count, &err_count);
232 if (ret)
233 return ret;
234
235 /* Bit 8 indicates the overflow status */
236 return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
237}
238
239static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da,
240 char *buf, int channel)
241{
242 struct smpro_errmon *errmon = dev_get_drvdata(dev);
243 unsigned char err_data[MAX_READ_BLOCK_LENGTH];
244 struct smpro_error_hdr *err_info;
245 s32 err_count, err_length;
246 int ret;
247
248 err_info = &smpro_error_table[channel];
249
250 ret = regmap_read(errmon->regmap, err_info->count, &err_count);
251 /* Error count is the low byte */
252 err_count &= 0xff;
253 if (ret || !err_count || err_count > err_info->max_cnt)
254 return ret;
255
256 ret = regmap_read(errmon->regmap, err_info->len, &err_length);
257 if (ret || err_length <= 0)
258 return ret;
259
260 if (err_length > MAX_READ_BLOCK_LENGTH)
261 err_length = MAX_READ_BLOCK_LENGTH;
262
263 memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH);
264 ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length);
265 if (ret < 0)
266 return ret;
267
268 /* clear the error */
269 ret = regmap_write(errmon->regmap, err_info->count, 0x100);
270 if (ret)
271 return ret;
272 /*
273 * The output of Core/Memory/PCIe/Others UE/CE errors follows the format
274 * specified in section 5.8.1 CE/UE Error Data record in
275 * Altra SOC BMC Interface specification.
276 */
277 return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data);
278}
279
280/*
281 * Output format:
282 * <4-byte hex value of error info><4-byte hex value of error extensive data>
283 * Where:
284 * + error info : The error information
285 * + error data : Extensive data (32 bits)
286 * Reference to section 5.10 RAS Internal Error Register Definition in
287 * Altra SOC BMC Interface specification
288 */
289static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da,
290 char *buf, int channel)
291{
292 struct smpro_errmon *errmon = dev_get_drvdata(dev);
293 struct smpro_int_error_hdr *err_info;
294 unsigned int err[4] = { 0 };
295 unsigned int err_type;
296 unsigned int val;
297 int ret;
298
299 /* read error status */
300 ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
301 if (ret)
302 return ret;
303
304 if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
305 (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
306 return 0;
307
308 err_info = &list_smpro_int_error_hdr[channel];
309 ret = regmap_read(errmon->regmap, err_info->type, &val);
310 if (ret)
311 return ret;
312
313 err_type = (val & BIT(1)) ? BIT(1) :
314 (val & BIT(2)) ? BIT(2) : 0;
315
316 if (!err_type)
317 return 0;
318
319 ret = regmap_read(errmon->regmap, err_info->info_l, err + 1);
320 if (ret)
321 return ret;
322
323 ret = regmap_read(errmon->regmap, err_info->info_h, err);
324 if (ret)
325 return ret;
326
327 if (err_type & BIT(2)) {
328 /* Error with data type */
329 ret = regmap_read(errmon->regmap, err_info->data_l, err + 3);
330 if (ret)
331 return ret;
332
333 ret = regmap_read(errmon->regmap, err_info->data_h, err + 2);
334 if (ret)
335 return ret;
336 }
337
338 /* clear the read errors */
339 ret = regmap_write(errmon->regmap, err_info->type, err_type);
340 if (ret)
341 return ret;
342
343 return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err);
344}
345
346/*
347 * Output format:
348 * <4-byte hex value of warining info>
349 * Reference to section 5.10 RAS Internal Error Register Definition in
350 * Altra SOC BMC Interface specification
351 */
352static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da,
353 char *buf, int channel)
354{
355 struct smpro_errmon *errmon = dev_get_drvdata(dev);
356 struct smpro_int_error_hdr *err_info;
357 unsigned int warn[2] = { 0 };
358 unsigned int val;
359 int ret;
360
361 /* read error status */
362 ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
363 if (ret)
364 return ret;
365
366 if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
367 (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
368 return 0;
369
370 err_info = &list_smpro_int_error_hdr[channel];
371 ret = regmap_read(errmon->regmap, err_info->type, &val);
372 if (ret)
373 return ret;
374
375 if (!(val & BIT(0)))
376 return 0;
377
378 ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1);
379 if (ret)
380 return ret;
381
382 ret = regmap_read(errmon->regmap, err_info->warn_h, warn);
383 if (ret)
384 return ret;
385
386 /* clear the warning */
387 ret = regmap_write(errmon->regmap, err_info->type, BIT(0));
388 if (ret)
389 return ret;
390
391 return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn);
392}
393
394#define ERROR_OVERFLOW_RO(_error, _index) \
395 static ssize_t overflow_##_error##_show(struct device *dev, \
396 struct device_attribute *da, \
397 char *buf) \
398 { \
399 return smpro_overflow_data_read(dev, da, buf, _index); \
400 } \
401 static DEVICE_ATTR_RO(overflow_##_error)
402
403ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR);
404ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR);
405ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR);
406ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR);
407ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR);
408ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR);
409ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR);
410ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR);
411
412#define ERROR_RO(_error, _index) \
413 static ssize_t error_##_error##_show(struct device *dev, \
414 struct device_attribute *da, \
415 char *buf) \
416 { \
417 return smpro_error_data_read(dev, da, buf, _index); \
418 } \
419 static DEVICE_ATTR_RO(error_##_error)
420
421ERROR_RO(core_ce, CORE_CE_ERR);
422ERROR_RO(core_ue, CORE_UE_ERR);
423ERROR_RO(mem_ce, MEM_CE_ERR);
424ERROR_RO(mem_ue, MEM_UE_ERR);
425ERROR_RO(pcie_ce, PCIE_CE_ERR);
426ERROR_RO(pcie_ue, PCIE_UE_ERR);
427ERROR_RO(other_ce, OTHER_CE_ERR);
428ERROR_RO(other_ue, OTHER_UE_ERR);
429
430static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
431{
432 return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR);
433}
434static DEVICE_ATTR_RO(error_smpro);
435
436static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
437{
438 return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR);
439}
440static DEVICE_ATTR_RO(error_pmpro);
441
442static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
443{
444 return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR);
445}
446static DEVICE_ATTR_RO(warn_smpro);
447
448static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
449{
450 return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR);
451}
452static DEVICE_ATTR_RO(warn_pmpro);
453
454#define EVENT_RO(_event, _index) \
455 static ssize_t event_##_event##_show(struct device *dev, \
456 struct device_attribute *da, \
457 char *buf) \
458 { \
459 return smpro_event_data_read(dev, da, buf, _index); \
460 } \
461 static DEVICE_ATTR_RO(event_##_event)
462
463EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT);
464EVENT_RO(vrd_hot, VRD_HOT_EVENT);
465EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
466
467static struct attribute *smpro_errmon_attrs[] = {
468 &dev_attr_overflow_core_ce.attr,
469 &dev_attr_overflow_core_ue.attr,
470 &dev_attr_overflow_mem_ce.attr,
471 &dev_attr_overflow_mem_ue.attr,
472 &dev_attr_overflow_pcie_ce.attr,
473 &dev_attr_overflow_pcie_ue.attr,
474 &dev_attr_overflow_other_ce.attr,
475 &dev_attr_overflow_other_ue.attr,
476 &dev_attr_error_core_ce.attr,
477 &dev_attr_error_core_ue.attr,
478 &dev_attr_error_mem_ce.attr,
479 &dev_attr_error_mem_ue.attr,
480 &dev_attr_error_pcie_ce.attr,
481 &dev_attr_error_pcie_ue.attr,
482 &dev_attr_error_other_ce.attr,
483 &dev_attr_error_other_ue.attr,
484 &dev_attr_error_smpro.attr,
485 &dev_attr_error_pmpro.attr,
486 &dev_attr_warn_smpro.attr,
487 &dev_attr_warn_pmpro.attr,
488 &dev_attr_event_vrd_warn_fault.attr,
489 &dev_attr_event_vrd_hot.attr,
490 &dev_attr_event_dimm_hot.attr,
491 NULL
492};
493
494ATTRIBUTE_GROUPS(smpro_errmon);
495
496static int smpro_errmon_probe(struct platform_device *pdev)
497{
498 struct smpro_errmon *errmon;
499
500 errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL);
501 if (!errmon)
502 return -ENOMEM;
503
504 platform_set_drvdata(pdev, errmon);
505
506 errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL);
507 if (!errmon->regmap)
508 return -ENODEV;
509
510 return 0;
511}
512
513static struct platform_driver smpro_errmon_driver = {
514 .probe = smpro_errmon_probe,
515 .driver = {
516 .name = "smpro-errmon",
517 .dev_groups = smpro_errmon_groups,
518 },
519};
520
521module_platform_driver(smpro_errmon_driver);
522
523MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>");
524MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>");
525MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>");
526MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>");
527MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>");
528MODULE_DESCRIPTION("Ampere Altra SMpro driver");
529MODULE_LICENSE("GPL");