Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright(c) 2022 Intel Corporation. */
3
4#include <linux/cpu.h>
5#include <linux/delay.h>
6#include <linux/fs.h>
7#include <linux/nmi.h>
8#include <linux/slab.h>
9#include <linux/stop_machine.h>
10
11#include "ifs.h"
12
13/*
14 * Note all code and data in this file is protected by
15 * ifs_sem. On HT systems all threads on a core will
16 * execute together, but only the first thread on the
17 * core will update results of the test.
18 */
19
20#define CREATE_TRACE_POINTS
21#include <trace/events/intel_ifs.h>
22
23/* Max retries on the same chunk */
24#define MAX_IFS_RETRIES 5
25
26/*
27 * Number of TSC cycles that a logical CPU will wait for the other
28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
29 */
30#define IFS_THREAD_WAIT 100000
31
32enum ifs_status_err_code {
33 IFS_NO_ERROR = 0,
34 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1,
35 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2,
36 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3,
37 IFS_INVALID_CHUNK_RANGE = 4,
38 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5,
39 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6,
40 IFS_UNASSIGNED_ERROR_CODE = 7,
41 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8,
42 IFS_INTERRUPTED_DURING_EXECUTION = 9,
43 IFS_UNASSIGNED_ERROR_CODE_0xA = 0xA,
44 IFS_CORRUPTED_CHUNK = 0xB,
45};
46
47static const char * const scan_test_status[] = {
48 [IFS_NO_ERROR] = "SCAN no error",
49 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
50 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
51 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
52 "Core Abort SCAN Response due to power management condition.",
53 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
54 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
55 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
56 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
57 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
58 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
59 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
60 [IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA",
61 [IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading",
62};
63
64static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
65{
66 if (status.error_code < ARRAY_SIZE(scan_test_status)) {
67 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
68 cpumask_pr_args(cpu_smt_mask(cpu)),
69 scan_test_status[status.error_code]);
70 } else if (status.error_code == IFS_SW_TIMEOUT) {
71 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
72 cpumask_pr_args(cpu_smt_mask(cpu)));
73 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
74 dev_info(dev, "CPU(s) %*pbl: %s\n",
75 cpumask_pr_args(cpu_smt_mask(cpu)),
76 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
77 } else {
78 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
79 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
80 }
81}
82
83static void message_fail(struct device *dev, int cpu, union ifs_status status)
84{
85 struct ifs_data *ifsd = ifs_get_data(dev);
86
87 /*
88 * control_error is set when the microcode runs into a problem
89 * loading the image from the reserved BIOS memory, or it has
90 * been corrupted. Reloading the image may fix this issue.
91 */
92 if (status.control_error) {
93 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
94 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
95 }
96
97 /*
98 * signature_error is set when the output from the scan chains does not
99 * match the expected signature. This might be a transient problem (e.g.
100 * due to a bit flip from an alpha particle or neutron). If the problem
101 * repeats on a subsequent test, then it indicates an actual problem in
102 * the core being tested.
103 */
104 if (status.signature_error) {
105 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
106 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
107 }
108}
109
110static bool can_restart(union ifs_status status)
111{
112 enum ifs_status_err_code err_code = status.error_code;
113
114 /* Signature for chunk is bad, or scan test failed */
115 if (status.signature_error || status.control_error)
116 return false;
117
118 switch (err_code) {
119 case IFS_NO_ERROR:
120 case IFS_OTHER_THREAD_COULD_NOT_JOIN:
121 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
122 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
123 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
124 case IFS_INTERRUPTED_DURING_EXECUTION:
125 return true;
126 case IFS_INVALID_CHUNK_RANGE:
127 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
128 case IFS_CORE_NOT_CAPABLE_CURRENTLY:
129 case IFS_UNASSIGNED_ERROR_CODE:
130 case IFS_UNASSIGNED_ERROR_CODE_0xA:
131 case IFS_CORRUPTED_CHUNK:
132 break;
133 }
134 return false;
135}
136
137/*
138 * Execute the scan. Called "simultaneously" on all threads of a core
139 * at high priority using the stop_cpus mechanism.
140 */
141static int doscan(void *data)
142{
143 int cpu = smp_processor_id();
144 u64 *msrs = data;
145 int first;
146
147 /* Only the first logical CPU on a core reports result */
148 first = cpumask_first(cpu_smt_mask(cpu));
149
150 /*
151 * This WRMSR will wait for other HT threads to also write
152 * to this MSR (at most for activate.delay cycles). Then it
153 * starts scan of each requested chunk. The core scan happens
154 * during the "execution" of the WRMSR. This instruction can
155 * take up to 200 milliseconds (in the case where all chunks
156 * are processed in a single pass) before it retires.
157 */
158 wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
159
160 if (cpu == first) {
161 /* Pass back the result of the scan */
162 rdmsrl(MSR_SCAN_STATUS, msrs[1]);
163 }
164
165 return 0;
166}
167
168/*
169 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
170 * on all threads of the core to be tested. Loop if necessary to complete
171 * run of all chunks. Include some defensive tests to make sure forward
172 * progress is made, and that the whole test completes in a reasonable time.
173 */
174static void ifs_test_core(int cpu, struct device *dev)
175{
176 union ifs_scan activate;
177 union ifs_status status;
178 unsigned long timeout;
179 struct ifs_data *ifsd;
180 int to_start, to_stop;
181 int status_chunk;
182 u64 msrvals[2];
183 int retries;
184
185 ifsd = ifs_get_data(dev);
186
187 activate.gen0.rsvd = 0;
188 activate.delay = IFS_THREAD_WAIT;
189 activate.sigmce = 0;
190 to_start = 0;
191 to_stop = ifsd->valid_chunks - 1;
192
193 if (ifsd->generation) {
194 activate.gen2.start = to_start;
195 activate.gen2.stop = to_stop;
196 } else {
197 activate.gen0.start = to_start;
198 activate.gen0.stop = to_stop;
199 }
200
201 timeout = jiffies + HZ / 2;
202 retries = MAX_IFS_RETRIES;
203
204 while (to_start <= to_stop) {
205 if (time_after(jiffies, timeout)) {
206 status.error_code = IFS_SW_TIMEOUT;
207 break;
208 }
209
210 msrvals[0] = activate.data;
211 stop_core_cpuslocked(cpu, doscan, msrvals);
212
213 status.data = msrvals[1];
214
215 trace_ifs_status(cpu, to_start, to_stop, status.data);
216
217 /* Some cases can be retried, give up for others */
218 if (!can_restart(status))
219 break;
220
221 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
222 if (status_chunk == to_start) {
223 /* Check for forward progress */
224 if (--retries == 0) {
225 if (status.error_code == IFS_NO_ERROR)
226 status.error_code = IFS_SW_PARTIAL_COMPLETION;
227 break;
228 }
229 } else {
230 retries = MAX_IFS_RETRIES;
231 if (ifsd->generation)
232 activate.gen2.start = status_chunk;
233 else
234 activate.gen0.start = status_chunk;
235 to_start = status_chunk;
236 }
237 }
238
239 /* Update status for this core */
240 ifsd->scan_details = status.data;
241
242 if (status.control_error || status.signature_error) {
243 ifsd->status = SCAN_TEST_FAIL;
244 message_fail(dev, cpu, status);
245 } else if (status.error_code) {
246 ifsd->status = SCAN_NOT_TESTED;
247 message_not_tested(dev, cpu, status);
248 } else {
249 ifsd->status = SCAN_TEST_PASS;
250 }
251}
252
253#define SPINUNIT 100 /* 100 nsec */
254static atomic_t array_cpus_out;
255
256/*
257 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
258 */
259static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
260{
261 int cpu = smp_processor_id();
262 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
263 int all_cpus = cpumask_weight(smt_mask);
264
265 atomic_inc(t);
266 while (atomic_read(t) < all_cpus) {
267 if (timeout < SPINUNIT)
268 return;
269 ndelay(SPINUNIT);
270 timeout -= SPINUNIT;
271 touch_nmi_watchdog();
272 }
273}
274
275static int do_array_test(void *data)
276{
277 union ifs_array *command = data;
278 int cpu = smp_processor_id();
279 int first;
280
281 /*
282 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
283 */
284 first = cpumask_first(cpu_smt_mask(cpu));
285
286 if (cpu == first) {
287 wrmsrl(MSR_ARRAY_BIST, command->data);
288 /* Pass back the result of the test */
289 rdmsrl(MSR_ARRAY_BIST, command->data);
290 }
291
292 /* Tests complete faster if the sibling is spinning here */
293 wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
294
295 return 0;
296}
297
298static void ifs_array_test_core(int cpu, struct device *dev)
299{
300 union ifs_array command = {};
301 bool timed_out = false;
302 struct ifs_data *ifsd;
303 unsigned long timeout;
304
305 ifsd = ifs_get_data(dev);
306
307 command.array_bitmask = ~0U;
308 timeout = jiffies + HZ / 2;
309
310 do {
311 if (time_after(jiffies, timeout)) {
312 timed_out = true;
313 break;
314 }
315 atomic_set(&array_cpus_out, 0);
316 stop_core_cpuslocked(cpu, do_array_test, &command);
317
318 if (command.ctrl_result)
319 break;
320 } while (command.array_bitmask);
321
322 ifsd->scan_details = command.data;
323
324 if (command.ctrl_result)
325 ifsd->status = SCAN_TEST_FAIL;
326 else if (timed_out || command.array_bitmask)
327 ifsd->status = SCAN_NOT_TESTED;
328 else
329 ifsd->status = SCAN_TEST_PASS;
330}
331
332#define ARRAY_GEN1_TEST_ALL_ARRAYS 0x0ULL
333#define ARRAY_GEN1_STATUS_FAIL 0x1ULL
334
335static int do_array_test_gen1(void *status)
336{
337 int cpu = smp_processor_id();
338 int first;
339
340 first = cpumask_first(cpu_smt_mask(cpu));
341
342 if (cpu == first) {
343 wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS);
344 rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status));
345 }
346
347 return 0;
348}
349
350static void ifs_array_test_gen1(int cpu, struct device *dev)
351{
352 struct ifs_data *ifsd = ifs_get_data(dev);
353 u64 status = 0;
354
355 stop_core_cpuslocked(cpu, do_array_test_gen1, &status);
356 ifsd->scan_details = status;
357
358 if (status & ARRAY_GEN1_STATUS_FAIL)
359 ifsd->status = SCAN_TEST_FAIL;
360 else
361 ifsd->status = SCAN_TEST_PASS;
362}
363
364/*
365 * Initiate per core test. It wakes up work queue threads on the target cpu and
366 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
367 * wait for all sibling threads to finish the scan test.
368 */
369int do_core_test(int cpu, struct device *dev)
370{
371 const struct ifs_test_caps *test = ifs_get_test_caps(dev);
372 struct ifs_data *ifsd = ifs_get_data(dev);
373 int ret = 0;
374
375 /* Prevent CPUs from being taken offline during the scan test */
376 cpus_read_lock();
377
378 if (!cpu_online(cpu)) {
379 dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
380 ret = -EINVAL;
381 goto out;
382 }
383
384 switch (test->test_num) {
385 case IFS_TYPE_SAF:
386 if (!ifsd->loaded)
387 ret = -EPERM;
388 else
389 ifs_test_core(cpu, dev);
390 break;
391 case IFS_TYPE_ARRAY_BIST:
392 if (ifsd->array_gen == ARRAY_GEN0)
393 ifs_array_test_core(cpu, dev);
394 else
395 ifs_array_test_gen1(cpu, dev);
396 break;
397 default:
398 ret = -EINVAL;
399 }
400out:
401 cpus_read_unlock();
402 return ret;
403}