Loading...
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Performance event support - Processor Activity Instrumentation Extension
4 * Facility
5 *
6 * Copyright IBM Corp. 2022
7 * Author(s): Thomas Richter <tmricht@linux.ibm.com>
8 */
9#define KMSG_COMPONENT "pai_ext"
10#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
11
12#include <linux/kernel.h>
13#include <linux/kernel_stat.h>
14#include <linux/percpu.h>
15#include <linux/notifier.h>
16#include <linux/init.h>
17#include <linux/export.h>
18#include <linux/io.h>
19
20#include <asm/cpu_mcf.h>
21#include <asm/ctl_reg.h>
22#include <asm/pai.h>
23#include <asm/debug.h>
24
25#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */
26#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */
27
28static debug_info_t *paiext_dbg;
29static unsigned int paiext_cnt; /* Extracted with QPACI instruction */
30
31struct pai_userdata {
32 u16 num;
33 u64 value;
34} __packed;
35
36/* Create the PAI extension 1 control block area.
37 * The PAI extension control block 1 is pointed to by lowcore
38 * address 0x1508 for each CPU. This control block is 512 bytes in size
39 * and requires a 512 byte boundary alignment.
40 */
41struct paiext_cb { /* PAI extension 1 control block */
42 u64 header; /* Not used */
43 u64 reserved1;
44 u64 acc; /* Addr to analytics counter control block */
45 u8 reserved2[488];
46} __packed;
47
48struct paiext_map {
49 unsigned long *area; /* Area for CPU to store counters */
50 struct pai_userdata *save; /* Area to store non-zero counters */
51 enum paievt_mode mode; /* Type of event */
52 unsigned int active_events; /* # of PAI Extension users */
53 unsigned int refcnt;
54 struct perf_event *event; /* Perf event for sampling */
55 struct paiext_cb *paiext_cb; /* PAI extension control block area */
56};
57
58struct paiext_mapptr {
59 struct paiext_map *mapptr;
60};
61
62static struct paiext_root { /* Anchor to per CPU data */
63 int refcnt; /* Overall active events */
64 struct paiext_mapptr __percpu *mapptr;
65} paiext_root;
66
67/* Free per CPU data when the last event is removed. */
68static void paiext_root_free(void)
69{
70 if (!--paiext_root.refcnt) {
71 free_percpu(paiext_root.mapptr);
72 paiext_root.mapptr = NULL;
73 }
74}
75
76/* On initialization of first event also allocate per CPU data dynamically.
77 * Start with an array of pointers, the array size is the maximum number of
78 * CPUs possible, which might be larger than the number of CPUs currently
79 * online.
80 */
81static int paiext_root_alloc(void)
82{
83 if (++paiext_root.refcnt == 1) {
84 /* The memory is already zeroed. */
85 paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
86 if (!paiext_root.mapptr) {
87 /* Returing without refcnt adjustment is ok. The
88 * error code is handled by paiext_alloc() which
89 * decrements refcnt when an event can not be
90 * created.
91 */
92 return -ENOMEM;
93 }
94 }
95 return 0;
96}
97
98/* Protects against concurrent increment of sampler and counter member
99 * increments at the same time and prohibits concurrent execution of
100 * counting and sampling events.
101 * Ensures that analytics counter block is deallocated only when the
102 * sampling and counting on that cpu is zero.
103 * For details see paiext_alloc().
104 */
105static DEFINE_MUTEX(paiext_reserve_mutex);
106
107/* Free all memory allocated for event counting/sampling setup */
108static void paiext_free(struct paiext_mapptr *mp)
109{
110 kfree(mp->mapptr->area);
111 kfree(mp->mapptr->paiext_cb);
112 kvfree(mp->mapptr->save);
113 kfree(mp->mapptr);
114 mp->mapptr = NULL;
115}
116
117/* Release the PMU if event is the last perf event */
118static void paiext_event_destroy(struct perf_event *event)
119{
120 struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
121 struct paiext_map *cpump = mp->mapptr;
122
123 mutex_lock(&paiext_reserve_mutex);
124 cpump->event = NULL;
125 if (!--cpump->refcnt) /* Last reference gone */
126 paiext_free(mp);
127 paiext_root_free();
128 mutex_unlock(&paiext_reserve_mutex);
129 debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__,
130 event->cpu, mp->mapptr);
131
132}
133
134/* Used to avoid races in checking concurrent access of counting and
135 * sampling for pai_extension events.
136 *
137 * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
138 * allowed and when this event is running, no counting event is allowed.
139 * Several counting events are allowed in parallel, but no sampling event
140 * is allowed while one (or more) counting events are running.
141 *
142 * This function is called in process context and it is safe to block.
143 * When the event initialization functions fails, no other call back will
144 * be invoked.
145 *
146 * Allocate the memory for the event.
147 */
148static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
149{
150 struct paiext_mapptr *mp;
151 struct paiext_map *cpump;
152 int rc;
153
154 mutex_lock(&paiext_reserve_mutex);
155
156 rc = paiext_root_alloc();
157 if (rc)
158 goto unlock;
159
160 mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
161 cpump = mp->mapptr;
162 if (!cpump) { /* Paiext_map allocated? */
163 rc = -ENOMEM;
164 cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
165 if (!cpump)
166 goto unlock;
167
168 /* Allocate memory for counter area and counter extraction.
169 * These are
170 * - a 512 byte block and requires 512 byte boundary alignment.
171 * - a 1KB byte block and requires 1KB boundary alignment.
172 * Only the first counting event has to allocate the area.
173 *
174 * Note: This works with commit 59bb47985c1d by default.
175 * Backporting this to kernels without this commit might
176 * need adjustment.
177 */
178 mp->mapptr = cpump;
179 cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
180 cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
181 cpump->save = kvmalloc_array(paiext_cnt + 1,
182 sizeof(struct pai_userdata),
183 GFP_KERNEL);
184 if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
185 paiext_free(mp);
186 goto unlock;
187 }
188 cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
189 : PAI_MODE_COUNTING;
190 } else {
191 /* Multiple invocation, check whats active.
192 * Supported are multiple counter events or only one sampling
193 * event concurrently at any one time.
194 */
195 if (cpump->mode == PAI_MODE_SAMPLING ||
196 (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
197 rc = -EBUSY;
198 goto unlock;
199 }
200 }
201
202 rc = 0;
203 cpump->event = event;
204 ++cpump->refcnt;
205
206unlock:
207 if (rc) {
208 /* Error in allocation of event, decrement anchor. Since
209 * the event in not created, its destroy() function is never
210 * invoked. Adjust the reference counter for the anchor.
211 */
212 paiext_root_free();
213 }
214 mutex_unlock(&paiext_reserve_mutex);
215 /* If rc is non-zero, no increment of counter/sampler was done. */
216 return rc;
217}
218
219/* The PAI extension 1 control block supports up to 128 entries. Return
220 * the index within PAIE1_CB given the event number. Also validate event
221 * number.
222 */
223static int paiext_event_valid(struct perf_event *event)
224{
225 u64 cfg = event->attr.config;
226
227 if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
228 /* Offset NNPA in paiext_cb */
229 event->hw.config_base = offsetof(struct paiext_cb, acc);
230 return 0;
231 }
232 return -EINVAL;
233}
234
235/* Might be called on different CPU than the one the event is intended for. */
236static int paiext_event_init(struct perf_event *event)
237{
238 struct perf_event_attr *a = &event->attr;
239 int rc;
240
241 /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
242 if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
243 return -ENOENT;
244 /* PAI extension event must be valid and in supported range */
245 rc = paiext_event_valid(event);
246 if (rc)
247 return rc;
248 /* Allow only CPU wide operation, no process context for now. */
249 if (event->hw.target || event->cpu == -1)
250 return -ENOENT;
251 /* Allow only event NNPA_ALL for sampling. */
252 if (a->sample_period && a->config != PAI_NNPA_BASE)
253 return -EINVAL;
254 /* Prohibit exclude_user event selection */
255 if (a->exclude_user)
256 return -EINVAL;
257
258 rc = paiext_alloc(a, event);
259 if (rc)
260 return rc;
261 event->hw.last_tag = 0;
262 event->destroy = paiext_event_destroy;
263
264 if (a->sample_period) {
265 a->sample_period = 1;
266 a->freq = 0;
267 /* Register for paicrypt_sched_task() to be called */
268 event->attach_state |= PERF_ATTACH_SCHED_CB;
269 /* Add raw data which are the memory mapped counters */
270 a->sample_type |= PERF_SAMPLE_RAW;
271 /* Turn off inheritance */
272 a->inherit = 0;
273 }
274
275 return 0;
276}
277
278static u64 paiext_getctr(struct paiext_map *cpump, int nr)
279{
280 return cpump->area[nr];
281}
282
283/* Read the counter values. Return value from location in buffer. For event
284 * NNPA_ALL sum up all events.
285 */
286static u64 paiext_getdata(struct perf_event *event)
287{
288 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
289 struct paiext_map *cpump = mp->mapptr;
290 u64 sum = 0;
291 int i;
292
293 if (event->attr.config != PAI_NNPA_BASE)
294 return paiext_getctr(cpump, event->attr.config - PAI_NNPA_BASE);
295
296 for (i = 1; i <= paiext_cnt; i++)
297 sum += paiext_getctr(cpump, i);
298
299 return sum;
300}
301
302static u64 paiext_getall(struct perf_event *event)
303{
304 return paiext_getdata(event);
305}
306
307static void paiext_read(struct perf_event *event)
308{
309 u64 prev, new, delta;
310
311 prev = local64_read(&event->hw.prev_count);
312 new = paiext_getall(event);
313 local64_set(&event->hw.prev_count, new);
314 delta = new - prev;
315 local64_add(delta, &event->count);
316}
317
318static void paiext_start(struct perf_event *event, int flags)
319{
320 u64 sum;
321
322 if (event->hw.last_tag)
323 return;
324 event->hw.last_tag = 1;
325 sum = paiext_getall(event); /* Get current value */
326 local64_set(&event->hw.prev_count, sum);
327 local64_set(&event->count, 0);
328}
329
330static int paiext_add(struct perf_event *event, int flags)
331{
332 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
333 struct paiext_map *cpump = mp->mapptr;
334 struct paiext_cb *pcb = cpump->paiext_cb;
335
336 if (++cpump->active_events == 1) {
337 S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb);
338 pcb->acc = virt_to_phys(cpump->area) | 0x1;
339 /* Enable CPU instruction lookup for PAIE1 control block */
340 __ctl_set_bit(0, 49);
341 debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
342 __func__, S390_lowcore.aicd, pcb->acc);
343 }
344 if (flags & PERF_EF_START && !event->attr.sample_period) {
345 /* Only counting needs initial counter value */
346 paiext_start(event, PERF_EF_RELOAD);
347 }
348 event->hw.state = 0;
349 if (event->attr.sample_period) {
350 cpump->event = event;
351 perf_sched_cb_inc(event->pmu);
352 }
353 return 0;
354}
355
356static void paiext_stop(struct perf_event *event, int flags)
357{
358 paiext_read(event);
359 event->hw.state = PERF_HES_STOPPED;
360}
361
362static void paiext_del(struct perf_event *event, int flags)
363{
364 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
365 struct paiext_map *cpump = mp->mapptr;
366 struct paiext_cb *pcb = cpump->paiext_cb;
367
368 if (event->attr.sample_period)
369 perf_sched_cb_dec(event->pmu);
370 if (!event->attr.sample_period) {
371 /* Only counting needs to read counter */
372 paiext_stop(event, PERF_EF_UPDATE);
373 }
374 if (--cpump->active_events == 0) {
375 /* Disable CPU instruction lookup for PAIE1 control block */
376 __ctl_clear_bit(0, 49);
377 pcb->acc = 0;
378 S390_lowcore.aicd = 0;
379 debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
380 __func__, S390_lowcore.aicd, pcb->acc);
381 }
382}
383
384/* Create raw data and save it in buffer. Returns number of bytes copied.
385 * Saves only positive counter entries of the form
386 * 2 bytes: Number of counter
387 * 8 bytes: Value of counter
388 */
389static size_t paiext_copy(struct paiext_map *cpump)
390{
391 struct pai_userdata *userdata = cpump->save;
392 int i, outidx = 0;
393
394 for (i = 1; i <= paiext_cnt; i++) {
395 u64 val = paiext_getctr(cpump, i);
396
397 if (val) {
398 userdata[outidx].num = i;
399 userdata[outidx].value = val;
400 outidx++;
401 }
402 }
403 return outidx * sizeof(*userdata);
404}
405
406/* Write sample when one or more counters values are nonzero.
407 *
408 * Note: The function paiext_sched_task() and paiext_push_sample() are not
409 * invoked after function paiext_del() has been called because of function
410 * perf_sched_cb_dec().
411 * The function paiext_sched_task() and paiext_push_sample() are only
412 * called when sampling is active. Function perf_sched_cb_inc()
413 * has been invoked to install function paiext_sched_task() as call back
414 * to run at context switch time (see paiext_add()).
415 *
416 * This causes function perf_event_context_sched_out() and
417 * perf_event_context_sched_in() to check whether the PMU has installed an
418 * sched_task() callback. That callback is not active after paiext_del()
419 * returns and has deleted the event on that CPU.
420 */
421static int paiext_push_sample(void)
422{
423 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
424 struct paiext_map *cpump = mp->mapptr;
425 struct perf_event *event = cpump->event;
426 struct perf_sample_data data;
427 struct perf_raw_record raw;
428 struct pt_regs regs;
429 size_t rawsize;
430 int overflow;
431
432 rawsize = paiext_copy(cpump);
433 if (!rawsize) /* No incremented counters */
434 return 0;
435
436 /* Setup perf sample */
437 memset(®s, 0, sizeof(regs));
438 memset(&raw, 0, sizeof(raw));
439 memset(&data, 0, sizeof(data));
440 perf_sample_data_init(&data, 0, event->hw.last_period);
441 if (event->attr.sample_type & PERF_SAMPLE_TID) {
442 data.tid_entry.pid = task_tgid_nr(current);
443 data.tid_entry.tid = task_pid_nr(current);
444 }
445 if (event->attr.sample_type & PERF_SAMPLE_TIME)
446 data.time = event->clock();
447 if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
448 data.id = event->id;
449 if (event->attr.sample_type & PERF_SAMPLE_CPU)
450 data.cpu_entry.cpu = smp_processor_id();
451 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
452 raw.frag.size = rawsize;
453 raw.frag.data = cpump->save;
454 raw.size = raw.frag.size;
455 data.raw = &raw;
456 data.sample_flags |= PERF_SAMPLE_RAW;
457 }
458
459 overflow = perf_event_overflow(event, &data, ®s);
460 perf_event_update_userpage(event);
461 /* Clear lowcore area after read */
462 memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ);
463 return overflow;
464}
465
466/* Called on schedule-in and schedule-out. No access to event structure,
467 * but for sampling only event NNPA_ALL is allowed.
468 */
469static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
470{
471 /* We started with a clean page on event installation. So read out
472 * results on schedule_out and if page was dirty, clear values.
473 */
474 if (!sched_in)
475 paiext_push_sample();
476}
477
478/* Attribute definitions for pai extension1 interface. As with other CPU
479 * Measurement Facilities, there is one attribute per mapped counter.
480 * The number of mapped counters may vary per machine generation. Use
481 * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
482 * to determine the number of mapped counters. The instructions returns
483 * a positive number, which is the highest number of supported counters.
484 * All counters less than this number are also supported, there are no
485 * holes. A returned number of zero means no support for mapped counters.
486 *
487 * The identification of the counter is a unique number. The chosen range
488 * is 0x1800 + offset in mapped kernel page.
489 * All CPU Measurement Facility counters identifiers must be unique and
490 * the numbers from 0 to 496 are already used for the CPU Measurement
491 * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
492 * counters.
493 * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
494 * used for the CPU Measurement Sampling facility.
495 */
496PMU_FORMAT_ATTR(event, "config:0-63");
497
498static struct attribute *paiext_format_attr[] = {
499 &format_attr_event.attr,
500 NULL,
501};
502
503static struct attribute_group paiext_events_group = {
504 .name = "events",
505 .attrs = NULL, /* Filled in attr_event_init() */
506};
507
508static struct attribute_group paiext_format_group = {
509 .name = "format",
510 .attrs = paiext_format_attr,
511};
512
513static const struct attribute_group *paiext_attr_groups[] = {
514 &paiext_events_group,
515 &paiext_format_group,
516 NULL,
517};
518
519/* Performance monitoring unit for mapped counters */
520static struct pmu paiext = {
521 .task_ctx_nr = perf_invalid_context,
522 .event_init = paiext_event_init,
523 .add = paiext_add,
524 .del = paiext_del,
525 .start = paiext_start,
526 .stop = paiext_stop,
527 .read = paiext_read,
528 .sched_task = paiext_sched_task,
529 .attr_groups = paiext_attr_groups,
530};
531
532/* List of symbolic PAI extension 1 NNPA counter names. */
533static const char * const paiext_ctrnames[] = {
534 [0] = "NNPA_ALL",
535 [1] = "NNPA_ADD",
536 [2] = "NNPA_SUB",
537 [3] = "NNPA_MUL",
538 [4] = "NNPA_DIV",
539 [5] = "NNPA_MIN",
540 [6] = "NNPA_MAX",
541 [7] = "NNPA_LOG",
542 [8] = "NNPA_EXP",
543 [9] = "NNPA_IBM_RESERVED_9",
544 [10] = "NNPA_RELU",
545 [11] = "NNPA_TANH",
546 [12] = "NNPA_SIGMOID",
547 [13] = "NNPA_SOFTMAX",
548 [14] = "NNPA_BATCHNORM",
549 [15] = "NNPA_MAXPOOL2D",
550 [16] = "NNPA_AVGPOOL2D",
551 [17] = "NNPA_LSTMACT",
552 [18] = "NNPA_GRUACT",
553 [19] = "NNPA_CONVOLUTION",
554 [20] = "NNPA_MATMUL_OP",
555 [21] = "NNPA_MATMUL_OP_BCAST23",
556 [22] = "NNPA_SMALLBATCH",
557 [23] = "NNPA_LARGEDIM",
558 [24] = "NNPA_SMALLTENSOR",
559 [25] = "NNPA_1MFRAME",
560 [26] = "NNPA_2GFRAME",
561 [27] = "NNPA_ACCESSEXCEPT",
562};
563
564static void __init attr_event_free(struct attribute **attrs, int num)
565{
566 struct perf_pmu_events_attr *pa;
567 struct device_attribute *dap;
568 int i;
569
570 for (i = 0; i < num; i++) {
571 dap = container_of(attrs[i], struct device_attribute, attr);
572 pa = container_of(dap, struct perf_pmu_events_attr, attr);
573 kfree(pa);
574 }
575 kfree(attrs);
576}
577
578static int __init attr_event_init_one(struct attribute **attrs, int num)
579{
580 struct perf_pmu_events_attr *pa;
581
582 pa = kzalloc(sizeof(*pa), GFP_KERNEL);
583 if (!pa)
584 return -ENOMEM;
585
586 sysfs_attr_init(&pa->attr.attr);
587 pa->id = PAI_NNPA_BASE + num;
588 pa->attr.attr.name = paiext_ctrnames[num];
589 pa->attr.attr.mode = 0444;
590 pa->attr.show = cpumf_events_sysfs_show;
591 pa->attr.store = NULL;
592 attrs[num] = &pa->attr.attr;
593 return 0;
594}
595
596/* Create PMU sysfs event attributes on the fly. */
597static int __init attr_event_init(void)
598{
599 struct attribute **attrs;
600 int ret, i;
601
602 attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs),
603 GFP_KERNEL);
604 if (!attrs)
605 return -ENOMEM;
606 for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) {
607 ret = attr_event_init_one(attrs, i);
608 if (ret) {
609 attr_event_free(attrs, i - 1);
610 return ret;
611 }
612 }
613 attrs[i] = NULL;
614 paiext_events_group.attrs = attrs;
615 return 0;
616}
617
618static int __init paiext_init(void)
619{
620 struct qpaci_info_block ib;
621 int rc = -ENOMEM;
622
623 if (!test_facility(197))
624 return 0;
625
626 qpaci(&ib);
627 paiext_cnt = ib.num_nnpa;
628 if (paiext_cnt >= PAI_NNPA_MAXCTR)
629 paiext_cnt = PAI_NNPA_MAXCTR;
630 if (!paiext_cnt)
631 return 0;
632
633 rc = attr_event_init();
634 if (rc) {
635 pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
636 return rc;
637 }
638
639 /* Setup s390dbf facility */
640 paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
641 if (!paiext_dbg) {
642 pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
643 rc = -ENOMEM;
644 goto out_init;
645 }
646 debug_register_view(paiext_dbg, &debug_sprintf_view);
647
648 rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
649 if (rc) {
650 pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
651 "rc=%i\n", rc);
652 goto out_pmu;
653 }
654
655 return 0;
656
657out_pmu:
658 debug_unregister_view(paiext_dbg, &debug_sprintf_view);
659 debug_unregister(paiext_dbg);
660out_init:
661 attr_event_free(paiext_events_group.attrs,
662 ARRAY_SIZE(paiext_ctrnames) + 1);
663 return rc;
664}
665
666device_initcall(paiext_init);
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Performance event support - Processor Activity Instrumentation Extension
4 * Facility
5 *
6 * Copyright IBM Corp. 2022
7 * Author(s): Thomas Richter <tmricht@linux.ibm.com>
8 */
9#define KMSG_COMPONENT "pai_ext"
10#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
11
12#include <linux/kernel.h>
13#include <linux/kernel_stat.h>
14#include <linux/percpu.h>
15#include <linux/notifier.h>
16#include <linux/init.h>
17#include <linux/export.h>
18#include <linux/io.h>
19#include <linux/perf_event.h>
20#include <asm/ctlreg.h>
21#include <asm/pai.h>
22#include <asm/debug.h>
23
24#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */
25#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */
26
27static debug_info_t *paiext_dbg;
28static unsigned int paiext_cnt; /* Extracted with QPACI instruction */
29
30struct pai_userdata {
31 u16 num;
32 u64 value;
33} __packed;
34
35/* Create the PAI extension 1 control block area.
36 * The PAI extension control block 1 is pointed to by lowcore
37 * address 0x1508 for each CPU. This control block is 512 bytes in size
38 * and requires a 512 byte boundary alignment.
39 */
40struct paiext_cb { /* PAI extension 1 control block */
41 u64 header; /* Not used */
42 u64 reserved1;
43 u64 acc; /* Addr to analytics counter control block */
44 u8 reserved2[488];
45} __packed;
46
47struct paiext_map {
48 unsigned long *area; /* Area for CPU to store counters */
49 struct pai_userdata *save; /* Area to store non-zero counters */
50 enum paievt_mode mode; /* Type of event */
51 unsigned int active_events; /* # of PAI Extension users */
52 refcount_t refcnt;
53 struct perf_event *event; /* Perf event for sampling */
54 struct paiext_cb *paiext_cb; /* PAI extension control block area */
55};
56
57struct paiext_mapptr {
58 struct paiext_map *mapptr;
59};
60
61static struct paiext_root { /* Anchor to per CPU data */
62 refcount_t refcnt; /* Overall active events */
63 struct paiext_mapptr __percpu *mapptr;
64} paiext_root;
65
66/* Free per CPU data when the last event is removed. */
67static void paiext_root_free(void)
68{
69 if (refcount_dec_and_test(&paiext_root.refcnt)) {
70 free_percpu(paiext_root.mapptr);
71 paiext_root.mapptr = NULL;
72 }
73}
74
75/* On initialization of first event also allocate per CPU data dynamically.
76 * Start with an array of pointers, the array size is the maximum number of
77 * CPUs possible, which might be larger than the number of CPUs currently
78 * online.
79 */
80static int paiext_root_alloc(void)
81{
82 if (!refcount_inc_not_zero(&paiext_root.refcnt)) {
83 /* The memory is already zeroed. */
84 paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
85 if (!paiext_root.mapptr) {
86 /* Returning without refcnt adjustment is ok. The
87 * error code is handled by paiext_alloc() which
88 * decrements refcnt when an event can not be
89 * created.
90 */
91 return -ENOMEM;
92 }
93 refcount_set(&paiext_root.refcnt, 1);
94 }
95 return 0;
96}
97
98/* Protects against concurrent increment of sampler and counter member
99 * increments at the same time and prohibits concurrent execution of
100 * counting and sampling events.
101 * Ensures that analytics counter block is deallocated only when the
102 * sampling and counting on that cpu is zero.
103 * For details see paiext_alloc().
104 */
105static DEFINE_MUTEX(paiext_reserve_mutex);
106
107/* Free all memory allocated for event counting/sampling setup */
108static void paiext_free(struct paiext_mapptr *mp)
109{
110 kfree(mp->mapptr->area);
111 kfree(mp->mapptr->paiext_cb);
112 kvfree(mp->mapptr->save);
113 kfree(mp->mapptr);
114 mp->mapptr = NULL;
115}
116
117/* Release the PMU if event is the last perf event */
118static void paiext_event_destroy(struct perf_event *event)
119{
120 struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
121 struct paiext_map *cpump = mp->mapptr;
122
123 mutex_lock(&paiext_reserve_mutex);
124 cpump->event = NULL;
125 if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */
126 paiext_free(mp);
127 paiext_root_free();
128 mutex_unlock(&paiext_reserve_mutex);
129 debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__,
130 event->cpu, mp->mapptr);
131
132}
133
134/* Used to avoid races in checking concurrent access of counting and
135 * sampling for pai_extension events.
136 *
137 * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
138 * allowed and when this event is running, no counting event is allowed.
139 * Several counting events are allowed in parallel, but no sampling event
140 * is allowed while one (or more) counting events are running.
141 *
142 * This function is called in process context and it is safe to block.
143 * When the event initialization functions fails, no other call back will
144 * be invoked.
145 *
146 * Allocate the memory for the event.
147 */
148static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
149{
150 struct paiext_mapptr *mp;
151 struct paiext_map *cpump;
152 int rc;
153
154 mutex_lock(&paiext_reserve_mutex);
155
156 rc = paiext_root_alloc();
157 if (rc)
158 goto unlock;
159
160 mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
161 cpump = mp->mapptr;
162 if (!cpump) { /* Paiext_map allocated? */
163 rc = -ENOMEM;
164 cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
165 if (!cpump)
166 goto undo;
167
168 /* Allocate memory for counter area and counter extraction.
169 * These are
170 * - a 512 byte block and requires 512 byte boundary alignment.
171 * - a 1KB byte block and requires 1KB boundary alignment.
172 * Only the first counting event has to allocate the area.
173 *
174 * Note: This works with commit 59bb47985c1d by default.
175 * Backporting this to kernels without this commit might
176 * need adjustment.
177 */
178 mp->mapptr = cpump;
179 cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
180 cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
181 cpump->save = kvmalloc_array(paiext_cnt + 1,
182 sizeof(struct pai_userdata),
183 GFP_KERNEL);
184 if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
185 paiext_free(mp);
186 goto undo;
187 }
188 refcount_set(&cpump->refcnt, 1);
189 cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
190 : PAI_MODE_COUNTING;
191 } else {
192 /* Multiple invocation, check what is active.
193 * Supported are multiple counter events or only one sampling
194 * event concurrently at any one time.
195 */
196 if (cpump->mode == PAI_MODE_SAMPLING ||
197 (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
198 rc = -EBUSY;
199 goto undo;
200 }
201 refcount_inc(&cpump->refcnt);
202 }
203
204 rc = 0;
205 cpump->event = event;
206
207undo:
208 if (rc) {
209 /* Error in allocation of event, decrement anchor. Since
210 * the event in not created, its destroy() function is never
211 * invoked. Adjust the reference counter for the anchor.
212 */
213 paiext_root_free();
214 }
215unlock:
216 mutex_unlock(&paiext_reserve_mutex);
217 /* If rc is non-zero, no increment of counter/sampler was done. */
218 return rc;
219}
220
221/* The PAI extension 1 control block supports up to 128 entries. Return
222 * the index within PAIE1_CB given the event number. Also validate event
223 * number.
224 */
225static int paiext_event_valid(struct perf_event *event)
226{
227 u64 cfg = event->attr.config;
228
229 if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
230 /* Offset NNPA in paiext_cb */
231 event->hw.config_base = offsetof(struct paiext_cb, acc);
232 return 0;
233 }
234 return -EINVAL;
235}
236
237/* Might be called on different CPU than the one the event is intended for. */
238static int paiext_event_init(struct perf_event *event)
239{
240 struct perf_event_attr *a = &event->attr;
241 int rc;
242
243 /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
244 if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
245 return -ENOENT;
246 /* PAI extension event must be valid and in supported range */
247 rc = paiext_event_valid(event);
248 if (rc)
249 return rc;
250 /* Allow only CPU wide operation, no process context for now. */
251 if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
252 return -ENOENT;
253 /* Allow only event NNPA_ALL for sampling. */
254 if (a->sample_period && a->config != PAI_NNPA_BASE)
255 return -EINVAL;
256 /* Prohibit exclude_user event selection */
257 if (a->exclude_user)
258 return -EINVAL;
259
260 rc = paiext_alloc(a, event);
261 if (rc)
262 return rc;
263 event->destroy = paiext_event_destroy;
264
265 if (a->sample_period) {
266 a->sample_period = 1;
267 a->freq = 0;
268 /* Register for paicrypt_sched_task() to be called */
269 event->attach_state |= PERF_ATTACH_SCHED_CB;
270 /* Add raw data which are the memory mapped counters */
271 a->sample_type |= PERF_SAMPLE_RAW;
272 /* Turn off inheritance */
273 a->inherit = 0;
274 }
275
276 return 0;
277}
278
279static u64 paiext_getctr(unsigned long *area, int nr)
280{
281 return area[nr];
282}
283
284/* Read the counter values. Return value from location in buffer. For event
285 * NNPA_ALL sum up all events.
286 */
287static u64 paiext_getdata(struct perf_event *event)
288{
289 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
290 struct paiext_map *cpump = mp->mapptr;
291 u64 sum = 0;
292 int i;
293
294 if (event->attr.config != PAI_NNPA_BASE)
295 return paiext_getctr(cpump->area,
296 event->attr.config - PAI_NNPA_BASE);
297
298 for (i = 1; i <= paiext_cnt; i++)
299 sum += paiext_getctr(cpump->area, i);
300
301 return sum;
302}
303
304static u64 paiext_getall(struct perf_event *event)
305{
306 return paiext_getdata(event);
307}
308
309static void paiext_read(struct perf_event *event)
310{
311 u64 prev, new, delta;
312
313 prev = local64_read(&event->hw.prev_count);
314 new = paiext_getall(event);
315 local64_set(&event->hw.prev_count, new);
316 delta = new - prev;
317 local64_add(delta, &event->count);
318}
319
320static void paiext_start(struct perf_event *event, int flags)
321{
322 u64 sum;
323
324 if (!event->attr.sample_period) { /* Counting */
325 if (!event->hw.last_tag) {
326 event->hw.last_tag = 1;
327 sum = paiext_getall(event); /* Get current value */
328 local64_set(&event->hw.prev_count, sum);
329 }
330 } else { /* Sampling */
331 perf_sched_cb_inc(event->pmu);
332 }
333}
334
335static int paiext_add(struct perf_event *event, int flags)
336{
337 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
338 struct paiext_map *cpump = mp->mapptr;
339 struct paiext_cb *pcb = cpump->paiext_cb;
340
341 if (++cpump->active_events == 1) {
342 S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb);
343 pcb->acc = virt_to_phys(cpump->area) | 0x1;
344 /* Enable CPU instruction lookup for PAIE1 control block */
345 local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
346 debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
347 __func__, S390_lowcore.aicd, pcb->acc);
348 }
349 cpump->event = event;
350 if (flags & PERF_EF_START)
351 paiext_start(event, PERF_EF_RELOAD);
352 event->hw.state = 0;
353 return 0;
354}
355
356static void paiext_stop(struct perf_event *event, int flags)
357{
358 if (!event->attr.sample_period) /* Counting */
359 paiext_read(event);
360 else /* Sampling */
361 perf_sched_cb_dec(event->pmu);
362 event->hw.state = PERF_HES_STOPPED;
363}
364
365static void paiext_del(struct perf_event *event, int flags)
366{
367 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
368 struct paiext_map *cpump = mp->mapptr;
369 struct paiext_cb *pcb = cpump->paiext_cb;
370
371 paiext_stop(event, PERF_EF_UPDATE);
372 if (--cpump->active_events == 0) {
373 /* Disable CPU instruction lookup for PAIE1 control block */
374 local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
375 pcb->acc = 0;
376 S390_lowcore.aicd = 0;
377 debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
378 __func__, S390_lowcore.aicd, pcb->acc);
379 }
380}
381
382/* Create raw data and save it in buffer. Returns number of bytes copied.
383 * Saves only positive counter entries of the form
384 * 2 bytes: Number of counter
385 * 8 bytes: Value of counter
386 */
387static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area)
388{
389 int i, outidx = 0;
390
391 for (i = 1; i <= paiext_cnt; i++) {
392 u64 val = paiext_getctr(area, i);
393
394 if (val) {
395 userdata[outidx].num = i;
396 userdata[outidx].value = val;
397 outidx++;
398 }
399 }
400 return outidx * sizeof(*userdata);
401}
402
403/* Write sample when one or more counters values are nonzero.
404 *
405 * Note: The function paiext_sched_task() and paiext_push_sample() are not
406 * invoked after function paiext_del() has been called because of function
407 * perf_sched_cb_dec().
408 * The function paiext_sched_task() and paiext_push_sample() are only
409 * called when sampling is active. Function perf_sched_cb_inc()
410 * has been invoked to install function paiext_sched_task() as call back
411 * to run at context switch time (see paiext_add()).
412 *
413 * This causes function perf_event_context_sched_out() and
414 * perf_event_context_sched_in() to check whether the PMU has installed an
415 * sched_task() callback. That callback is not active after paiext_del()
416 * returns and has deleted the event on that CPU.
417 */
418static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
419 struct perf_event *event)
420{
421 struct perf_sample_data data;
422 struct perf_raw_record raw;
423 struct pt_regs regs;
424 int overflow;
425
426 /* Setup perf sample */
427 memset(®s, 0, sizeof(regs));
428 memset(&raw, 0, sizeof(raw));
429 memset(&data, 0, sizeof(data));
430 perf_sample_data_init(&data, 0, event->hw.last_period);
431 if (event->attr.sample_type & PERF_SAMPLE_TID) {
432 data.tid_entry.pid = task_tgid_nr(current);
433 data.tid_entry.tid = task_pid_nr(current);
434 }
435 if (event->attr.sample_type & PERF_SAMPLE_TIME)
436 data.time = event->clock();
437 if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
438 data.id = event->id;
439 if (event->attr.sample_type & PERF_SAMPLE_CPU)
440 data.cpu_entry.cpu = smp_processor_id();
441 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
442 raw.frag.size = rawsize;
443 raw.frag.data = cpump->save;
444 perf_sample_save_raw_data(&data, &raw);
445 }
446
447 overflow = perf_event_overflow(event, &data, ®s);
448 perf_event_update_userpage(event);
449 /* Clear lowcore area after read */
450 memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ);
451 return overflow;
452}
453
454/* Check if there is data to be saved on schedule out of a task. */
455static int paiext_have_sample(void)
456{
457 struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
458 struct paiext_map *cpump = mp->mapptr;
459 struct perf_event *event = cpump->event;
460 size_t rawsize;
461 int rc = 0;
462
463 if (!event)
464 return 0;
465 rawsize = paiext_copy(cpump->save, cpump->area);
466 if (rawsize) /* Incremented counters */
467 rc = paiext_push_sample(rawsize, cpump, event);
468 return rc;
469}
470
471/* Called on schedule-in and schedule-out. No access to event structure,
472 * but for sampling only event NNPA_ALL is allowed.
473 */
474static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
475{
476 /* We started with a clean page on event installation. So read out
477 * results on schedule_out and if page was dirty, clear values.
478 */
479 if (!sched_in)
480 paiext_have_sample();
481}
482
483/* Attribute definitions for pai extension1 interface. As with other CPU
484 * Measurement Facilities, there is one attribute per mapped counter.
485 * The number of mapped counters may vary per machine generation. Use
486 * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
487 * to determine the number of mapped counters. The instructions returns
488 * a positive number, which is the highest number of supported counters.
489 * All counters less than this number are also supported, there are no
490 * holes. A returned number of zero means no support for mapped counters.
491 *
492 * The identification of the counter is a unique number. The chosen range
493 * is 0x1800 + offset in mapped kernel page.
494 * All CPU Measurement Facility counters identifiers must be unique and
495 * the numbers from 0 to 496 are already used for the CPU Measurement
496 * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
497 * counters.
498 * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
499 * used for the CPU Measurement Sampling facility.
500 */
501PMU_FORMAT_ATTR(event, "config:0-63");
502
503static struct attribute *paiext_format_attr[] = {
504 &format_attr_event.attr,
505 NULL,
506};
507
508static struct attribute_group paiext_events_group = {
509 .name = "events",
510 .attrs = NULL, /* Filled in attr_event_init() */
511};
512
513static struct attribute_group paiext_format_group = {
514 .name = "format",
515 .attrs = paiext_format_attr,
516};
517
518static const struct attribute_group *paiext_attr_groups[] = {
519 &paiext_events_group,
520 &paiext_format_group,
521 NULL,
522};
523
524/* Performance monitoring unit for mapped counters */
525static struct pmu paiext = {
526 .task_ctx_nr = perf_invalid_context,
527 .event_init = paiext_event_init,
528 .add = paiext_add,
529 .del = paiext_del,
530 .start = paiext_start,
531 .stop = paiext_stop,
532 .read = paiext_read,
533 .sched_task = paiext_sched_task,
534 .attr_groups = paiext_attr_groups,
535};
536
537/* List of symbolic PAI extension 1 NNPA counter names. */
538static const char * const paiext_ctrnames[] = {
539 [0] = "NNPA_ALL",
540 [1] = "NNPA_ADD",
541 [2] = "NNPA_SUB",
542 [3] = "NNPA_MUL",
543 [4] = "NNPA_DIV",
544 [5] = "NNPA_MIN",
545 [6] = "NNPA_MAX",
546 [7] = "NNPA_LOG",
547 [8] = "NNPA_EXP",
548 [9] = "NNPA_IBM_RESERVED_9",
549 [10] = "NNPA_RELU",
550 [11] = "NNPA_TANH",
551 [12] = "NNPA_SIGMOID",
552 [13] = "NNPA_SOFTMAX",
553 [14] = "NNPA_BATCHNORM",
554 [15] = "NNPA_MAXPOOL2D",
555 [16] = "NNPA_AVGPOOL2D",
556 [17] = "NNPA_LSTMACT",
557 [18] = "NNPA_GRUACT",
558 [19] = "NNPA_CONVOLUTION",
559 [20] = "NNPA_MATMUL_OP",
560 [21] = "NNPA_MATMUL_OP_BCAST23",
561 [22] = "NNPA_SMALLBATCH",
562 [23] = "NNPA_LARGEDIM",
563 [24] = "NNPA_SMALLTENSOR",
564 [25] = "NNPA_1MFRAME",
565 [26] = "NNPA_2GFRAME",
566 [27] = "NNPA_ACCESSEXCEPT",
567};
568
569static void __init attr_event_free(struct attribute **attrs, int num)
570{
571 struct perf_pmu_events_attr *pa;
572 struct device_attribute *dap;
573 int i;
574
575 for (i = 0; i < num; i++) {
576 dap = container_of(attrs[i], struct device_attribute, attr);
577 pa = container_of(dap, struct perf_pmu_events_attr, attr);
578 kfree(pa);
579 }
580 kfree(attrs);
581}
582
583static int __init attr_event_init_one(struct attribute **attrs, int num)
584{
585 struct perf_pmu_events_attr *pa;
586
587 pa = kzalloc(sizeof(*pa), GFP_KERNEL);
588 if (!pa)
589 return -ENOMEM;
590
591 sysfs_attr_init(&pa->attr.attr);
592 pa->id = PAI_NNPA_BASE + num;
593 pa->attr.attr.name = paiext_ctrnames[num];
594 pa->attr.attr.mode = 0444;
595 pa->attr.show = cpumf_events_sysfs_show;
596 pa->attr.store = NULL;
597 attrs[num] = &pa->attr.attr;
598 return 0;
599}
600
601/* Create PMU sysfs event attributes on the fly. */
602static int __init attr_event_init(void)
603{
604 struct attribute **attrs;
605 int ret, i;
606
607 attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs),
608 GFP_KERNEL);
609 if (!attrs)
610 return -ENOMEM;
611 for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) {
612 ret = attr_event_init_one(attrs, i);
613 if (ret) {
614 attr_event_free(attrs, i - 1);
615 return ret;
616 }
617 }
618 attrs[i] = NULL;
619 paiext_events_group.attrs = attrs;
620 return 0;
621}
622
623static int __init paiext_init(void)
624{
625 struct qpaci_info_block ib;
626 int rc = -ENOMEM;
627
628 if (!test_facility(197))
629 return 0;
630
631 qpaci(&ib);
632 paiext_cnt = ib.num_nnpa;
633 if (paiext_cnt >= PAI_NNPA_MAXCTR)
634 paiext_cnt = PAI_NNPA_MAXCTR;
635 if (!paiext_cnt)
636 return 0;
637
638 rc = attr_event_init();
639 if (rc) {
640 pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
641 return rc;
642 }
643
644 /* Setup s390dbf facility */
645 paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
646 if (!paiext_dbg) {
647 pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
648 rc = -ENOMEM;
649 goto out_init;
650 }
651 debug_register_view(paiext_dbg, &debug_sprintf_view);
652
653 rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
654 if (rc) {
655 pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
656 "rc=%i\n", rc);
657 goto out_pmu;
658 }
659
660 return 0;
661
662out_pmu:
663 debug_unregister_view(paiext_dbg, &debug_sprintf_view);
664 debug_unregister(paiext_dbg);
665out_init:
666 attr_event_free(paiext_events_group.attrs,
667 ARRAY_SIZE(paiext_ctrnames) + 1);
668 return rc;
669}
670
671device_initcall(paiext_init);