Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright IBM Corp. 2018
   4 * Auxtrace support for s390 CPU-Measurement Sampling Facility
   5 *
   6 * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
   7 *
   8 * Auxiliary traces are collected during 'perf record' using rbd000 event.
   9 * Several PERF_RECORD_XXX are generated during recording:
  10 *
  11 * PERF_RECORD_AUX:
  12 *	Records that new data landed in the AUX buffer part.
  13 * PERF_RECORD_AUXTRACE:
  14 *	Defines auxtrace data. Followed by the actual data. The contents of
  15 *	the auxtrace data is dependent on the event and the CPU.
  16 *	This record is generated by perf record command. For details
  17 *	see Documentation/perf.data-file-format.txt.
  18 * PERF_RECORD_AUXTRACE_INFO:
  19 *	Defines a table of contains for PERF_RECORD_AUXTRACE records. This
  20 *	record is generated during 'perf record' command. Each record contains
  21 *	up to 256 entries describing offset and size of the AUXTRACE data in the
  22 *	perf.data file.
  23 * PERF_RECORD_AUXTRACE_ERROR:
  24 *	Indicates an error during AUXTRACE collection such as buffer overflow.
  25 * PERF_RECORD_FINISHED_ROUND:
  26 *	Perf events are not necessarily in time stamp order, as they can be
  27 *	collected in parallel on different CPUs. If the events should be
  28 *	processed in time order they need to be sorted first.
  29 *	Perf report guarantees that there is no reordering over a
  30 *	PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
  31 *	time stamp lower than this record are processed (and displayed) before
  32 *	the succeeding perf record are processed.
  33 *
  34 * These records are evaluated during perf report command.
  35 *
  36 * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
  37 * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
  38 * below.
  39 * Auxiliary trace data is collected per CPU. To merge the data into the report
  40 * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
  41 * data is in ascending order.
  42 *
  43 * Each queue has a double linked list of auxtrace_buffers. This list contains
  44 * the offset and size of a CPU's auxtrace data. During auxtrace processing
  45 * the data portion is mmap()'ed.
  46 *
  47 * To sort the queues in chronological order, all queue access is controlled
  48 * by the auxtrace_heap. This is basicly a stack, each stack element has two
  49 * entries, the queue number and a time stamp. However the stack is sorted by
  50 * the time stamps. The highest time stamp is at the bottom the lowest
  51 * (nearest) time stamp is at the top. That sort order is maintained at all
  52 * times!
  53 *
  54 * After the auxtrace infrastructure has been setup, the auxtrace queues are
  55 * filled with data (offset/size pairs) and the auxtrace_heap is populated.
  56 *
  57 * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
  58 * Each record is handled by s390_cpumsf_process_event(). The time stamp of
  59 * the perf record is compared with the time stamp located on the auxtrace_heap
  60 * top element. If that time stamp is lower than the time stamp from the
  61 * record sample, the auxtrace queues will be processed. As auxtrace queues
  62 * control many auxtrace_buffers and each buffer can be quite large, the
  63 * auxtrace buffer might be processed only partially. In this case the
  64 * position in the auxtrace_buffer of that queue is remembered and the time
  65 * stamp of the last processed entry of the auxtrace_buffer replaces the
  66 * current auxtrace_heap top.
  67 *
  68 * 3. Auxtrace_queues might run of out data and are feeded by the
  69 * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
  70 *
  71 * Event Generation
  72 * Each sampling-data entry in the auxilary trace data generates a perf sample.
  73 * This sample is filled
  74 * with data from the auxtrace such as PID/TID, instruction address, CPU state,
  75 * etc. This sample is processed with perf_session__deliver_synth_event() to
  76 * be included into the GUI.
  77 *
  78 * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
  79 * auxiliary traces entries until the time stamp of this record is reached
  80 * auxtrace_heap top. This is triggered by ordered_event->deliver().
  81 *
  82 *
  83 * Perf event processing.
  84 * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
  85 * This is the function call sequence:
  86 *
  87 * __cmd_report()
  88 * |
  89 * perf_session__process_events()
  90 * |
  91 * __perf_session__process_events()
  92 * |
  93 * perf_session__process_event()
  94 * |  This functions splits the PERF_RECORD_XXX records.
  95 * |  - Those generated by perf record command (type number equal or higher
  96 * |    than PERF_RECORD_USER_TYPE_START) are handled by
  97 * |    perf_session__process_user_event(see below)
  98 * |  - Those generated by the kernel are handled by
  99 * |    perf_evlist__parse_sample_timestamp()
 100 * |
 101 * perf_evlist__parse_sample_timestamp()
 102 * |  Extract time stamp from sample data.
 103 * |
 104 * perf_session__queue_event()
 105 * |  If timestamp is positive the sample is entered into an ordered_event
 106 * |  list, sort order is the timestamp. The event processing is deferred until
 107 * |  later (see perf_session__process_user_event()).
 108 * |  Other timestamps (0 or -1) are handled immediately by
 109 * |  perf_session__deliver_event(). These are events generated at start up
 110 * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
 111 * |  records. They are needed to create a list of running processes and its
 112 * |  memory mappings and layout. They are needed at the beginning to enable
 113 * |  command perf report to create process trees and memory mappings.
 114 * |
 115 * perf_session__deliver_event()
 116 * |  Delivers a PERF_RECORD_XXX entry for handling.
 117 * |
 118 * auxtrace__process_event()
 119 * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
 120 * |  time stamps from the auxiliary trace buffers. This enables
 121 * |  synchronization between auxiliary trace data and the events on the
 122 * |  perf.data file.
 123 * |
 124 * machine__deliver_event()
 125 * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
 126 *    It might update the process tree, update a process memory map or enter
 127 *    a sample with IP and call back chain data into GUI data pool.
 128 *
 129 *
 130 * Deferred processing determined by perf_session__process_user_event() is
 131 * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
 132 * are generated during command perf record.
 133 * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
 134 * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
 135 * built up while reading the perf.data file.
 136 * Each event is now processed by calling perf_session__deliver_event().
 137 * This enables time synchronization between the data in the perf.data file and
 138 * the data in the auxiliary trace buffers.
 139 */
 140
 141#include <endian.h>
 142#include <errno.h>
 143#include <byteswap.h>
 144#include <inttypes.h>
 145#include <linux/kernel.h>
 146#include <linux/types.h>
 147#include <linux/bitops.h>
 148#include <linux/log2.h>
 149#include <linux/zalloc.h>
 150
 151#include <sys/stat.h>
 152#include <sys/types.h>
 153
 154#include "color.h"
 155#include "evsel.h"
 156#include "evlist.h"
 157#include "machine.h"
 158#include "session.h"
 159#include "tool.h"
 160#include "debug.h"
 161#include "auxtrace.h"
 162#include "s390-cpumsf.h"
 163#include "s390-cpumsf-kernel.h"
 164#include "s390-cpumcf-kernel.h"
 165#include "config.h"
 
 166
 167struct s390_cpumsf {
 168	struct auxtrace		auxtrace;
 169	struct auxtrace_queues	queues;
 170	struct auxtrace_heap	heap;
 171	struct perf_session	*session;
 172	struct machine		*machine;
 173	u32			auxtrace_type;
 174	u32			pmu_type;
 175	u16			machine_type;
 176	bool			data_queued;
 177	bool			use_logfile;
 178	char			*logdir;
 179};
 180
 181struct s390_cpumsf_queue {
 182	struct s390_cpumsf	*sf;
 183	unsigned int		queue_nr;
 184	struct auxtrace_buffer	*buffer;
 185	int			cpu;
 186	FILE			*logfile;
 187	FILE			*logfile_ctr;
 188};
 189
 190/* Check if the raw data should be dumped to file. If this is the case and
 191 * the file to dump to has not been opened for writing, do so.
 192 *
 193 * Return 0 on success and greater zero on error so processing continues.
 194 */
 195static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf,
 196			       struct perf_sample *sample)
 197{
 198	struct s390_cpumsf_queue *sfq;
 199	struct auxtrace_queue *q;
 200	int rc = 0;
 201
 202	if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu)
 203		return rc;
 204
 205	q = &sf->queues.queue_array[sample->cpu];
 206	sfq = q->priv;
 207	if (!sfq)		/* Queue not yet allocated */
 208		return rc;
 209
 210	if (!sfq->logfile_ctr) {
 211		char *name;
 212
 213		rc = (sf->logdir)
 214			? asprintf(&name, "%s/aux.ctr.%02x",
 215				 sf->logdir, sample->cpu)
 216			: asprintf(&name, "aux.ctr.%02x", sample->cpu);
 217		if (rc > 0)
 218			sfq->logfile_ctr = fopen(name, "w");
 219		if (sfq->logfile_ctr == NULL) {
 220			pr_err("Failed to open counter set log file %s, "
 221			       "continue...\n", name);
 222			rc = 1;
 223		}
 224		free(name);
 225	}
 226
 227	if (sfq->logfile_ctr) {
 228		/* See comment above for -4 */
 229		size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1,
 230				  sfq->logfile_ctr);
 231		if (n != 1) {
 232			pr_err("Failed to write counter set data\n");
 233			rc = 1;
 234		}
 235	}
 236	return rc;
 237}
 238
 239/* Display s390 CPU measurement facility basic-sampling data entry
 240 * Data written on s390 in big endian byte order and contains bit
 241 * fields across byte boundaries.
 242 */
 243static bool s390_cpumsf_basic_show(const char *color, size_t pos,
 244				   struct hws_basic_entry *basicp)
 245{
 246	struct hws_basic_entry *basic = basicp;
 247#if __BYTE_ORDER == __LITTLE_ENDIAN
 248	struct hws_basic_entry local;
 249	unsigned long long word = be64toh(*(unsigned long long *)basicp);
 250
 251	memset(&local, 0, sizeof(local));
 252	local.def = be16toh(basicp->def);
 253	local.prim_asn = word & 0xffff;
 254	local.CL = word >> 30 & 0x3;
 255	local.I = word >> 32 & 0x1;
 256	local.AS = word >> 33 & 0x3;
 257	local.P = word >> 35 & 0x1;
 258	local.W = word >> 36 & 0x1;
 259	local.T = word >> 37 & 0x1;
 260	local.U = word >> 40 & 0xf;
 261	local.ia = be64toh(basicp->ia);
 262	local.gpp = be64toh(basicp->gpp);
 263	local.hpp = be64toh(basicp->hpp);
 264	basic = &local;
 265#endif
 266	if (basic->def != 1) {
 267		pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
 268		return false;
 269	}
 270	color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
 271		      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
 272		      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
 273		      pos, basic->def, basic->U,
 274		      basic->T ? 'T' : ' ',
 275		      basic->W ? 'W' : ' ',
 276		      basic->P ? 'P' : ' ',
 277		      basic->I ? 'I' : ' ',
 278		      basic->AS, basic->prim_asn, basic->ia, basic->CL,
 279		      basic->hpp, basic->gpp);
 280	return true;
 281}
 282
 283/* Display s390 CPU measurement facility diagnostic-sampling data entry.
 284 * Data written on s390 in big endian byte order and contains bit
 285 * fields across byte boundaries.
 286 */
 287static bool s390_cpumsf_diag_show(const char *color, size_t pos,
 288				  struct hws_diag_entry *diagp)
 289{
 290	struct hws_diag_entry *diag = diagp;
 291#if __BYTE_ORDER == __LITTLE_ENDIAN
 292	struct hws_diag_entry local;
 293	unsigned long long word = be64toh(*(unsigned long long *)diagp);
 294
 295	local.def = be16toh(diagp->def);
 296	local.I = word >> 32 & 0x1;
 297	diag = &local;
 298#endif
 299	if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
 300		pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
 301		return false;
 302	}
 303	color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
 304		      pos, diag->def, diag->I ? 'I' : ' ');
 305	return true;
 306}
 307
 308/* Return TOD timestamp contained in an trailer entry */
 309static unsigned long long trailer_timestamp(struct hws_trailer_entry *te,
 310					    int idx)
 311{
 312	/* te->t set: TOD in STCKE format, bytes 8-15
 313	 * to->t not set: TOD in STCK format, bytes 0-7
 314	 */
 315	unsigned long long ts;
 316
 317	memcpy(&ts, &te->timestamp[idx], sizeof(ts));
 318	return be64toh(ts);
 319}
 320
 321/* Display s390 CPU measurement facility trailer entry */
 322static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
 323				     struct hws_trailer_entry *te)
 324{
 325#if __BYTE_ORDER == __LITTLE_ENDIAN
 326	struct hws_trailer_entry local;
 327	const unsigned long long flags = be64toh(te->flags);
 328
 329	memset(&local, 0, sizeof(local));
 330	local.f = flags >> 63 & 0x1;
 331	local.a = flags >> 62 & 0x1;
 332	local.t = flags >> 61 & 0x1;
 333	local.bsdes = be16toh((flags >> 16 & 0xffff));
 334	local.dsdes = be16toh((flags & 0xffff));
 335	memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp));
 336	local.overflow = be64toh(te->overflow);
 337	local.clock_base = be64toh(te->progusage[0]) >> 63 & 1;
 338	local.progusage2 = be64toh(te->progusage2);
 339	te = &local;
 340#endif
 341	if (te->bsdes != sizeof(struct hws_basic_entry)) {
 342		pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
 343		return false;
 344	}
 345	color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
 346		      " dsdes:%d Overflow:%lld Time:%#llx\n"
 347		      "\t\tC:%d TOD:%#lx\n",
 348		      pos,
 349		      te->f ? 'F' : ' ',
 350		      te->a ? 'A' : ' ',
 351		      te->t ? 'T' : ' ',
 352		      te->bsdes, te->dsdes, te->overflow,
 353		      trailer_timestamp(te, te->clock_base),
 354		      te->clock_base, te->progusage2);
 355	return true;
 356}
 357
 358/* Test a sample data block. It must be 4KB or a multiple thereof in size and
 359 * 4KB page aligned. Each sample data page has a trailer entry at the
 360 * end which contains the sample entry data sizes.
 361 *
 362 * Return true if the sample data block passes the checks and set the
 363 * basic set entry size and diagnostic set entry size.
 364 *
 365 * Return false on failure.
 366 *
 367 * Note: Old hardware does not set the basic or diagnostic entry sizes
 368 * in the trailer entry. Use the type number instead.
 369 */
 370static bool s390_cpumsf_validate(int machine_type,
 371				 unsigned char *buf, size_t len,
 372				 unsigned short *bsdes,
 373				 unsigned short *dsdes)
 374{
 375	struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
 376	struct hws_trailer_entry *te;
 377
 378	*dsdes = *bsdes = 0;
 379	if (len & (S390_CPUMSF_PAGESZ - 1))	/* Illegal size */
 380		return false;
 381	if (be16toh(basic->def) != 1)	/* No basic set entry, must be first */
 382		return false;
 383	/* Check for trailer entry at end of SDB */
 384	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
 385					      - sizeof(*te));
 386	*bsdes = be16toh(te->bsdes);
 387	*dsdes = be16toh(te->dsdes);
 388	if (!te->bsdes && !te->dsdes) {
 389		/* Very old hardware, use CPUID */
 390		switch (machine_type) {
 391		case 2097:
 392		case 2098:
 393			*dsdes = 64;
 394			*bsdes = 32;
 395			break;
 396		case 2817:
 397		case 2818:
 398			*dsdes = 74;
 399			*bsdes = 32;
 400			break;
 401		case 2827:
 402		case 2828:
 403			*dsdes = 85;
 404			*bsdes = 32;
 405			break;
 406		case 2964:
 407		case 2965:
 408			*dsdes = 112;
 409			*bsdes = 32;
 410			break;
 411		default:
 412			/* Illegal trailer entry */
 413			return false;
 414		}
 415	}
 416	return true;
 417}
 418
 419/* Return true if there is room for another entry */
 420static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
 421{
 422	size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
 423
 424	if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
 425		return false;
 426	return true;
 427}
 428
 429/* Dump an auxiliary buffer. These buffers are multiple of
 430 * 4KB SDB pages.
 431 */
 432static void s390_cpumsf_dump(struct s390_cpumsf *sf,
 433			     unsigned char *buf, size_t len)
 434{
 435	const char *color = PERF_COLOR_BLUE;
 436	struct hws_basic_entry *basic;
 437	struct hws_diag_entry *diag;
 438	unsigned short bsdes, dsdes;
 439	size_t pos = 0;
 440
 441	color_fprintf(stdout, color,
 442		      ". ... s390 AUX data: size %zu bytes\n",
 443		      len);
 444
 445	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
 446				  &dsdes)) {
 447		pr_err("Invalid AUX trace data block size:%zu"
 448		       " (type:%d bsdes:%hd dsdes:%hd)\n",
 449		       len, sf->machine_type, bsdes, dsdes);
 450		return;
 451	}
 452
 453	/* s390 kernel always returns 4KB blocks fully occupied,
 454	 * no partially filled SDBs.
 455	 */
 456	while (pos < len) {
 457		/* Handle Basic entry */
 458		basic = (struct hws_basic_entry *)(buf + pos);
 459		if (s390_cpumsf_basic_show(color, pos, basic))
 460			pos += bsdes;
 461		else
 462			return;
 463
 464		/* Handle Diagnostic entry */
 465		diag = (struct hws_diag_entry *)(buf + pos);
 466		if (s390_cpumsf_diag_show(color, pos, diag))
 467			pos += dsdes;
 468		else
 469			return;
 470
 471		/* Check for trailer entry */
 472		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
 473			/* Show trailer entry */
 474			struct hws_trailer_entry te;
 475
 476			pos = (pos + S390_CPUMSF_PAGESZ)
 477			       & ~(S390_CPUMSF_PAGESZ - 1);
 478			pos -= sizeof(te);
 479			memcpy(&te, buf + pos, sizeof(te));
 480			/* Set descriptor sizes in case of old hardware
 481			 * where these values are not set.
 482			 */
 483			te.bsdes = bsdes;
 484			te.dsdes = dsdes;
 485			if (s390_cpumsf_trailer_show(color, pos, &te))
 486				pos += sizeof(te);
 487			else
 488				return;
 489		}
 490	}
 491}
 492
 493static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
 494				   size_t len)
 495{
 496	printf(".\n");
 497	s390_cpumsf_dump(sf, buf, len);
 498}
 499
 500#define	S390_LPP_PID_MASK	0xffffffff
 501
 502static bool s390_cpumsf_make_event(size_t pos,
 503				   struct hws_basic_entry *basic,
 504				   struct s390_cpumsf_queue *sfq)
 505{
 506	struct perf_sample sample = {
 507				.ip = basic->ia,
 508				.pid = basic->hpp & S390_LPP_PID_MASK,
 509				.tid = basic->hpp & S390_LPP_PID_MASK,
 510				.cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
 511				.cpu = sfq->cpu,
 512				.period = 1
 513			    };
 514	union perf_event event;
 515
 516	memset(&event, 0, sizeof(event));
 517	if (basic->CL == 1)	/* Native LPAR mode */
 518		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
 519					  : PERF_RECORD_MISC_KERNEL;
 520	else if (basic->CL == 2)	/* Guest kernel/user space */
 521		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
 522					  : PERF_RECORD_MISC_GUEST_KERNEL;
 523	else if (basic->gpp || basic->prim_asn != 0xffff)
 524		/* Use heuristics on old hardware */
 525		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
 526					  : PERF_RECORD_MISC_GUEST_KERNEL;
 527	else
 528		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
 529					  : PERF_RECORD_MISC_KERNEL;
 530
 531	event.sample.header.type = PERF_RECORD_SAMPLE;
 532	event.sample.header.misc = sample.cpumode;
 533	event.sample.header.size = sizeof(struct perf_event_header);
 534
 535	pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
 536		 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
 537		 sample.tid, sample.cpumode, sample.cpu);
 538	if (perf_session__deliver_synth_event(sfq->sf->session, &event,
 539					      &sample)) {
 540		pr_err("s390 Auxiliary Trace: failed to deliver event\n");
 541		return false;
 542	}
 543	return true;
 544}
 545
 546static unsigned long long get_trailer_time(const unsigned char *buf)
 547{
 548	struct hws_trailer_entry *te;
 549	unsigned long long aux_time, progusage2;
 550	bool clock_base;
 551
 552	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
 553					      - sizeof(*te));
 554
 555#if __BYTE_ORDER == __LITTLE_ENDIAN
 556	clock_base = be64toh(te->progusage[0]) >> 63 & 0x1;
 557	progusage2 = be64toh(te->progusage[1]);
 558#else
 559	clock_base = te->clock_base;
 560	progusage2 = te->progusage2;
 561#endif
 562	if (!clock_base)	/* TOD_CLOCK_BASE value missing */
 563		return 0;
 564
 565	/* Correct calculation to convert time stamp in trailer entry to
 566	 * nano seconds (taken from arch/s390 function tod_to_ns()).
 567	 * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
 568	 */
 569	aux_time = trailer_timestamp(te, clock_base) - progusage2;
 570	aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
 571	return aux_time;
 572}
 573
 574/* Process the data samples of a single queue. The first parameter is a
 575 * pointer to the queue, the second parameter is the time stamp. This
 576 * is the time stamp:
 577 * - of the event that triggered this processing.
 578 * - or the time stamp when the last proccesing of this queue stopped.
 579 *   In this case it stopped at a 4KB page boundary and record the
 580 *   position on where to continue processing on the next invocation
 581 *   (see buffer->use_data and buffer->use_size).
 582 *
 583 * When this function returns the second parameter is updated to
 584 * reflect the time stamp of the last processed auxiliary data entry
 585 * (taken from the trailer entry of that page). The caller uses this
 586 * returned time stamp to record the last processed entry in this
 587 * queue.
 588 *
 589 * The function returns:
 590 * 0:  Processing successful. The second parameter returns the
 591 *     time stamp from the trailer entry until which position
 592 *     processing took place. Subsequent calls resume from this
 593 *     position.
 594 * <0: An error occurred during processing. The second parameter
 595 *     returns the maximum time stamp.
 596 * >0: Done on this queue. The second parameter returns the
 597 *     maximum time stamp.
 598 */
 599static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
 600{
 601	struct s390_cpumsf *sf = sfq->sf;
 602	unsigned char *buf = sfq->buffer->use_data;
 603	size_t len = sfq->buffer->use_size;
 604	struct hws_basic_entry *basic;
 605	unsigned short bsdes, dsdes;
 606	size_t pos = 0;
 607	int err = 1;
 608	u64 aux_ts;
 609
 610	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
 611				  &dsdes)) {
 612		*ts = ~0ULL;
 613		return -1;
 614	}
 615
 616	/* Get trailer entry time stamp and check if entries in
 617	 * this auxiliary page are ready for processing. If the
 618	 * time stamp of the first entry is too high, whole buffer
 619	 * can be skipped. In this case return time stamp.
 620	 */
 621	aux_ts = get_trailer_time(buf);
 622	if (!aux_ts) {
 623		pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
 624		       (s64)sfq->buffer->data_offset);
 625		aux_ts = ~0ULL;
 626		goto out;
 627	}
 628	if (aux_ts > *ts) {
 629		*ts = aux_ts;
 630		return 0;
 631	}
 632
 633	while (pos < len) {
 634		/* Handle Basic entry */
 635		basic = (struct hws_basic_entry *)(buf + pos);
 636		if (s390_cpumsf_make_event(pos, basic, sfq))
 637			pos += bsdes;
 638		else {
 639			err = -EBADF;
 640			goto out;
 641		}
 642
 643		pos += dsdes;	/* Skip diagnositic entry */
 644
 645		/* Check for trailer entry */
 646		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
 647			pos = (pos + S390_CPUMSF_PAGESZ)
 648			       & ~(S390_CPUMSF_PAGESZ - 1);
 649			/* Check existence of next page */
 650			if (pos >= len)
 651				break;
 652			aux_ts = get_trailer_time(buf + pos);
 653			if (!aux_ts) {
 654				aux_ts = ~0ULL;
 655				goto out;
 656			}
 657			if (aux_ts > *ts) {
 658				*ts = aux_ts;
 659				sfq->buffer->use_data += pos;
 660				sfq->buffer->use_size -= pos;
 661				return 0;
 662			}
 663		}
 664	}
 665out:
 666	*ts = aux_ts;
 667	sfq->buffer->use_size = 0;
 668	sfq->buffer->use_data = NULL;
 669	return err;	/* Buffer completely scanned or error */
 670}
 671
 672/* Run the s390 auxiliary trace decoder.
 673 * Select the queue buffer to operate on, the caller already selected
 674 * the proper queue, depending on second parameter 'ts'.
 675 * This is the time stamp until which the auxiliary entries should
 676 * be processed. This value is updated by called functions and
 677 * returned to the caller.
 678 *
 679 * Resume processing in the current buffer. If there is no buffer
 680 * get a new buffer from the queue and setup start position for
 681 * processing.
 682 * When a buffer is completely processed remove it from the queue
 683 * before returning.
 684 *
 685 * This function returns
 686 * 1: When the queue is empty. Second parameter will be set to
 687 *    maximum time stamp.
 688 * 0: Normal processing done.
 689 * <0: Error during queue buffer setup. This causes the caller
 690 *     to stop processing completely.
 691 */
 692static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
 693				   u64 *ts)
 694{
 695
 696	struct auxtrace_buffer *buffer;
 697	struct auxtrace_queue *queue;
 698	int err;
 699
 700	queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
 701
 702	/* Get buffer and last position in buffer to resume
 703	 * decoding the auxiliary entries. One buffer might be large
 704	 * and decoding might stop in between. This depends on the time
 705	 * stamp of the trailer entry in each page of the auxiliary
 706	 * data and the time stamp of the event triggering the decoding.
 707	 */
 708	if (sfq->buffer == NULL) {
 709		sfq->buffer = buffer = auxtrace_buffer__next(queue,
 710							     sfq->buffer);
 711		if (!buffer) {
 712			*ts = ~0ULL;
 713			return 1;	/* Processing done on this queue */
 714		}
 715		/* Start with a new buffer on this queue */
 716		if (buffer->data) {
 717			buffer->use_size = buffer->size;
 718			buffer->use_data = buffer->data;
 719		}
 720		if (sfq->logfile) {	/* Write into log file */
 721			size_t rc = fwrite(buffer->data, buffer->size, 1,
 722					   sfq->logfile);
 723			if (rc != 1)
 724				pr_err("Failed to write auxiliary data\n");
 725		}
 726	} else
 727		buffer = sfq->buffer;
 728
 729	if (!buffer->data) {
 730		int fd = perf_data__fd(sfq->sf->session->data);
 731
 732		buffer->data = auxtrace_buffer__get_data(buffer, fd);
 733		if (!buffer->data)
 734			return -ENOMEM;
 735		buffer->use_size = buffer->size;
 736		buffer->use_data = buffer->data;
 737
 738		if (sfq->logfile) {	/* Write into log file */
 739			size_t rc = fwrite(buffer->data, buffer->size, 1,
 740					   sfq->logfile);
 741			if (rc != 1)
 742				pr_err("Failed to write auxiliary data\n");
 743		}
 744	}
 745	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
 746		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
 747		  buffer->size, buffer->use_size);
 748	err = s390_cpumsf_samples(sfq, ts);
 749
 750	/* If non-zero, there is either an error (err < 0) or the buffer is
 751	 * completely done (err > 0). The error is unrecoverable, usually
 752	 * some descriptors could not be read successfully, so continue with
 753	 * the next buffer.
 754	 * In both cases the parameter 'ts' has been updated.
 755	 */
 756	if (err) {
 757		sfq->buffer = NULL;
 758		list_del_init(&buffer->list);
 759		auxtrace_buffer__free(buffer);
 760		if (err > 0)		/* Buffer done, no error */
 761			err = 0;
 762	}
 763	return err;
 764}
 765
 766static struct s390_cpumsf_queue *
 767s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
 768{
 769	struct s390_cpumsf_queue *sfq;
 770
 771	sfq = zalloc(sizeof(struct s390_cpumsf_queue));
 772	if (sfq == NULL)
 773		return NULL;
 774
 775	sfq->sf = sf;
 776	sfq->queue_nr = queue_nr;
 777	sfq->cpu = -1;
 778	if (sf->use_logfile) {
 779		char *name;
 780		int rc;
 781
 782		rc = (sf->logdir)
 783			? asprintf(&name, "%s/aux.smp.%02x",
 784				 sf->logdir, queue_nr)
 785			: asprintf(&name, "aux.smp.%02x", queue_nr);
 786		if (rc > 0)
 787			sfq->logfile = fopen(name, "w");
 788		if (sfq->logfile == NULL) {
 789			pr_err("Failed to open auxiliary log file %s,"
 790			       "continue...\n", name);
 791			sf->use_logfile = false;
 792		}
 793		free(name);
 794	}
 795	return sfq;
 796}
 797
 798static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
 799				   struct auxtrace_queue *queue,
 800				   unsigned int queue_nr, u64 ts)
 801{
 802	struct s390_cpumsf_queue *sfq = queue->priv;
 803
 804	if (list_empty(&queue->head))
 805		return 0;
 806
 807	if (sfq == NULL) {
 808		sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
 809		if (!sfq)
 810			return -ENOMEM;
 811		queue->priv = sfq;
 812
 813		if (queue->cpu != -1)
 814			sfq->cpu = queue->cpu;
 815	}
 816	return auxtrace_heap__add(&sf->heap, queue_nr, ts);
 817}
 818
 819static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
 820{
 821	unsigned int i;
 822	int ret = 0;
 823
 824	for (i = 0; i < sf->queues.nr_queues; i++) {
 825		ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
 826					      i, ts);
 827		if (ret)
 828			break;
 829	}
 830	return ret;
 831}
 832
 833static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
 834{
 835	if (!sf->queues.new_data)
 836		return 0;
 837
 838	sf->queues.new_data = false;
 839	return s390_cpumsf_setup_queues(sf, ts);
 840}
 841
 842static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
 843{
 844	unsigned int queue_nr;
 845	u64 ts;
 846	int ret;
 847
 848	while (1) {
 849		struct auxtrace_queue *queue;
 850		struct s390_cpumsf_queue *sfq;
 851
 852		if (!sf->heap.heap_cnt)
 853			return 0;
 854
 855		if (sf->heap.heap_array[0].ordinal >= timestamp)
 856			return 0;
 857
 858		queue_nr = sf->heap.heap_array[0].queue_nr;
 859		queue = &sf->queues.queue_array[queue_nr];
 860		sfq = queue->priv;
 861
 862		auxtrace_heap__pop(&sf->heap);
 863		if (sf->heap.heap_cnt) {
 864			ts = sf->heap.heap_array[0].ordinal + 1;
 865			if (ts > timestamp)
 866				ts = timestamp;
 867		} else {
 868			ts = timestamp;
 869		}
 870
 871		ret = s390_cpumsf_run_decoder(sfq, &ts);
 872		if (ret < 0) {
 873			auxtrace_heap__add(&sf->heap, queue_nr, ts);
 874			return ret;
 875		}
 876		if (!ret) {
 877			ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
 878			if (ret < 0)
 879				return ret;
 880		}
 881	}
 882	return 0;
 883}
 884
 885static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
 886				   pid_t pid, pid_t tid, u64 ip, u64 timestamp)
 887{
 888	char msg[MAX_AUXTRACE_ERROR_MSG];
 889	union perf_event event;
 890	int err;
 891
 892	strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
 893	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
 894			     code, cpu, pid, tid, ip, msg, timestamp);
 895
 896	err = perf_session__deliver_synth_event(sf->session, &event, NULL);
 897	if (err)
 898		pr_err("s390 Auxiliary Trace: failed to deliver error event,"
 899			"error %d\n", err);
 900	return err;
 901}
 902
 903static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
 904{
 905	return s390_cpumsf_synth_error(sf, 1, sample->cpu,
 906				       sample->pid, sample->tid, 0,
 907				       sample->time);
 908}
 909
 910static int
 911s390_cpumsf_process_event(struct perf_session *session,
 912			  union perf_event *event,
 913			  struct perf_sample *sample,
 914			  struct perf_tool *tool)
 915{
 916	struct s390_cpumsf *sf = container_of(session->auxtrace,
 917					      struct s390_cpumsf,
 918					      auxtrace);
 919	u64 timestamp = sample->time;
 920	struct evsel *ev_bc000;
 921
 922	int err = 0;
 923
 924	if (dump_trace)
 925		return 0;
 926
 927	if (!tool->ordered_events) {
 928		pr_err("s390 Auxiliary Trace requires ordered events\n");
 929		return -EINVAL;
 930	}
 931
 932	if (event->header.type == PERF_RECORD_SAMPLE &&
 933	    sample->raw_size) {
 934		/* Handle event with raw data */
 935		ev_bc000 = perf_evlist__event2evsel(session->evlist, event);
 936		if (ev_bc000 &&
 937		    ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG)
 938			err = s390_cpumcf_dumpctr(sf, sample);
 939		return err;
 940	}
 941
 942	if (event->header.type == PERF_RECORD_AUX &&
 943	    event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
 944		return s390_cpumsf_lost(sf, sample);
 945
 946	if (timestamp) {
 947		err = s390_cpumsf_update_queues(sf, timestamp);
 948		if (!err)
 949			err = s390_cpumsf_process_queues(sf, timestamp);
 950	}
 951	return err;
 952}
 953
 954struct s390_cpumsf_synth {
 955	struct perf_tool cpumsf_tool;
 956	struct perf_session *session;
 957};
 958
 959static int
 960s390_cpumsf_process_auxtrace_event(struct perf_session *session,
 961				   union perf_event *event __maybe_unused,
 962				   struct perf_tool *tool __maybe_unused)
 963{
 964	struct s390_cpumsf *sf = container_of(session->auxtrace,
 965					      struct s390_cpumsf,
 966					      auxtrace);
 967
 968	int fd = perf_data__fd(session->data);
 969	struct auxtrace_buffer *buffer;
 970	off_t data_offset;
 971	int err;
 972
 973	if (sf->data_queued)
 974		return 0;
 975
 976	if (perf_data__is_pipe(session->data)) {
 977		data_offset = 0;
 978	} else {
 979		data_offset = lseek(fd, 0, SEEK_CUR);
 980		if (data_offset == -1)
 981			return -errno;
 982	}
 983
 984	err = auxtrace_queues__add_event(&sf->queues, session, event,
 985					 data_offset, &buffer);
 986	if (err)
 987		return err;
 988
 989	/* Dump here after copying piped trace out of the pipe */
 990	if (dump_trace) {
 991		if (auxtrace_buffer__get_data(buffer, fd)) {
 992			s390_cpumsf_dump_event(sf, buffer->data,
 993					       buffer->size);
 994			auxtrace_buffer__put_data(buffer);
 995		}
 996	}
 997	return 0;
 998}
 999
1000static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
1001{
1002}
1003
1004static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
1005			     struct perf_tool *tool __maybe_unused)
1006{
1007	return 0;
1008}
1009
1010static void s390_cpumsf_free_queues(struct perf_session *session)
1011{
1012	struct s390_cpumsf *sf = container_of(session->auxtrace,
1013					      struct s390_cpumsf,
1014					      auxtrace);
1015	struct auxtrace_queues *queues = &sf->queues;
1016	unsigned int i;
1017
1018	for (i = 0; i < queues->nr_queues; i++) {
1019		struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
1020						queues->queue_array[i].priv;
1021
1022		if (sfq != NULL) {
1023			if (sfq->logfile) {
1024				fclose(sfq->logfile);
1025				sfq->logfile = NULL;
1026			}
1027			if (sfq->logfile_ctr) {
1028				fclose(sfq->logfile_ctr);
1029				sfq->logfile_ctr = NULL;
1030			}
1031		}
1032		zfree(&queues->queue_array[i].priv);
1033	}
1034	auxtrace_queues__free(queues);
1035}
1036
1037static void s390_cpumsf_free(struct perf_session *session)
1038{
1039	struct s390_cpumsf *sf = container_of(session->auxtrace,
1040					      struct s390_cpumsf,
1041					      auxtrace);
1042
1043	auxtrace_heap__free(&sf->heap);
1044	s390_cpumsf_free_queues(session);
1045	session->auxtrace = NULL;
1046	zfree(&sf->logdir);
1047	free(sf);
1048}
1049
 
 
 
 
 
 
 
 
1050static int s390_cpumsf_get_type(const char *cpuid)
1051{
1052	int ret, family = 0;
1053
1054	ret = sscanf(cpuid, "%*[^,],%u", &family);
1055	return (ret == 1) ? family : 0;
1056}
1057
1058/* Check itrace options set on perf report command.
1059 * Return true, if none are set or all options specified can be
1060 * handled on s390 (currently only option 'd' for logging.
1061 * Return false otherwise.
1062 */
1063static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
1064{
1065	bool ison = false;
1066
1067	if (!itops || !itops->set)
1068		return true;
1069	ison = itops->inject || itops->instructions || itops->branches ||
1070		itops->transactions || itops->ptwrites ||
1071		itops->pwr_events || itops->errors ||
1072		itops->dont_decode || itops->calls || itops->returns ||
1073		itops->callchain || itops->thread_stack ||
1074		itops->last_branch;
 
1075	if (!ison)
1076		return true;
1077	pr_err("Unsupported --itrace options specified\n");
1078	return false;
1079}
1080
1081/* Check for AUXTRACE dump directory if it is needed.
1082 * On failure print an error message but continue.
1083 * Return 0 on wrong keyword in config file and 1 otherwise.
1084 */
1085static int s390_cpumsf__config(const char *var, const char *value, void *cb)
1086{
1087	struct s390_cpumsf *sf = cb;
1088	struct stat stbuf;
1089	int rc;
1090
1091	if (strcmp(var, "auxtrace.dumpdir"))
1092		return 0;
1093	sf->logdir = strdup(value);
1094	if (sf->logdir == NULL) {
1095		pr_err("Failed to find auxtrace log directory %s,"
1096		       " continue with current directory...\n", value);
1097		return 1;
1098	}
1099	rc = stat(sf->logdir, &stbuf);
1100	if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
1101		pr_err("Missing auxtrace log directory %s,"
1102		       " continue with current directory...\n", value);
1103		zfree(&sf->logdir);
1104	}
1105	return 1;
1106}
1107
1108int s390_cpumsf_process_auxtrace_info(union perf_event *event,
1109				      struct perf_session *session)
1110{
1111	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1112	struct s390_cpumsf *sf;
1113	int err;
1114
1115	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info))
1116		return -EINVAL;
1117
1118	sf = zalloc(sizeof(struct s390_cpumsf));
1119	if (sf == NULL)
1120		return -ENOMEM;
1121
1122	if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
1123		err = -EINVAL;
1124		goto err_free;
1125	}
1126	sf->use_logfile = session->itrace_synth_opts->log;
1127	if (sf->use_logfile)
1128		perf_config(s390_cpumsf__config, sf);
1129
1130	err = auxtrace_queues__init(&sf->queues);
1131	if (err)
1132		goto err_free;
1133
1134	sf->session = session;
1135	sf->machine = &session->machines.host; /* No kvm support */
1136	sf->auxtrace_type = auxtrace_info->type;
1137	sf->pmu_type = PERF_TYPE_RAW;
1138	sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid);
1139
1140	sf->auxtrace.process_event = s390_cpumsf_process_event;
1141	sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
1142	sf->auxtrace.flush_events = s390_cpumsf_flush;
1143	sf->auxtrace.free_events = s390_cpumsf_free_events;
1144	sf->auxtrace.free = s390_cpumsf_free;
 
1145	session->auxtrace = &sf->auxtrace;
1146
1147	if (dump_trace)
1148		return 0;
1149
1150	err = auxtrace_queues__process_index(&sf->queues, session);
1151	if (err)
1152		goto err_free_queues;
1153
1154	if (sf->queues.populated)
1155		sf->data_queued = true;
1156
1157	return 0;
1158
1159err_free_queues:
1160	auxtrace_queues__free(&sf->queues);
1161	session->auxtrace = NULL;
1162err_free:
1163	zfree(&sf->logdir);
1164	free(sf);
1165	return err;
1166}
v6.8
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright IBM Corp. 2018
   4 * Auxtrace support for s390 CPU-Measurement Sampling Facility
   5 *
   6 * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
   7 *
   8 * Auxiliary traces are collected during 'perf record' using rbd000 event.
   9 * Several PERF_RECORD_XXX are generated during recording:
  10 *
  11 * PERF_RECORD_AUX:
  12 *	Records that new data landed in the AUX buffer part.
  13 * PERF_RECORD_AUXTRACE:
  14 *	Defines auxtrace data. Followed by the actual data. The contents of
  15 *	the auxtrace data is dependent on the event and the CPU.
  16 *	This record is generated by perf record command. For details
  17 *	see Documentation/perf.data-file-format.txt.
  18 * PERF_RECORD_AUXTRACE_INFO:
  19 *	Defines a table of contains for PERF_RECORD_AUXTRACE records. This
  20 *	record is generated during 'perf record' command. Each record contains
  21 *	up to 256 entries describing offset and size of the AUXTRACE data in the
  22 *	perf.data file.
  23 * PERF_RECORD_AUXTRACE_ERROR:
  24 *	Indicates an error during AUXTRACE collection such as buffer overflow.
  25 * PERF_RECORD_FINISHED_ROUND:
  26 *	Perf events are not necessarily in time stamp order, as they can be
  27 *	collected in parallel on different CPUs. If the events should be
  28 *	processed in time order they need to be sorted first.
  29 *	Perf report guarantees that there is no reordering over a
  30 *	PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
  31 *	time stamp lower than this record are processed (and displayed) before
  32 *	the succeeding perf record are processed.
  33 *
  34 * These records are evaluated during perf report command.
  35 *
  36 * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
  37 * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
  38 * below.
  39 * Auxiliary trace data is collected per CPU. To merge the data into the report
  40 * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
  41 * data is in ascending order.
  42 *
  43 * Each queue has a double linked list of auxtrace_buffers. This list contains
  44 * the offset and size of a CPU's auxtrace data. During auxtrace processing
  45 * the data portion is mmap()'ed.
  46 *
  47 * To sort the queues in chronological order, all queue access is controlled
  48 * by the auxtrace_heap. This is basically a stack, each stack element has two
  49 * entries, the queue number and a time stamp. However the stack is sorted by
  50 * the time stamps. The highest time stamp is at the bottom the lowest
  51 * (nearest) time stamp is at the top. That sort order is maintained at all
  52 * times!
  53 *
  54 * After the auxtrace infrastructure has been setup, the auxtrace queues are
  55 * filled with data (offset/size pairs) and the auxtrace_heap is populated.
  56 *
  57 * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
  58 * Each record is handled by s390_cpumsf_process_event(). The time stamp of
  59 * the perf record is compared with the time stamp located on the auxtrace_heap
  60 * top element. If that time stamp is lower than the time stamp from the
  61 * record sample, the auxtrace queues will be processed. As auxtrace queues
  62 * control many auxtrace_buffers and each buffer can be quite large, the
  63 * auxtrace buffer might be processed only partially. In this case the
  64 * position in the auxtrace_buffer of that queue is remembered and the time
  65 * stamp of the last processed entry of the auxtrace_buffer replaces the
  66 * current auxtrace_heap top.
  67 *
  68 * 3. Auxtrace_queues might run of out data and are fed by the
  69 * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
  70 *
  71 * Event Generation
  72 * Each sampling-data entry in the auxiliary trace data generates a perf sample.
  73 * This sample is filled
  74 * with data from the auxtrace such as PID/TID, instruction address, CPU state,
  75 * etc. This sample is processed with perf_session__deliver_synth_event() to
  76 * be included into the GUI.
  77 *
  78 * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
  79 * auxiliary traces entries until the time stamp of this record is reached
  80 * auxtrace_heap top. This is triggered by ordered_event->deliver().
  81 *
  82 *
  83 * Perf event processing.
  84 * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
  85 * This is the function call sequence:
  86 *
  87 * __cmd_report()
  88 * |
  89 * perf_session__process_events()
  90 * |
  91 * __perf_session__process_events()
  92 * |
  93 * perf_session__process_event()
  94 * |  This functions splits the PERF_RECORD_XXX records.
  95 * |  - Those generated by perf record command (type number equal or higher
  96 * |    than PERF_RECORD_USER_TYPE_START) are handled by
  97 * |    perf_session__process_user_event(see below)
  98 * |  - Those generated by the kernel are handled by
  99 * |    evlist__parse_sample_timestamp()
 100 * |
 101 * evlist__parse_sample_timestamp()
 102 * |  Extract time stamp from sample data.
 103 * |
 104 * perf_session__queue_event()
 105 * |  If timestamp is positive the sample is entered into an ordered_event
 106 * |  list, sort order is the timestamp. The event processing is deferred until
 107 * |  later (see perf_session__process_user_event()).
 108 * |  Other timestamps (0 or -1) are handled immediately by
 109 * |  perf_session__deliver_event(). These are events generated at start up
 110 * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
 111 * |  records. They are needed to create a list of running processes and its
 112 * |  memory mappings and layout. They are needed at the beginning to enable
 113 * |  command perf report to create process trees and memory mappings.
 114 * |
 115 * perf_session__deliver_event()
 116 * |  Delivers a PERF_RECORD_XXX entry for handling.
 117 * |
 118 * auxtrace__process_event()
 119 * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
 120 * |  time stamps from the auxiliary trace buffers. This enables
 121 * |  synchronization between auxiliary trace data and the events on the
 122 * |  perf.data file.
 123 * |
 124 * machine__deliver_event()
 125 * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
 126 *    It might update the process tree, update a process memory map or enter
 127 *    a sample with IP and call back chain data into GUI data pool.
 128 *
 129 *
 130 * Deferred processing determined by perf_session__process_user_event() is
 131 * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
 132 * are generated during command perf record.
 133 * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
 134 * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
 135 * built up while reading the perf.data file.
 136 * Each event is now processed by calling perf_session__deliver_event().
 137 * This enables time synchronization between the data in the perf.data file and
 138 * the data in the auxiliary trace buffers.
 139 */
 140
 141#include <endian.h>
 142#include <errno.h>
 143#include <byteswap.h>
 144#include <inttypes.h>
 145#include <linux/kernel.h>
 146#include <linux/types.h>
 147#include <linux/bitops.h>
 148#include <linux/log2.h>
 149#include <linux/zalloc.h>
 150
 151#include <sys/stat.h>
 152#include <sys/types.h>
 153
 154#include "color.h"
 155#include "evsel.h"
 156#include "evlist.h"
 157#include "machine.h"
 158#include "session.h"
 159#include "tool.h"
 160#include "debug.h"
 161#include "auxtrace.h"
 162#include "s390-cpumsf.h"
 163#include "s390-cpumsf-kernel.h"
 164#include "s390-cpumcf-kernel.h"
 165#include "config.h"
 166#include "util/sample.h"
 167
 168struct s390_cpumsf {
 169	struct auxtrace		auxtrace;
 170	struct auxtrace_queues	queues;
 171	struct auxtrace_heap	heap;
 172	struct perf_session	*session;
 173	struct machine		*machine;
 174	u32			auxtrace_type;
 175	u32			pmu_type;
 176	u16			machine_type;
 177	bool			data_queued;
 178	bool			use_logfile;
 179	char			*logdir;
 180};
 181
 182struct s390_cpumsf_queue {
 183	struct s390_cpumsf	*sf;
 184	unsigned int		queue_nr;
 185	struct auxtrace_buffer	*buffer;
 186	int			cpu;
 187	FILE			*logfile;
 188	FILE			*logfile_ctr;
 189};
 190
 191/* Check if the raw data should be dumped to file. If this is the case and
 192 * the file to dump to has not been opened for writing, do so.
 193 *
 194 * Return 0 on success and greater zero on error so processing continues.
 195 */
 196static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf,
 197			       struct perf_sample *sample)
 198{
 199	struct s390_cpumsf_queue *sfq;
 200	struct auxtrace_queue *q;
 201	int rc = 0;
 202
 203	if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu)
 204		return rc;
 205
 206	q = &sf->queues.queue_array[sample->cpu];
 207	sfq = q->priv;
 208	if (!sfq)		/* Queue not yet allocated */
 209		return rc;
 210
 211	if (!sfq->logfile_ctr) {
 212		char *name;
 213
 214		rc = (sf->logdir)
 215			? asprintf(&name, "%s/aux.ctr.%02x",
 216				 sf->logdir, sample->cpu)
 217			: asprintf(&name, "aux.ctr.%02x", sample->cpu);
 218		if (rc > 0)
 219			sfq->logfile_ctr = fopen(name, "w");
 220		if (sfq->logfile_ctr == NULL) {
 221			pr_err("Failed to open counter set log file %s, "
 222			       "continue...\n", name);
 223			rc = 1;
 224		}
 225		free(name);
 226	}
 227
 228	if (sfq->logfile_ctr) {
 229		/* See comment above for -4 */
 230		size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1,
 231				  sfq->logfile_ctr);
 232		if (n != 1) {
 233			pr_err("Failed to write counter set data\n");
 234			rc = 1;
 235		}
 236	}
 237	return rc;
 238}
 239
 240/* Display s390 CPU measurement facility basic-sampling data entry
 241 * Data written on s390 in big endian byte order and contains bit
 242 * fields across byte boundaries.
 243 */
 244static bool s390_cpumsf_basic_show(const char *color, size_t pos,
 245				   struct hws_basic_entry *basicp)
 246{
 247	struct hws_basic_entry *basic = basicp;
 248#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 249	struct hws_basic_entry local;
 250	unsigned long long word = be64toh(*(unsigned long long *)basicp);
 251
 252	memset(&local, 0, sizeof(local));
 253	local.def = be16toh(basicp->def);
 254	local.prim_asn = word & 0xffff;
 255	local.CL = word >> 30 & 0x3;
 256	local.I = word >> 32 & 0x1;
 257	local.AS = word >> 33 & 0x3;
 258	local.P = word >> 35 & 0x1;
 259	local.W = word >> 36 & 0x1;
 260	local.T = word >> 37 & 0x1;
 261	local.U = word >> 40 & 0xf;
 262	local.ia = be64toh(basicp->ia);
 263	local.gpp = be64toh(basicp->gpp);
 264	local.hpp = be64toh(basicp->hpp);
 265	basic = &local;
 266#endif
 267	if (basic->def != 1) {
 268		pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
 269		return false;
 270	}
 271	color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
 272		      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
 273		      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
 274		      pos, basic->def, basic->U,
 275		      basic->T ? 'T' : ' ',
 276		      basic->W ? 'W' : ' ',
 277		      basic->P ? 'P' : ' ',
 278		      basic->I ? 'I' : ' ',
 279		      basic->AS, basic->prim_asn, basic->ia, basic->CL,
 280		      basic->hpp, basic->gpp);
 281	return true;
 282}
 283
 284/* Display s390 CPU measurement facility diagnostic-sampling data entry.
 285 * Data written on s390 in big endian byte order and contains bit
 286 * fields across byte boundaries.
 287 */
 288static bool s390_cpumsf_diag_show(const char *color, size_t pos,
 289				  struct hws_diag_entry *diagp)
 290{
 291	struct hws_diag_entry *diag = diagp;
 292#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 293	struct hws_diag_entry local;
 294	unsigned long long word = be64toh(*(unsigned long long *)diagp);
 295
 296	local.def = be16toh(diagp->def);
 297	local.I = word >> 32 & 0x1;
 298	diag = &local;
 299#endif
 300	if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
 301		pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
 302		return false;
 303	}
 304	color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
 305		      pos, diag->def, diag->I ? 'I' : ' ');
 306	return true;
 307}
 308
 309/* Return TOD timestamp contained in an trailer entry */
 310static unsigned long long trailer_timestamp(struct hws_trailer_entry *te,
 311					    int idx)
 312{
 313	/* te->t set: TOD in STCKE format, bytes 8-15
 314	 * to->t not set: TOD in STCK format, bytes 0-7
 315	 */
 316	unsigned long long ts;
 317
 318	memcpy(&ts, &te->timestamp[idx], sizeof(ts));
 319	return be64toh(ts);
 320}
 321
 322/* Display s390 CPU measurement facility trailer entry */
 323static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
 324				     struct hws_trailer_entry *te)
 325{
 326#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 327	struct hws_trailer_entry local;
 328	const unsigned long long flags = be64toh(te->flags);
 329
 330	memset(&local, 0, sizeof(local));
 331	local.f = flags >> 63 & 0x1;
 332	local.a = flags >> 62 & 0x1;
 333	local.t = flags >> 61 & 0x1;
 334	local.bsdes = be16toh((flags >> 16 & 0xffff));
 335	local.dsdes = be16toh((flags & 0xffff));
 336	memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp));
 337	local.overflow = be64toh(te->overflow);
 338	local.clock_base = be64toh(te->progusage[0]) >> 63 & 1;
 339	local.progusage2 = be64toh(te->progusage2);
 340	te = &local;
 341#endif
 342	if (te->bsdes != sizeof(struct hws_basic_entry)) {
 343		pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
 344		return false;
 345	}
 346	color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
 347		      " dsdes:%d Overflow:%lld Time:%#llx\n"
 348		      "\t\tC:%d TOD:%#lx\n",
 349		      pos,
 350		      te->f ? 'F' : ' ',
 351		      te->a ? 'A' : ' ',
 352		      te->t ? 'T' : ' ',
 353		      te->bsdes, te->dsdes, te->overflow,
 354		      trailer_timestamp(te, te->clock_base),
 355		      te->clock_base, te->progusage2);
 356	return true;
 357}
 358
 359/* Test a sample data block. It must be 4KB or a multiple thereof in size and
 360 * 4KB page aligned. Each sample data page has a trailer entry at the
 361 * end which contains the sample entry data sizes.
 362 *
 363 * Return true if the sample data block passes the checks and set the
 364 * basic set entry size and diagnostic set entry size.
 365 *
 366 * Return false on failure.
 367 *
 368 * Note: Old hardware does not set the basic or diagnostic entry sizes
 369 * in the trailer entry. Use the type number instead.
 370 */
 371static bool s390_cpumsf_validate(int machine_type,
 372				 unsigned char *buf, size_t len,
 373				 unsigned short *bsdes,
 374				 unsigned short *dsdes)
 375{
 376	struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
 377	struct hws_trailer_entry *te;
 378
 379	*dsdes = *bsdes = 0;
 380	if (len & (S390_CPUMSF_PAGESZ - 1))	/* Illegal size */
 381		return false;
 382	if (be16toh(basic->def) != 1)	/* No basic set entry, must be first */
 383		return false;
 384	/* Check for trailer entry at end of SDB */
 385	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
 386					      - sizeof(*te));
 387	*bsdes = be16toh(te->bsdes);
 388	*dsdes = be16toh(te->dsdes);
 389	if (!te->bsdes && !te->dsdes) {
 390		/* Very old hardware, use CPUID */
 391		switch (machine_type) {
 392		case 2097:
 393		case 2098:
 394			*dsdes = 64;
 395			*bsdes = 32;
 396			break;
 397		case 2817:
 398		case 2818:
 399			*dsdes = 74;
 400			*bsdes = 32;
 401			break;
 402		case 2827:
 403		case 2828:
 404			*dsdes = 85;
 405			*bsdes = 32;
 406			break;
 407		case 2964:
 408		case 2965:
 409			*dsdes = 112;
 410			*bsdes = 32;
 411			break;
 412		default:
 413			/* Illegal trailer entry */
 414			return false;
 415		}
 416	}
 417	return true;
 418}
 419
 420/* Return true if there is room for another entry */
 421static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
 422{
 423	size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
 424
 425	if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
 426		return false;
 427	return true;
 428}
 429
 430/* Dump an auxiliary buffer. These buffers are multiple of
 431 * 4KB SDB pages.
 432 */
 433static void s390_cpumsf_dump(struct s390_cpumsf *sf,
 434			     unsigned char *buf, size_t len)
 435{
 436	const char *color = PERF_COLOR_BLUE;
 437	struct hws_basic_entry *basic;
 438	struct hws_diag_entry *diag;
 439	unsigned short bsdes, dsdes;
 440	size_t pos = 0;
 441
 442	color_fprintf(stdout, color,
 443		      ". ... s390 AUX data: size %zu bytes\n",
 444		      len);
 445
 446	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
 447				  &dsdes)) {
 448		pr_err("Invalid AUX trace data block size:%zu"
 449		       " (type:%d bsdes:%hd dsdes:%hd)\n",
 450		       len, sf->machine_type, bsdes, dsdes);
 451		return;
 452	}
 453
 454	/* s390 kernel always returns 4KB blocks fully occupied,
 455	 * no partially filled SDBs.
 456	 */
 457	while (pos < len) {
 458		/* Handle Basic entry */
 459		basic = (struct hws_basic_entry *)(buf + pos);
 460		if (s390_cpumsf_basic_show(color, pos, basic))
 461			pos += bsdes;
 462		else
 463			return;
 464
 465		/* Handle Diagnostic entry */
 466		diag = (struct hws_diag_entry *)(buf + pos);
 467		if (s390_cpumsf_diag_show(color, pos, diag))
 468			pos += dsdes;
 469		else
 470			return;
 471
 472		/* Check for trailer entry */
 473		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
 474			/* Show trailer entry */
 475			struct hws_trailer_entry te;
 476
 477			pos = (pos + S390_CPUMSF_PAGESZ)
 478			       & ~(S390_CPUMSF_PAGESZ - 1);
 479			pos -= sizeof(te);
 480			memcpy(&te, buf + pos, sizeof(te));
 481			/* Set descriptor sizes in case of old hardware
 482			 * where these values are not set.
 483			 */
 484			te.bsdes = bsdes;
 485			te.dsdes = dsdes;
 486			if (s390_cpumsf_trailer_show(color, pos, &te))
 487				pos += sizeof(te);
 488			else
 489				return;
 490		}
 491	}
 492}
 493
 494static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
 495				   size_t len)
 496{
 497	printf(".\n");
 498	s390_cpumsf_dump(sf, buf, len);
 499}
 500
 501#define	S390_LPP_PID_MASK	0xffffffff
 502
 503static bool s390_cpumsf_make_event(size_t pos,
 504				   struct hws_basic_entry *basic,
 505				   struct s390_cpumsf_queue *sfq)
 506{
 507	struct perf_sample sample = {
 508				.ip = basic->ia,
 509				.pid = basic->hpp & S390_LPP_PID_MASK,
 510				.tid = basic->hpp & S390_LPP_PID_MASK,
 511				.cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
 512				.cpu = sfq->cpu,
 513				.period = 1
 514			    };
 515	union perf_event event;
 516
 517	memset(&event, 0, sizeof(event));
 518	if (basic->CL == 1)	/* Native LPAR mode */
 519		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
 520					  : PERF_RECORD_MISC_KERNEL;
 521	else if (basic->CL == 2)	/* Guest kernel/user space */
 522		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
 523					  : PERF_RECORD_MISC_GUEST_KERNEL;
 524	else if (basic->gpp || basic->prim_asn != 0xffff)
 525		/* Use heuristics on old hardware */
 526		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
 527					  : PERF_RECORD_MISC_GUEST_KERNEL;
 528	else
 529		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
 530					  : PERF_RECORD_MISC_KERNEL;
 531
 532	event.sample.header.type = PERF_RECORD_SAMPLE;
 533	event.sample.header.misc = sample.cpumode;
 534	event.sample.header.size = sizeof(struct perf_event_header);
 535
 536	pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
 537		 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
 538		 sample.tid, sample.cpumode, sample.cpu);
 539	if (perf_session__deliver_synth_event(sfq->sf->session, &event,
 540					      &sample)) {
 541		pr_err("s390 Auxiliary Trace: failed to deliver event\n");
 542		return false;
 543	}
 544	return true;
 545}
 546
 547static unsigned long long get_trailer_time(const unsigned char *buf)
 548{
 549	struct hws_trailer_entry *te;
 550	unsigned long long aux_time, progusage2;
 551	bool clock_base;
 552
 553	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
 554					      - sizeof(*te));
 555
 556#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 557	clock_base = be64toh(te->progusage[0]) >> 63 & 0x1;
 558	progusage2 = be64toh(te->progusage[1]);
 559#else
 560	clock_base = te->clock_base;
 561	progusage2 = te->progusage2;
 562#endif
 563	if (!clock_base)	/* TOD_CLOCK_BASE value missing */
 564		return 0;
 565
 566	/* Correct calculation to convert time stamp in trailer entry to
 567	 * nano seconds (taken from arch/s390 function tod_to_ns()).
 568	 * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
 569	 */
 570	aux_time = trailer_timestamp(te, clock_base) - progusage2;
 571	aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
 572	return aux_time;
 573}
 574
 575/* Process the data samples of a single queue. The first parameter is a
 576 * pointer to the queue, the second parameter is the time stamp. This
 577 * is the time stamp:
 578 * - of the event that triggered this processing.
 579 * - or the time stamp when the last processing of this queue stopped.
 580 *   In this case it stopped at a 4KB page boundary and record the
 581 *   position on where to continue processing on the next invocation
 582 *   (see buffer->use_data and buffer->use_size).
 583 *
 584 * When this function returns the second parameter is updated to
 585 * reflect the time stamp of the last processed auxiliary data entry
 586 * (taken from the trailer entry of that page). The caller uses this
 587 * returned time stamp to record the last processed entry in this
 588 * queue.
 589 *
 590 * The function returns:
 591 * 0:  Processing successful. The second parameter returns the
 592 *     time stamp from the trailer entry until which position
 593 *     processing took place. Subsequent calls resume from this
 594 *     position.
 595 * <0: An error occurred during processing. The second parameter
 596 *     returns the maximum time stamp.
 597 * >0: Done on this queue. The second parameter returns the
 598 *     maximum time stamp.
 599 */
 600static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
 601{
 602	struct s390_cpumsf *sf = sfq->sf;
 603	unsigned char *buf = sfq->buffer->use_data;
 604	size_t len = sfq->buffer->use_size;
 605	struct hws_basic_entry *basic;
 606	unsigned short bsdes, dsdes;
 607	size_t pos = 0;
 608	int err = 1;
 609	u64 aux_ts;
 610
 611	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
 612				  &dsdes)) {
 613		*ts = ~0ULL;
 614		return -1;
 615	}
 616
 617	/* Get trailer entry time stamp and check if entries in
 618	 * this auxiliary page are ready for processing. If the
 619	 * time stamp of the first entry is too high, whole buffer
 620	 * can be skipped. In this case return time stamp.
 621	 */
 622	aux_ts = get_trailer_time(buf);
 623	if (!aux_ts) {
 624		pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
 625		       (s64)sfq->buffer->data_offset);
 626		aux_ts = ~0ULL;
 627		goto out;
 628	}
 629	if (aux_ts > *ts) {
 630		*ts = aux_ts;
 631		return 0;
 632	}
 633
 634	while (pos < len) {
 635		/* Handle Basic entry */
 636		basic = (struct hws_basic_entry *)(buf + pos);
 637		if (s390_cpumsf_make_event(pos, basic, sfq))
 638			pos += bsdes;
 639		else {
 640			err = -EBADF;
 641			goto out;
 642		}
 643
 644		pos += dsdes;	/* Skip diagnostic entry */
 645
 646		/* Check for trailer entry */
 647		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
 648			pos = (pos + S390_CPUMSF_PAGESZ)
 649			       & ~(S390_CPUMSF_PAGESZ - 1);
 650			/* Check existence of next page */
 651			if (pos >= len)
 652				break;
 653			aux_ts = get_trailer_time(buf + pos);
 654			if (!aux_ts) {
 655				aux_ts = ~0ULL;
 656				goto out;
 657			}
 658			if (aux_ts > *ts) {
 659				*ts = aux_ts;
 660				sfq->buffer->use_data += pos;
 661				sfq->buffer->use_size -= pos;
 662				return 0;
 663			}
 664		}
 665	}
 666out:
 667	*ts = aux_ts;
 668	sfq->buffer->use_size = 0;
 669	sfq->buffer->use_data = NULL;
 670	return err;	/* Buffer completely scanned or error */
 671}
 672
 673/* Run the s390 auxiliary trace decoder.
 674 * Select the queue buffer to operate on, the caller already selected
 675 * the proper queue, depending on second parameter 'ts'.
 676 * This is the time stamp until which the auxiliary entries should
 677 * be processed. This value is updated by called functions and
 678 * returned to the caller.
 679 *
 680 * Resume processing in the current buffer. If there is no buffer
 681 * get a new buffer from the queue and setup start position for
 682 * processing.
 683 * When a buffer is completely processed remove it from the queue
 684 * before returning.
 685 *
 686 * This function returns
 687 * 1: When the queue is empty. Second parameter will be set to
 688 *    maximum time stamp.
 689 * 0: Normal processing done.
 690 * <0: Error during queue buffer setup. This causes the caller
 691 *     to stop processing completely.
 692 */
 693static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
 694				   u64 *ts)
 695{
 696
 697	struct auxtrace_buffer *buffer;
 698	struct auxtrace_queue *queue;
 699	int err;
 700
 701	queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
 702
 703	/* Get buffer and last position in buffer to resume
 704	 * decoding the auxiliary entries. One buffer might be large
 705	 * and decoding might stop in between. This depends on the time
 706	 * stamp of the trailer entry in each page of the auxiliary
 707	 * data and the time stamp of the event triggering the decoding.
 708	 */
 709	if (sfq->buffer == NULL) {
 710		sfq->buffer = buffer = auxtrace_buffer__next(queue,
 711							     sfq->buffer);
 712		if (!buffer) {
 713			*ts = ~0ULL;
 714			return 1;	/* Processing done on this queue */
 715		}
 716		/* Start with a new buffer on this queue */
 717		if (buffer->data) {
 718			buffer->use_size = buffer->size;
 719			buffer->use_data = buffer->data;
 720		}
 721		if (sfq->logfile) {	/* Write into log file */
 722			size_t rc = fwrite(buffer->data, buffer->size, 1,
 723					   sfq->logfile);
 724			if (rc != 1)
 725				pr_err("Failed to write auxiliary data\n");
 726		}
 727	} else
 728		buffer = sfq->buffer;
 729
 730	if (!buffer->data) {
 731		int fd = perf_data__fd(sfq->sf->session->data);
 732
 733		buffer->data = auxtrace_buffer__get_data(buffer, fd);
 734		if (!buffer->data)
 735			return -ENOMEM;
 736		buffer->use_size = buffer->size;
 737		buffer->use_data = buffer->data;
 738
 739		if (sfq->logfile) {	/* Write into log file */
 740			size_t rc = fwrite(buffer->data, buffer->size, 1,
 741					   sfq->logfile);
 742			if (rc != 1)
 743				pr_err("Failed to write auxiliary data\n");
 744		}
 745	}
 746	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
 747		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
 748		  buffer->size, buffer->use_size);
 749	err = s390_cpumsf_samples(sfq, ts);
 750
 751	/* If non-zero, there is either an error (err < 0) or the buffer is
 752	 * completely done (err > 0). The error is unrecoverable, usually
 753	 * some descriptors could not be read successfully, so continue with
 754	 * the next buffer.
 755	 * In both cases the parameter 'ts' has been updated.
 756	 */
 757	if (err) {
 758		sfq->buffer = NULL;
 759		list_del_init(&buffer->list);
 760		auxtrace_buffer__free(buffer);
 761		if (err > 0)		/* Buffer done, no error */
 762			err = 0;
 763	}
 764	return err;
 765}
 766
 767static struct s390_cpumsf_queue *
 768s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
 769{
 770	struct s390_cpumsf_queue *sfq;
 771
 772	sfq = zalloc(sizeof(struct s390_cpumsf_queue));
 773	if (sfq == NULL)
 774		return NULL;
 775
 776	sfq->sf = sf;
 777	sfq->queue_nr = queue_nr;
 778	sfq->cpu = -1;
 779	if (sf->use_logfile) {
 780		char *name;
 781		int rc;
 782
 783		rc = (sf->logdir)
 784			? asprintf(&name, "%s/aux.smp.%02x",
 785				 sf->logdir, queue_nr)
 786			: asprintf(&name, "aux.smp.%02x", queue_nr);
 787		if (rc > 0)
 788			sfq->logfile = fopen(name, "w");
 789		if (sfq->logfile == NULL) {
 790			pr_err("Failed to open auxiliary log file %s,"
 791			       "continue...\n", name);
 792			sf->use_logfile = false;
 793		}
 794		free(name);
 795	}
 796	return sfq;
 797}
 798
 799static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
 800				   struct auxtrace_queue *queue,
 801				   unsigned int queue_nr, u64 ts)
 802{
 803	struct s390_cpumsf_queue *sfq = queue->priv;
 804
 805	if (list_empty(&queue->head))
 806		return 0;
 807
 808	if (sfq == NULL) {
 809		sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
 810		if (!sfq)
 811			return -ENOMEM;
 812		queue->priv = sfq;
 813
 814		if (queue->cpu != -1)
 815			sfq->cpu = queue->cpu;
 816	}
 817	return auxtrace_heap__add(&sf->heap, queue_nr, ts);
 818}
 819
 820static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
 821{
 822	unsigned int i;
 823	int ret = 0;
 824
 825	for (i = 0; i < sf->queues.nr_queues; i++) {
 826		ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
 827					      i, ts);
 828		if (ret)
 829			break;
 830	}
 831	return ret;
 832}
 833
 834static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
 835{
 836	if (!sf->queues.new_data)
 837		return 0;
 838
 839	sf->queues.new_data = false;
 840	return s390_cpumsf_setup_queues(sf, ts);
 841}
 842
 843static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
 844{
 845	unsigned int queue_nr;
 846	u64 ts;
 847	int ret;
 848
 849	while (1) {
 850		struct auxtrace_queue *queue;
 851		struct s390_cpumsf_queue *sfq;
 852
 853		if (!sf->heap.heap_cnt)
 854			return 0;
 855
 856		if (sf->heap.heap_array[0].ordinal >= timestamp)
 857			return 0;
 858
 859		queue_nr = sf->heap.heap_array[0].queue_nr;
 860		queue = &sf->queues.queue_array[queue_nr];
 861		sfq = queue->priv;
 862
 863		auxtrace_heap__pop(&sf->heap);
 864		if (sf->heap.heap_cnt) {
 865			ts = sf->heap.heap_array[0].ordinal + 1;
 866			if (ts > timestamp)
 867				ts = timestamp;
 868		} else {
 869			ts = timestamp;
 870		}
 871
 872		ret = s390_cpumsf_run_decoder(sfq, &ts);
 873		if (ret < 0) {
 874			auxtrace_heap__add(&sf->heap, queue_nr, ts);
 875			return ret;
 876		}
 877		if (!ret) {
 878			ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
 879			if (ret < 0)
 880				return ret;
 881		}
 882	}
 883	return 0;
 884}
 885
 886static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
 887				   pid_t pid, pid_t tid, u64 ip, u64 timestamp)
 888{
 889	char msg[MAX_AUXTRACE_ERROR_MSG];
 890	union perf_event event;
 891	int err;
 892
 893	strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
 894	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
 895			     code, cpu, pid, tid, ip, msg, timestamp);
 896
 897	err = perf_session__deliver_synth_event(sf->session, &event, NULL);
 898	if (err)
 899		pr_err("s390 Auxiliary Trace: failed to deliver error event,"
 900			"error %d\n", err);
 901	return err;
 902}
 903
 904static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
 905{
 906	return s390_cpumsf_synth_error(sf, 1, sample->cpu,
 907				       sample->pid, sample->tid, 0,
 908				       sample->time);
 909}
 910
 911static int
 912s390_cpumsf_process_event(struct perf_session *session,
 913			  union perf_event *event,
 914			  struct perf_sample *sample,
 915			  struct perf_tool *tool)
 916{
 917	struct s390_cpumsf *sf = container_of(session->auxtrace,
 918					      struct s390_cpumsf,
 919					      auxtrace);
 920	u64 timestamp = sample->time;
 921	struct evsel *ev_bc000;
 922
 923	int err = 0;
 924
 925	if (dump_trace)
 926		return 0;
 927
 928	if (!tool->ordered_events) {
 929		pr_err("s390 Auxiliary Trace requires ordered events\n");
 930		return -EINVAL;
 931	}
 932
 933	if (event->header.type == PERF_RECORD_SAMPLE &&
 934	    sample->raw_size) {
 935		/* Handle event with raw data */
 936		ev_bc000 = evlist__event2evsel(session->evlist, event);
 937		if (ev_bc000 &&
 938		    ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG)
 939			err = s390_cpumcf_dumpctr(sf, sample);
 940		return err;
 941	}
 942
 943	if (event->header.type == PERF_RECORD_AUX &&
 944	    event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
 945		return s390_cpumsf_lost(sf, sample);
 946
 947	if (timestamp) {
 948		err = s390_cpumsf_update_queues(sf, timestamp);
 949		if (!err)
 950			err = s390_cpumsf_process_queues(sf, timestamp);
 951	}
 952	return err;
 953}
 954
 955struct s390_cpumsf_synth {
 956	struct perf_tool cpumsf_tool;
 957	struct perf_session *session;
 958};
 959
 960static int
 961s390_cpumsf_process_auxtrace_event(struct perf_session *session,
 962				   union perf_event *event __maybe_unused,
 963				   struct perf_tool *tool __maybe_unused)
 964{
 965	struct s390_cpumsf *sf = container_of(session->auxtrace,
 966					      struct s390_cpumsf,
 967					      auxtrace);
 968
 969	int fd = perf_data__fd(session->data);
 970	struct auxtrace_buffer *buffer;
 971	off_t data_offset;
 972	int err;
 973
 974	if (sf->data_queued)
 975		return 0;
 976
 977	if (perf_data__is_pipe(session->data)) {
 978		data_offset = 0;
 979	} else {
 980		data_offset = lseek(fd, 0, SEEK_CUR);
 981		if (data_offset == -1)
 982			return -errno;
 983	}
 984
 985	err = auxtrace_queues__add_event(&sf->queues, session, event,
 986					 data_offset, &buffer);
 987	if (err)
 988		return err;
 989
 990	/* Dump here after copying piped trace out of the pipe */
 991	if (dump_trace) {
 992		if (auxtrace_buffer__get_data(buffer, fd)) {
 993			s390_cpumsf_dump_event(sf, buffer->data,
 994					       buffer->size);
 995			auxtrace_buffer__put_data(buffer);
 996		}
 997	}
 998	return 0;
 999}
1000
1001static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
1002{
1003}
1004
1005static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
1006			     struct perf_tool *tool __maybe_unused)
1007{
1008	return 0;
1009}
1010
1011static void s390_cpumsf_free_queues(struct perf_session *session)
1012{
1013	struct s390_cpumsf *sf = container_of(session->auxtrace,
1014					      struct s390_cpumsf,
1015					      auxtrace);
1016	struct auxtrace_queues *queues = &sf->queues;
1017	unsigned int i;
1018
1019	for (i = 0; i < queues->nr_queues; i++) {
1020		struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
1021						queues->queue_array[i].priv;
1022
1023		if (sfq != NULL) {
1024			if (sfq->logfile) {
1025				fclose(sfq->logfile);
1026				sfq->logfile = NULL;
1027			}
1028			if (sfq->logfile_ctr) {
1029				fclose(sfq->logfile_ctr);
1030				sfq->logfile_ctr = NULL;
1031			}
1032		}
1033		zfree(&queues->queue_array[i].priv);
1034	}
1035	auxtrace_queues__free(queues);
1036}
1037
1038static void s390_cpumsf_free(struct perf_session *session)
1039{
1040	struct s390_cpumsf *sf = container_of(session->auxtrace,
1041					      struct s390_cpumsf,
1042					      auxtrace);
1043
1044	auxtrace_heap__free(&sf->heap);
1045	s390_cpumsf_free_queues(session);
1046	session->auxtrace = NULL;
1047	zfree(&sf->logdir);
1048	free(sf);
1049}
1050
1051static bool
1052s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused,
1053			      struct evsel *evsel)
1054{
1055	return evsel->core.attr.type == PERF_TYPE_RAW &&
1056	       evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG;
1057}
1058
1059static int s390_cpumsf_get_type(const char *cpuid)
1060{
1061	int ret, family = 0;
1062
1063	ret = sscanf(cpuid, "%*[^,],%u", &family);
1064	return (ret == 1) ? family : 0;
1065}
1066
1067/* Check itrace options set on perf report command.
1068 * Return true, if none are set or all options specified can be
1069 * handled on s390 (currently only option 'd' for logging.
1070 * Return false otherwise.
1071 */
1072static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
1073{
1074	bool ison = false;
1075
1076	if (!itops || !itops->set)
1077		return true;
1078	ison = itops->inject || itops->instructions || itops->branches ||
1079		itops->transactions || itops->ptwrites ||
1080		itops->pwr_events || itops->errors ||
1081		itops->dont_decode || itops->calls || itops->returns ||
1082		itops->callchain || itops->thread_stack ||
1083		itops->last_branch || itops->add_callchain ||
1084		itops->add_last_branch;
1085	if (!ison)
1086		return true;
1087	pr_err("Unsupported --itrace options specified\n");
1088	return false;
1089}
1090
1091/* Check for AUXTRACE dump directory if it is needed.
1092 * On failure print an error message but continue.
1093 * Return 0 on wrong keyword in config file and 1 otherwise.
1094 */
1095static int s390_cpumsf__config(const char *var, const char *value, void *cb)
1096{
1097	struct s390_cpumsf *sf = cb;
1098	struct stat stbuf;
1099	int rc;
1100
1101	if (strcmp(var, "auxtrace.dumpdir"))
1102		return 0;
1103	sf->logdir = strdup(value);
1104	if (sf->logdir == NULL) {
1105		pr_err("Failed to find auxtrace log directory %s,"
1106		       " continue with current directory...\n", value);
1107		return 1;
1108	}
1109	rc = stat(sf->logdir, &stbuf);
1110	if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
1111		pr_err("Missing auxtrace log directory %s,"
1112		       " continue with current directory...\n", value);
1113		zfree(&sf->logdir);
1114	}
1115	return 1;
1116}
1117
1118int s390_cpumsf_process_auxtrace_info(union perf_event *event,
1119				      struct perf_session *session)
1120{
1121	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
1122	struct s390_cpumsf *sf;
1123	int err;
1124
1125	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info))
1126		return -EINVAL;
1127
1128	sf = zalloc(sizeof(struct s390_cpumsf));
1129	if (sf == NULL)
1130		return -ENOMEM;
1131
1132	if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
1133		err = -EINVAL;
1134		goto err_free;
1135	}
1136	sf->use_logfile = session->itrace_synth_opts->log;
1137	if (sf->use_logfile)
1138		perf_config(s390_cpumsf__config, sf);
1139
1140	err = auxtrace_queues__init(&sf->queues);
1141	if (err)
1142		goto err_free;
1143
1144	sf->session = session;
1145	sf->machine = &session->machines.host; /* No kvm support */
1146	sf->auxtrace_type = auxtrace_info->type;
1147	sf->pmu_type = PERF_TYPE_RAW;
1148	sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid);
1149
1150	sf->auxtrace.process_event = s390_cpumsf_process_event;
1151	sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
1152	sf->auxtrace.flush_events = s390_cpumsf_flush;
1153	sf->auxtrace.free_events = s390_cpumsf_free_events;
1154	sf->auxtrace.free = s390_cpumsf_free;
1155	sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace;
1156	session->auxtrace = &sf->auxtrace;
1157
1158	if (dump_trace)
1159		return 0;
1160
1161	err = auxtrace_queues__process_index(&sf->queues, session);
1162	if (err)
1163		goto err_free_queues;
1164
1165	if (sf->queues.populated)
1166		sf->data_queued = true;
1167
1168	return 0;
1169
1170err_free_queues:
1171	auxtrace_queues__free(&sf->queues);
1172	session->auxtrace = NULL;
1173err_free:
1174	zfree(&sf->logdir);
1175	free(sf);
1176	return err;
1177}