Linux Audio

Check our new training course

Loading...
v6.9.4
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
 
 
  15 */
  16
  17#include "util/record.h"
  18#include <api/fs/tracing_path.h>
  19#ifdef HAVE_LIBBPF_SUPPORT
  20#include <bpf/bpf.h>
  21#include <bpf/libbpf.h>
  22#ifdef HAVE_BPF_SKEL
  23#include "bpf_skel/augmented_raw_syscalls.skel.h"
  24#endif
  25#endif
  26#include "util/bpf_map.h"
  27#include "util/rlimit.h"
  28#include "builtin.h"
  29#include "util/cgroup.h"
  30#include "util/color.h"
  31#include "util/config.h"
  32#include "util/debug.h"
  33#include "util/dso.h"
  34#include "util/env.h"
  35#include "util/event.h"
  36#include "util/evsel.h"
  37#include "util/evsel_fprintf.h"
  38#include "util/synthetic-events.h"
  39#include "util/evlist.h"
  40#include "util/evswitch.h"
  41#include "util/mmap.h"
  42#include <subcmd/pager.h>
  43#include <subcmd/exec-cmd.h>
  44#include "util/machine.h"
  45#include "util/map.h"
  46#include "util/symbol.h"
  47#include "util/path.h"
  48#include "util/session.h"
  49#include "util/thread.h"
  50#include <subcmd/parse-options.h>
  51#include "util/strlist.h"
  52#include "util/intlist.h"
  53#include "util/thread_map.h"
  54#include "util/stat.h"
  55#include "util/tool.h"
  56#include "util/util.h"
  57#include "trace/beauty/beauty.h"
  58#include "trace-event.h"
  59#include "util/parse-events.h"
  60#include "util/tracepoint.h"
  61#include "callchain.h"
  62#include "print_binary.h"
  63#include "string2.h"
  64#include "syscalltbl.h"
  65#include "rb_resort.h"
  66#include "../perf.h"
  67
  68#include <errno.h>
  69#include <inttypes.h>
  70#include <poll.h>
  71#include <signal.h>
  72#include <stdlib.h>
  73#include <string.h>
  74#include <linux/err.h>
  75#include <linux/filter.h>
  76#include <linux/kernel.h>
  77#include <linux/list_sort.h>
  78#include <linux/random.h>
  79#include <linux/stringify.h>
  80#include <linux/time64.h>
  81#include <linux/zalloc.h>
  82#include <fcntl.h>
  83#include <sys/sysmacros.h>
  84
  85#include <linux/ctype.h>
  86#include <perf/mmap.h>
  87
  88#ifdef HAVE_LIBTRACEEVENT
  89#include <traceevent/event-parse.h>
  90#endif
  91
  92#ifndef O_CLOEXEC
  93# define O_CLOEXEC		02000000
  94#endif
  95
  96#ifndef F_LINUX_SPECIFIC_BASE
  97# define F_LINUX_SPECIFIC_BASE	1024
  98#endif
  99
 100#define RAW_SYSCALL_ARGS_NUM	6
 101
 102/*
 103 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
 104 */
 105struct syscall_arg_fmt {
 106	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 107	bool	   (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
 108	unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
 109	void	   *parm;
 110	const char *name;
 111	u16	   nr_entries; // for arrays
 112	bool	   show_zero;
 113};
 114
 115struct syscall_fmt {
 116	const char *name;
 117	const char *alias;
 118	struct {
 119		const char *sys_enter,
 120			   *sys_exit;
 121	}	   bpf_prog_name;
 122	struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
 123	u8	   nr_args;
 124	bool	   errpid;
 125	bool	   timeout;
 126	bool	   hexret;
 127};
 128
 129struct trace {
 130	struct perf_tool	tool;
 131	struct syscalltbl	*sctbl;
 132	struct {
 
 133		struct syscall  *table;
 134		struct {
 135			struct evsel *sys_enter,
 136				*sys_exit,
 137				*bpf_output;
 138		}		events;
 139	} syscalls;
 140#ifdef HAVE_BPF_SKEL
 141	struct augmented_raw_syscalls_bpf *skel;
 142#endif
 143	struct record_opts	opts;
 144	struct evlist	*evlist;
 145	struct machine		*host;
 146	struct thread		*current;
 147	struct cgroup		*cgroup;
 148	u64			base_time;
 149	FILE			*output;
 150	unsigned long		nr_events;
 151	unsigned long		nr_events_printed;
 152	unsigned long		max_events;
 153	struct evswitch		evswitch;
 154	struct strlist		*ev_qualifier;
 155	struct {
 156		size_t		nr;
 157		int		*entries;
 158	}			ev_qualifier_ids;
 159	struct {
 160		size_t		nr;
 161		pid_t		*entries;
 162		struct bpf_map  *map;
 163	}			filter_pids;
 164	double			duration_filter;
 165	double			runtime_ms;
 166	struct {
 167		u64		vfs_getname,
 168				proc_getname;
 169	} stats;
 170	unsigned int		max_stack;
 171	unsigned int		min_stack;
 172	int			raw_augmented_syscalls_args_size;
 173	bool			raw_augmented_syscalls;
 174	bool			fd_path_disabled;
 175	bool			sort_events;
 176	bool			not_ev_qualifier;
 177	bool			live;
 178	bool			full_time;
 179	bool			sched;
 180	bool			multiple_threads;
 181	bool			summary;
 182	bool			summary_only;
 183	bool			errno_summary;
 184	bool			failure_only;
 185	bool			show_comm;
 186	bool			print_sample;
 187	bool			show_tool_stats;
 188	bool			trace_syscalls;
 189	bool			libtraceevent_print;
 190	bool			kernel_syscallchains;
 191	s16			args_alignment;
 192	bool			show_tstamp;
 193	bool			show_duration;
 194	bool			show_zeros;
 195	bool			show_arg_names;
 196	bool			show_string_prefix;
 197	bool			force;
 198	bool			vfs_getname;
 199	int			trace_pgfaults;
 200	char			*perfconfig_events;
 201	struct {
 202		struct ordered_events	data;
 203		u64			last;
 204	} oe;
 205};
 206
 207struct tp_field {
 208	int offset;
 209	union {
 210		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 211		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 212	};
 213};
 214
 215#define TP_UINT_FIELD(bits) \
 216static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 217{ \
 218	u##bits value; \
 219	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 220	return value;  \
 221}
 222
 223TP_UINT_FIELD(8);
 224TP_UINT_FIELD(16);
 225TP_UINT_FIELD(32);
 226TP_UINT_FIELD(64);
 227
 228#define TP_UINT_FIELD__SWAPPED(bits) \
 229static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 230{ \
 231	u##bits value; \
 232	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 233	return bswap_##bits(value);\
 234}
 235
 236TP_UINT_FIELD__SWAPPED(16);
 237TP_UINT_FIELD__SWAPPED(32);
 238TP_UINT_FIELD__SWAPPED(64);
 239
 240static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
 
 
 241{
 242	field->offset = offset;
 243
 244	switch (size) {
 245	case 1:
 246		field->integer = tp_field__u8;
 247		break;
 248	case 2:
 249		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 250		break;
 251	case 4:
 252		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 253		break;
 254	case 8:
 255		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 256		break;
 257	default:
 258		return -1;
 259	}
 260
 261	return 0;
 262}
 263
 264static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
 265{
 266	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
 267}
 268
 269static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 270{
 271	return sample->raw_data + field->offset;
 272}
 273
 274static int __tp_field__init_ptr(struct tp_field *field, int offset)
 275{
 276	field->offset = offset;
 277	field->pointer = tp_field__ptr;
 278	return 0;
 279}
 280
 281static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
 282{
 283	return __tp_field__init_ptr(field, format_field->offset);
 284}
 285
 286struct syscall_tp {
 287	struct tp_field id;
 288	union {
 289		struct tp_field args, ret;
 290	};
 291};
 292
 293/*
 294 * The evsel->priv as used by 'perf trace'
 295 * sc:	for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
 296 * fmt: for all the other tracepoints
 297 */
 298struct evsel_trace {
 299	struct syscall_tp	sc;
 300	struct syscall_arg_fmt  *fmt;
 301};
 302
 303static struct evsel_trace *evsel_trace__new(void)
 304{
 305	return zalloc(sizeof(struct evsel_trace));
 306}
 307
 308static void evsel_trace__delete(struct evsel_trace *et)
 309{
 310	if (et == NULL)
 311		return;
 312
 313	zfree(&et->fmt);
 314	free(et);
 315}
 316
 317/*
 318 * Used with raw_syscalls:sys_{enter,exit} and with the
 319 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
 320 */
 321static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
 322{
 323	struct evsel_trace *et = evsel->priv;
 324
 325	return &et->sc;
 326}
 327
 328static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
 329{
 330	if (evsel->priv == NULL) {
 331		evsel->priv = evsel_trace__new();
 332		if (evsel->priv == NULL)
 333			return NULL;
 334	}
 335
 336	return __evsel__syscall_tp(evsel);
 337}
 338
 339/*
 340 * Used with all the other tracepoints.
 341 */
 342static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
 343{
 344	struct evsel_trace *et = evsel->priv;
 345
 346	return et->fmt;
 347}
 348
 349static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
 350{
 351	struct evsel_trace *et = evsel->priv;
 352
 353	if (evsel->priv == NULL) {
 354		et = evsel->priv = evsel_trace__new();
 355
 356		if (et == NULL)
 357			return NULL;
 358	}
 359
 360	if (et->fmt == NULL) {
 361		et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
 362		if (et->fmt == NULL)
 363			goto out_delete;
 364	}
 365
 366	return __evsel__syscall_arg_fmt(evsel);
 367
 368out_delete:
 369	evsel_trace__delete(evsel->priv);
 370	evsel->priv = NULL;
 371	return NULL;
 372}
 373
 374static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
 375{
 376	struct tep_format_field *format_field = evsel__field(evsel, name);
 377
 378	if (format_field == NULL)
 379		return -1;
 380
 381	return tp_field__init_uint(field, format_field, evsel->needs_swap);
 382}
 383
 384#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 385	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
 386	   evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 387
 388static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
 
 
 389{
 390	struct tep_format_field *format_field = evsel__field(evsel, name);
 391
 392	if (format_field == NULL)
 393		return -1;
 394
 395	return tp_field__init_ptr(field, format_field);
 396}
 397
 398#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 399	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
 400	   evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 401
 402static void evsel__delete_priv(struct evsel *evsel)
 403{
 404	zfree(&evsel->priv);
 405	evsel__delete(evsel);
 406}
 407
 408static int evsel__init_syscall_tp(struct evsel *evsel)
 409{
 410	struct syscall_tp *sc = evsel__syscall_tp(evsel);
 411
 412	if (sc != NULL) {
 413		if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
 414		    evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
 415			return -ENOENT;
 416
 417		return 0;
 418	}
 419
 420	return -ENOMEM;
 421}
 422
 423static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
 424{
 425	struct syscall_tp *sc = evsel__syscall_tp(evsel);
 426
 427	if (sc != NULL) {
 428		struct tep_format_field *syscall_id = evsel__field(tp, "id");
 429		if (syscall_id == NULL)
 430			syscall_id = evsel__field(tp, "__syscall_nr");
 431		if (syscall_id == NULL ||
 432		    __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
 433			return -EINVAL;
 434
 435		return 0;
 436	}
 437
 438	return -ENOMEM;
 439}
 440
 441static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
 442{
 443	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
 444
 445	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 446}
 447
 448static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
 449{
 450	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
 451
 452	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
 453}
 454
 455static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
 456{
 457	if (evsel__syscall_tp(evsel) != NULL) {
 458		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 459			return -ENOENT;
 460
 461		evsel->handler = handler;
 462		return 0;
 463	}
 464
 465	return -ENOMEM;
 
 
 
 
 466}
 467
 468static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
 469{
 470	struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
 471
 472	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 473	if (IS_ERR(evsel))
 474		evsel = evsel__newtp("syscalls", direction);
 475
 476	if (IS_ERR(evsel))
 477		return NULL;
 478
 479	if (evsel__init_raw_syscall_tp(evsel, handler))
 480		goto out_delete;
 481
 482	return evsel;
 483
 484out_delete:
 485	evsel__delete_priv(evsel);
 486	return NULL;
 487}
 488
 489#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 490	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 491	   fields->name.integer(&fields->name, sample); })
 492
 493#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 494	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 495	   fields->name.pointer(&fields->name, sample); })
 496
 497size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
 498{
 499	int idx = val - sa->offset;
 500
 501	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 502		size_t printed = scnprintf(bf, size, intfmt, val);
 503		if (show_suffix)
 504			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 505		return printed;
 506	}
 507
 508	return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
 509}
 510
 511size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 512{
 513	int idx = val - sa->offset;
 514
 515	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 516		size_t printed = scnprintf(bf, size, intfmt, val);
 517		if (show_prefix)
 518			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 519		return printed;
 520	}
 521
 522	return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 523}
 524
 525static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 526						const char *intfmt,
 527					        struct syscall_arg *arg)
 528{
 529	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
 530}
 531
 532static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 533					      struct syscall_arg *arg)
 534{
 535	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 536}
 537
 538#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 539
 540bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 541{
 542	return strarray__strtoul(arg->parm, bf, size, ret);
 543}
 544
 545bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 546{
 547	return strarray__strtoul_flags(arg->parm, bf, size, ret);
 548}
 549
 550bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 551{
 552	return strarrays__strtoul(arg->parm, bf, size, ret);
 553}
 554
 555size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
 556{
 557	return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
 558}
 559
 560size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 
 561{
 562	size_t printed;
 563	int i;
 564
 565	for (i = 0; i < sas->nr_entries; ++i) {
 566		struct strarray *sa = sas->entries[i];
 567		int idx = val - sa->offset;
 568
 569		if (idx >= 0 && idx < sa->nr_entries) {
 570			if (sa->entries[idx] == NULL)
 571				break;
 572			return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 573		}
 574	}
 575
 576	printed = scnprintf(bf, size, intfmt, val);
 577	if (show_prefix)
 578		printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
 579	return printed;
 580}
 581
 582bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
 583{
 584	int i;
 585
 586	for (i = 0; i < sa->nr_entries; ++i) {
 587		if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
 588			*ret = sa->offset + i;
 589			return true;
 590		}
 591	}
 592
 593	return false;
 594}
 595
 596bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
 597{
 598	u64 val = 0;
 599	char *tok = bf, *sep, *end;
 600
 601	*ret = 0;
 602
 603	while (size != 0) {
 604		int toklen = size;
 605
 606		sep = memchr(tok, '|', size);
 607		if (sep != NULL) {
 608			size -= sep - tok + 1;
 609
 610			end = sep - 1;
 611			while (end > tok && isspace(*end))
 612				--end;
 613
 614			toklen = end - tok + 1;
 615		}
 616
 617		while (isspace(*tok))
 618			++tok;
 619
 620		if (isalpha(*tok) || *tok == '_') {
 621			if (!strarray__strtoul(sa, tok, toklen, &val))
 622				return false;
 623		} else
 624			val = strtoul(tok, NULL, 0);
 625
 626		*ret |= (1 << (val - 1));
 627
 628		if (sep == NULL)
 629			break;
 630		tok = sep + 1;
 631	}
 632
 633	return true;
 634}
 635
 636bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
 637{
 638	int i;
 639
 640	for (i = 0; i < sas->nr_entries; ++i) {
 641		struct strarray *sa = sas->entries[i];
 642
 643		if (strarray__strtoul(sa, bf, size, ret))
 644			return true;
 645	}
 646
 647	return false;
 648}
 649
 650size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 651					struct syscall_arg *arg)
 652{
 653	return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
 654}
 655
 656#ifndef AT_FDCWD
 657#define AT_FDCWD	-100
 658#endif
 659
 660static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 661					   struct syscall_arg *arg)
 662{
 663	int fd = arg->val;
 664	const char *prefix = "AT_FD";
 665
 666	if (fd == AT_FDCWD)
 667		return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
 668
 669	return syscall_arg__scnprintf_fd(bf, size, arg);
 670}
 671
 672#define SCA_FDAT syscall_arg__scnprintf_fd_at
 673
 674static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 675					      struct syscall_arg *arg);
 676
 677#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 678
 679size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 680{
 681	return scnprintf(bf, size, "%#lx", arg->val);
 682}
 683
 684size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
 685{
 686	if (arg->val == 0)
 687		return scnprintf(bf, size, "NULL");
 688	return syscall_arg__scnprintf_hex(bf, size, arg);
 689}
 690
 691size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 692{
 693	return scnprintf(bf, size, "%d", arg->val);
 694}
 695
 696size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 697{
 698	return scnprintf(bf, size, "%ld", arg->val);
 699}
 700
 701static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
 702{
 703	// XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
 704	//     fill missing comms using thread__set_comm()...
 705	//     here or in a special syscall_arg__scnprintf_pid_sched_tp...
 706	return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
 707}
 708
 709#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
 710
 711static const char *bpf_cmd[] = {
 712	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 713	"MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
 714	"PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
 715	"PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
 716	"PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
 717	"TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
 718	"BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
 719	"MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
 720	"LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
 721	"LINK_DETACH", "PROG_BIND_MAP",
 722};
 723static DEFINE_STRARRAY(bpf_cmd, "BPF_");
 724
 725static const char *fsmount_flags[] = {
 726	[1] = "CLOEXEC",
 727};
 728static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
 729
 730#include "trace/beauty/generated/fsconfig_arrays.c"
 731
 732static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
 733
 734static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 735static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
 736
 737static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 738static DEFINE_STRARRAY(itimers, "ITIMER_");
 739
 740static const char *keyctl_options[] = {
 741	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 742	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 743	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 744	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 745	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 746};
 747static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
 748
 749static const char *whences[] = { "SET", "CUR", "END",
 750#ifdef SEEK_DATA
 751"DATA",
 752#endif
 753#ifdef SEEK_HOLE
 754"HOLE",
 755#endif
 756};
 757static DEFINE_STRARRAY(whences, "SEEK_");
 758
 759static const char *fcntl_cmds[] = {
 760	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 761	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 762	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 763	"GETOWNER_UIDS",
 764};
 765static DEFINE_STRARRAY(fcntl_cmds, "F_");
 766
 767static const char *fcntl_linux_specific_cmds[] = {
 768	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
 769	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 770	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 771};
 772
 773static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
 774
 775static struct strarray *fcntl_cmds_arrays[] = {
 776	&strarray__fcntl_cmds,
 777	&strarray__fcntl_linux_specific_cmds,
 778};
 779
 780static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 781
 782static const char *rlimit_resources[] = {
 783	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 784	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 785	"RTTIME",
 786};
 787static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
 788
 789static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 790static DEFINE_STRARRAY(sighow, "SIG_");
 791
 792static const char *clockid[] = {
 793	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 794	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 795	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 796};
 797static DEFINE_STRARRAY(clockid, "CLOCK_");
 
 
 
 
 
 
 
 
 
 
 798
 799static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 800						 struct syscall_arg *arg)
 801{
 802	bool show_prefix = arg->show_string_prefix;
 803	const char *suffix = "_OK";
 804	size_t printed = 0;
 805	int mode = arg->val;
 806
 807	if (mode == F_OK) /* 0 */
 808		return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
 809#define	P_MODE(n) \
 810	if (mode & n##_OK) { \
 811		printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
 812		mode &= ~n##_OK; \
 813	}
 814
 815	P_MODE(R);
 816	P_MODE(W);
 817	P_MODE(X);
 818#undef P_MODE
 819
 820	if (mode)
 821		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 822
 823	return printed;
 824}
 825
 826#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 827
 828static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 829					      struct syscall_arg *arg);
 830
 831#define SCA_FILENAME syscall_arg__scnprintf_filename
 832
 833static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 834						struct syscall_arg *arg)
 835{
 836	bool show_prefix = arg->show_string_prefix;
 837	const char *prefix = "O_";
 838	int printed = 0, flags = arg->val;
 839
 840#define	P_FLAG(n) \
 841	if (flags & O_##n) { \
 842		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 843		flags &= ~O_##n; \
 844	}
 845
 846	P_FLAG(CLOEXEC);
 847	P_FLAG(NONBLOCK);
 848#undef P_FLAG
 849
 850	if (flags)
 851		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 852
 853	return printed;
 854}
 855
 856#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 857
 858#ifndef GRND_NONBLOCK
 859#define GRND_NONBLOCK	0x0001
 860#endif
 861#ifndef GRND_RANDOM
 862#define GRND_RANDOM	0x0002
 863#endif
 864
 865static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 866						   struct syscall_arg *arg)
 867{
 868	bool show_prefix = arg->show_string_prefix;
 869	const char *prefix = "GRND_";
 870	int printed = 0, flags = arg->val;
 871
 872#define	P_FLAG(n) \
 873	if (flags & GRND_##n) { \
 874		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 875		flags &= ~GRND_##n; \
 876	}
 877
 878	P_FLAG(RANDOM);
 879	P_FLAG(NONBLOCK);
 880#undef P_FLAG
 881
 882	if (flags)
 883		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 884
 885	return printed;
 886}
 887
 888#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 889
 890#define STRARRAY(name, array) \
 891	  { .scnprintf	= SCA_STRARRAY, \
 892	    .strtoul	= STUL_STRARRAY, \
 893	    .parm	= &strarray__##array, }
 894
 895#define STRARRAY_FLAGS(name, array) \
 896	  { .scnprintf	= SCA_STRARRAY_FLAGS, \
 897	    .strtoul	= STUL_STRARRAY_FLAGS, \
 898	    .parm	= &strarray__##array, }
 899
 900#include "trace/beauty/arch_errno_names.c"
 901#include "trace/beauty/eventfd.c"
 902#include "trace/beauty/futex_op.c"
 903#include "trace/beauty/futex_val3.c"
 904#include "trace/beauty/mmap.c"
 905#include "trace/beauty/mode_t.c"
 906#include "trace/beauty/msg_flags.c"
 907#include "trace/beauty/open_flags.c"
 908#include "trace/beauty/perf_event_open.c"
 909#include "trace/beauty/pid.c"
 910#include "trace/beauty/sched_policy.c"
 911#include "trace/beauty/seccomp.c"
 912#include "trace/beauty/signum.c"
 913#include "trace/beauty/socket_type.c"
 914#include "trace/beauty/waitid_options.c"
 915
 916static const struct syscall_fmt syscall_fmts[] = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 917	{ .name	    = "access",
 918	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 919	{ .name	    = "arch_prctl",
 920	  .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
 921		   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
 922	{ .name	    = "bind",
 923	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
 924		   [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ },
 925		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
 926	{ .name	    = "bpf",
 927	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 928	{ .name	    = "brk",	    .hexret = true,
 929	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
 930	{ .name     = "clock_gettime",
 931	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 932	{ .name	    = "clock_nanosleep",
 933	  .arg = { [2] = { .scnprintf = SCA_TIMESPEC,  /* rqtp */ }, }, },
 934	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
 935	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
 936		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 937		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 938		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 939		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
 940	{ .name	    = "close",
 941	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 942	{ .name	    = "connect",
 943	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
 944		   [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ },
 945		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
 946	{ .name	    = "epoll_ctl",
 947	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 948	{ .name	    = "eventfd2",
 949	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 950	{ .name	    = "fchmodat",
 951	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 952	{ .name	    = "fchownat",
 953	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 954	{ .name	    = "fcntl",
 955	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD,  /* cmd */
 956			   .strtoul   = STUL_STRARRAYS,
 957			   .parm      = &strarrays__fcntl_cmds_arrays,
 958			   .show_zero = true, },
 959		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 960	{ .name	    = "flock",
 961	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 962	{ .name     = "fsconfig",
 963	  .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
 964	{ .name     = "fsmount",
 965	  .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
 966		   [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
 967	{ .name     = "fspick",
 968	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
 969		   [1] = { .scnprintf = SCA_FILENAME,	  /* path */ },
 970		   [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
 971	{ .name	    = "fstat", .alias = "newfstat", },
 972	{ .name	    = "fstatat", .alias = "newfstatat", },
 973	{ .name	    = "futex",
 974	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 975		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 976	{ .name	    = "futimesat",
 977	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 978	{ .name	    = "getitimer",
 979	  .arg = { [0] = STRARRAY(which, itimers), }, },
 980	{ .name	    = "getpid",	    .errpid = true, },
 981	{ .name	    = "getpgid",    .errpid = true, },
 982	{ .name	    = "getppid",    .errpid = true, },
 983	{ .name	    = "getrandom",
 984	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 985	{ .name	    = "getrlimit",
 986	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 987	{ .name	    = "getsockopt",
 988	  .arg = { [1] = STRARRAY(level, socket_level), }, },
 989	{ .name	    = "gettid",	    .errpid = true, },
 990	{ .name	    = "ioctl",
 991	  .arg = {
 992#if defined(__i386__) || defined(__x86_64__)
 993/*
 994 * FIXME: Make this available to all arches.
 995 */
 996		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 997		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 998#else
 999		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1000#endif
1001	{ .name	    = "kcmp",	    .nr_args = 5,
1002	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
1003		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
1004		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
1005		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
1006		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
1007	{ .name	    = "keyctl",
1008	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
1009	{ .name	    = "kill",
1010	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1011	{ .name	    = "linkat",
1012	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1013	{ .name	    = "lseek",
1014	  .arg = { [2] = STRARRAY(whence, whences), }, },
1015	{ .name	    = "lstat", .alias = "newlstat", },
1016	{ .name     = "madvise",
1017	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
1018		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
1019	{ .name	    = "mkdirat",
1020	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1021	{ .name	    = "mknodat",
1022	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 
 
 
 
1023	{ .name	    = "mmap",	    .hexret = true,
1024/* The standard mmap maps to old_mmap on s390x */
1025#if defined(__s390x__)
1026	.alias = "old_mmap",
1027#endif
1028	  .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
1029		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */
1030			   .strtoul   = STUL_STRARRAY_FLAGS,
1031			   .parm      = &strarray__mmap_flags, },
1032		   [5] = { .scnprintf = SCA_HEX,	/* offset */ }, }, },
1033	{ .name	    = "mount",
1034	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
1035		   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
1036			   .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
1037	{ .name	    = "move_mount",
1038	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* from_dfd */ },
1039		   [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
1040		   [2] = { .scnprintf = SCA_FDAT,	/* to_dfd */ },
1041		   [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
1042		   [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
1043	{ .name	    = "mprotect",
1044	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
1045		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
1046	{ .name	    = "mq_unlink",
1047	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
1048	{ .name	    = "mremap",	    .hexret = true,
1049	  .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
 
 
 
 
 
 
1050	{ .name	    = "name_to_handle_at",
1051	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1052	{ .name	    = "newfstatat",
1053	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1054	{ .name	    = "open",
1055	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1056	{ .name	    = "open_by_handle_at",
1057	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
1058		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1059	{ .name	    = "openat",
1060	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
1061		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1062	{ .name	    = "perf_event_open",
1063	  .arg = { [0] = { .scnprintf = SCA_PERF_ATTR,  /* attr */ },
1064		   [2] = { .scnprintf = SCA_INT,	/* cpu */ },
1065		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
1066		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
1067	{ .name	    = "pipe2",
1068	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
1069	{ .name	    = "pkey_alloc",
1070	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
1071	{ .name	    = "pkey_free",
1072	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
1073	{ .name	    = "pkey_mprotect",
1074	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
1075		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
1076		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
1077	{ .name	    = "poll", .timeout = true, },
1078	{ .name	    = "ppoll", .timeout = true, },
1079	{ .name	    = "prctl",
1080	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
1081			   .strtoul   = STUL_STRARRAY,
1082			   .parm      = &strarray__prctl_options, },
1083		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
1084		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
1085	{ .name	    = "pread", .alias = "pread64", },
1086	{ .name	    = "preadv", .alias = "pread", },
1087	{ .name	    = "prlimit64",
1088	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
1089	{ .name	    = "pwrite", .alias = "pwrite64", },
1090	{ .name	    = "readlinkat",
1091	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1092	{ .name	    = "recvfrom",
1093	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1094	{ .name	    = "recvmmsg",
1095	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1096	{ .name	    = "recvmsg",
1097	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1098	{ .name	    = "renameat",
1099	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1100		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
1101	{ .name	    = "renameat2",
1102	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1103		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
1104		   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
1105	{ .name	    = "rt_sigaction",
1106	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1107	{ .name	    = "rt_sigprocmask",
1108	  .arg = { [0] = STRARRAY(how, sighow), }, },
1109	{ .name	    = "rt_sigqueueinfo",
1110	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1111	{ .name	    = "rt_tgsigqueueinfo",
1112	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1113	{ .name	    = "sched_setscheduler",
1114	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
1115	{ .name	    = "seccomp",
1116	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
1117		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
1118	{ .name	    = "select", .timeout = true, },
1119	{ .name	    = "sendfile", .alias = "sendfile64", },
1120	{ .name	    = "sendmmsg",
1121	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1122	{ .name	    = "sendmsg",
1123	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1124	{ .name	    = "sendto",
1125	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
1126		   [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
1127	{ .name	    = "set_tid_address", .errpid = true, },
1128	{ .name	    = "setitimer",
1129	  .arg = { [0] = STRARRAY(which, itimers), }, },
1130	{ .name	    = "setrlimit",
1131	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
1132	{ .name	    = "setsockopt",
1133	  .arg = { [1] = STRARRAY(level, socket_level), }, },
1134	{ .name	    = "socket",
1135	  .arg = { [0] = STRARRAY(family, socket_families),
1136		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1137		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1138	{ .name	    = "socketpair",
1139	  .arg = { [0] = STRARRAY(family, socket_families),
1140		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1141		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1142	{ .name	    = "stat", .alias = "newstat", },
1143	{ .name	    = "statx",
1144	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
1145		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
1146		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
1147	{ .name	    = "swapoff",
1148	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
1149	{ .name	    = "swapon",
1150	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
1151	{ .name	    = "symlinkat",
1152	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1153	{ .name	    = "sync_file_range",
1154	  .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
1155	{ .name	    = "tgkill",
1156	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1157	{ .name	    = "tkill",
1158	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1159	{ .name     = "umount2", .alias = "umount",
1160	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
1161	{ .name	    = "uname", .alias = "newuname", },
1162	{ .name	    = "unlinkat",
1163	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1164	{ .name	    = "utimensat",
1165	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
1166	{ .name	    = "wait4",	    .errpid = true,
1167	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1168	{ .name	    = "waitid",	    .errpid = true,
1169	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1170};
1171
1172static int syscall_fmt__cmp(const void *name, const void *fmtp)
1173{
1174	const struct syscall_fmt *fmt = fmtp;
1175	return strcmp(name, fmt->name);
1176}
1177
1178static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts,
1179						     const int nmemb,
1180						     const char *name)
1181{
1182	return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1183}
1184
1185static const struct syscall_fmt *syscall_fmt__find(const char *name)
1186{
1187	const int nmemb = ARRAY_SIZE(syscall_fmts);
1188	return __syscall_fmt__find(syscall_fmts, nmemb, name);
1189}
1190
1191static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts,
1192							      const int nmemb, const char *alias)
1193{
1194	int i;
1195
1196	for (i = 0; i < nmemb; ++i) {
1197		if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
1198			return &fmts[i];
1199	}
1200
1201	return NULL;
1202}
1203
1204static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
1205{
1206	const int nmemb = ARRAY_SIZE(syscall_fmts);
1207	return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
1208}
1209
1210/*
1211 * is_exit: is this "exit" or "exit_group"?
1212 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
1213 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
1214 * nonexistent: Just a hole in the syscall table, syscall id not allocated
1215 */
1216struct syscall {
1217	struct tep_event    *tp_format;
1218	int		    nr_args;
1219	int		    args_size;
1220	struct {
1221		struct bpf_program *sys_enter,
1222				   *sys_exit;
1223	}		    bpf_prog;
1224	bool		    is_exit;
1225	bool		    is_open;
1226	bool		    nonexistent;
1227	struct tep_format_field *args;
1228	const char	    *name;
1229	const struct syscall_fmt  *fmt;
 
1230	struct syscall_arg_fmt *arg_fmt;
1231};
1232
1233/*
1234 * We need to have this 'calculated' boolean because in some cases we really
1235 * don't know what is the duration of a syscall, for instance, when we start
1236 * a session and some threads are waiting for a syscall to finish, say 'poll',
1237 * in which case all we can do is to print "( ? ) for duration and for the
1238 * start timestamp.
1239 */
1240static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1241{
1242	double duration = (double)t / NSEC_PER_MSEC;
1243	size_t printed = fprintf(fp, "(");
1244
1245	if (!calculated)
1246		printed += fprintf(fp, "         ");
1247	else if (duration >= 1.0)
1248		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1249	else if (duration >= 0.01)
1250		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1251	else
1252		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1253	return printed + fprintf(fp, "): ");
1254}
1255
1256/**
1257 * filename.ptr: The filename char pointer that will be vfs_getname'd
1258 * filename.entry_str_pos: Where to insert the string translated from
1259 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1260 * ret_scnprintf: syscall args may set this to a different syscall return
1261 *                formatter, for instance, fcntl may return fds, file flags, etc.
1262 */
1263struct thread_trace {
1264	u64		  entry_time;
1265	bool		  entry_pending;
1266	unsigned long	  nr_events;
1267	unsigned long	  pfmaj, pfmin;
1268	char		  *entry_str;
1269	double		  runtime_ms;
1270	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1271        struct {
1272		unsigned long ptr;
1273		short int     entry_str_pos;
1274		bool	      pending_open;
1275		unsigned int  namelen;
1276		char	      *name;
1277	} filename;
1278	struct {
1279		int	      max;
1280		struct file   *table;
1281	} files;
1282
1283	struct intlist *syscall_stats;
1284};
1285
1286static struct thread_trace *thread_trace__new(void)
1287{
1288	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1289
1290	if (ttrace) {
1291		ttrace->files.max = -1;
1292		ttrace->syscall_stats = intlist__new(NULL);
1293	}
1294
1295	return ttrace;
1296}
1297
1298static void thread_trace__free_files(struct thread_trace *ttrace);
1299
1300static void thread_trace__delete(void *pttrace)
1301{
1302	struct thread_trace *ttrace = pttrace;
1303
1304	if (!ttrace)
1305		return;
1306
1307	intlist__delete(ttrace->syscall_stats);
1308	ttrace->syscall_stats = NULL;
1309	thread_trace__free_files(ttrace);
1310	zfree(&ttrace->entry_str);
1311	free(ttrace);
1312}
1313
1314static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1315{
1316	struct thread_trace *ttrace;
1317
1318	if (thread == NULL)
1319		goto fail;
1320
1321	if (thread__priv(thread) == NULL)
1322		thread__set_priv(thread, thread_trace__new());
1323
1324	if (thread__priv(thread) == NULL)
1325		goto fail;
1326
1327	ttrace = thread__priv(thread);
1328	++ttrace->nr_events;
1329
1330	return ttrace;
1331fail:
1332	color_fprintf(fp, PERF_COLOR_RED,
1333		      "WARNING: not enough memory, dropping samples!\n");
1334	return NULL;
1335}
1336
1337
1338void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1339				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1340{
1341	struct thread_trace *ttrace = thread__priv(arg->thread);
1342
1343	ttrace->ret_scnprintf = ret_scnprintf;
1344}
1345
1346#define TRACE_PFMAJ		(1 << 0)
1347#define TRACE_PFMIN		(1 << 1)
1348
1349static const size_t trace__entry_str_size = 2048;
1350
1351static void thread_trace__free_files(struct thread_trace *ttrace)
1352{
1353	for (int i = 0; i < ttrace->files.max; ++i) {
1354		struct file *file = ttrace->files.table + i;
1355		zfree(&file->pathname);
1356	}
1357
1358	zfree(&ttrace->files.table);
1359	ttrace->files.max  = -1;
1360}
1361
1362static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1363{
1364	if (fd < 0)
1365		return NULL;
1366
1367	if (fd > ttrace->files.max) {
1368		struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1369
1370		if (nfiles == NULL)
1371			return NULL;
1372
1373		if (ttrace->files.max != -1) {
1374			memset(nfiles + ttrace->files.max + 1, 0,
1375			       (fd - ttrace->files.max) * sizeof(struct file));
1376		} else {
1377			memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1378		}
1379
1380		ttrace->files.table = nfiles;
1381		ttrace->files.max   = fd;
1382	}
1383
1384	return ttrace->files.table + fd;
1385}
1386
1387struct file *thread__files_entry(struct thread *thread, int fd)
1388{
1389	return thread_trace__files_entry(thread__priv(thread), fd);
1390}
1391
1392static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1393{
1394	struct thread_trace *ttrace = thread__priv(thread);
1395	struct file *file = thread_trace__files_entry(ttrace, fd);
1396
1397	if (file != NULL) {
1398		struct stat st;
1399		if (stat(pathname, &st) == 0)
1400			file->dev_maj = major(st.st_rdev);
1401		file->pathname = strdup(pathname);
1402		if (file->pathname)
1403			return 0;
1404	}
1405
1406	return -1;
1407}
1408
1409static int thread__read_fd_path(struct thread *thread, int fd)
1410{
1411	char linkname[PATH_MAX], pathname[PATH_MAX];
1412	struct stat st;
1413	int ret;
1414
1415	if (thread__pid(thread) == thread__tid(thread)) {
1416		scnprintf(linkname, sizeof(linkname),
1417			  "/proc/%d/fd/%d", thread__pid(thread), fd);
1418	} else {
1419		scnprintf(linkname, sizeof(linkname),
1420			  "/proc/%d/task/%d/fd/%d",
1421			  thread__pid(thread), thread__tid(thread), fd);
1422	}
1423
1424	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1425		return -1;
1426
1427	ret = readlink(linkname, pathname, sizeof(pathname));
1428
1429	if (ret < 0 || ret > st.st_size)
1430		return -1;
1431
1432	pathname[ret] = '\0';
1433	return trace__set_fd_pathname(thread, fd, pathname);
1434}
1435
1436static const char *thread__fd_path(struct thread *thread, int fd,
1437				   struct trace *trace)
1438{
1439	struct thread_trace *ttrace = thread__priv(thread);
1440
1441	if (ttrace == NULL || trace->fd_path_disabled)
1442		return NULL;
1443
1444	if (fd < 0)
1445		return NULL;
1446
1447	if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1448		if (!trace->live)
1449			return NULL;
1450		++trace->stats.proc_getname;
1451		if (thread__read_fd_path(thread, fd))
1452			return NULL;
1453	}
1454
1455	return ttrace->files.table[fd].pathname;
1456}
1457
1458size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1459{
1460	int fd = arg->val;
1461	size_t printed = scnprintf(bf, size, "%d", fd);
1462	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1463
1464	if (path)
1465		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1466
1467	return printed;
1468}
1469
1470size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1471{
1472        size_t printed = scnprintf(bf, size, "%d", fd);
1473	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1474
1475	if (thread) {
1476		const char *path = thread__fd_path(thread, fd, trace);
1477
1478		if (path)
1479			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1480
1481		thread__put(thread);
1482	}
1483
1484        return printed;
1485}
1486
1487static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1488					      struct syscall_arg *arg)
1489{
1490	int fd = arg->val;
1491	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1492	struct thread_trace *ttrace = thread__priv(arg->thread);
1493
1494	if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1495		zfree(&ttrace->files.table[fd].pathname);
1496
1497	return printed;
1498}
1499
1500static void thread__set_filename_pos(struct thread *thread, const char *bf,
1501				     unsigned long ptr)
1502{
1503	struct thread_trace *ttrace = thread__priv(thread);
1504
1505	ttrace->filename.ptr = ptr;
1506	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1507}
1508
1509static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1510{
1511	struct augmented_arg *augmented_arg = arg->augmented.args;
1512	size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1513	/*
1514	 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1515	 * we would have two strings, each prefixed by its size.
1516	 */
1517	int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1518
1519	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1520	arg->augmented.size -= consumed;
1521
1522	return printed;
1523}
1524
1525static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1526					      struct syscall_arg *arg)
1527{
1528	unsigned long ptr = arg->val;
1529
1530	if (arg->augmented.args)
1531		return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1532
1533	if (!arg->trace->vfs_getname)
1534		return scnprintf(bf, size, "%#x", ptr);
1535
1536	thread__set_filename_pos(arg->thread, bf, ptr);
1537	return 0;
1538}
1539
1540static bool trace__filter_duration(struct trace *trace, double t)
1541{
1542	return t < (trace->duration_filter * NSEC_PER_MSEC);
1543}
1544
1545static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1546{
1547	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1548
1549	return fprintf(fp, "%10.3f ", ts);
1550}
1551
1552/*
1553 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1554 * using ttrace->entry_time for a thread that receives a sys_exit without
1555 * first having received a sys_enter ("poll" issued before tracing session
1556 * starts, lost sys_enter exit due to ring buffer overflow).
1557 */
1558static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1559{
1560	if (tstamp > 0)
1561		return __trace__fprintf_tstamp(trace, tstamp, fp);
1562
1563	return fprintf(fp, "         ? ");
1564}
1565
1566static pid_t workload_pid = -1;
1567static volatile sig_atomic_t done = false;
1568static volatile sig_atomic_t interrupted = false;
1569
1570static void sighandler_interrupt(int sig __maybe_unused)
1571{
1572	done = interrupted = true;
 
1573}
1574
1575static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
1576			    void *context __maybe_unused)
1577{
1578	if (info->si_pid == workload_pid)
1579		done = true;
1580}
1581
1582static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1583{
1584	size_t printed = 0;
 
1585
1586	if (trace->multiple_threads) {
1587		if (trace->show_comm)
1588			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1589		printed += fprintf(fp, "%d ", thread__tid(thread));
1590	}
1591
1592	return printed;
1593}
1594
1595static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1596					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1597{
1598	size_t printed = 0;
1599
1600	if (trace->show_tstamp)
1601		printed = trace__fprintf_tstamp(trace, tstamp, fp);
1602	if (trace->show_duration)
1603		printed += fprintf_duration(duration, duration_calculated, fp);
1604	return printed + trace__fprintf_comm_tid(trace, thread, fp);
1605}
1606
1607static int trace__process_event(struct trace *trace, struct machine *machine,
1608				union perf_event *event, struct perf_sample *sample)
1609{
1610	int ret = 0;
1611
1612	switch (event->header.type) {
1613	case PERF_RECORD_LOST:
1614		color_fprintf(trace->output, PERF_COLOR_RED,
1615			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1616		ret = machine__process_lost_event(machine, event, sample);
1617		break;
1618	default:
1619		ret = machine__process_event(machine, event, sample);
1620		break;
1621	}
1622
1623	return ret;
1624}
1625
1626static int trace__tool_process(struct perf_tool *tool,
1627			       union perf_event *event,
1628			       struct perf_sample *sample,
1629			       struct machine *machine)
1630{
1631	struct trace *trace = container_of(tool, struct trace, tool);
1632	return trace__process_event(trace, machine, event, sample);
1633}
1634
1635static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1636{
1637	struct machine *machine = vmachine;
1638
1639	if (machine->kptr_restrict_warned)
1640		return NULL;
1641
1642	if (symbol_conf.kptr_restrict) {
1643		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1644			   "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1645			   "Kernel samples will not be resolved.\n");
1646		machine->kptr_restrict_warned = true;
1647		return NULL;
1648	}
1649
1650	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1651}
1652
1653static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1654{
1655	int err = symbol__init(NULL);
1656
1657	if (err)
1658		return err;
1659
1660	trace->host = machine__new_host();
1661	if (trace->host == NULL)
1662		return -ENOMEM;
1663
1664	thread__set_priv_destructor(thread_trace__delete);
1665
1666	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1667	if (err < 0)
1668		goto out;
1669
1670	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1671					    evlist->core.threads, trace__tool_process,
1672					    true, false, 1);
1673out:
1674	if (err)
1675		symbol__exit();
1676
1677	return err;
1678}
1679
1680static void trace__symbols__exit(struct trace *trace)
1681{
1682	machine__exit(trace->host);
1683	trace->host = NULL;
1684
1685	symbol__exit();
1686}
1687
1688static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1689{
1690	int idx;
1691
1692	if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
1693		nr_args = sc->fmt->nr_args;
1694
1695	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1696	if (sc->arg_fmt == NULL)
1697		return -1;
1698
1699	for (idx = 0; idx < nr_args; ++idx) {
1700		if (sc->fmt)
1701			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1702	}
1703
1704	sc->nr_args = nr_args;
1705	return 0;
1706}
1707
1708static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
1709	{ .name = "msr",	.scnprintf = SCA_X86_MSR,	  .strtoul = STUL_X86_MSR,	   },
1710	{ .name = "vector",	.scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
1711};
1712
1713static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
1714{
1715       const struct syscall_arg_fmt *fmt = fmtp;
1716       return strcmp(name, fmt->name);
1717}
1718
1719static const struct syscall_arg_fmt *
1720__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb,
1721				const char *name)
1722{
1723       return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
1724}
1725
1726static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
1727{
1728       const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
1729       return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
1730}
1731
1732static struct tep_format_field *
1733syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field)
1734{
1735	struct tep_format_field *last_field = NULL;
1736	int len;
1737
1738	for (; field; field = field->next, ++arg) {
1739		last_field = field;
1740
1741		if (arg->scnprintf)
 
1742			continue;
1743
1744		len = strlen(field->name);
1745
1746		if (strcmp(field->type, "const char *") == 0 &&
1747		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
1748		     strstr(field->name, "path") != NULL))
1749			arg->scnprintf = SCA_FILENAME;
1750		else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
1751			arg->scnprintf = SCA_PTR;
 
1752		else if (strcmp(field->type, "pid_t") == 0)
1753			arg->scnprintf = SCA_PID;
1754		else if (strcmp(field->type, "umode_t") == 0)
1755			arg->scnprintf = SCA_MODE_T;
1756		else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
1757			arg->scnprintf = SCA_CHAR_ARRAY;
1758			arg->nr_entries = field->arraylen;
1759		} else if ((strcmp(field->type, "int") == 0 ||
1760			  strcmp(field->type, "unsigned int") == 0 ||
1761			  strcmp(field->type, "long") == 0) &&
1762			 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
 
1763			/*
1764			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1765			 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1766			 * 65 int
1767			 * 23 unsigned int
1768			 * 7 unsigned long
1769			 */
1770			arg->scnprintf = SCA_FD;
1771		} else {
1772			const struct syscall_arg_fmt *fmt =
1773				syscall_arg_fmt__find_by_name(field->name);
1774
1775			if (fmt) {
1776				arg->scnprintf = fmt->scnprintf;
1777				arg->strtoul   = fmt->strtoul;
1778			}
1779		}
1780	}
1781
1782	return last_field;
1783}
1784
1785static int syscall__set_arg_fmts(struct syscall *sc)
1786{
1787	struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args);
1788
1789	if (last_field)
1790		sc->args_size = last_field->offset + last_field->size;
1791
1792	return 0;
1793}
1794
1795static int trace__read_syscall_info(struct trace *trace, int id)
1796{
1797	char tp_name[128];
1798	struct syscall *sc;
1799	const char *name = syscalltbl__name(trace->sctbl, id);
1800
1801#ifdef HAVE_SYSCALL_TABLE_SUPPORT
1802	if (trace->syscalls.table == NULL) {
1803		trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
1804		if (trace->syscalls.table == NULL)
1805			return -ENOMEM;
1806	}
1807#else
1808	if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
1809		// When using libaudit we don't know beforehand what is the max syscall id
1810		struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1811
1812		if (table == NULL)
1813			return -ENOMEM;
1814
1815		// Need to memset from offset 0 and +1 members if brand new
1816		if (trace->syscalls.table == NULL)
1817			memset(table, 0, (id + 1) * sizeof(*sc));
1818		else
1819			memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
1820
1821		trace->syscalls.table	      = table;
1822		trace->sctbl->syscalls.max_id = id;
1823	}
1824#endif
1825	sc = trace->syscalls.table + id;
1826	if (sc->nonexistent)
1827		return -EEXIST;
 
 
 
 
 
1828
1829	if (name == NULL) {
1830		sc->nonexistent = true;
1831		return -EEXIST;
1832	}
1833
 
1834	sc->name = name;
 
1835	sc->fmt  = syscall_fmt__find(sc->name);
1836
1837	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1838	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1839
1840	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1841		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1842		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1843	}
1844
1845	/*
1846	 * Fails to read trace point format via sysfs node, so the trace point
1847	 * doesn't exist.  Set the 'nonexistent' flag as true.
1848	 */
1849	if (IS_ERR(sc->tp_format)) {
1850		sc->nonexistent = true;
1851		return PTR_ERR(sc->tp_format);
1852	}
1853
1854	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
1855					RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields))
1856		return -ENOMEM;
1857
1858	sc->args = sc->tp_format->format.fields;
1859	/*
1860	 * We need to check and discard the first variable '__syscall_nr'
1861	 * or 'nr' that mean the syscall number. It is needless here.
1862	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1863	 */
1864	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1865		sc->args = sc->args->next;
1866		--sc->nr_args;
1867	}
1868
1869	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1870	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1871
1872	return syscall__set_arg_fmts(sc);
1873}
1874
1875static int evsel__init_tp_arg_scnprintf(struct evsel *evsel)
1876{
1877	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
1878
1879	if (fmt != NULL) {
1880		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields);
1881		return 0;
1882	}
1883
1884	return -ENOMEM;
1885}
1886
1887static int intcmp(const void *a, const void *b)
1888{
1889	const int *one = a, *another = b;
1890
1891	return *one - *another;
1892}
1893
1894static int trace__validate_ev_qualifier(struct trace *trace)
1895{
1896	int err = 0;
1897	bool printed_invalid_prefix = false;
1898	struct str_node *pos;
1899	size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
1900
1901	trace->ev_qualifier_ids.entries = malloc(nr_allocated *
 
1902						 sizeof(trace->ev_qualifier_ids.entries[0]));
1903
1904	if (trace->ev_qualifier_ids.entries == NULL) {
1905		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1906		       trace->output);
1907		err = -EINVAL;
1908		goto out;
1909	}
1910
 
 
 
1911	strlist__for_each_entry(pos, trace->ev_qualifier) {
1912		const char *sc = pos->s;
1913		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1914
1915		if (id < 0) {
1916			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1917			if (id >= 0)
1918				goto matches;
1919
1920			if (!printed_invalid_prefix) {
1921				pr_debug("Skipping unknown syscalls: ");
1922				printed_invalid_prefix = true;
1923			} else {
1924				pr_debug(", ");
1925			}
1926
1927			pr_debug("%s", sc);
1928			continue;
1929		}
1930matches:
1931		trace->ev_qualifier_ids.entries[nr_used++] = id;
1932		if (match_next == -1)
1933			continue;
1934
1935		while (1) {
1936			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1937			if (id < 0)
1938				break;
1939			if (nr_allocated == nr_used) {
1940				void *entries;
1941
1942				nr_allocated += 8;
1943				entries = realloc(trace->ev_qualifier_ids.entries,
1944						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1945				if (entries == NULL) {
1946					err = -ENOMEM;
1947					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1948					goto out_free;
1949				}
1950				trace->ev_qualifier_ids.entries = entries;
1951			}
1952			trace->ev_qualifier_ids.entries[nr_used++] = id;
 
1953		}
1954	}
1955
1956	trace->ev_qualifier_ids.nr = nr_used;
1957	qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
 
 
 
 
 
1958out:
1959	if (printed_invalid_prefix)
1960		pr_debug("\n");
1961	return err;
1962out_free:
1963	zfree(&trace->ev_qualifier_ids.entries);
1964	trace->ev_qualifier_ids.nr = 0;
1965	goto out;
1966}
1967
1968static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
1969{
1970	bool in_ev_qualifier;
1971
1972	if (trace->ev_qualifier_ids.nr == 0)
1973		return true;
1974
1975	in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
1976				  trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
1977
1978	if (in_ev_qualifier)
1979	       return !trace->not_ev_qualifier;
1980
1981	return trace->not_ev_qualifier;
1982}
1983
1984/*
1985 * args is to be interpreted as a series of longs but we need to handle
1986 * 8-byte unaligned accesses. args points to raw_data within the event
1987 * and raw_data is guaranteed to be 8-byte unaligned because it is
1988 * preceded by raw_size which is a u32. So we need to copy args to a temp
1989 * variable to read it. Most notably this avoids extended load instructions
1990 * on unaligned addresses
1991 */
1992unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1993{
1994	unsigned long val;
1995	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1996
1997	memcpy(&val, p, sizeof(val));
1998	return val;
1999}
2000
2001static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
2002				      struct syscall_arg *arg)
2003{
2004	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
2005		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
2006
2007	return scnprintf(bf, size, "arg%d: ", arg->idx);
2008}
2009
2010/*
2011 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
2012 * as mount 'flags' argument that needs ignoring some magic flag, see comment
2013 * in tools/perf/trace/beauty/mount_flags.c
2014 */
2015static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
2016{
2017	if (fmt && fmt->mask_val)
2018		return fmt->mask_val(arg, val);
2019
2020	return val;
2021}
2022
2023static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
2024					     struct syscall_arg *arg, unsigned long val)
2025{
2026	if (fmt && fmt->scnprintf) {
2027		arg->val = val;
2028		if (fmt->parm)
2029			arg->parm = fmt->parm;
2030		return fmt->scnprintf(bf, size, arg);
2031	}
2032	return scnprintf(bf, size, "%ld", val);
2033}
2034
2035static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
2036				      unsigned char *args, void *augmented_args, int augmented_args_size,
2037				      struct trace *trace, struct thread *thread)
2038{
2039	size_t printed = 0;
2040	unsigned long val;
2041	u8 bit = 1;
2042	struct syscall_arg arg = {
2043		.args	= args,
2044		.augmented = {
2045			.size = augmented_args_size,
2046			.args = augmented_args,
2047		},
2048		.idx	= 0,
2049		.mask	= 0,
2050		.trace  = trace,
2051		.thread = thread,
2052		.show_string_prefix = trace->show_string_prefix,
2053	};
2054	struct thread_trace *ttrace = thread__priv(thread);
2055
2056	/*
2057	 * Things like fcntl will set this in its 'cmd' formatter to pick the
2058	 * right formatter for the return value (an fd? file flags?), which is
2059	 * not needed for syscalls that always return a given type, say an fd.
2060	 */
2061	ttrace->ret_scnprintf = NULL;
2062
2063	if (sc->args != NULL) {
2064		struct tep_format_field *field;
2065
2066		for (field = sc->args; field;
2067		     field = field->next, ++arg.idx, bit <<= 1) {
2068			if (arg.mask & bit)
2069				continue;
2070
2071			arg.fmt = &sc->arg_fmt[arg.idx];
2072			val = syscall_arg__val(&arg, arg.idx);
2073			/*
2074			 * Some syscall args need some mask, most don't and
2075			 * return val untouched.
2076			 */
2077			val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
2078
2079			/*
2080 			 * Suppress this argument if its value is zero and
2081 			 * and we don't have a string associated in an
2082 			 * strarray for it.
2083 			 */
2084			if (val == 0 &&
2085			    !trace->show_zeros &&
2086			    !(sc->arg_fmt &&
2087			      (sc->arg_fmt[arg.idx].show_zero ||
2088			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
2089			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
2090			      sc->arg_fmt[arg.idx].parm))
2091				continue;
2092
2093			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2094
2095			if (trace->show_arg_names)
2096				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2097
2098			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
2099								  bf + printed, size - printed, &arg, val);
2100		}
2101	} else if (IS_ERR(sc->tp_format)) {
2102		/*
2103		 * If we managed to read the tracepoint /format file, then we
2104		 * may end up not having any args, like with gettid(), so only
2105		 * print the raw args when we didn't manage to read it.
2106		 */
2107		while (arg.idx < sc->nr_args) {
2108			if (arg.mask & bit)
2109				goto next_arg;
2110			val = syscall_arg__val(&arg, arg.idx);
2111			if (printed)
2112				printed += scnprintf(bf + printed, size - printed, ", ");
2113			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
2114			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
2115next_arg:
2116			++arg.idx;
2117			bit <<= 1;
2118		}
2119	}
2120
2121	return printed;
2122}
2123
2124typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
2125				  union perf_event *event,
2126				  struct perf_sample *sample);
2127
2128static struct syscall *trace__syscall_info(struct trace *trace,
2129					   struct evsel *evsel, int id)
2130{
2131	int err = 0;
2132
2133	if (id < 0) {
2134
2135		/*
2136		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
2137		 * before that, leaving at a higher verbosity level till that is
2138		 * explained. Reproduced with plain ftrace with:
2139		 *
2140		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
2141		 * grep "NR -1 " /t/trace_pipe
2142		 *
2143		 * After generating some load on the machine.
2144 		 */
2145		if (verbose > 1) {
2146			static u64 n;
2147			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2148				id, evsel__name(evsel), ++n);
2149		}
2150		return NULL;
2151	}
2152
2153	err = -EINVAL;
2154
2155#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2156	if (id > trace->sctbl->syscalls.max_id) {
2157#else
2158	if (id >= trace->sctbl->syscalls.max_id) {
2159		/*
2160		 * With libaudit we don't know beforehand what is the max_id,
2161		 * so we let trace__read_syscall_info() figure that out as we
2162		 * go on reading syscalls.
2163		 */
2164		err = trace__read_syscall_info(trace, id);
2165		if (err)
2166#endif
2167		goto out_cant_read;
2168	}
2169
2170	if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
2171	    (err = trace__read_syscall_info(trace, id)) != 0)
2172		goto out_cant_read;
2173
2174	if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
2175		goto out_cant_read;
2176
2177	return &trace->syscalls.table[id];
2178
2179out_cant_read:
2180	if (verbose > 0) {
2181		char sbuf[STRERR_BUFSIZE];
2182		fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
2183		if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
2184			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2185		fputs(" information\n", trace->output);
2186	}
2187	return NULL;
2188}
2189
2190struct syscall_stats {
2191	struct stats stats;
2192	u64	     nr_failures;
2193	int	     max_errno;
2194	u32	     *errnos;
2195};
2196
2197static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
2198				 int id, struct perf_sample *sample, long err, bool errno_summary)
2199{
2200	struct int_node *inode;
2201	struct syscall_stats *stats;
2202	u64 duration = 0;
2203
2204	inode = intlist__findnew(ttrace->syscall_stats, id);
2205	if (inode == NULL)
2206		return;
2207
2208	stats = inode->priv;
2209	if (stats == NULL) {
2210		stats = zalloc(sizeof(*stats));
2211		if (stats == NULL)
2212			return;
2213
2214		init_stats(&stats->stats);
2215		inode->priv = stats;
2216	}
2217
2218	if (ttrace->entry_time && sample->time > ttrace->entry_time)
2219		duration = sample->time - ttrace->entry_time;
2220
2221	update_stats(&stats->stats, duration);
2222
2223	if (err < 0) {
2224		++stats->nr_failures;
2225
2226		if (!errno_summary)
2227			return;
2228
2229		err = -err;
2230		if (err > stats->max_errno) {
2231			u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
2232
2233			if (new_errnos) {
2234				memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
2235			} else {
2236				pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
2237					 thread__comm_str(thread), thread__pid(thread),
2238					 thread__tid(thread));
2239				return;
2240			}
2241
2242			stats->errnos = new_errnos;
2243			stats->max_errno = err;
2244		}
2245
2246		++stats->errnos[err - 1];
2247	}
2248}
2249
2250static int trace__printf_interrupted_entry(struct trace *trace)
2251{
2252	struct thread_trace *ttrace;
2253	size_t printed;
2254	int len;
2255
2256	if (trace->failure_only || trace->current == NULL)
2257		return 0;
2258
2259	ttrace = thread__priv(trace->current);
2260
2261	if (!ttrace->entry_pending)
2262		return 0;
2263
2264	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
2265	printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
2266
2267	if (len < trace->args_alignment - 4)
2268		printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
2269
2270	printed += fprintf(trace->output, " ...\n");
2271
2272	ttrace->entry_pending = false;
2273	++trace->nr_events_printed;
2274
2275	return printed;
2276}
2277
2278static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
2279				 struct perf_sample *sample, struct thread *thread)
2280{
2281	int printed = 0;
2282
2283	if (trace->print_sample) {
2284		double ts = (double)sample->time / NSEC_PER_MSEC;
2285
2286		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
2287				   evsel__name(evsel), ts,
2288				   thread__comm_str(thread),
2289				   sample->pid, sample->tid, sample->cpu);
2290	}
2291
2292	return printed;
2293}
2294
2295static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
2296{
2297	void *augmented_args = NULL;
2298	/*
2299	 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
2300	 * and there we get all 6 syscall args plus the tracepoint common fields
2301	 * that gets calculated at the start and the syscall_nr (another long).
2302	 * So we check if that is the case and if so don't look after the
2303	 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
2304	 * which is fixed.
2305	 *
2306	 * We'll revisit this later to pass s->args_size to the BPF augmenter
2307	 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
2308	 * copies only what we need for each syscall, like what happens when we
2309	 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
2310	 * traffic to just what is needed for each syscall.
2311	 */
2312	int args_size = raw_augmented_args_size ?: sc->args_size;
2313
2314	*augmented_args_size = sample->raw_size - args_size;
2315	if (*augmented_args_size > 0)
2316		augmented_args = sample->raw_data + args_size;
2317
2318	return augmented_args;
2319}
2320
2321static void syscall__exit(struct syscall *sc)
2322{
2323	if (!sc)
2324		return;
2325
2326	zfree(&sc->arg_fmt);
2327}
2328
2329static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
2330			    union perf_event *event __maybe_unused,
2331			    struct perf_sample *sample)
2332{
2333	char *msg;
2334	void *args;
2335	int printed = 0;
2336	struct thread *thread;
2337	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2338	int augmented_args_size = 0;
2339	void *augmented_args = NULL;
2340	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2341	struct thread_trace *ttrace;
2342
2343	if (sc == NULL)
2344		return -1;
2345
2346	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2347	ttrace = thread__trace(thread, trace->output);
2348	if (ttrace == NULL)
2349		goto out_put;
2350
2351	trace__fprintf_sample(trace, evsel, sample, thread);
2352
2353	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2354
2355	if (ttrace->entry_str == NULL) {
2356		ttrace->entry_str = malloc(trace__entry_str_size);
2357		if (!ttrace->entry_str)
2358			goto out_put;
2359	}
2360
2361	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
2362		trace__printf_interrupted_entry(trace);
2363	/*
2364	 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
2365	 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
2366	 * this breaks syscall__augmented_args() check for augmented args, as we calculate
2367	 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
2368	 * so when handling, say the openat syscall, we end up getting 6 args for the
2369	 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
2370	 * thinking that the extra 2 u64 args are the augmented filename, so just check
2371	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
2372	 */
2373	if (evsel != trace->syscalls.events.sys_enter)
2374		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2375	ttrace->entry_time = sample->time;
2376	msg = ttrace->entry_str;
2377	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2378
2379	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2380					   args, augmented_args, augmented_args_size, trace, thread);
2381
2382	if (sc->is_exit) {
2383		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
2384			int alignment = 0;
2385
2386			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2387			printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2388			if (trace->args_alignment > printed)
2389				alignment = trace->args_alignment - printed;
2390			fprintf(trace->output, "%*s= ?\n", alignment, " ");
2391		}
2392	} else {
2393		ttrace->entry_pending = true;
2394		/* See trace__vfs_getname & trace__sys_exit */
2395		ttrace->filename.pending_open = false;
2396	}
2397
2398	if (trace->current != thread) {
2399		thread__put(trace->current);
2400		trace->current = thread__get(thread);
2401	}
2402	err = 0;
2403out_put:
2404	thread__put(thread);
2405	return err;
2406}
2407
2408static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2409				    struct perf_sample *sample)
2410{
2411	struct thread_trace *ttrace;
2412	struct thread *thread;
2413	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2414	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2415	char msg[1024];
2416	void *args, *augmented_args = NULL;
2417	int augmented_args_size;
2418
2419	if (sc == NULL)
2420		return -1;
2421
2422	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2423	ttrace = thread__trace(thread, trace->output);
2424	/*
2425	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2426	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2427	 */
2428	if (ttrace == NULL)
2429		goto out_put;
2430
2431	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2432	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2433	syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2434	fprintf(trace->output, "%s", msg);
2435	err = 0;
2436out_put:
2437	thread__put(thread);
2438	return err;
2439}
2440
2441static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2442				    struct perf_sample *sample,
2443				    struct callchain_cursor *cursor)
2444{
2445	struct addr_location al;
2446	int max_stack = evsel->core.attr.sample_max_stack ?
2447			evsel->core.attr.sample_max_stack :
2448			trace->max_stack;
2449	int err = -1;
2450
2451	addr_location__init(&al);
2452	if (machine__resolve(trace->host, &al, sample) < 0)
2453		goto out;
2454
2455	err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2456out:
2457	addr_location__exit(&al);
2458	return err;
2459}
2460
2461static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2462{
2463	/* TODO: user-configurable print_opts */
2464	const unsigned int print_opts = EVSEL__PRINT_SYM |
2465				        EVSEL__PRINT_DSO |
2466				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
2467
2468	return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
2469}
2470
2471static const char *errno_to_name(struct evsel *evsel, int err)
2472{
2473	struct perf_env *env = evsel__env(evsel);
 
2474
2475	return perf_env__arch_strerrno(env, err);
2476}
2477
2478static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2479			   union perf_event *event __maybe_unused,
2480			   struct perf_sample *sample)
2481{
2482	long ret;
2483	u64 duration = 0;
2484	bool duration_calculated = false;
2485	struct thread *thread;
2486	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2487	int alignment = trace->args_alignment;
2488	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2489	struct thread_trace *ttrace;
2490
2491	if (sc == NULL)
2492		return -1;
2493
2494	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2495	ttrace = thread__trace(thread, trace->output);
2496	if (ttrace == NULL)
2497		goto out_put;
2498
2499	trace__fprintf_sample(trace, evsel, sample, thread);
2500
2501	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2502
2503	if (trace->summary)
2504		thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
2505
2506	if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
 
 
2507		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2508		ttrace->filename.pending_open = false;
2509		++trace->stats.vfs_getname;
2510	}
2511
2512	if (ttrace->entry_time) {
2513		duration = sample->time - ttrace->entry_time;
2514		if (trace__filter_duration(trace, duration))
2515			goto out;
2516		duration_calculated = true;
2517	} else if (trace->duration_filter)
2518		goto out;
2519
2520	if (sample->callchain) {
2521		struct callchain_cursor *cursor = get_tls_callchain_cursor();
2522
2523		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2524		if (callchain_ret == 0) {
2525			if (cursor->nr < trace->min_stack)
2526				goto out;
2527			callchain_ret = 1;
2528		}
2529	}
2530
2531	if (trace->summary_only || (ret >= 0 && trace->failure_only))
2532		goto out;
2533
2534	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2535
2536	if (ttrace->entry_pending) {
2537		printed = fprintf(trace->output, "%s", ttrace->entry_str);
2538	} else {
2539		printed += fprintf(trace->output, " ... [");
2540		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2541		printed += 9;
2542		printed += fprintf(trace->output, "]: %s()", sc->name);
2543	}
2544
2545	printed++; /* the closing ')' */
2546
2547	if (alignment > printed)
2548		alignment -= printed;
2549	else
2550		alignment = 0;
2551
2552	fprintf(trace->output, ")%*s= ", alignment, " ");
2553
2554	if (sc->fmt == NULL) {
2555		if (ret < 0)
2556			goto errno_print;
2557signed_print:
2558		fprintf(trace->output, "%ld", ret);
2559	} else if (ret < 0) {
2560errno_print: {
2561		char bf[STRERR_BUFSIZE];
2562		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2563			   *e = errno_to_name(evsel, -ret);
2564
2565		fprintf(trace->output, "-1 %s (%s)", e, emsg);
2566	}
2567	} else if (ret == 0 && sc->fmt->timeout)
2568		fprintf(trace->output, "0 (Timeout)");
2569	else if (ttrace->ret_scnprintf) {
2570		char bf[1024];
2571		struct syscall_arg arg = {
2572			.val	= ret,
2573			.thread	= thread,
2574			.trace	= trace,
2575		};
2576		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2577		ttrace->ret_scnprintf = NULL;
2578		fprintf(trace->output, "%s", bf);
2579	} else if (sc->fmt->hexret)
2580		fprintf(trace->output, "%#lx", ret);
2581	else if (sc->fmt->errpid) {
2582		struct thread *child = machine__find_thread(trace->host, ret, ret);
2583
2584		if (child != NULL) {
2585			fprintf(trace->output, "%ld", ret);
2586			if (thread__comm_set(child))
2587				fprintf(trace->output, " (%s)", thread__comm_str(child));
2588			thread__put(child);
2589		}
2590	} else
2591		goto signed_print;
2592
2593	fputc('\n', trace->output);
2594
2595	/*
2596	 * We only consider an 'event' for the sake of --max-events a non-filtered
2597	 * sys_enter + sys_exit and other tracepoint events.
2598	 */
2599	if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2600		interrupted = true;
2601
2602	if (callchain_ret > 0)
2603		trace__fprintf_callchain(trace, sample);
2604	else if (callchain_ret < 0)
2605		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2606out:
2607	ttrace->entry_pending = false;
2608	err = 0;
2609out_put:
2610	thread__put(thread);
2611	return err;
2612}
2613
2614static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2615			      union perf_event *event __maybe_unused,
2616			      struct perf_sample *sample)
2617{
2618	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2619	struct thread_trace *ttrace;
2620	size_t filename_len, entry_str_len, to_move;
2621	ssize_t remaining_space;
2622	char *pos;
2623	const char *filename = evsel__rawptr(evsel, sample, "pathname");
2624
2625	if (!thread)
2626		goto out;
2627
2628	ttrace = thread__priv(thread);
2629	if (!ttrace)
2630		goto out_put;
2631
2632	filename_len = strlen(filename);
2633	if (filename_len == 0)
2634		goto out_put;
2635
2636	if (ttrace->filename.namelen < filename_len) {
2637		char *f = realloc(ttrace->filename.name, filename_len + 1);
2638
2639		if (f == NULL)
2640			goto out_put;
2641
2642		ttrace->filename.namelen = filename_len;
2643		ttrace->filename.name = f;
2644	}
2645
2646	strcpy(ttrace->filename.name, filename);
2647	ttrace->filename.pending_open = true;
2648
2649	if (!ttrace->filename.ptr)
2650		goto out_put;
2651
2652	entry_str_len = strlen(ttrace->entry_str);
2653	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2654	if (remaining_space <= 0)
2655		goto out_put;
2656
2657	if (filename_len > (size_t)remaining_space) {
2658		filename += filename_len - remaining_space;
2659		filename_len = remaining_space;
2660	}
2661
2662	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2663	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2664	memmove(pos + filename_len, pos, to_move);
2665	memcpy(pos, filename, filename_len);
2666
2667	ttrace->filename.ptr = 0;
2668	ttrace->filename.entry_str_pos = 0;
2669out_put:
2670	thread__put(thread);
2671out:
2672	return 0;
2673}
2674
2675static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2676				     union perf_event *event __maybe_unused,
2677				     struct perf_sample *sample)
2678{
2679        u64 runtime = evsel__intval(evsel, sample, "runtime");
2680	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2681	struct thread *thread = machine__findnew_thread(trace->host,
2682							sample->pid,
2683							sample->tid);
2684	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2685
2686	if (ttrace == NULL)
2687		goto out_dump;
2688
2689	ttrace->runtime_ms += runtime_ms;
2690	trace->runtime_ms += runtime_ms;
2691out_put:
2692	thread__put(thread);
2693	return 0;
2694
2695out_dump:
2696	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2697	       evsel->name,
2698	       evsel__strval(evsel, sample, "comm"),
2699	       (pid_t)evsel__intval(evsel, sample, "pid"),
2700	       runtime,
2701	       evsel__intval(evsel, sample, "vruntime"));
2702	goto out_put;
2703}
2704
2705static int bpf_output__printer(enum binary_printer_ops op,
2706			       unsigned int val, void *extra __maybe_unused, FILE *fp)
2707{
2708	unsigned char ch = (unsigned char)val;
2709
2710	switch (op) {
2711	case BINARY_PRINT_CHAR_DATA:
2712		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2713	case BINARY_PRINT_DATA_BEGIN:
2714	case BINARY_PRINT_LINE_BEGIN:
2715	case BINARY_PRINT_ADDR:
2716	case BINARY_PRINT_NUM_DATA:
2717	case BINARY_PRINT_NUM_PAD:
2718	case BINARY_PRINT_SEP:
2719	case BINARY_PRINT_CHAR_PAD:
2720	case BINARY_PRINT_LINE_END:
2721	case BINARY_PRINT_DATA_END:
2722	default:
2723		break;
2724	}
2725
2726	return 0;
2727}
2728
2729static void bpf_output__fprintf(struct trace *trace,
2730				struct perf_sample *sample)
2731{
2732	binary__fprintf(sample->raw_data, sample->raw_size, 8,
2733			bpf_output__printer, NULL, trace->output);
2734	++trace->nr_events_printed;
2735}
2736
2737static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
2738				       struct thread *thread, void *augmented_args, int augmented_args_size)
2739{
2740	char bf[2048];
2741	size_t size = sizeof(bf);
2742	struct tep_format_field *field = evsel->tp_format->format.fields;
2743	struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
2744	size_t printed = 0;
2745	unsigned long val;
2746	u8 bit = 1;
2747	struct syscall_arg syscall_arg = {
2748		.augmented = {
2749			.size = augmented_args_size,
2750			.args = augmented_args,
2751		},
2752		.idx	= 0,
2753		.mask	= 0,
2754		.trace  = trace,
2755		.thread = thread,
2756		.show_string_prefix = trace->show_string_prefix,
2757	};
2758
2759	for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
2760		if (syscall_arg.mask & bit)
2761			continue;
2762
2763		syscall_arg.len = 0;
2764		syscall_arg.fmt = arg;
2765		if (field->flags & TEP_FIELD_IS_ARRAY) {
2766			int offset = field->offset;
2767
2768			if (field->flags & TEP_FIELD_IS_DYNAMIC) {
2769				offset = format_field__intval(field, sample, evsel->needs_swap);
2770				syscall_arg.len = offset >> 16;
2771				offset &= 0xffff;
2772				if (tep_field_is_relative(field->flags))
2773					offset += field->offset + field->size;
2774			}
2775
2776			val = (uintptr_t)(sample->raw_data + offset);
2777		} else
2778			val = format_field__intval(field, sample, evsel->needs_swap);
2779		/*
2780		 * Some syscall args need some mask, most don't and
2781		 * return val untouched.
2782		 */
2783		val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
2784
2785		/*
2786		 * Suppress this argument if its value is zero and
2787		 * we don't have a string associated in an
2788		 * strarray for it.
2789		 */
2790		if (val == 0 &&
2791		    !trace->show_zeros &&
2792		    !((arg->show_zero ||
2793		       arg->scnprintf == SCA_STRARRAY ||
2794		       arg->scnprintf == SCA_STRARRAYS) &&
2795		      arg->parm))
2796			continue;
2797
2798		printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2799
2800		if (trace->show_arg_names)
2801			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2802
2803		printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
2804	}
2805
2806	return printed + fprintf(trace->output, "%s", bf);
2807}
2808
2809static int trace__event_handler(struct trace *trace, struct evsel *evsel,
2810				union perf_event *event __maybe_unused,
2811				struct perf_sample *sample)
2812{
2813	struct thread *thread;
2814	int callchain_ret = 0;
2815	/*
2816	 * Check if we called perf_evsel__disable(evsel) due to, for instance,
2817	 * this event's max_events having been hit and this is an entry coming
2818	 * from the ring buffer that we should discard, since the max events
2819	 * have already been considered/printed.
2820	 */
2821	if (evsel->disabled)
2822		return 0;
2823
2824	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2825
2826	if (sample->callchain) {
2827		struct callchain_cursor *cursor = get_tls_callchain_cursor();
2828
2829		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2830		if (callchain_ret == 0) {
2831			if (cursor->nr < trace->min_stack)
2832				goto out;
2833			callchain_ret = 1;
2834		}
2835	}
2836
2837	trace__printf_interrupted_entry(trace);
2838	trace__fprintf_tstamp(trace, sample->time, trace->output);
2839
2840	if (trace->trace_syscalls && trace->show_duration)
2841		fprintf(trace->output, "(         ): ");
2842
2843	if (thread)
2844		trace__fprintf_comm_tid(trace, thread, trace->output);
2845
2846	if (evsel == trace->syscalls.events.bpf_output) {
2847		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2848		struct syscall *sc = trace__syscall_info(trace, evsel, id);
2849
2850		if (sc) {
2851			fprintf(trace->output, "%s(", sc->name);
2852			trace__fprintf_sys_enter(trace, evsel, sample);
2853			fputc(')', trace->output);
2854			goto newline;
2855		}
2856
2857		/*
2858		 * XXX: Not having the associated syscall info or not finding/adding
2859		 * 	the thread should never happen, but if it does...
2860		 * 	fall thru and print it as a bpf_output event.
2861		 */
2862	}
2863
2864	fprintf(trace->output, "%s(", evsel->name);
2865
2866	if (evsel__is_bpf_output(evsel)) {
2867		bpf_output__fprintf(trace, sample);
2868	} else if (evsel->tp_format) {
2869		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2870		    trace__fprintf_sys_enter(trace, evsel, sample)) {
2871			if (trace->libtraceevent_print) {
2872				event_format__fprintf(evsel->tp_format, sample->cpu,
2873						      sample->raw_data, sample->raw_size,
2874						      trace->output);
2875			} else {
2876				trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
2877			}
2878		}
2879	}
2880
2881newline:
2882	fprintf(trace->output, ")\n");
2883
2884	if (callchain_ret > 0)
2885		trace__fprintf_callchain(trace, sample);
2886	else if (callchain_ret < 0)
2887		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2888
2889	++trace->nr_events_printed;
2890
2891	if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
2892		evsel__disable(evsel);
2893		evsel__close(evsel);
2894	}
2895out:
2896	thread__put(thread);
2897	return 0;
2898}
2899
2900static void print_location(FILE *f, struct perf_sample *sample,
2901			   struct addr_location *al,
2902			   bool print_dso, bool print_sym)
2903{
2904
2905	if ((verbose > 0 || print_dso) && al->map)
2906		fprintf(f, "%s@", map__dso(al->map)->long_name);
2907
2908	if ((verbose > 0 || print_sym) && al->sym)
2909		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2910			al->addr - al->sym->start);
2911	else if (al->map)
2912		fprintf(f, "0x%" PRIx64, al->addr);
2913	else
2914		fprintf(f, "0x%" PRIx64, sample->addr);
2915}
2916
2917static int trace__pgfault(struct trace *trace,
2918			  struct evsel *evsel,
2919			  union perf_event *event __maybe_unused,
2920			  struct perf_sample *sample)
2921{
2922	struct thread *thread;
2923	struct addr_location al;
2924	char map_type = 'd';
2925	struct thread_trace *ttrace;
2926	int err = -1;
2927	int callchain_ret = 0;
2928
2929	addr_location__init(&al);
2930	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2931
2932	if (sample->callchain) {
2933		struct callchain_cursor *cursor = get_tls_callchain_cursor();
2934
2935		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2936		if (callchain_ret == 0) {
2937			if (cursor->nr < trace->min_stack)
2938				goto out_put;
2939			callchain_ret = 1;
2940		}
2941	}
2942
2943	ttrace = thread__trace(thread, trace->output);
2944	if (ttrace == NULL)
2945		goto out_put;
2946
2947	if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2948		ttrace->pfmaj++;
2949	else
2950		ttrace->pfmin++;
2951
2952	if (trace->summary_only)
2953		goto out;
2954
2955	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
 
2956
2957	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2958
2959	fprintf(trace->output, "%sfault [",
2960		evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2961		"maj" : "min");
2962
2963	print_location(trace->output, sample, &al, false, true);
2964
2965	fprintf(trace->output, "] => ");
2966
2967	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
 
2968
2969	if (!al.map) {
2970		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
 
2971
2972		if (al.map)
2973			map_type = 'x';
2974		else
2975			map_type = '?';
2976	}
2977
2978	print_location(trace->output, sample, &al, true, false);
2979
2980	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2981
2982	if (callchain_ret > 0)
2983		trace__fprintf_callchain(trace, sample);
2984	else if (callchain_ret < 0)
2985		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2986
2987	++trace->nr_events_printed;
2988out:
2989	err = 0;
2990out_put:
2991	thread__put(thread);
2992	addr_location__exit(&al);
2993	return err;
2994}
2995
2996static void trace__set_base_time(struct trace *trace,
2997				 struct evsel *evsel,
2998				 struct perf_sample *sample)
2999{
3000	/*
3001	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
3002	 * and don't use sample->time unconditionally, we may end up having
3003	 * some other event in the future without PERF_SAMPLE_TIME for good
3004	 * reason, i.e. we may not be interested in its timestamps, just in
3005	 * it taking place, picking some piece of information when it
3006	 * appears in our event stream (vfs_getname comes to mind).
3007	 */
3008	if (trace->base_time == 0 && !trace->full_time &&
3009	    (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
3010		trace->base_time = sample->time;
3011}
3012
3013static int trace__process_sample(struct perf_tool *tool,
3014				 union perf_event *event,
3015				 struct perf_sample *sample,
3016				 struct evsel *evsel,
3017				 struct machine *machine __maybe_unused)
3018{
3019	struct trace *trace = container_of(tool, struct trace, tool);
3020	struct thread *thread;
3021	int err = 0;
3022
3023	tracepoint_handler handler = evsel->handler;
3024
3025	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3026	if (thread && thread__is_filtered(thread))
3027		goto out;
3028
3029	trace__set_base_time(trace, evsel, sample);
3030
3031	if (handler) {
3032		++trace->nr_events;
3033		handler(trace, evsel, event, sample);
3034	}
3035out:
3036	thread__put(thread);
3037	return err;
3038}
3039
3040static int trace__record(struct trace *trace, int argc, const char **argv)
3041{
3042	unsigned int rec_argc, i, j;
3043	const char **rec_argv;
3044	const char * const record_args[] = {
3045		"record",
3046		"-R",
3047		"-m", "1024",
3048		"-c", "1",
3049	};
3050	pid_t pid = getpid();
3051	char *filter = asprintf__tp_filter_pids(1, &pid);
3052	const char * const sc_args[] = { "-e", };
3053	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
3054	const char * const majpf_args[] = { "-e", "major-faults" };
3055	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
3056	const char * const minpf_args[] = { "-e", "minor-faults" };
3057	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
3058	int err = -1;
3059
3060	/* +3 is for the event string below and the pid filter */
3061	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
3062		majpf_args_nr + minpf_args_nr + argc;
3063	rec_argv = calloc(rec_argc + 1, sizeof(char *));
3064
3065	if (rec_argv == NULL || filter == NULL)
3066		goto out_free;
3067
3068	j = 0;
3069	for (i = 0; i < ARRAY_SIZE(record_args); i++)
3070		rec_argv[j++] = record_args[i];
3071
3072	if (trace->trace_syscalls) {
3073		for (i = 0; i < sc_args_nr; i++)
3074			rec_argv[j++] = sc_args[i];
3075
3076		/* event string may be different for older kernels - e.g., RHEL6 */
3077		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
3078			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
3079		else if (is_valid_tracepoint("syscalls:sys_enter"))
3080			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
3081		else {
3082			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
3083			goto out_free;
 
3084		}
3085	}
3086
3087	rec_argv[j++] = "--filter";
3088	rec_argv[j++] = filter;
3089
3090	if (trace->trace_pgfaults & TRACE_PFMAJ)
3091		for (i = 0; i < majpf_args_nr; i++)
3092			rec_argv[j++] = majpf_args[i];
3093
3094	if (trace->trace_pgfaults & TRACE_PFMIN)
3095		for (i = 0; i < minpf_args_nr; i++)
3096			rec_argv[j++] = minpf_args[i];
3097
3098	for (i = 0; i < (unsigned int)argc; i++)
3099		rec_argv[j++] = argv[i];
3100
3101	err = cmd_record(j, rec_argv);
3102out_free:
3103	free(filter);
3104	free(rec_argv);
3105	return err;
3106}
3107
3108static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
3109
3110static bool evlist__add_vfs_getname(struct evlist *evlist)
3111{
3112	bool found = false;
3113	struct evsel *evsel, *tmp;
3114	struct parse_events_error err;
3115	int ret;
3116
3117	parse_events_error__init(&err);
3118	ret = parse_events(evlist, "probe:vfs_getname*", &err);
3119	parse_events_error__exit(&err);
3120	if (ret)
3121		return false;
3122
3123	evlist__for_each_entry_safe(evlist, evsel, tmp) {
3124		if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
3125			continue;
3126
3127		if (evsel__field(evsel, "pathname")) {
3128			evsel->handler = trace__vfs_getname;
3129			found = true;
3130			continue;
3131		}
3132
3133		list_del_init(&evsel->core.node);
3134		evsel->evlist = NULL;
3135		evsel__delete(evsel);
3136	}
3137
3138	return found;
 
 
3139}
3140
3141static struct evsel *evsel__new_pgfault(u64 config)
3142{
3143	struct evsel *evsel;
3144	struct perf_event_attr attr = {
3145		.type = PERF_TYPE_SOFTWARE,
3146		.mmap_data = 1,
3147	};
3148
3149	attr.config = config;
3150	attr.sample_period = 1;
3151
3152	event_attr_init(&attr);
3153
3154	evsel = evsel__new(&attr);
3155	if (evsel)
3156		evsel->handler = trace__pgfault;
3157
3158	return evsel;
3159}
3160
3161static void evlist__free_syscall_tp_fields(struct evlist *evlist)
3162{
3163	struct evsel *evsel;
3164
3165	evlist__for_each_entry(evlist, evsel) {
3166		evsel_trace__delete(evsel->priv);
3167		evsel->priv = NULL;
3168	}
3169}
3170
3171static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
3172{
3173	const u32 type = event->header.type;
3174	struct evsel *evsel;
3175
3176	if (type != PERF_RECORD_SAMPLE) {
3177		trace__process_event(trace, trace->host, event, sample);
3178		return;
3179	}
3180
3181	evsel = evlist__id2evsel(trace->evlist, sample->id);
3182	if (evsel == NULL) {
3183		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
3184		return;
3185	}
3186
3187	if (evswitch__discard(&trace->evswitch, evsel))
3188		return;
3189
3190	trace__set_base_time(trace, evsel, sample);
3191
3192	if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
3193	    sample->raw_data == NULL) {
3194		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
3195		       evsel__name(evsel), sample->tid,
3196		       sample->cpu, sample->raw_size);
3197	} else {
3198		tracepoint_handler handler = evsel->handler;
3199		handler(trace, evsel, event, sample);
3200	}
3201
3202	if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
3203		interrupted = true;
3204}
3205
3206static int trace__add_syscall_newtp(struct trace *trace)
3207{
3208	int ret = -1;
3209	struct evlist *evlist = trace->evlist;
3210	struct evsel *sys_enter, *sys_exit;
3211
3212	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
3213	if (sys_enter == NULL)
3214		goto out;
3215
3216	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
3217		goto out_delete_sys_enter;
3218
3219	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
3220	if (sys_exit == NULL)
3221		goto out_delete_sys_enter;
3222
3223	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
3224		goto out_delete_sys_exit;
3225
3226	evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
3227	evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
3228
3229	evlist__add(evlist, sys_enter);
3230	evlist__add(evlist, sys_exit);
3231
3232	if (callchain_param.enabled && !trace->kernel_syscallchains) {
3233		/*
3234		 * We're interested only in the user space callchain
3235		 * leading to the syscall, allow overriding that for
3236		 * debugging reasons using --kernel_syscall_callchains
3237		 */
3238		sys_exit->core.attr.exclude_callchain_kernel = 1;
3239	}
3240
3241	trace->syscalls.events.sys_enter = sys_enter;
3242	trace->syscalls.events.sys_exit  = sys_exit;
3243
3244	ret = 0;
3245out:
3246	return ret;
3247
3248out_delete_sys_exit:
3249	evsel__delete_priv(sys_exit);
3250out_delete_sys_enter:
3251	evsel__delete_priv(sys_enter);
3252	goto out;
3253}
3254
3255static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
3256{
3257	int err = -1;
3258	struct evsel *sys_exit;
3259	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
3260						trace->ev_qualifier_ids.nr,
3261						trace->ev_qualifier_ids.entries);
3262
3263	if (filter == NULL)
3264		goto out_enomem;
3265
3266	if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
 
3267		sys_exit = trace->syscalls.events.sys_exit;
3268		err = evsel__append_tp_filter(sys_exit, filter);
3269	}
3270
3271	free(filter);
3272out:
3273	return err;
3274out_enomem:
3275	errno = ENOMEM;
3276	goto out;
3277}
3278
3279#ifdef HAVE_BPF_SKEL
3280static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
3281{
3282	struct bpf_program *pos, *prog = NULL;
3283	const char *sec_name;
3284
3285	if (trace->skel->obj == NULL)
3286		return NULL;
3287
3288	bpf_object__for_each_program(pos, trace->skel->obj) {
3289		sec_name = bpf_program__section_name(pos);
3290		if (sec_name && !strcmp(sec_name, name)) {
3291			prog = pos;
3292			break;
3293		}
3294	}
3295
3296	return prog;
3297}
3298
3299static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
3300							const char *prog_name, const char *type)
3301{
3302	struct bpf_program *prog;
3303
3304	if (prog_name == NULL) {
3305		char default_prog_name[256];
3306		scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name);
3307		prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3308		if (prog != NULL)
3309			goto out_found;
3310		if (sc->fmt && sc->fmt->alias) {
3311			scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
3312			prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3313			if (prog != NULL)
3314				goto out_found;
3315		}
3316		goto out_unaugmented;
3317	}
3318
3319	prog = trace__find_bpf_program_by_title(trace, prog_name);
3320
3321	if (prog != NULL) {
3322out_found:
3323		return prog;
3324	}
3325
3326	pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
3327		 prog_name, type, sc->name);
3328out_unaugmented:
3329	return trace->skel->progs.syscall_unaugmented;
3330}
3331
3332static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
3333{
3334	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3335
3336	if (sc == NULL)
3337		return;
3338
3339	sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3340	sc->bpf_prog.sys_exit  = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit  : NULL,  "exit");
3341}
3342
3343static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
3344{
3345	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3346	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3347}
3348
3349static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
3350{
3351	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3352	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3353}
3354
3355static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
3356{
3357	struct tep_format_field *field, *candidate_field;
3358	int id;
3359
3360	/*
3361	 * We're only interested in syscalls that have a pointer:
3362	 */
3363	for (field = sc->args; field; field = field->next) {
3364		if (field->flags & TEP_FIELD_IS_POINTER)
3365			goto try_to_find_pair;
3366	}
3367
3368	return NULL;
3369
3370try_to_find_pair:
3371	for (id = 0; id < trace->sctbl->syscalls.nr_entries; ++id) {
3372		struct syscall *pair = trace__syscall_info(trace, NULL, id);
3373		struct bpf_program *pair_prog;
3374		bool is_candidate = false;
3375
3376		if (pair == NULL || pair == sc ||
3377		    pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
3378			continue;
3379
3380		for (field = sc->args, candidate_field = pair->args;
3381		     field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
3382			bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
3383			     candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
3384
3385			if (is_pointer) {
3386			       if (!candidate_is_pointer) {
3387					// The candidate just doesn't copies our pointer arg, might copy other pointers we want.
3388					continue;
3389			       }
3390			} else {
3391				if (candidate_is_pointer) {
3392					// The candidate might copy a pointer we don't have, skip it.
3393					goto next_candidate;
3394				}
3395				continue;
3396			}
3397
3398			if (strcmp(field->type, candidate_field->type))
3399				goto next_candidate;
3400
3401			/*
3402			 * This is limited in the BPF program but sys_write
3403			 * uses "const char *" for its "buf" arg so we need to
3404			 * use some heuristic that is kinda future proof...
3405			 */
3406			if (strcmp(field->type, "const char *") == 0 &&
3407			    !(strstr(field->name, "name") ||
3408			      strstr(field->name, "path") ||
3409			      strstr(field->name, "file") ||
3410			      strstr(field->name, "root") ||
3411			      strstr(field->name, "description")))
3412				goto next_candidate;
3413
3414			is_candidate = true;
3415		}
3416
3417		if (!is_candidate)
3418			goto next_candidate;
3419
3420		/*
3421		 * Check if the tentative pair syscall augmenter has more pointers, if it has,
3422		 * then it may be collecting that and we then can't use it, as it would collect
3423		 * more than what is common to the two syscalls.
3424		 */
3425		if (candidate_field) {
3426			for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
3427				if (candidate_field->flags & TEP_FIELD_IS_POINTER)
3428					goto next_candidate;
3429		}
3430
3431		pair_prog = pair->bpf_prog.sys_enter;
3432		/*
3433		 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
3434		 * have been searched for, so search it here and if it returns the
3435		 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
3436		 * program for a filtered syscall on a non-filtered one.
3437		 *
3438		 * For instance, we have "!syscalls:sys_enter_renameat" and that is
3439		 * useful for "renameat2".
3440		 */
3441		if (pair_prog == NULL) {
3442			pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3443			if (pair_prog == trace->skel->progs.syscall_unaugmented)
3444				goto next_candidate;
3445		}
3446
3447		pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3448		return pair_prog;
3449	next_candidate:
3450		continue;
3451	}
3452
3453	return NULL;
3454}
3455
3456static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3457{
3458	int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
3459	int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
3460	int err = 0, key;
3461
3462	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
3463		int prog_fd;
3464
3465		if (!trace__syscall_enabled(trace, key))
3466			continue;
3467
3468		trace__init_syscall_bpf_progs(trace, key);
3469
3470		// It'll get at least the "!raw_syscalls:unaugmented"
3471		prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3472		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3473		if (err)
3474			break;
3475		prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3476		err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3477		if (err)
3478			break;
3479	}
3480
3481	/*
3482	 * Now lets do a second pass looking for enabled syscalls without
3483	 * an augmenter that have a signature that is a superset of another
3484	 * syscall with an augmenter so that we can auto-reuse it.
3485	 *
3486	 * I.e. if we have an augmenter for the "open" syscall that has
3487	 * this signature:
3488	 *
3489	 *   int open(const char *pathname, int flags, mode_t mode);
3490	 *
3491	 * I.e. that will collect just the first string argument, then we
3492	 * can reuse it for the 'creat' syscall, that has this signature:
3493	 *
3494	 *   int creat(const char *pathname, mode_t mode);
3495	 *
3496	 * and for:
3497	 *
3498	 *   int stat(const char *pathname, struct stat *statbuf);
3499	 *   int lstat(const char *pathname, struct stat *statbuf);
3500	 *
3501	 * Because the 'open' augmenter will collect the first arg as a string,
3502	 * and leave alone all the other args, which already helps with
3503	 * beautifying 'stat' and 'lstat''s pathname arg.
3504	 *
3505	 * Then, in time, when 'stat' gets an augmenter that collects both
3506	 * first and second arg (this one on the raw_syscalls:sys_exit prog
3507	 * array tail call, then that one will be used.
3508	 */
3509	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
3510		struct syscall *sc = trace__syscall_info(trace, NULL, key);
3511		struct bpf_program *pair_prog;
3512		int prog_fd;
3513
3514		if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3515			continue;
3516
3517		/*
3518		 * For now we're just reusing the sys_enter prog, and if it
3519		 * already has an augmenter, we don't need to find one.
3520		 */
3521		if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
3522			continue;
3523
3524		/*
3525		 * Look at all the other syscalls for one that has a signature
3526		 * that is close enough that we can share:
3527		 */
3528		pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3529		if (pair_prog == NULL)
3530			continue;
3531
3532		sc->bpf_prog.sys_enter = pair_prog;
3533
3534		/*
3535		 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3536		 * with the fd for the program we're reusing:
3537		 */
3538		prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3539		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3540		if (err)
3541			break;
3542	}
3543
3544	return err;
3545}
3546#endif // HAVE_BPF_SKEL
3547
3548static int trace__set_ev_qualifier_filter(struct trace *trace)
3549{
3550	if (trace->syscalls.events.sys_enter)
3551		return trace__set_ev_qualifier_tp_filter(trace);
3552	return 0;
3553}
3554
3555static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3556				    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3557{
3558	int err = 0;
3559#ifdef HAVE_LIBBPF_SUPPORT
3560	bool value = true;
3561	int map_fd = bpf_map__fd(map);
3562	size_t i;
3563
3564	for (i = 0; i < npids; ++i) {
3565		err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3566		if (err)
3567			break;
3568	}
3569#endif
3570	return err;
3571}
3572
3573static int trace__set_filter_loop_pids(struct trace *trace)
3574{
3575	unsigned int nr = 1, err;
3576	pid_t pids[32] = {
3577		getpid(),
3578	};
3579	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3580
3581	while (thread && nr < ARRAY_SIZE(pids)) {
3582		struct thread *parent = machine__find_thread(trace->host,
3583							     thread__ppid(thread),
3584							     thread__ppid(thread));
3585
3586		if (parent == NULL)
3587			break;
3588
3589		if (!strcmp(thread__comm_str(parent), "sshd") ||
3590		    strstarts(thread__comm_str(parent), "gnome-terminal")) {
3591			pids[nr++] = thread__tid(parent);
3592			break;
3593		}
3594		thread = parent;
3595	}
3596
3597	err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
3598	if (!err && trace->filter_pids.map)
3599		err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3600
3601	return err;
3602}
3603
3604static int trace__set_filter_pids(struct trace *trace)
3605{
3606	int err = 0;
3607	/*
3608	 * Better not use !target__has_task() here because we need to cover the
3609	 * case where no threads were specified in the command line, but a
3610	 * workload was, and in that case we will fill in the thread_map when
3611	 * we fork the workload in evlist__prepare_workload.
3612	 */
3613	if (trace->filter_pids.nr > 0) {
3614		err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
3615						    trace->filter_pids.entries);
3616		if (!err && trace->filter_pids.map) {
3617			err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
3618						       trace->filter_pids.entries);
3619		}
3620	} else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
3621		err = trace__set_filter_loop_pids(trace);
3622	}
3623
3624	return err;
3625}
3626
3627static int __trace__deliver_event(struct trace *trace, union perf_event *event)
3628{
3629	struct evlist *evlist = trace->evlist;
3630	struct perf_sample sample;
3631	int err = evlist__parse_sample(evlist, event, &sample);
3632
3633	if (err)
3634		fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
3635	else
3636		trace__handle_event(trace, event, &sample);
3637
3638	return 0;
3639}
3640
3641static int __trace__flush_events(struct trace *trace)
3642{
3643	u64 first = ordered_events__first_time(&trace->oe.data);
3644	u64 flush = trace->oe.last - NSEC_PER_SEC;
3645
3646	/* Is there some thing to flush.. */
3647	if (first && first < flush)
3648		return ordered_events__flush_time(&trace->oe.data, flush);
3649
3650	return 0;
3651}
3652
3653static int trace__flush_events(struct trace *trace)
3654{
3655	return !trace->sort_events ? 0 : __trace__flush_events(trace);
3656}
3657
3658static int trace__deliver_event(struct trace *trace, union perf_event *event)
3659{
3660	int err;
3661
3662	if (!trace->sort_events)
3663		return __trace__deliver_event(trace, event);
3664
3665	err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
3666	if (err && err != -1)
3667		return err;
3668
3669	err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
3670	if (err)
3671		return err;
3672
3673	return trace__flush_events(trace);
3674}
3675
3676static int ordered_events__deliver_event(struct ordered_events *oe,
3677					 struct ordered_event *event)
3678{
3679	struct trace *trace = container_of(oe, struct trace, oe.data);
3680
3681	return __trace__deliver_event(trace, event->event);
3682}
3683
3684static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg)
3685{
3686	struct tep_format_field *field;
3687	struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
3688
3689	if (evsel->tp_format == NULL || fmt == NULL)
3690		return NULL;
3691
3692	for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
3693		if (strcmp(field->name, arg) == 0)
3694			return fmt;
3695
3696	return NULL;
3697}
3698
3699static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel *evsel)
3700{
3701	char *tok, *left = evsel->filter, *new_filter = evsel->filter;
3702
3703	while ((tok = strpbrk(left, "=<>!")) != NULL) {
3704		char *right = tok + 1, *right_end;
3705
3706		if (*right == '=')
3707			++right;
3708
3709		while (isspace(*right))
3710			++right;
3711
3712		if (*right == '\0')
3713			break;
3714
3715		while (!isalpha(*left))
3716			if (++left == tok) {
3717				/*
3718				 * Bail out, can't find the name of the argument that is being
3719				 * used in the filter, let it try to set this filter, will fail later.
3720				 */
3721				return 0;
3722			}
3723
3724		right_end = right + 1;
3725		while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
3726			++right_end;
3727
3728		if (isalpha(*right)) {
3729			struct syscall_arg_fmt *fmt;
3730			int left_size = tok - left,
3731			    right_size = right_end - right;
3732			char arg[128];
3733
3734			while (isspace(left[left_size - 1]))
3735				--left_size;
3736
3737			scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
3738
3739			fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg);
3740			if (fmt == NULL) {
3741				pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
3742				       arg, evsel->name, evsel->filter);
3743				return -1;
3744			}
3745
3746			pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
3747				 arg, (int)(right - tok), tok, right_size, right);
3748
3749			if (fmt->strtoul) {
3750				u64 val;
3751				struct syscall_arg syscall_arg = {
3752					.parm = fmt->parm,
3753				};
3754
3755				if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
3756					char *n, expansion[19];
3757					int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
3758					int expansion_offset = right - new_filter;
3759
3760					pr_debug("%s", expansion);
3761
3762					if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
3763						pr_debug(" out of memory!\n");
3764						free(new_filter);
3765						return -1;
3766					}
3767					if (new_filter != evsel->filter)
3768						free(new_filter);
3769					left = n + expansion_offset + expansion_lenght;
3770					new_filter = n;
3771				} else {
3772					pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
3773					       right_size, right, arg, evsel->name, evsel->filter);
3774					return -1;
3775				}
3776			} else {
3777				pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
3778				       arg, evsel->name, evsel->filter);
3779				return -1;
3780			}
3781
3782			pr_debug("\n");
3783		} else {
3784			left = right_end;
3785		}
3786	}
3787
3788	if (new_filter != evsel->filter) {
3789		pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
3790		evsel__set_filter(evsel, new_filter);
3791		free(new_filter);
3792	}
3793
3794	return 0;
3795}
3796
3797static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
3798{
3799	struct evlist *evlist = trace->evlist;
3800	struct evsel *evsel;
3801
3802	evlist__for_each_entry(evlist, evsel) {
3803		if (evsel->filter == NULL)
3804			continue;
3805
3806		if (trace__expand_filter(trace, evsel)) {
3807			*err_evsel = evsel;
3808			return -1;
3809		}
3810	}
3811
3812	return 0;
3813}
3814
3815static int trace__run(struct trace *trace, int argc, const char **argv)
3816{
3817	struct evlist *evlist = trace->evlist;
3818	struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
3819	int err = -1, i;
3820	unsigned long before;
3821	const bool forks = argc > 0;
3822	bool draining = false;
3823
3824	trace->live = true;
3825
3826	if (!trace->raw_augmented_syscalls) {
3827		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
3828			goto out_error_raw_syscalls;
3829
3830		if (trace->trace_syscalls)
3831			trace->vfs_getname = evlist__add_vfs_getname(evlist);
3832	}
3833
3834	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
3835		pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
3836		if (pgfault_maj == NULL)
3837			goto out_error_mem;
3838		evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
3839		evlist__add(evlist, pgfault_maj);
3840	}
3841
3842	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
3843		pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
3844		if (pgfault_min == NULL)
3845			goto out_error_mem;
3846		evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
3847		evlist__add(evlist, pgfault_min);
3848	}
3849
3850	/* Enable ignoring missing threads when -u/-p option is defined. */
3851	trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
3852
3853	if (trace->sched &&
3854	    evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
 
3855		goto out_error_sched_stat_runtime;
 
3856	/*
3857	 * If a global cgroup was set, apply it to all the events without an
3858	 * explicit cgroup. I.e.:
3859	 *
3860	 * 	trace -G A -e sched:*switch
3861	 *
3862	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
3863	 * _and_ sched:sched_switch to the 'A' cgroup, while:
3864	 *
3865	 * trace -e sched:*switch -G A
3866	 *
3867	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
3868	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
3869	 * a cgroup (on the root cgroup, sys wide, etc).
3870	 *
3871	 * Multiple cgroups:
3872	 *
3873	 * trace -G A -e sched:*switch -G B
3874	 *
3875	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
3876	 * to the 'B' cgroup.
3877	 *
3878	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
3879	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
3880	 */
3881	if (trace->cgroup)
3882		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
3883
3884	err = evlist__create_maps(evlist, &trace->opts.target);
3885	if (err < 0) {
3886		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
3887		goto out_delete_evlist;
3888	}
3889
3890	err = trace__symbols_init(trace, evlist);
3891	if (err < 0) {
3892		fprintf(trace->output, "Problems initializing symbol libraries!\n");
3893		goto out_delete_evlist;
3894	}
3895
3896	evlist__config(evlist, &trace->opts, &callchain_param);
 
 
 
3897
3898	if (forks) {
3899		err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
 
3900		if (err < 0) {
3901			fprintf(trace->output, "Couldn't run the workload!\n");
3902			goto out_delete_evlist;
3903		}
3904		workload_pid = evlist->workload.pid;
3905	}
3906
3907	err = evlist__open(evlist);
3908	if (err < 0)
3909		goto out_error_open;
3910#ifdef HAVE_BPF_SKEL
3911	if (trace->syscalls.events.bpf_output) {
3912		struct perf_cpu cpu;
3913
3914		/*
3915		 * Set up the __augmented_syscalls__ BPF map to hold for each
3916		 * CPU the bpf-output event's file descriptor.
3917		 */
3918		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
3919			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
3920					&cpu.cpu, sizeof(int),
3921					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
3922						       cpu.cpu, 0),
3923					sizeof(__u32), BPF_ANY);
3924		}
3925	}
3926#endif
3927	err = trace__set_filter_pids(trace);
 
 
 
 
 
 
 
 
 
 
3928	if (err < 0)
3929		goto out_error_mem;
3930
3931#ifdef HAVE_BPF_SKEL
3932	if (trace->skel && trace->skel->progs.sys_enter)
3933		trace__init_syscalls_bpf_prog_array_maps(trace);
3934#endif
3935
3936	if (trace->ev_qualifier_ids.nr > 0) {
3937		err = trace__set_ev_qualifier_filter(trace);
3938		if (err < 0)
3939			goto out_errno;
3940
3941		if (trace->syscalls.events.sys_exit) {
3942			pr_debug("event qualifier tracepoint filter: %s\n",
3943				 trace->syscalls.events.sys_exit->filter);
3944		}
3945	}
3946
3947	/*
3948	 * If the "close" syscall is not traced, then we will not have the
3949	 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
3950	 * fd->pathname table and were ending up showing the last value set by
3951	 * syscalls opening a pathname and associating it with a descriptor or
3952	 * reading it from /proc/pid/fd/ in cases where that doesn't make
3953	 * sense.
3954	 *
3955	 *  So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
3956	 *  not in use.
3957	 */
3958	trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
3959
3960	err = trace__expand_filters(trace, &evsel);
3961	if (err)
3962		goto out_delete_evlist;
3963	err = evlist__apply_filters(evlist, &evsel);
3964	if (err < 0)
3965		goto out_error_apply_filters;
3966
3967	err = evlist__mmap(evlist, trace->opts.mmap_pages);
3968	if (err < 0)
3969		goto out_error_mmap;
3970
3971	if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
3972		evlist__enable(evlist);
3973
3974	if (forks)
3975		evlist__start_workload(evlist);
3976
3977	if (trace->opts.target.initial_delay) {
3978		usleep(trace->opts.target.initial_delay * 1000);
3979		evlist__enable(evlist);
3980	}
3981
3982	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
3983		perf_thread_map__nr(evlist->core.threads) > 1 ||
3984		evlist__first(evlist)->core.attr.inherit;
3985
3986	/*
3987	 * Now that we already used evsel->core.attr to ask the kernel to setup the
3988	 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
3989	 * trace__resolve_callchain(), allowing per-event max-stack settings
3990	 * to override an explicitly set --max-stack global setting.
3991	 */
3992	evlist__for_each_entry(evlist, evsel) {
3993		if (evsel__has_callchain(evsel) &&
3994		    evsel->core.attr.sample_max_stack == 0)
3995			evsel->core.attr.sample_max_stack = trace->max_stack;
3996	}
3997again:
3998	before = trace->nr_events;
3999
4000	for (i = 0; i < evlist->core.nr_mmaps; i++) {
4001		union perf_event *event;
4002		struct mmap *md;
4003
4004		md = &evlist->mmap[i];
4005		if (perf_mmap__read_init(&md->core) < 0)
4006			continue;
4007
4008		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 
 
4009			++trace->nr_events;
4010
4011			err = trace__deliver_event(trace, event);
4012			if (err)
4013				goto out_disable;
 
 
4014
4015			perf_mmap__consume(&md->core);
 
 
4016
4017			if (interrupted)
4018				goto out_disable;
4019
4020			if (done && !draining) {
4021				evlist__disable(evlist);
4022				draining = true;
4023			}
4024		}
4025		perf_mmap__read_done(&md->core);
4026	}
4027
4028	if (trace->nr_events == before) {
4029		int timeout = done ? 100 : -1;
4030
4031		if (!draining && evlist__poll(evlist, timeout) > 0) {
4032			if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
4033				draining = true;
4034
4035			goto again;
4036		} else {
4037			if (trace__flush_events(trace))
4038				goto out_disable;
4039		}
4040	} else {
4041		goto again;
4042	}
4043
4044out_disable:
4045	thread__zput(trace->current);
4046
4047	evlist__disable(evlist);
4048
4049	if (trace->sort_events)
4050		ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
4051
4052	if (!err) {
4053		if (trace->summary)
4054			trace__fprintf_thread_summary(trace, trace->output);
4055
4056		if (trace->show_tool_stats) {
4057			fprintf(trace->output, "Stats:\n "
4058					       " vfs_getname : %" PRIu64 "\n"
4059					       " proc_getname: %" PRIu64 "\n",
4060				trace->stats.vfs_getname,
4061				trace->stats.proc_getname);
4062		}
4063	}
4064
4065out_delete_evlist:
4066	trace__symbols__exit(trace);
4067	evlist__free_syscall_tp_fields(evlist);
4068	evlist__delete(evlist);
4069	cgroup__put(trace->cgroup);
4070	trace->evlist = NULL;
4071	trace->live = false;
4072	return err;
4073{
4074	char errbuf[BUFSIZ];
4075
4076out_error_sched_stat_runtime:
4077	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
4078	goto out_error;
4079
4080out_error_raw_syscalls:
4081	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
4082	goto out_error;
4083
4084out_error_mmap:
4085	evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
4086	goto out_error;
4087
4088out_error_open:
4089	evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
4090
4091out_error:
4092	fprintf(trace->output, "%s\n", errbuf);
4093	goto out_delete_evlist;
4094
4095out_error_apply_filters:
4096	fprintf(trace->output,
4097		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
4098		evsel->filter, evsel__name(evsel), errno,
4099		str_error_r(errno, errbuf, sizeof(errbuf)));
4100	goto out_delete_evlist;
4101}
4102out_error_mem:
4103	fprintf(trace->output, "Not enough memory to run!\n");
4104	goto out_delete_evlist;
4105
4106out_errno:
4107	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
4108	goto out_delete_evlist;
4109}
4110
4111static int trace__replay(struct trace *trace)
4112{
4113	const struct evsel_str_handler handlers[] = {
4114		{ "probe:vfs_getname",	     trace__vfs_getname, },
4115	};
4116	struct perf_data data = {
4117		.path  = input_name,
4118		.mode  = PERF_DATA_MODE_READ,
4119		.force = trace->force,
 
 
4120	};
4121	struct perf_session *session;
4122	struct evsel *evsel;
4123	int err = -1;
4124
4125	trace->tool.sample	  = trace__process_sample;
4126	trace->tool.mmap	  = perf_event__process_mmap;
4127	trace->tool.mmap2	  = perf_event__process_mmap2;
4128	trace->tool.comm	  = perf_event__process_comm;
4129	trace->tool.exit	  = perf_event__process_exit;
4130	trace->tool.fork	  = perf_event__process_fork;
4131	trace->tool.attr	  = perf_event__process_attr;
4132	trace->tool.tracing_data  = perf_event__process_tracing_data;
4133	trace->tool.build_id	  = perf_event__process_build_id;
4134	trace->tool.namespaces	  = perf_event__process_namespaces;
4135
4136	trace->tool.ordered_events = true;
4137	trace->tool.ordering_requires_timestamps = true;
4138
4139	/* add tid to output */
4140	trace->multiple_threads = true;
4141
4142	session = perf_session__new(&data, &trace->tool);
4143	if (IS_ERR(session))
4144		return PTR_ERR(session);
4145
4146	if (trace->opts.target.pid)
4147		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
4148
4149	if (trace->opts.target.tid)
4150		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
4151
4152	if (symbol__init(&session->header.env) < 0)
4153		goto out;
4154
4155	trace->host = &session->machines.host;
4156
4157	err = perf_session__set_tracepoints_handlers(session, handlers);
4158	if (err)
4159		goto out;
4160
4161	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
4162	trace->syscalls.events.sys_enter = evsel;
4163	/* older kernels have syscalls tp versus raw_syscalls */
4164	if (evsel == NULL)
4165		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
 
4166
4167	if (evsel &&
4168	    (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
4169	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
4170		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
4171		goto out;
4172	}
4173
4174	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
4175	trace->syscalls.events.sys_exit = evsel;
4176	if (evsel == NULL)
4177		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
 
4178	if (evsel &&
4179	    (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
4180	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
4181		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
4182		goto out;
4183	}
4184
4185	evlist__for_each_entry(session->evlist, evsel) {
4186		if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
4187		    (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
4188		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
4189		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
4190			evsel->handler = trace__pgfault;
4191	}
4192
4193	setup_pager();
4194
4195	err = perf_session__process_events(session);
4196	if (err)
4197		pr_err("Failed to process events, error %d", err);
4198
4199	else if (trace->summary)
4200		trace__fprintf_thread_summary(trace, trace->output);
4201
4202out:
4203	perf_session__delete(session);
4204
4205	return err;
4206}
4207
4208static size_t trace__fprintf_threads_header(FILE *fp)
4209{
4210	size_t printed;
4211
4212	printed  = fprintf(fp, "\n Summary of events:\n\n");
4213
4214	return printed;
4215}
4216
4217DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
4218	struct syscall_stats *stats;
4219	double		     msecs;
4220	int		     syscall;
4221)
4222{
4223	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
4224	struct syscall_stats *stats = source->priv;
4225
4226	entry->syscall = source->i;
4227	entry->stats   = stats;
4228	entry->msecs   = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
4229}
4230
4231static size_t thread__dump_stats(struct thread_trace *ttrace,
4232				 struct trace *trace, FILE *fp)
4233{
4234	size_t printed = 0;
4235	struct syscall *sc;
4236	struct rb_node *nd;
4237	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
4238
4239	if (syscall_stats == NULL)
4240		return 0;
4241
4242	printed += fprintf(fp, "\n");
4243
4244	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
4245	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
4246	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
4247
4248	resort_rb__for_each_entry(nd, syscall_stats) {
4249		struct syscall_stats *stats = syscall_stats_entry->stats;
4250		if (stats) {
4251			double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
4252			double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
4253			double avg = avg_stats(&stats->stats);
4254			double pct;
4255			u64 n = (u64)stats->stats.n;
4256
4257			pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
4258			avg /= NSEC_PER_MSEC;
4259
4260			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
4261			printed += fprintf(fp, "   %-15s", sc->name);
4262			printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
4263					   n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
4264			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
4265
4266			if (trace->errno_summary && stats->nr_failures) {
4267				int e;
4268
4269				for (e = 0; e < stats->max_errno; ++e) {
4270					if (stats->errnos[e] != 0)
4271						fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
4272				}
4273			}
4274		}
4275	}
4276
4277	resort_rb__delete(syscall_stats);
4278	printed += fprintf(fp, "\n\n");
4279
4280	return printed;
4281}
4282
4283static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
4284{
4285	size_t printed = 0;
4286	struct thread_trace *ttrace = thread__priv(thread);
4287	double ratio;
4288
4289	if (ttrace == NULL)
4290		return 0;
4291
4292	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
4293
4294	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread));
4295	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
4296	printed += fprintf(fp, "%.1f%%", ratio);
4297	if (ttrace->pfmaj)
4298		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
4299	if (ttrace->pfmin)
4300		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
4301	if (trace->sched)
4302		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
4303	else if (fputc('\n', fp) != EOF)
4304		++printed;
4305
4306	printed += thread__dump_stats(ttrace, trace, fp);
4307
4308	return printed;
4309}
4310
4311static unsigned long thread__nr_events(struct thread_trace *ttrace)
4312{
4313	return ttrace ? ttrace->nr_events : 0;
4314}
4315
4316static int trace_nr_events_cmp(void *priv __maybe_unused,
4317			       const struct list_head *la,
4318			       const struct list_head *lb)
4319{
4320	struct thread_list *a = list_entry(la, struct thread_list, list);
4321	struct thread_list *b = list_entry(lb, struct thread_list, list);
4322	unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
4323	unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
4324
4325	if (a_nr_events != b_nr_events)
4326		return a_nr_events < b_nr_events ? -1 : 1;
4327
4328	/* Identical number of threads, place smaller tids first. */
4329	return thread__tid(a->thread) < thread__tid(b->thread)
4330		? -1
4331		: (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
4332}
4333
4334static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
4335{
4336	size_t printed = trace__fprintf_threads_header(fp);
4337	LIST_HEAD(threads);
 
4338
4339	if (machine__thread_list(trace->host, &threads) == 0) {
4340		struct thread_list *pos;
 
 
 
 
 
4341
4342		list_sort(NULL, &threads, trace_nr_events_cmp);
 
4343
4344		list_for_each_entry(pos, &threads, list)
4345			printed += trace__fprintf_thread(fp, pos->thread, trace);
4346	}
4347	thread_list__delete(&threads);
4348	return printed;
4349}
4350
4351static int trace__set_duration(const struct option *opt, const char *str,
4352			       int unset __maybe_unused)
4353{
4354	struct trace *trace = opt->value;
4355
4356	trace->duration_filter = atof(str);
4357	return 0;
4358}
4359
4360static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
4361					      int unset __maybe_unused)
4362{
4363	int ret = -1;
4364	size_t i;
4365	struct trace *trace = opt->value;
4366	/*
4367	 * FIXME: introduce a intarray class, plain parse csv and create a
4368	 * { int nr, int entries[] } struct...
4369	 */
4370	struct intlist *list = intlist__new(str);
4371
4372	if (list == NULL)
4373		return -1;
4374
4375	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
4376	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
4377
4378	if (trace->filter_pids.entries == NULL)
4379		goto out;
4380
4381	trace->filter_pids.entries[0] = getpid();
4382
4383	for (i = 1; i < trace->filter_pids.nr; ++i)
4384		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
4385
4386	intlist__delete(list);
4387	ret = 0;
4388out:
4389	return ret;
4390}
4391
4392static int trace__open_output(struct trace *trace, const char *filename)
4393{
4394	struct stat st;
4395
4396	if (!stat(filename, &st) && st.st_size) {
4397		char oldname[PATH_MAX];
4398
4399		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
4400		unlink(oldname);
4401		rename(filename, oldname);
4402	}
4403
4404	trace->output = fopen(filename, "w");
4405
4406	return trace->output == NULL ? -errno : 0;
4407}
4408
4409static int parse_pagefaults(const struct option *opt, const char *str,
4410			    int unset __maybe_unused)
4411{
4412	int *trace_pgfaults = opt->value;
4413
4414	if (strcmp(str, "all") == 0)
4415		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
4416	else if (strcmp(str, "maj") == 0)
4417		*trace_pgfaults |= TRACE_PFMAJ;
4418	else if (strcmp(str, "min") == 0)
4419		*trace_pgfaults |= TRACE_PFMIN;
4420	else
4421		return -1;
4422
4423	return 0;
4424}
4425
4426static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
4427{
4428	struct evsel *evsel;
4429
4430	evlist__for_each_entry(evlist, evsel) {
4431		if (evsel->handler == NULL)
4432			evsel->handler = handler;
4433	}
4434}
4435
4436static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
4437{
4438	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
4439
4440	if (fmt) {
4441		const struct syscall_fmt *scfmt = syscall_fmt__find(name);
4442
4443		if (scfmt) {
4444			int skip = 0;
4445
4446			if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
4447			    strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
4448				++skip;
4449
4450			memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
4451		}
4452	}
4453}
4454
4455static int evlist__set_syscall_tp_fields(struct evlist *evlist)
4456{
4457	struct evsel *evsel;
4458
4459	evlist__for_each_entry(evlist, evsel) {
4460		if (evsel->priv || !evsel->tp_format)
4461			continue;
4462
4463		if (strcmp(evsel->tp_format->system, "syscalls")) {
4464			evsel__init_tp_arg_scnprintf(evsel);
4465			continue;
4466		}
4467
4468		if (evsel__init_syscall_tp(evsel))
4469			return -1;
4470
4471		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
4472			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4473
4474			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
4475				return -1;
4476
4477			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
4478		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
4479			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4480
4481			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
4482				return -1;
4483
4484			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
4485		}
4486	}
4487
4488	return 0;
4489}
4490
4491/*
4492 * XXX: Hackish, just splitting the combined -e+--event (syscalls
4493 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
4494 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
4495 *
4496 * It'd be better to introduce a parse_options() variant that would return a
4497 * list with the terms it didn't match to an event...
4498 */
4499static int trace__parse_events_option(const struct option *opt, const char *str,
4500				      int unset __maybe_unused)
4501{
4502	struct trace *trace = (struct trace *)opt->value;
4503	const char *s = str;
4504	char *sep = NULL, *lists[2] = { NULL, NULL, };
4505	int len = strlen(str) + 1, err = -1, list, idx;
4506	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
4507	char group_name[PATH_MAX];
4508	const struct syscall_fmt *fmt;
4509
4510	if (strace_groups_dir == NULL)
4511		return -1;
4512
4513	if (*s == '!') {
4514		++s;
4515		trace->not_ev_qualifier = true;
4516	}
4517
4518	while (1) {
4519		if ((sep = strchr(s, ',')) != NULL)
4520			*sep = '\0';
4521
4522		list = 0;
4523		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
4524		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
4525			list = 1;
4526			goto do_concat;
4527		}
4528
4529		fmt = syscall_fmt__find_by_alias(s);
4530		if (fmt != NULL) {
4531			list = 1;
4532			s = fmt->name;
4533		} else {
4534			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
4535			if (access(group_name, R_OK) == 0)
4536				list = 1;
4537		}
4538do_concat:
4539		if (lists[list]) {
4540			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
4541		} else {
4542			lists[list] = malloc(len);
4543			if (lists[list] == NULL)
4544				goto out;
4545			strcpy(lists[list], s);
4546		}
4547
4548		if (!sep)
4549			break;
4550
4551		*sep = ',';
4552		s = sep + 1;
4553	}
4554
4555	if (lists[1] != NULL) {
4556		struct strlist_config slist_config = {
4557			.dirname = strace_groups_dir,
4558		};
4559
4560		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
4561		if (trace->ev_qualifier == NULL) {
4562			fputs("Not enough memory to parse event qualifier", trace->output);
4563			goto out;
4564		}
4565
4566		if (trace__validate_ev_qualifier(trace))
4567			goto out;
4568		trace->trace_syscalls = true;
4569	}
4570
4571	err = 0;
4572
4573	if (lists[0]) {
4574		struct parse_events_option_args parse_events_option_args = {
4575			.evlistp = &trace->evlist,
4576		};
4577		struct option o = {
4578			.value = &parse_events_option_args,
4579		};
4580		err = parse_events_option(&o, lists[0], 0);
4581	}
4582out:
4583	free(strace_groups_dir);
4584	free(lists[0]);
4585	free(lists[1]);
4586	if (sep)
4587		*sep = ',';
4588
4589	return err;
4590}
4591
4592static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
4593{
4594	struct trace *trace = opt->value;
4595
4596	if (!list_empty(&trace->evlist->core.entries)) {
4597		struct option o = {
4598			.value = &trace->evlist,
4599		};
4600		return parse_cgroups(&o, str, unset);
4601	}
4602	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
4603
4604	return 0;
4605}
4606
4607static int trace__config(const char *var, const char *value, void *arg)
4608{
4609	struct trace *trace = arg;
4610	int err = 0;
4611
4612	if (!strcmp(var, "trace.add_events")) {
4613		trace->perfconfig_events = strdup(value);
4614		if (trace->perfconfig_events == NULL) {
4615			pr_err("Not enough memory for %s\n", "trace.add_events");
4616			return -1;
4617		}
4618	} else if (!strcmp(var, "trace.show_timestamp")) {
4619		trace->show_tstamp = perf_config_bool(var, value);
4620	} else if (!strcmp(var, "trace.show_duration")) {
4621		trace->show_duration = perf_config_bool(var, value);
4622	} else if (!strcmp(var, "trace.show_arg_names")) {
4623		trace->show_arg_names = perf_config_bool(var, value);
4624		if (!trace->show_arg_names)
4625			trace->show_zeros = true;
4626	} else if (!strcmp(var, "trace.show_zeros")) {
4627		bool new_show_zeros = perf_config_bool(var, value);
4628		if (!trace->show_arg_names && !new_show_zeros) {
4629			pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
4630			goto out;
4631		}
4632		trace->show_zeros = new_show_zeros;
4633	} else if (!strcmp(var, "trace.show_prefix")) {
4634		trace->show_string_prefix = perf_config_bool(var, value);
4635	} else if (!strcmp(var, "trace.no_inherit")) {
4636		trace->opts.no_inherit = perf_config_bool(var, value);
4637	} else if (!strcmp(var, "trace.args_alignment")) {
4638		int args_alignment = 0;
4639		if (perf_config_int(&args_alignment, var, value) == 0)
4640			trace->args_alignment = args_alignment;
4641	} else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
4642		if (strcasecmp(value, "libtraceevent") == 0)
4643			trace->libtraceevent_print = true;
4644		else if (strcasecmp(value, "libbeauty") == 0)
4645			trace->libtraceevent_print = false;
4646	}
4647out:
4648	return err;
4649}
4650
4651static void trace__exit(struct trace *trace)
4652{
4653	int i;
4654
4655	strlist__delete(trace->ev_qualifier);
4656	zfree(&trace->ev_qualifier_ids.entries);
4657	if (trace->syscalls.table) {
4658		for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
4659			syscall__exit(&trace->syscalls.table[i]);
4660		zfree(&trace->syscalls.table);
4661	}
4662	syscalltbl__delete(trace->sctbl);
4663	zfree(&trace->perfconfig_events);
4664}
4665
4666#ifdef HAVE_BPF_SKEL
4667static int bpf__setup_bpf_output(struct evlist *evlist)
4668{
4669	int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
4670
4671	if (err)
4672		pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
4673
4674	return err;
4675}
4676#endif
4677
4678int cmd_trace(int argc, const char **argv)
4679{
4680	const char *trace_usage[] = {
4681		"perf trace [<options>] [<command>]",
4682		"perf trace [<options>] -- <command> [<options>]",
4683		"perf trace record [<options>] [<command>]",
4684		"perf trace record [<options>] -- <command> [<options>]",
4685		NULL
4686	};
4687	struct trace trace = {
 
 
 
4688		.opts = {
4689			.target = {
4690				.uid	   = UINT_MAX,
4691				.uses_mmap = true,
4692			},
4693			.user_freq     = UINT_MAX,
4694			.user_interval = ULLONG_MAX,
4695			.no_buffering  = true,
4696			.mmap_pages    = UINT_MAX,
 
4697		},
4698		.output = stderr,
4699		.show_comm = true,
4700		.show_tstamp = true,
4701		.show_duration = true,
4702		.show_arg_names = true,
4703		.args_alignment = 70,
4704		.trace_syscalls = false,
4705		.kernel_syscallchains = false,
4706		.max_stack = UINT_MAX,
4707		.max_events = ULONG_MAX,
4708	};
4709	const char *output_name = NULL;
4710	const struct option trace_options[] = {
4711	OPT_CALLBACK('e', "event", &trace, "event",
4712		     "event/syscall selector. use 'perf list' to list available events",
4713		     trace__parse_events_option),
4714	OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
4715		     "event filter", parse_filter),
4716	OPT_BOOLEAN(0, "comm", &trace.show_comm,
4717		    "show the thread COMM next to its id"),
4718	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
4719	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
4720		     trace__parse_events_option),
4721	OPT_STRING('o', "output", &output_name, "file", "output file name"),
4722	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
4723	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
4724		    "trace events on existing process id"),
4725	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
4726		    "trace events on existing thread id"),
4727	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
4728		     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
4729	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
4730		    "system-wide collection from all CPUs"),
4731	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
4732		    "list of cpus to monitor"),
4733	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
4734		    "child tasks do not inherit counters"),
4735	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
4736		     "number of mmap data pages", evlist__parse_mmap_pages),
 
4737	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
4738		   "user to profile"),
4739	OPT_CALLBACK(0, "duration", &trace, "float",
4740		     "show only events with duration > N.M ms",
4741		     trace__set_duration),
4742	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
4743	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
4744	OPT_BOOLEAN('T', "time", &trace.full_time,
4745		    "Show full timestamp, not time relative to first start"),
4746	OPT_BOOLEAN(0, "failure", &trace.failure_only,
4747		    "Show only syscalls that failed"),
4748	OPT_BOOLEAN('s', "summary", &trace.summary_only,
4749		    "Show only syscall summary with statistics"),
4750	OPT_BOOLEAN('S', "with-summary", &trace.summary,
4751		    "Show all syscalls and summary with statistics"),
4752	OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
4753		    "Show errno stats per syscall, use with -s or -S"),
4754	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
4755		     "Trace pagefaults", parse_pagefaults, "maj"),
4756	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
4757	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
4758	OPT_CALLBACK(0, "call-graph", &trace.opts,
4759		     "record_mode[,record_size]", record_callchain_help,
4760		     &record_parse_callchain_opt),
4761	OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
4762		    "Use libtraceevent to print the tracepoint arguments."),
4763	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
4764		    "Show the kernel callchains on the syscall exit path"),
4765	OPT_ULONG(0, "max-events", &trace.max_events,
4766		"Set the maximum number of events to print, exit after that is reached. "),
4767	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
4768		     "Set the minimum stack depth when parsing the callchain, "
4769		     "anything below the specified depth will be ignored."),
4770	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
4771		     "Set the maximum stack depth when parsing the callchain, "
4772		     "anything beyond the specified depth will be ignored. "
4773		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
4774	OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
4775			"Sort batch of events before processing, use if getting out of order events"),
4776	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
4777			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
4778	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
4779			"per thread proc mmap processing timeout in ms"),
4780	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
4781		     trace__parse_cgroups),
4782	OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
4783		     "ms to wait before starting measurement after program "
4784		     "start"),
4785	OPTS_EVSWITCH(&trace.evswitch),
4786	OPT_END()
4787	};
4788	bool __maybe_unused max_stack_user_set = true;
4789	bool mmap_pages_user_set = true;
4790	struct evsel *evsel;
4791	const char * const trace_subcommands[] = { "record", NULL };
4792	int err = -1;
4793	char bf[BUFSIZ];
4794	struct sigaction sigchld_act;
4795
4796	signal(SIGSEGV, sighandler_dump_stack);
4797	signal(SIGFPE, sighandler_dump_stack);
4798	signal(SIGINT, sighandler_interrupt);
4799
4800	memset(&sigchld_act, 0, sizeof(sigchld_act));
4801	sigchld_act.sa_flags = SA_SIGINFO;
4802	sigchld_act.sa_sigaction = sighandler_chld;
4803	sigaction(SIGCHLD, &sigchld_act, NULL);
4804
4805	trace.evlist = evlist__new();
4806	trace.sctbl = syscalltbl__new();
4807
4808	if (trace.evlist == NULL || trace.sctbl == NULL) {
4809		pr_err("Not enough memory to run!\n");
4810		err = -ENOMEM;
4811		goto out;
4812	}
4813
4814	/*
4815	 * Parsing .perfconfig may entail creating a BPF event, that may need
4816	 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
4817	 * is too small. This affects just this process, not touching the
4818	 * global setting. If it fails we'll get something in 'perf trace -v'
4819	 * to help diagnose the problem.
4820	 */
4821	rlimit__bump_memlock();
4822
4823	err = perf_config(trace__config, &trace);
4824	if (err)
4825		goto out;
4826
4827	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
4828				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
4829
4830	/*
4831	 * Here we already passed thru trace__parse_events_option() and it has
4832	 * already figured out if -e syscall_name, if not but if --event
4833	 * foo:bar was used, the user is interested _just_ in those, say,
4834	 * tracepoint events, not in the strace-like syscall-name-based mode.
4835	 *
4836	 * This is important because we need to check if strace-like mode is
4837	 * needed to decided if we should filter out the eBPF
4838	 * __augmented_syscalls__ code, if it is in the mix, say, via
4839	 * .perfconfig trace.add_events, and filter those out.
4840	 */
4841	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
4842	    trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
4843		trace.trace_syscalls = true;
4844	}
4845	/*
4846	 * Now that we have --verbose figured out, lets see if we need to parse
4847	 * events from .perfconfig, so that if those events fail parsing, say some
4848	 * BPF program fails, then we'll be able to use --verbose to see what went
4849	 * wrong in more detail.
4850	 */
4851	if (trace.perfconfig_events != NULL) {
4852		struct parse_events_error parse_err;
4853
4854		parse_events_error__init(&parse_err);
4855		err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
4856		if (err)
4857			parse_events_error__print(&parse_err, trace.perfconfig_events);
4858		parse_events_error__exit(&parse_err);
4859		if (err)
4860			goto out;
4861	}
4862
4863	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
4864		usage_with_options_msg(trace_usage, trace_options,
4865				       "cgroup monitoring only available in system-wide mode");
4866	}
4867
4868#ifdef HAVE_BPF_SKEL
4869	if (!trace.trace_syscalls)
4870		goto skip_augmentation;
4871
4872	trace.skel = augmented_raw_syscalls_bpf__open();
4873	if (!trace.skel) {
4874		pr_debug("Failed to open augmented syscalls BPF skeleton");
4875	} else {
4876		/*
4877		 * Disable attaching the BPF programs except for sys_enter and
4878		 * sys_exit that tail call into this as necessary.
4879		 */
4880		struct bpf_program *prog;
4881
4882		bpf_object__for_each_program(prog, trace.skel->obj) {
4883			if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
4884				bpf_program__set_autoattach(prog, /*autoattach=*/false);
4885		}
4886
4887		err = augmented_raw_syscalls_bpf__load(trace.skel);
4888
4889		if (err < 0) {
4890			libbpf_strerror(err, bf, sizeof(bf));
4891			pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf);
4892		} else {
4893			augmented_raw_syscalls_bpf__attach(trace.skel);
4894			trace__add_syscall_newtp(&trace);
4895		}
4896	}
4897
4898	err = bpf__setup_bpf_output(trace.evlist);
4899	if (err) {
4900		libbpf_strerror(err, bf, sizeof(bf));
4901		pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
4902		goto out;
4903	}
4904	trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
4905	assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
4906skip_augmentation:
4907#endif
4908	err = -1;
4909
4910	if (trace.trace_pgfaults) {
4911		trace.opts.sample_address = true;
4912		trace.opts.sample_time = true;
4913	}
4914
4915	if (trace.opts.mmap_pages == UINT_MAX)
4916		mmap_pages_user_set = false;
4917
4918	if (trace.max_stack == UINT_MAX) {
4919		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
4920		max_stack_user_set = false;
4921	}
4922
4923#ifdef HAVE_DWARF_UNWIND_SUPPORT
4924	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
4925		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
4926	}
4927#endif
4928
4929	if (callchain_param.enabled) {
4930		if (!mmap_pages_user_set && geteuid() == 0)
4931			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
4932
4933		symbol_conf.use_callchain = true;
4934	}
4935
4936	if (trace.evlist->core.nr_entries > 0) {
4937		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
4938		if (evlist__set_syscall_tp_fields(trace.evlist)) {
4939			perror("failed to set syscalls:* tracepoint fields");
4940			goto out;
4941		}
4942	}
4943
4944	if (trace.sort_events) {
4945		ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
4946		ordered_events__set_copy_on_queue(&trace.oe.data, true);
4947	}
4948
4949	/*
4950	 * If we are augmenting syscalls, then combine what we put in the
4951	 * __augmented_syscalls__ BPF map with what is in the
4952	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
4953	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
4954	 *
4955	 * We'll switch to look at two BPF maps, one for sys_enter and the
4956	 * other for sys_exit when we start augmenting the sys_exit paths with
4957	 * buffers that are being copied from kernel to userspace, think 'read'
4958	 * syscall.
4959	 */
4960	if (trace.syscalls.events.bpf_output) {
4961		evlist__for_each_entry(trace.evlist, evsel) {
4962			bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
4963
4964			if (raw_syscalls_sys_exit) {
4965				trace.raw_augmented_syscalls = true;
4966				goto init_augmented_syscall_tp;
4967			}
4968
4969			if (trace.syscalls.events.bpf_output->priv == NULL &&
4970			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
4971				struct evsel *augmented = trace.syscalls.events.bpf_output;
4972				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
4973				    evsel__init_augmented_syscall_tp_args(augmented))
4974					goto out;
4975				/*
4976				 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
4977				 * Above we made sure we can get from the payload the tp fields
4978				 * that we get from syscalls:sys_enter tracefs format file.
4979				 */
4980				augmented->handler = trace__sys_enter;
4981				/*
4982				 * Now we do the same for the *syscalls:sys_enter event so that
4983				 * if we handle it directly, i.e. if the BPF prog returns 0 so
4984				 * as not to filter it, then we'll handle it just like we would
4985				 * for the BPF_OUTPUT one:
4986				 */
4987				if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
4988				    evsel__init_augmented_syscall_tp_args(evsel))
4989					goto out;
4990				evsel->handler = trace__sys_enter;
4991			}
4992
4993			if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
4994				struct syscall_tp *sc;
4995init_augmented_syscall_tp:
4996				if (evsel__init_augmented_syscall_tp(evsel, evsel))
4997					goto out;
4998				sc = __evsel__syscall_tp(evsel);
4999				/*
5000				 * For now with BPF raw_augmented we hook into
5001				 * raw_syscalls:sys_enter and there we get all
5002				 * 6 syscall args plus the tracepoint common
5003				 * fields and the syscall_nr (another long).
5004				 * So we check if that is the case and if so
5005				 * don't look after the sc->args_size but
5006				 * always after the full raw_syscalls:sys_enter
5007				 * payload, which is fixed.
5008				 *
5009				 * We'll revisit this later to pass
5010				 * s->args_size to the BPF augmenter (now
5011				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
5012				 * so that it copies only what we need for each
5013				 * syscall, like what happens when we use
5014				 * syscalls:sys_enter_NAME, so that we reduce
5015				 * the kernel/userspace traffic to just what is
5016				 * needed for each syscall.
5017				 */
5018				if (trace.raw_augmented_syscalls)
5019					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
5020				evsel__init_augmented_syscall_tp_ret(evsel);
5021				evsel->handler = trace__sys_exit;
5022			}
5023		}
5024	}
5025
5026	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
5027		return trace__record(&trace, argc-1, &argv[1]);
5028
5029	/* Using just --errno-summary will trigger --summary */
5030	if (trace.errno_summary && !trace.summary && !trace.summary_only)
5031		trace.summary_only = true;
5032
5033	/* summary_only implies summary option, but don't overwrite summary if set */
5034	if (trace.summary_only)
5035		trace.summary = trace.summary_only;
5036
 
 
 
 
 
 
 
 
 
 
 
5037	if (output_name != NULL) {
5038		err = trace__open_output(&trace, output_name);
5039		if (err < 0) {
5040			perror("failed to create output file");
5041			goto out;
5042		}
5043	}
5044
5045	err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
5046	if (err)
5047		goto out_close;
5048
5049	err = target__validate(&trace.opts.target);
5050	if (err) {
5051		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5052		fprintf(trace.output, "%s", bf);
5053		goto out_close;
5054	}
5055
5056	err = target__parse_uid(&trace.opts.target);
5057	if (err) {
5058		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5059		fprintf(trace.output, "%s", bf);
5060		goto out_close;
5061	}
5062
5063	if (!argc && target__none(&trace.opts.target))
5064		trace.opts.target.system_wide = true;
5065
5066	if (input_name)
5067		err = trace__replay(&trace);
5068	else
5069		err = trace__run(&trace, argc, argv);
5070
5071out_close:
5072	if (output_name != NULL)
5073		fclose(trace.output);
5074out:
5075	trace__exit(&trace);
5076#ifdef HAVE_BPF_SKEL
5077	augmented_raw_syscalls_bpf__destroy(trace.skel);
5078#endif
5079	return err;
5080}
v4.17
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
 
 
 
 
 
 
 
 
 
  21#include "builtin.h"
  22#include "util/cgroup.h"
  23#include "util/color.h"
 
  24#include "util/debug.h"
 
  25#include "util/env.h"
  26#include "util/event.h"
 
 
 
  27#include "util/evlist.h"
 
 
 
  28#include <subcmd/exec-cmd.h>
  29#include "util/machine.h"
 
 
  30#include "util/path.h"
  31#include "util/session.h"
  32#include "util/thread.h"
  33#include <subcmd/parse-options.h>
  34#include "util/strlist.h"
  35#include "util/intlist.h"
  36#include "util/thread_map.h"
  37#include "util/stat.h"
 
 
  38#include "trace/beauty/beauty.h"
  39#include "trace-event.h"
  40#include "util/parse-events.h"
  41#include "util/bpf-loader.h"
  42#include "callchain.h"
  43#include "print_binary.h"
  44#include "string2.h"
  45#include "syscalltbl.h"
  46#include "rb_resort.h"
 
  47
  48#include <errno.h>
  49#include <inttypes.h>
  50#include <poll.h>
  51#include <signal.h>
  52#include <stdlib.h>
  53#include <string.h>
  54#include <linux/err.h>
  55#include <linux/filter.h>
  56#include <linux/kernel.h>
 
  57#include <linux/random.h>
  58#include <linux/stringify.h>
  59#include <linux/time64.h>
 
  60#include <fcntl.h>
 
  61
  62#include "sane_ctype.h"
 
 
 
 
 
  63
  64#ifndef O_CLOEXEC
  65# define O_CLOEXEC		02000000
  66#endif
  67
  68#ifndef F_LINUX_SPECIFIC_BASE
  69# define F_LINUX_SPECIFIC_BASE	1024
  70#endif
  71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  72struct trace {
  73	struct perf_tool	tool;
  74	struct syscalltbl	*sctbl;
  75	struct {
  76		int		max;
  77		struct syscall  *table;
  78		struct {
  79			struct perf_evsel *sys_enter,
  80					  *sys_exit;
 
  81		}		events;
  82	} syscalls;
 
 
 
  83	struct record_opts	opts;
  84	struct perf_evlist	*evlist;
  85	struct machine		*host;
  86	struct thread		*current;
  87	struct cgroup		*cgroup;
  88	u64			base_time;
  89	FILE			*output;
  90	unsigned long		nr_events;
 
 
 
  91	struct strlist		*ev_qualifier;
  92	struct {
  93		size_t		nr;
  94		int		*entries;
  95	}			ev_qualifier_ids;
  96	struct {
  97		size_t		nr;
  98		pid_t		*entries;
 
  99	}			filter_pids;
 100	double			duration_filter;
 101	double			runtime_ms;
 102	struct {
 103		u64		vfs_getname,
 104				proc_getname;
 105	} stats;
 106	unsigned int		max_stack;
 107	unsigned int		min_stack;
 
 
 
 
 108	bool			not_ev_qualifier;
 109	bool			live;
 110	bool			full_time;
 111	bool			sched;
 112	bool			multiple_threads;
 113	bool			summary;
 114	bool			summary_only;
 
 115	bool			failure_only;
 116	bool			show_comm;
 117	bool			print_sample;
 118	bool			show_tool_stats;
 119	bool			trace_syscalls;
 
 120	bool			kernel_syscallchains;
 
 
 
 
 
 
 121	bool			force;
 122	bool			vfs_getname;
 123	int			trace_pgfaults;
 124	int			open_id;
 
 
 
 
 125};
 126
 127struct tp_field {
 128	int offset;
 129	union {
 130		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 131		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 132	};
 133};
 134
 135#define TP_UINT_FIELD(bits) \
 136static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 137{ \
 138	u##bits value; \
 139	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 140	return value;  \
 141}
 142
 143TP_UINT_FIELD(8);
 144TP_UINT_FIELD(16);
 145TP_UINT_FIELD(32);
 146TP_UINT_FIELD(64);
 147
 148#define TP_UINT_FIELD__SWAPPED(bits) \
 149static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 150{ \
 151	u##bits value; \
 152	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 153	return bswap_##bits(value);\
 154}
 155
 156TP_UINT_FIELD__SWAPPED(16);
 157TP_UINT_FIELD__SWAPPED(32);
 158TP_UINT_FIELD__SWAPPED(64);
 159
 160static int tp_field__init_uint(struct tp_field *field,
 161			       struct format_field *format_field,
 162			       bool needs_swap)
 163{
 164	field->offset = format_field->offset;
 165
 166	switch (format_field->size) {
 167	case 1:
 168		field->integer = tp_field__u8;
 169		break;
 170	case 2:
 171		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 172		break;
 173	case 4:
 174		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 175		break;
 176	case 8:
 177		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 178		break;
 179	default:
 180		return -1;
 181	}
 182
 183	return 0;
 184}
 185
 
 
 
 
 
 186static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 187{
 188	return sample->raw_data + field->offset;
 189}
 190
 191static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 192{
 193	field->offset = format_field->offset;
 194	field->pointer = tp_field__ptr;
 195	return 0;
 196}
 197
 
 
 
 
 
 198struct syscall_tp {
 199	struct tp_field id;
 200	union {
 201		struct tp_field args, ret;
 202	};
 203};
 204
 205static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 206					  struct tp_field *field,
 207					  const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 208{
 209	struct format_field *format_field = perf_evsel__field(evsel, name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 210
 211	if (format_field == NULL)
 212		return -1;
 213
 214	return tp_field__init_uint(field, format_field, evsel->needs_swap);
 215}
 216
 217#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 218	({ struct syscall_tp *sc = evsel->priv;\
 219	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 220
 221static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 222					 struct tp_field *field,
 223					 const char *name)
 224{
 225	struct format_field *format_field = perf_evsel__field(evsel, name);
 226
 227	if (format_field == NULL)
 228		return -1;
 229
 230	return tp_field__init_ptr(field, format_field);
 231}
 232
 233#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 234	({ struct syscall_tp *sc = evsel->priv;\
 235	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 236
 237static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 238{
 239	zfree(&evsel->priv);
 240	perf_evsel__delete(evsel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 241}
 242
 243static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
 244{
 245	evsel->priv = malloc(sizeof(struct syscall_tp));
 246	if (evsel->priv != NULL) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 247		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 248			goto out_delete;
 249
 250		evsel->handler = handler;
 251		return 0;
 252	}
 253
 254	return -ENOMEM;
 255
 256out_delete:
 257	zfree(&evsel->priv);
 258	return -ENOENT;
 259}
 260
 261static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
 262{
 263	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 264
 265	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 266	if (IS_ERR(evsel))
 267		evsel = perf_evsel__newtp("syscalls", direction);
 268
 269	if (IS_ERR(evsel))
 270		return NULL;
 271
 272	if (perf_evsel__init_syscall_tp(evsel, handler))
 273		goto out_delete;
 274
 275	return evsel;
 276
 277out_delete:
 278	perf_evsel__delete_priv(evsel);
 279	return NULL;
 280}
 281
 282#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 283	({ struct syscall_tp *fields = evsel->priv; \
 284	   fields->name.integer(&fields->name, sample); })
 285
 286#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 287	({ struct syscall_tp *fields = evsel->priv; \
 288	   fields->name.pointer(&fields->name, sample); })
 289
 290size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 291{
 292	int idx = val - sa->offset;
 293
 294	if (idx < 0 || idx >= sa->nr_entries)
 295		return scnprintf(bf, size, intfmt, val);
 
 
 
 
 296
 297	return scnprintf(bf, size, "%s", sa->entries[idx]);
 298}
 299
 300static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 301						const char *intfmt,
 302					        struct syscall_arg *arg)
 303{
 304	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
 305}
 306
 307static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 308					      struct syscall_arg *arg)
 309{
 310	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 311}
 312
 313#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 314
 315struct strarrays {
 316	int		nr_entries;
 317	struct strarray **entries;
 318};
 
 
 
 
 
 
 
 
 
 
 319
 320#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
 321	.nr_entries = ARRAY_SIZE(array), \
 322	.entries = array, \
 323}
 324
 325size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 326					struct syscall_arg *arg)
 327{
 328	struct strarrays *sas = arg->parm;
 329	int i;
 330
 331	for (i = 0; i < sas->nr_entries; ++i) {
 332		struct strarray *sa = sas->entries[i];
 333		int idx = arg->val - sa->offset;
 334
 335		if (idx >= 0 && idx < sa->nr_entries) {
 336			if (sa->entries[idx] == NULL)
 337				break;
 338			return scnprintf(bf, size, "%s", sa->entries[idx]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 339		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 340	}
 341
 342	return scnprintf(bf, size, "%d", arg->val);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 343}
 344
 345#ifndef AT_FDCWD
 346#define AT_FDCWD	-100
 347#endif
 348
 349static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 350					   struct syscall_arg *arg)
 351{
 352	int fd = arg->val;
 
 353
 354	if (fd == AT_FDCWD)
 355		return scnprintf(bf, size, "CWD");
 356
 357	return syscall_arg__scnprintf_fd(bf, size, arg);
 358}
 359
 360#define SCA_FDAT syscall_arg__scnprintf_fd_at
 361
 362static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 363					      struct syscall_arg *arg);
 364
 365#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 366
 367size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 368{
 369	return scnprintf(bf, size, "%#lx", arg->val);
 370}
 371
 
 
 
 
 
 
 
 372size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 373{
 374	return scnprintf(bf, size, "%d", arg->val);
 375}
 376
 377size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 378{
 379	return scnprintf(bf, size, "%ld", arg->val);
 380}
 381
 
 
 
 
 
 
 
 
 
 
 382static const char *bpf_cmd[] = {
 383	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 384	"MAP_GET_NEXT_KEY", "PROG_LOAD",
 
 
 
 
 
 
 
 
 385};
 386static DEFINE_STRARRAY(bpf_cmd);
 
 
 
 
 
 
 
 
 
 387
 388static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 389static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 390
 391static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 392static DEFINE_STRARRAY(itimers);
 393
 394static const char *keyctl_options[] = {
 395	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 396	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 397	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 398	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 399	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 400};
 401static DEFINE_STRARRAY(keyctl_options);
 402
 403static const char *whences[] = { "SET", "CUR", "END",
 404#ifdef SEEK_DATA
 405"DATA",
 406#endif
 407#ifdef SEEK_HOLE
 408"HOLE",
 409#endif
 410};
 411static DEFINE_STRARRAY(whences);
 412
 413static const char *fcntl_cmds[] = {
 414	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 415	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 416	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 417	"GETOWNER_UIDS",
 418};
 419static DEFINE_STRARRAY(fcntl_cmds);
 420
 421static const char *fcntl_linux_specific_cmds[] = {
 422	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
 423	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 424	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 425};
 426
 427static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
 428
 429static struct strarray *fcntl_cmds_arrays[] = {
 430	&strarray__fcntl_cmds,
 431	&strarray__fcntl_linux_specific_cmds,
 432};
 433
 434static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 435
 436static const char *rlimit_resources[] = {
 437	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 438	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 439	"RTTIME",
 440};
 441static DEFINE_STRARRAY(rlimit_resources);
 442
 443static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 444static DEFINE_STRARRAY(sighow);
 445
 446static const char *clockid[] = {
 447	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 448	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 449	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 450};
 451static DEFINE_STRARRAY(clockid);
 452
 453static const char *socket_families[] = {
 454	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 455	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 456	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 457	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 458	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 459	"ALG", "NFC", "VSOCK",
 460};
 461static DEFINE_STRARRAY(socket_families);
 462
 463static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 464						 struct syscall_arg *arg)
 465{
 
 
 466	size_t printed = 0;
 467	int mode = arg->val;
 468
 469	if (mode == F_OK) /* 0 */
 470		return scnprintf(bf, size, "F");
 471#define	P_MODE(n) \
 472	if (mode & n##_OK) { \
 473		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 474		mode &= ~n##_OK; \
 475	}
 476
 477	P_MODE(R);
 478	P_MODE(W);
 479	P_MODE(X);
 480#undef P_MODE
 481
 482	if (mode)
 483		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 484
 485	return printed;
 486}
 487
 488#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 489
 490static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 491					      struct syscall_arg *arg);
 492
 493#define SCA_FILENAME syscall_arg__scnprintf_filename
 494
 495static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 496						struct syscall_arg *arg)
 497{
 
 
 498	int printed = 0, flags = arg->val;
 499
 500#define	P_FLAG(n) \
 501	if (flags & O_##n) { \
 502		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 503		flags &= ~O_##n; \
 504	}
 505
 506	P_FLAG(CLOEXEC);
 507	P_FLAG(NONBLOCK);
 508#undef P_FLAG
 509
 510	if (flags)
 511		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 512
 513	return printed;
 514}
 515
 516#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 517
 518#ifndef GRND_NONBLOCK
 519#define GRND_NONBLOCK	0x0001
 520#endif
 521#ifndef GRND_RANDOM
 522#define GRND_RANDOM	0x0002
 523#endif
 524
 525static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 526						   struct syscall_arg *arg)
 527{
 
 
 528	int printed = 0, flags = arg->val;
 529
 530#define	P_FLAG(n) \
 531	if (flags & GRND_##n) { \
 532		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 533		flags &= ~GRND_##n; \
 534	}
 535
 536	P_FLAG(RANDOM);
 537	P_FLAG(NONBLOCK);
 538#undef P_FLAG
 539
 540	if (flags)
 541		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 542
 543	return printed;
 544}
 545
 546#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 547
 548#define STRARRAY(name, array) \
 549	  { .scnprintf	= SCA_STRARRAY, \
 
 
 
 
 
 
 550	    .parm	= &strarray__##array, }
 551
 552#include "trace/beauty/arch_errno_names.c"
 553#include "trace/beauty/eventfd.c"
 554#include "trace/beauty/futex_op.c"
 555#include "trace/beauty/futex_val3.c"
 556#include "trace/beauty/mmap.c"
 557#include "trace/beauty/mode_t.c"
 558#include "trace/beauty/msg_flags.c"
 559#include "trace/beauty/open_flags.c"
 560#include "trace/beauty/perf_event_open.c"
 561#include "trace/beauty/pid.c"
 562#include "trace/beauty/sched_policy.c"
 563#include "trace/beauty/seccomp.c"
 564#include "trace/beauty/signum.c"
 565#include "trace/beauty/socket_type.c"
 566#include "trace/beauty/waitid_options.c"
 567
 568struct syscall_arg_fmt {
 569	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 570	void	   *parm;
 571	const char *name;
 572	bool	   show_zero;
 573};
 574
 575static struct syscall_fmt {
 576	const char *name;
 577	const char *alias;
 578	struct syscall_arg_fmt arg[6];
 579	u8	   nr_args;
 580	bool	   errpid;
 581	bool	   timeout;
 582	bool	   hexret;
 583} syscall_fmts[] = {
 584	{ .name	    = "access",
 585	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 
 
 
 
 
 
 
 586	{ .name	    = "bpf",
 587	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 588	{ .name	    = "brk",	    .hexret = true,
 589	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
 590	{ .name     = "clock_gettime",
 591	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 
 
 592	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
 593	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
 594		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 595		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 596		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 597		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
 598	{ .name	    = "close",
 599	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 
 
 
 
 600	{ .name	    = "epoll_ctl",
 601	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 602	{ .name	    = "eventfd2",
 603	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 604	{ .name	    = "fchmodat",
 605	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 606	{ .name	    = "fchownat",
 607	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 608	{ .name	    = "fcntl",
 609	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
 
 610			   .parm      = &strarrays__fcntl_cmds_arrays,
 611			   .show_zero = true, },
 612		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 613	{ .name	    = "flock",
 614	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 
 
 
 
 
 
 
 
 
 615	{ .name	    = "fstat", .alias = "newfstat", },
 616	{ .name	    = "fstatat", .alias = "newfstatat", },
 617	{ .name	    = "futex",
 618	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 619		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 620	{ .name	    = "futimesat",
 621	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 622	{ .name	    = "getitimer",
 623	  .arg = { [0] = STRARRAY(which, itimers), }, },
 624	{ .name	    = "getpid",	    .errpid = true, },
 625	{ .name	    = "getpgid",    .errpid = true, },
 626	{ .name	    = "getppid",    .errpid = true, },
 627	{ .name	    = "getrandom",
 628	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 629	{ .name	    = "getrlimit",
 630	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 
 
 631	{ .name	    = "gettid",	    .errpid = true, },
 632	{ .name	    = "ioctl",
 633	  .arg = {
 634#if defined(__i386__) || defined(__x86_64__)
 635/*
 636 * FIXME: Make this available to all arches.
 637 */
 638		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 639		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 640#else
 641		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 642#endif
 643	{ .name	    = "kcmp",	    .nr_args = 5,
 644	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
 645		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
 646		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
 647		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
 648		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
 649	{ .name	    = "keyctl",
 650	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
 651	{ .name	    = "kill",
 652	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 653	{ .name	    = "linkat",
 654	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 655	{ .name	    = "lseek",
 656	  .arg = { [2] = STRARRAY(whence, whences), }, },
 657	{ .name	    = "lstat", .alias = "newlstat", },
 658	{ .name     = "madvise",
 659	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
 660		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
 661	{ .name	    = "mkdirat",
 662	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 663	{ .name	    = "mknodat",
 664	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 665	{ .name	    = "mlock",
 666	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 667	{ .name	    = "mlockall",
 668	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 669	{ .name	    = "mmap",	    .hexret = true,
 670/* The standard mmap maps to old_mmap on s390x */
 671#if defined(__s390x__)
 672	.alias = "old_mmap",
 673#endif
 674	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
 675		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
 676		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
 
 
 
 
 
 
 
 
 
 
 
 
 677	{ .name	    = "mprotect",
 678	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
 679		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
 680	{ .name	    = "mq_unlink",
 681	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 682	{ .name	    = "mremap",	    .hexret = true,
 683	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
 684		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
 685		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
 686	{ .name	    = "munlock",
 687	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 688	{ .name	    = "munmap",
 689	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 690	{ .name	    = "name_to_handle_at",
 691	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 692	{ .name	    = "newfstatat",
 693	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 694	{ .name	    = "open",
 695	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 696	{ .name	    = "open_by_handle_at",
 697	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
 698		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 699	{ .name	    = "openat",
 700	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
 701		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 702	{ .name	    = "perf_event_open",
 703	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
 
 704		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
 705		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 706	{ .name	    = "pipe2",
 707	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
 708	{ .name	    = "pkey_alloc",
 709	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
 710	{ .name	    = "pkey_free",
 711	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
 712	{ .name	    = "pkey_mprotect",
 713	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
 714		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
 715		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
 716	{ .name	    = "poll", .timeout = true, },
 717	{ .name	    = "ppoll", .timeout = true, },
 718	{ .name	    = "prctl", .alias = "arch_prctl",
 719	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
 
 
 720		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
 721		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
 722	{ .name	    = "pread", .alias = "pread64", },
 723	{ .name	    = "preadv", .alias = "pread", },
 724	{ .name	    = "prlimit64",
 725	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
 726	{ .name	    = "pwrite", .alias = "pwrite64", },
 727	{ .name	    = "readlinkat",
 728	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 729	{ .name	    = "recvfrom",
 730	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 731	{ .name	    = "recvmmsg",
 732	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 733	{ .name	    = "recvmsg",
 734	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 735	{ .name	    = "renameat",
 736	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 
 
 
 
 
 737	{ .name	    = "rt_sigaction",
 738	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 739	{ .name	    = "rt_sigprocmask",
 740	  .arg = { [0] = STRARRAY(how, sighow), }, },
 741	{ .name	    = "rt_sigqueueinfo",
 742	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 743	{ .name	    = "rt_tgsigqueueinfo",
 744	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 745	{ .name	    = "sched_setscheduler",
 746	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
 747	{ .name	    = "seccomp",
 748	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
 749		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
 750	{ .name	    = "select", .timeout = true, },
 
 751	{ .name	    = "sendmmsg",
 752	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 753	{ .name	    = "sendmsg",
 754	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 755	{ .name	    = "sendto",
 756	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 
 757	{ .name	    = "set_tid_address", .errpid = true, },
 758	{ .name	    = "setitimer",
 759	  .arg = { [0] = STRARRAY(which, itimers), }, },
 760	{ .name	    = "setrlimit",
 761	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 
 
 762	{ .name	    = "socket",
 763	  .arg = { [0] = STRARRAY(family, socket_families),
 764		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
 
 765	{ .name	    = "socketpair",
 766	  .arg = { [0] = STRARRAY(family, socket_families),
 767		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
 
 768	{ .name	    = "stat", .alias = "newstat", },
 769	{ .name	    = "statx",
 770	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
 771		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
 772		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
 773	{ .name	    = "swapoff",
 774	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 775	{ .name	    = "swapon",
 776	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 777	{ .name	    = "symlinkat",
 778	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 
 
 779	{ .name	    = "tgkill",
 780	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 781	{ .name	    = "tkill",
 782	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 
 
 783	{ .name	    = "uname", .alias = "newuname", },
 784	{ .name	    = "unlinkat",
 785	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 786	{ .name	    = "utimensat",
 787	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 788	{ .name	    = "wait4",	    .errpid = true,
 789	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 790	{ .name	    = "waitid",	    .errpid = true,
 791	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 792};
 793
 794static int syscall_fmt__cmp(const void *name, const void *fmtp)
 795{
 796	const struct syscall_fmt *fmt = fmtp;
 797	return strcmp(name, fmt->name);
 798}
 799
 800static struct syscall_fmt *syscall_fmt__find(const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 801{
 802	const int nmemb = ARRAY_SIZE(syscall_fmts);
 803	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 804}
 805
 
 
 
 
 
 
 806struct syscall {
 807	struct event_format *tp_format;
 808	int		    nr_args;
 809	struct format_field *args;
 
 
 
 
 
 
 
 
 810	const char	    *name;
 811	bool		    is_exit;
 812	struct syscall_fmt  *fmt;
 813	struct syscall_arg_fmt *arg_fmt;
 814};
 815
 816/*
 817 * We need to have this 'calculated' boolean because in some cases we really
 818 * don't know what is the duration of a syscall, for instance, when we start
 819 * a session and some threads are waiting for a syscall to finish, say 'poll',
 820 * in which case all we can do is to print "( ? ) for duration and for the
 821 * start timestamp.
 822 */
 823static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 824{
 825	double duration = (double)t / NSEC_PER_MSEC;
 826	size_t printed = fprintf(fp, "(");
 827
 828	if (!calculated)
 829		printed += fprintf(fp, "         ");
 830	else if (duration >= 1.0)
 831		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 832	else if (duration >= 0.01)
 833		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 834	else
 835		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 836	return printed + fprintf(fp, "): ");
 837}
 838
 839/**
 840 * filename.ptr: The filename char pointer that will be vfs_getname'd
 841 * filename.entry_str_pos: Where to insert the string translated from
 842 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 843 * ret_scnprintf: syscall args may set this to a different syscall return
 844 *                formatter, for instance, fcntl may return fds, file flags, etc.
 845 */
 846struct thread_trace {
 847	u64		  entry_time;
 848	bool		  entry_pending;
 849	unsigned long	  nr_events;
 850	unsigned long	  pfmaj, pfmin;
 851	char		  *entry_str;
 852	double		  runtime_ms;
 853	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 854        struct {
 855		unsigned long ptr;
 856		short int     entry_str_pos;
 857		bool	      pending_open;
 858		unsigned int  namelen;
 859		char	      *name;
 860	} filename;
 861	struct {
 862		int	  max;
 863		char	  **table;
 864	} paths;
 865
 866	struct intlist *syscall_stats;
 867};
 868
 869static struct thread_trace *thread_trace__new(void)
 870{
 871	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 872
 873	if (ttrace)
 874		ttrace->paths.max = -1;
 
 
 
 
 
 875
 876	ttrace->syscall_stats = intlist__new(NULL);
 
 
 
 
 
 
 
 877
 878	return ttrace;
 
 
 
 
 879}
 880
 881static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 882{
 883	struct thread_trace *ttrace;
 884
 885	if (thread == NULL)
 886		goto fail;
 887
 888	if (thread__priv(thread) == NULL)
 889		thread__set_priv(thread, thread_trace__new());
 890
 891	if (thread__priv(thread) == NULL)
 892		goto fail;
 893
 894	ttrace = thread__priv(thread);
 895	++ttrace->nr_events;
 896
 897	return ttrace;
 898fail:
 899	color_fprintf(fp, PERF_COLOR_RED,
 900		      "WARNING: not enough memory, dropping samples!\n");
 901	return NULL;
 902}
 903
 904
 905void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
 906				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
 907{
 908	struct thread_trace *ttrace = thread__priv(arg->thread);
 909
 910	ttrace->ret_scnprintf = ret_scnprintf;
 911}
 912
 913#define TRACE_PFMAJ		(1 << 0)
 914#define TRACE_PFMIN		(1 << 1)
 915
 916static const size_t trace__entry_str_size = 2048;
 917
 918static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 
 
 
 
 
 
 
 
 
 
 
 919{
 920	struct thread_trace *ttrace = thread__priv(thread);
 
 921
 922	if (fd > ttrace->paths.max) {
 923		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 924
 925		if (npath == NULL)
 926			return -1;
 927
 928		if (ttrace->paths.max != -1) {
 929			memset(npath + ttrace->paths.max + 1, 0,
 930			       (fd - ttrace->paths.max) * sizeof(char *));
 931		} else {
 932			memset(npath, 0, (fd + 1) * sizeof(char *));
 933		}
 934
 935		ttrace->paths.table = npath;
 936		ttrace->paths.max   = fd;
 937	}
 938
 939	ttrace->paths.table[fd] = strdup(pathname);
 
 940
 941	return ttrace->paths.table[fd] != NULL ? 0 : -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 942}
 943
 944static int thread__read_fd_path(struct thread *thread, int fd)
 945{
 946	char linkname[PATH_MAX], pathname[PATH_MAX];
 947	struct stat st;
 948	int ret;
 949
 950	if (thread->pid_ == thread->tid) {
 951		scnprintf(linkname, sizeof(linkname),
 952			  "/proc/%d/fd/%d", thread->pid_, fd);
 953	} else {
 954		scnprintf(linkname, sizeof(linkname),
 955			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
 
 956	}
 957
 958	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
 959		return -1;
 960
 961	ret = readlink(linkname, pathname, sizeof(pathname));
 962
 963	if (ret < 0 || ret > st.st_size)
 964		return -1;
 965
 966	pathname[ret] = '\0';
 967	return trace__set_fd_pathname(thread, fd, pathname);
 968}
 969
 970static const char *thread__fd_path(struct thread *thread, int fd,
 971				   struct trace *trace)
 972{
 973	struct thread_trace *ttrace = thread__priv(thread);
 974
 975	if (ttrace == NULL)
 976		return NULL;
 977
 978	if (fd < 0)
 979		return NULL;
 980
 981	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
 982		if (!trace->live)
 983			return NULL;
 984		++trace->stats.proc_getname;
 985		if (thread__read_fd_path(thread, fd))
 986			return NULL;
 987	}
 988
 989	return ttrace->paths.table[fd];
 990}
 991
 992size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
 993{
 994	int fd = arg->val;
 995	size_t printed = scnprintf(bf, size, "%d", fd);
 996	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
 997
 998	if (path)
 999		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000
1001	return printed;
1002}
1003
1004size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005{
1006        size_t printed = scnprintf(bf, size, "%d", fd);
1007	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008
1009	if (thread) {
1010		const char *path = thread__fd_path(thread, fd, trace);
1011
1012		if (path)
1013			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014
1015		thread__put(thread);
1016	}
1017
1018        return printed;
1019}
1020
1021static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022					      struct syscall_arg *arg)
1023{
1024	int fd = arg->val;
1025	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026	struct thread_trace *ttrace = thread__priv(arg->thread);
1027
1028	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029		zfree(&ttrace->paths.table[fd]);
1030
1031	return printed;
1032}
1033
1034static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035				     unsigned long ptr)
1036{
1037	struct thread_trace *ttrace = thread__priv(thread);
1038
1039	ttrace->filename.ptr = ptr;
1040	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041}
1042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044					      struct syscall_arg *arg)
1045{
1046	unsigned long ptr = arg->val;
1047
 
 
 
1048	if (!arg->trace->vfs_getname)
1049		return scnprintf(bf, size, "%#x", ptr);
1050
1051	thread__set_filename_pos(arg->thread, bf, ptr);
1052	return 0;
1053}
1054
1055static bool trace__filter_duration(struct trace *trace, double t)
1056{
1057	return t < (trace->duration_filter * NSEC_PER_MSEC);
1058}
1059
1060static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061{
1062	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063
1064	return fprintf(fp, "%10.3f ", ts);
1065}
1066
1067/*
1068 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069 * using ttrace->entry_time for a thread that receives a sys_exit without
1070 * first having received a sys_enter ("poll" issued before tracing session
1071 * starts, lost sys_enter exit due to ring buffer overflow).
1072 */
1073static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074{
1075	if (tstamp > 0)
1076		return __trace__fprintf_tstamp(trace, tstamp, fp);
1077
1078	return fprintf(fp, "         ? ");
1079}
1080
1081static bool done = false;
1082static bool interrupted = false;
 
1083
1084static void sig_handler(int sig)
1085{
1086	done = true;
1087	interrupted = sig == SIGINT;
1088}
1089
1090static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
 
 
 
 
 
 
1092{
1093	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094	printed += fprintf_duration(duration, duration_calculated, fp);
1095
1096	if (trace->multiple_threads) {
1097		if (trace->show_comm)
1098			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099		printed += fprintf(fp, "%d ", thread->tid);
1100	}
1101
1102	return printed;
1103}
1104
 
 
 
 
 
 
 
 
 
 
 
 
1105static int trace__process_event(struct trace *trace, struct machine *machine,
1106				union perf_event *event, struct perf_sample *sample)
1107{
1108	int ret = 0;
1109
1110	switch (event->header.type) {
1111	case PERF_RECORD_LOST:
1112		color_fprintf(trace->output, PERF_COLOR_RED,
1113			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1114		ret = machine__process_lost_event(machine, event, sample);
1115		break;
1116	default:
1117		ret = machine__process_event(machine, event, sample);
1118		break;
1119	}
1120
1121	return ret;
1122}
1123
1124static int trace__tool_process(struct perf_tool *tool,
1125			       union perf_event *event,
1126			       struct perf_sample *sample,
1127			       struct machine *machine)
1128{
1129	struct trace *trace = container_of(tool, struct trace, tool);
1130	return trace__process_event(trace, machine, event, sample);
1131}
1132
1133static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134{
1135	struct machine *machine = vmachine;
1136
1137	if (machine->kptr_restrict_warned)
1138		return NULL;
1139
1140	if (symbol_conf.kptr_restrict) {
1141		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143			   "Kernel samples will not be resolved.\n");
1144		machine->kptr_restrict_warned = true;
1145		return NULL;
1146	}
1147
1148	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149}
1150
1151static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152{
1153	int err = symbol__init(NULL);
1154
1155	if (err)
1156		return err;
1157
1158	trace->host = machine__new_host();
1159	if (trace->host == NULL)
1160		return -ENOMEM;
1161
 
 
1162	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163	if (err < 0)
1164		goto out;
1165
1166	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167					    evlist->threads, trace__tool_process, false,
1168					    trace->opts.proc_map_timeout, 1);
1169out:
1170	if (err)
1171		symbol__exit();
1172
1173	return err;
1174}
1175
1176static void trace__symbols__exit(struct trace *trace)
1177{
1178	machine__exit(trace->host);
1179	trace->host = NULL;
1180
1181	symbol__exit();
1182}
1183
1184static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185{
1186	int idx;
1187
1188	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189		nr_args = sc->fmt->nr_args;
1190
1191	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192	if (sc->arg_fmt == NULL)
1193		return -1;
1194
1195	for (idx = 0; idx < nr_args; ++idx) {
1196		if (sc->fmt)
1197			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198	}
1199
1200	sc->nr_args = nr_args;
1201	return 0;
1202}
1203
1204static int syscall__set_arg_fmts(struct syscall *sc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1205{
1206	struct format_field *field;
1207	int idx = 0, len;
 
 
 
1208
1209	for (field = sc->args; field; field = field->next, ++idx) {
1210		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211			continue;
1212
 
 
1213		if (strcmp(field->type, "const char *") == 0 &&
1214			 (strcmp(field->name, "filename") == 0 ||
1215			  strcmp(field->name, "path") == 0 ||
1216			  strcmp(field->name, "pathname") == 0))
1217			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218		else if (field->flags & FIELD_IS_POINTER)
1219			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220		else if (strcmp(field->type, "pid_t") == 0)
1221			sc->arg_fmt[idx].scnprintf = SCA_PID;
1222		else if (strcmp(field->type, "umode_t") == 0)
1223			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224		else if ((strcmp(field->type, "int") == 0 ||
 
 
 
1225			  strcmp(field->type, "unsigned int") == 0 ||
1226			  strcmp(field->type, "long") == 0) &&
1227			 (len = strlen(field->name)) >= 2 &&
1228			 strcmp(field->name + len - 2, "fd") == 0) {
1229			/*
1230			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1231			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232			 * 65 int
1233			 * 23 unsigned int
1234			 * 7 unsigned long
1235			 */
1236			sc->arg_fmt[idx].scnprintf = SCA_FD;
 
 
 
 
 
 
 
 
1237		}
1238	}
1239
 
 
 
 
 
 
 
 
 
 
1240	return 0;
1241}
1242
1243static int trace__read_syscall_info(struct trace *trace, int id)
1244{
1245	char tp_name[128];
1246	struct syscall *sc;
1247	const char *name = syscalltbl__name(trace->sctbl, id);
1248
1249	if (name == NULL)
1250		return -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1251
1252	if (id > trace->syscalls.max) {
1253		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254
1255		if (nsyscalls == NULL)
1256			return -1;
1257
1258		if (trace->syscalls.max != -1) {
1259			memset(nsyscalls + trace->syscalls.max + 1, 0,
1260			       (id - trace->syscalls.max) * sizeof(*sc));
1261		} else {
1262			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263		}
1264
1265		trace->syscalls.table = nsyscalls;
1266		trace->syscalls.max   = id;
 
1267	}
1268
1269	sc = trace->syscalls.table + id;
1270	sc->name = name;
1271
1272	sc->fmt  = syscall_fmt__find(sc->name);
1273
1274	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276
1277	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280	}
1281
1282	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283		return -1;
 
 
 
 
 
 
1284
1285	if (IS_ERR(sc->tp_format))
1286		return -1;
 
1287
1288	sc->args = sc->tp_format->format.fields;
1289	/*
1290	 * We need to check and discard the first variable '__syscall_nr'
1291	 * or 'nr' that mean the syscall number. It is needless here.
1292	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293	 */
1294	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295		sc->args = sc->args->next;
1296		--sc->nr_args;
1297	}
1298
1299	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
 
1300
1301	return syscall__set_arg_fmts(sc);
1302}
1303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1304static int trace__validate_ev_qualifier(struct trace *trace)
1305{
1306	int err = 0, i;
1307	size_t nr_allocated;
1308	struct str_node *pos;
 
1309
1310	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312						 sizeof(trace->ev_qualifier_ids.entries[0]));
1313
1314	if (trace->ev_qualifier_ids.entries == NULL) {
1315		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316		       trace->output);
1317		err = -EINVAL;
1318		goto out;
1319	}
1320
1321	nr_allocated = trace->ev_qualifier_ids.nr;
1322	i = 0;
1323
1324	strlist__for_each_entry(pos, trace->ev_qualifier) {
1325		const char *sc = pos->s;
1326		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327
1328		if (id < 0) {
1329			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330			if (id >= 0)
1331				goto matches;
1332
1333			if (err == 0) {
1334				fputs("Error:\tInvalid syscall ", trace->output);
1335				err = -EINVAL;
1336			} else {
1337				fputs(", ", trace->output);
1338			}
1339
1340			fputs(sc, trace->output);
 
1341		}
1342matches:
1343		trace->ev_qualifier_ids.entries[i++] = id;
1344		if (match_next == -1)
1345			continue;
1346
1347		while (1) {
1348			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349			if (id < 0)
1350				break;
1351			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352				void *entries;
1353
1354				nr_allocated += 8;
1355				entries = realloc(trace->ev_qualifier_ids.entries,
1356						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357				if (entries == NULL) {
1358					err = -ENOMEM;
1359					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360					goto out_free;
1361				}
1362				trace->ev_qualifier_ids.entries = entries;
1363			}
1364			trace->ev_qualifier_ids.nr++;
1365			trace->ev_qualifier_ids.entries[i++] = id;
1366		}
1367	}
1368
1369	if (err < 0) {
1370		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1372out_free:
1373		zfree(&trace->ev_qualifier_ids.entries);
1374		trace->ev_qualifier_ids.nr = 0;
1375	}
1376out:
 
 
1377	return err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1378}
1379
1380/*
1381 * args is to be interpreted as a series of longs but we need to handle
1382 * 8-byte unaligned accesses. args points to raw_data within the event
1383 * and raw_data is guaranteed to be 8-byte unaligned because it is
1384 * preceded by raw_size which is a u32. So we need to copy args to a temp
1385 * variable to read it. Most notably this avoids extended load instructions
1386 * on unaligned addresses
1387 */
1388unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389{
1390	unsigned long val;
1391	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392
1393	memcpy(&val, p, sizeof(val));
1394	return val;
1395}
1396
1397static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398				      struct syscall_arg *arg)
1399{
1400	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402
1403	return scnprintf(bf, size, "arg%d: ", arg->idx);
1404}
1405
1406static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407				     struct syscall_arg *arg, unsigned long val)
 
 
 
 
1408{
1409	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
 
 
 
 
 
 
 
 
 
1410		arg->val = val;
1411		if (sc->arg_fmt[arg->idx].parm)
1412			arg->parm = sc->arg_fmt[arg->idx].parm;
1413		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414	}
1415	return scnprintf(bf, size, "%ld", val);
1416}
1417
1418static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419				      unsigned char *args, struct trace *trace,
1420				      struct thread *thread)
1421{
1422	size_t printed = 0;
1423	unsigned long val;
1424	u8 bit = 1;
1425	struct syscall_arg arg = {
1426		.args	= args,
 
 
 
 
1427		.idx	= 0,
1428		.mask	= 0,
1429		.trace  = trace,
1430		.thread = thread,
 
1431	};
1432	struct thread_trace *ttrace = thread__priv(thread);
1433
1434	/*
1435	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1436	 * right formatter for the return value (an fd? file flags?), which is
1437	 * not needed for syscalls that always return a given type, say an fd.
1438	 */
1439	ttrace->ret_scnprintf = NULL;
1440
1441	if (sc->args != NULL) {
1442		struct format_field *field;
1443
1444		for (field = sc->args; field;
1445		     field = field->next, ++arg.idx, bit <<= 1) {
1446			if (arg.mask & bit)
1447				continue;
1448
 
1449			val = syscall_arg__val(&arg, arg.idx);
 
 
 
 
 
1450
1451			/*
1452 			 * Suppress this argument if its value is zero and
1453 			 * and we don't have a string associated in an
1454 			 * strarray for it.
1455 			 */
1456			if (val == 0 &&
 
1457			    !(sc->arg_fmt &&
1458			      (sc->arg_fmt[arg.idx].show_zero ||
1459			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461			      sc->arg_fmt[arg.idx].parm))
1462				continue;
1463
1464			printed += scnprintf(bf + printed, size - printed,
1465					     "%s%s: ", printed ? ", " : "", field->name);
1466			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
 
 
 
 
1467		}
1468	} else if (IS_ERR(sc->tp_format)) {
1469		/*
1470		 * If we managed to read the tracepoint /format file, then we
1471		 * may end up not having any args, like with gettid(), so only
1472		 * print the raw args when we didn't manage to read it.
1473		 */
1474		while (arg.idx < sc->nr_args) {
1475			if (arg.mask & bit)
1476				goto next_arg;
1477			val = syscall_arg__val(&arg, arg.idx);
1478			if (printed)
1479				printed += scnprintf(bf + printed, size - printed, ", ");
1480			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482next_arg:
1483			++arg.idx;
1484			bit <<= 1;
1485		}
1486	}
1487
1488	return printed;
1489}
1490
1491typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492				  union perf_event *event,
1493				  struct perf_sample *sample);
1494
1495static struct syscall *trace__syscall_info(struct trace *trace,
1496					   struct perf_evsel *evsel, int id)
1497{
 
1498
1499	if (id < 0) {
1500
1501		/*
1502		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503		 * before that, leaving at a higher verbosity level till that is
1504		 * explained. Reproduced with plain ftrace with:
1505		 *
1506		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507		 * grep "NR -1 " /t/trace_pipe
1508		 *
1509		 * After generating some load on the machine.
1510 		 */
1511		if (verbose > 1) {
1512			static u64 n;
1513			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514				id, perf_evsel__name(evsel), ++n);
1515		}
1516		return NULL;
1517	}
1518
1519	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520	    trace__read_syscall_info(trace, id))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1521		goto out_cant_read;
1522
1523	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524		goto out_cant_read;
1525
1526	return &trace->syscalls.table[id];
1527
1528out_cant_read:
1529	if (verbose > 0) {
1530		fprintf(trace->output, "Problems reading syscall %d", id);
1531		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
 
1532			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533		fputs(" information\n", trace->output);
1534	}
1535	return NULL;
1536}
1537
1538static void thread__update_stats(struct thread_trace *ttrace,
1539				 int id, struct perf_sample *sample)
 
 
 
 
 
 
 
1540{
1541	struct int_node *inode;
1542	struct stats *stats;
1543	u64 duration = 0;
1544
1545	inode = intlist__findnew(ttrace->syscall_stats, id);
1546	if (inode == NULL)
1547		return;
1548
1549	stats = inode->priv;
1550	if (stats == NULL) {
1551		stats = malloc(sizeof(struct stats));
1552		if (stats == NULL)
1553			return;
1554		init_stats(stats);
 
1555		inode->priv = stats;
1556	}
1557
1558	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559		duration = sample->time - ttrace->entry_time;
1560
1561	update_stats(stats, duration);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1562}
1563
1564static int trace__printf_interrupted_entry(struct trace *trace)
1565{
1566	struct thread_trace *ttrace;
1567	size_t printed;
 
1568
1569	if (trace->failure_only || trace->current == NULL)
1570		return 0;
1571
1572	ttrace = thread__priv(trace->current);
1573
1574	if (!ttrace->entry_pending)
1575		return 0;
1576
1577	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
 
 
 
 
 
 
1579	ttrace->entry_pending = false;
 
1580
1581	return printed;
1582}
1583
1584static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585				 struct perf_sample *sample, struct thread *thread)
1586{
1587	int printed = 0;
1588
1589	if (trace->print_sample) {
1590		double ts = (double)sample->time / NSEC_PER_MSEC;
1591
1592		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593				   perf_evsel__name(evsel), ts,
1594				   thread__comm_str(thread),
1595				   sample->pid, sample->tid, sample->cpu);
1596	}
1597
1598	return printed;
1599}
1600
1601static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1602			    union perf_event *event __maybe_unused,
1603			    struct perf_sample *sample)
1604{
1605	char *msg;
1606	void *args;
1607	size_t printed = 0;
1608	struct thread *thread;
1609	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
 
 
1610	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611	struct thread_trace *ttrace;
1612
1613	if (sc == NULL)
1614		return -1;
1615
1616	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617	ttrace = thread__trace(thread, trace->output);
1618	if (ttrace == NULL)
1619		goto out_put;
1620
1621	trace__fprintf_sample(trace, evsel, sample, thread);
1622
1623	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624
1625	if (ttrace->entry_str == NULL) {
1626		ttrace->entry_str = malloc(trace__entry_str_size);
1627		if (!ttrace->entry_str)
1628			goto out_put;
1629	}
1630
1631	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632		trace__printf_interrupted_entry(trace);
1633
 
 
 
 
 
 
 
 
 
 
 
1634	ttrace->entry_time = sample->time;
1635	msg = ttrace->entry_str;
1636	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637
1638	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639					   args, trace, thread);
1640
1641	if (sc->is_exit) {
1642		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
 
 
1643			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
 
 
 
1645		}
1646	} else {
1647		ttrace->entry_pending = true;
1648		/* See trace__vfs_getname & trace__sys_exit */
1649		ttrace->filename.pending_open = false;
1650	}
1651
1652	if (trace->current != thread) {
1653		thread__put(trace->current);
1654		trace->current = thread__get(thread);
1655	}
1656	err = 0;
1657out_put:
1658	thread__put(thread);
1659	return err;
1660}
1661
1662static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1663				    struct perf_sample *sample,
1664				    struct callchain_cursor *cursor)
1665{
1666	struct addr_location al;
1667	int max_stack = evsel->attr.sample_max_stack ?
1668			evsel->attr.sample_max_stack :
1669			trace->max_stack;
 
1670
1671	if (machine__resolve(trace->host, &al, sample) < 0 ||
1672	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673		return -1;
1674
1675	return 0;
 
 
 
1676}
1677
1678static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679{
1680	/* TODO: user-configurable print_opts */
1681	const unsigned int print_opts = EVSEL__PRINT_SYM |
1682				        EVSEL__PRINT_DSO |
1683				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684
1685	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686}
1687
1688static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689{
1690	struct perf_env *env = perf_evsel__env(evsel);
1691	const char *arch_name = perf_env__arch(env);
1692
1693	return arch_syscalls__strerrno(arch_name, err);
1694}
1695
1696static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697			   union perf_event *event __maybe_unused,
1698			   struct perf_sample *sample)
1699{
1700	long ret;
1701	u64 duration = 0;
1702	bool duration_calculated = false;
1703	struct thread *thread;
1704	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
 
1705	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706	struct thread_trace *ttrace;
1707
1708	if (sc == NULL)
1709		return -1;
1710
1711	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712	ttrace = thread__trace(thread, trace->output);
1713	if (ttrace == NULL)
1714		goto out_put;
1715
1716	trace__fprintf_sample(trace, evsel, sample, thread);
1717
 
 
1718	if (trace->summary)
1719		thread__update_stats(ttrace, id, sample);
1720
1721	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722
1723	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725		ttrace->filename.pending_open = false;
1726		++trace->stats.vfs_getname;
1727	}
1728
1729	if (ttrace->entry_time) {
1730		duration = sample->time - ttrace->entry_time;
1731		if (trace__filter_duration(trace, duration))
1732			goto out;
1733		duration_calculated = true;
1734	} else if (trace->duration_filter)
1735		goto out;
1736
1737	if (sample->callchain) {
1738		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1739		if (callchain_ret == 0) {
1740			if (callchain_cursor.nr < trace->min_stack)
1741				goto out;
1742			callchain_ret = 1;
1743		}
1744	}
1745
1746	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747		goto out;
1748
1749	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750
1751	if (ttrace->entry_pending) {
1752		fprintf(trace->output, "%-70s", ttrace->entry_str);
1753	} else {
1754		fprintf(trace->output, " ... [");
1755		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756		fprintf(trace->output, "]: %s()", sc->name);
 
1757	}
1758
 
 
 
 
 
 
 
 
 
1759	if (sc->fmt == NULL) {
1760		if (ret < 0)
1761			goto errno_print;
1762signed_print:
1763		fprintf(trace->output, ") = %ld", ret);
1764	} else if (ret < 0) {
1765errno_print: {
1766		char bf[STRERR_BUFSIZE];
1767		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768			   *e = errno_to_name(evsel, -ret);
1769
1770		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771	}
1772	} else if (ret == 0 && sc->fmt->timeout)
1773		fprintf(trace->output, ") = 0 Timeout");
1774	else if (ttrace->ret_scnprintf) {
1775		char bf[1024];
1776		struct syscall_arg arg = {
1777			.val	= ret,
1778			.thread	= thread,
1779			.trace	= trace,
1780		};
1781		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782		ttrace->ret_scnprintf = NULL;
1783		fprintf(trace->output, ") = %s", bf);
1784	} else if (sc->fmt->hexret)
1785		fprintf(trace->output, ") = %#lx", ret);
1786	else if (sc->fmt->errpid) {
1787		struct thread *child = machine__find_thread(trace->host, ret, ret);
1788
1789		if (child != NULL) {
1790			fprintf(trace->output, ") = %ld", ret);
1791			if (child->comm_set)
1792				fprintf(trace->output, " (%s)", thread__comm_str(child));
1793			thread__put(child);
1794		}
1795	} else
1796		goto signed_print;
1797
1798	fputc('\n', trace->output);
1799
 
 
 
 
 
 
 
1800	if (callchain_ret > 0)
1801		trace__fprintf_callchain(trace, sample);
1802	else if (callchain_ret < 0)
1803		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804out:
1805	ttrace->entry_pending = false;
1806	err = 0;
1807out_put:
1808	thread__put(thread);
1809	return err;
1810}
1811
1812static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813			      union perf_event *event __maybe_unused,
1814			      struct perf_sample *sample)
1815{
1816	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817	struct thread_trace *ttrace;
1818	size_t filename_len, entry_str_len, to_move;
1819	ssize_t remaining_space;
1820	char *pos;
1821	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822
1823	if (!thread)
1824		goto out;
1825
1826	ttrace = thread__priv(thread);
1827	if (!ttrace)
1828		goto out_put;
1829
1830	filename_len = strlen(filename);
1831	if (filename_len == 0)
1832		goto out_put;
1833
1834	if (ttrace->filename.namelen < filename_len) {
1835		char *f = realloc(ttrace->filename.name, filename_len + 1);
1836
1837		if (f == NULL)
1838			goto out_put;
1839
1840		ttrace->filename.namelen = filename_len;
1841		ttrace->filename.name = f;
1842	}
1843
1844	strcpy(ttrace->filename.name, filename);
1845	ttrace->filename.pending_open = true;
1846
1847	if (!ttrace->filename.ptr)
1848		goto out_put;
1849
1850	entry_str_len = strlen(ttrace->entry_str);
1851	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852	if (remaining_space <= 0)
1853		goto out_put;
1854
1855	if (filename_len > (size_t)remaining_space) {
1856		filename += filename_len - remaining_space;
1857		filename_len = remaining_space;
1858	}
1859
1860	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862	memmove(pos + filename_len, pos, to_move);
1863	memcpy(pos, filename, filename_len);
1864
1865	ttrace->filename.ptr = 0;
1866	ttrace->filename.entry_str_pos = 0;
1867out_put:
1868	thread__put(thread);
1869out:
1870	return 0;
1871}
1872
1873static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874				     union perf_event *event __maybe_unused,
1875				     struct perf_sample *sample)
1876{
1877        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879	struct thread *thread = machine__findnew_thread(trace->host,
1880							sample->pid,
1881							sample->tid);
1882	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883
1884	if (ttrace == NULL)
1885		goto out_dump;
1886
1887	ttrace->runtime_ms += runtime_ms;
1888	trace->runtime_ms += runtime_ms;
1889out_put:
1890	thread__put(thread);
1891	return 0;
1892
1893out_dump:
1894	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895	       evsel->name,
1896	       perf_evsel__strval(evsel, sample, "comm"),
1897	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898	       runtime,
1899	       perf_evsel__intval(evsel, sample, "vruntime"));
1900	goto out_put;
1901}
1902
1903static int bpf_output__printer(enum binary_printer_ops op,
1904			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1905{
1906	unsigned char ch = (unsigned char)val;
1907
1908	switch (op) {
1909	case BINARY_PRINT_CHAR_DATA:
1910		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911	case BINARY_PRINT_DATA_BEGIN:
1912	case BINARY_PRINT_LINE_BEGIN:
1913	case BINARY_PRINT_ADDR:
1914	case BINARY_PRINT_NUM_DATA:
1915	case BINARY_PRINT_NUM_PAD:
1916	case BINARY_PRINT_SEP:
1917	case BINARY_PRINT_CHAR_PAD:
1918	case BINARY_PRINT_LINE_END:
1919	case BINARY_PRINT_DATA_END:
1920	default:
1921		break;
1922	}
1923
1924	return 0;
1925}
1926
1927static void bpf_output__fprintf(struct trace *trace,
1928				struct perf_sample *sample)
1929{
1930	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931			bpf_output__printer, NULL, trace->output);
 
1932}
1933
1934static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1935				union perf_event *event __maybe_unused,
1936				struct perf_sample *sample)
1937{
 
1938	int callchain_ret = 0;
 
 
 
 
 
 
 
 
 
 
1939
1940	if (sample->callchain) {
1941		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1942		if (callchain_ret == 0) {
1943			if (callchain_cursor.nr < trace->min_stack)
1944				goto out;
1945			callchain_ret = 1;
1946		}
1947	}
1948
1949	trace__printf_interrupted_entry(trace);
1950	trace__fprintf_tstamp(trace, sample->time, trace->output);
1951
1952	if (trace->trace_syscalls)
1953		fprintf(trace->output, "(         ): ");
1954
1955	fprintf(trace->output, "%s:", evsel->name);
 
 
 
 
 
 
 
 
 
 
 
 
1956
1957	if (perf_evsel__is_bpf_output(evsel)) {
 
 
 
 
 
 
 
 
 
1958		bpf_output__fprintf(trace, sample);
1959	} else if (evsel->tp_format) {
1960		event_format__fprintf(evsel->tp_format, sample->cpu,
1961				      sample->raw_data, sample->raw_size,
1962				      trace->output);
 
 
 
 
 
 
 
1963	}
1964
1965	fprintf(trace->output, "\n");
 
1966
1967	if (callchain_ret > 0)
1968		trace__fprintf_callchain(trace, sample);
1969	else if (callchain_ret < 0)
1970		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 
 
 
 
 
 
 
1971out:
 
1972	return 0;
1973}
1974
1975static void print_location(FILE *f, struct perf_sample *sample,
1976			   struct addr_location *al,
1977			   bool print_dso, bool print_sym)
1978{
1979
1980	if ((verbose > 0 || print_dso) && al->map)
1981		fprintf(f, "%s@", al->map->dso->long_name);
1982
1983	if ((verbose > 0 || print_sym) && al->sym)
1984		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985			al->addr - al->sym->start);
1986	else if (al->map)
1987		fprintf(f, "0x%" PRIx64, al->addr);
1988	else
1989		fprintf(f, "0x%" PRIx64, sample->addr);
1990}
1991
1992static int trace__pgfault(struct trace *trace,
1993			  struct perf_evsel *evsel,
1994			  union perf_event *event __maybe_unused,
1995			  struct perf_sample *sample)
1996{
1997	struct thread *thread;
1998	struct addr_location al;
1999	char map_type = 'd';
2000	struct thread_trace *ttrace;
2001	int err = -1;
2002	int callchain_ret = 0;
2003
 
2004	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005
2006	if (sample->callchain) {
2007		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
2008		if (callchain_ret == 0) {
2009			if (callchain_cursor.nr < trace->min_stack)
2010				goto out_put;
2011			callchain_ret = 1;
2012		}
2013	}
2014
2015	ttrace = thread__trace(thread, trace->output);
2016	if (ttrace == NULL)
2017		goto out_put;
2018
2019	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020		ttrace->pfmaj++;
2021	else
2022		ttrace->pfmin++;
2023
2024	if (trace->summary_only)
2025		goto out;
2026
2027	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2028			      sample->ip, &al);
2029
2030	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2031
2032	fprintf(trace->output, "%sfault [",
2033		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2034		"maj" : "min");
2035
2036	print_location(trace->output, sample, &al, false, true);
2037
2038	fprintf(trace->output, "] => ");
2039
2040	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2041				   sample->addr, &al);
2042
2043	if (!al.map) {
2044		thread__find_addr_location(thread, sample->cpumode,
2045					   MAP__FUNCTION, sample->addr, &al);
2046
2047		if (al.map)
2048			map_type = 'x';
2049		else
2050			map_type = '?';
2051	}
2052
2053	print_location(trace->output, sample, &al, true, false);
2054
2055	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2056
2057	if (callchain_ret > 0)
2058		trace__fprintf_callchain(trace, sample);
2059	else if (callchain_ret < 0)
2060		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 
 
2061out:
2062	err = 0;
2063out_put:
2064	thread__put(thread);
 
2065	return err;
2066}
2067
2068static void trace__set_base_time(struct trace *trace,
2069				 struct perf_evsel *evsel,
2070				 struct perf_sample *sample)
2071{
2072	/*
2073	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2074	 * and don't use sample->time unconditionally, we may end up having
2075	 * some other event in the future without PERF_SAMPLE_TIME for good
2076	 * reason, i.e. we may not be interested in its timestamps, just in
2077	 * it taking place, picking some piece of information when it
2078	 * appears in our event stream (vfs_getname comes to mind).
2079	 */
2080	if (trace->base_time == 0 && !trace->full_time &&
2081	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2082		trace->base_time = sample->time;
2083}
2084
2085static int trace__process_sample(struct perf_tool *tool,
2086				 union perf_event *event,
2087				 struct perf_sample *sample,
2088				 struct perf_evsel *evsel,
2089				 struct machine *machine __maybe_unused)
2090{
2091	struct trace *trace = container_of(tool, struct trace, tool);
2092	struct thread *thread;
2093	int err = 0;
2094
2095	tracepoint_handler handler = evsel->handler;
2096
2097	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098	if (thread && thread__is_filtered(thread))
2099		goto out;
2100
2101	trace__set_base_time(trace, evsel, sample);
2102
2103	if (handler) {
2104		++trace->nr_events;
2105		handler(trace, evsel, event, sample);
2106	}
2107out:
2108	thread__put(thread);
2109	return err;
2110}
2111
2112static int trace__record(struct trace *trace, int argc, const char **argv)
2113{
2114	unsigned int rec_argc, i, j;
2115	const char **rec_argv;
2116	const char * const record_args[] = {
2117		"record",
2118		"-R",
2119		"-m", "1024",
2120		"-c", "1",
2121	};
2122
 
2123	const char * const sc_args[] = { "-e", };
2124	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2125	const char * const majpf_args[] = { "-e", "major-faults" };
2126	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2127	const char * const minpf_args[] = { "-e", "minor-faults" };
2128	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
 
2129
2130	/* +1 is for the event string below */
2131	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2132		majpf_args_nr + minpf_args_nr + argc;
2133	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2134
2135	if (rec_argv == NULL)
2136		return -ENOMEM;
2137
2138	j = 0;
2139	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2140		rec_argv[j++] = record_args[i];
2141
2142	if (trace->trace_syscalls) {
2143		for (i = 0; i < sc_args_nr; i++)
2144			rec_argv[j++] = sc_args[i];
2145
2146		/* event string may be different for older kernels - e.g., RHEL6 */
2147		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2148			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2149		else if (is_valid_tracepoint("syscalls:sys_enter"))
2150			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2151		else {
2152			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2153			free(rec_argv);
2154			return -1;
2155		}
2156	}
2157
 
 
 
2158	if (trace->trace_pgfaults & TRACE_PFMAJ)
2159		for (i = 0; i < majpf_args_nr; i++)
2160			rec_argv[j++] = majpf_args[i];
2161
2162	if (trace->trace_pgfaults & TRACE_PFMIN)
2163		for (i = 0; i < minpf_args_nr; i++)
2164			rec_argv[j++] = minpf_args[i];
2165
2166	for (i = 0; i < (unsigned int)argc; i++)
2167		rec_argv[j++] = argv[i];
2168
2169	return cmd_record(j, rec_argv);
 
 
 
 
2170}
2171
2172static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2173
2174static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2175{
2176	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
 
 
 
2177
2178	if (IS_ERR(evsel))
 
 
 
2179		return false;
2180
2181	if (perf_evsel__field(evsel, "pathname") == NULL) {
2182		perf_evsel__delete(evsel);
2183		return false;
 
 
 
 
 
 
 
 
 
 
2184	}
2185
2186	evsel->handler = trace__vfs_getname;
2187	perf_evlist__add(evlist, evsel);
2188	return true;
2189}
2190
2191static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2192{
2193	struct perf_evsel *evsel;
2194	struct perf_event_attr attr = {
2195		.type = PERF_TYPE_SOFTWARE,
2196		.mmap_data = 1,
2197	};
2198
2199	attr.config = config;
2200	attr.sample_period = 1;
2201
2202	event_attr_init(&attr);
2203
2204	evsel = perf_evsel__new(&attr);
2205	if (evsel)
2206		evsel->handler = trace__pgfault;
2207
2208	return evsel;
2209}
2210
 
 
 
 
 
 
 
 
 
 
2211static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2212{
2213	const u32 type = event->header.type;
2214	struct perf_evsel *evsel;
2215
2216	if (type != PERF_RECORD_SAMPLE) {
2217		trace__process_event(trace, trace->host, event, sample);
2218		return;
2219	}
2220
2221	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2222	if (evsel == NULL) {
2223		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2224		return;
2225	}
2226
 
 
 
2227	trace__set_base_time(trace, evsel, sample);
2228
2229	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2230	    sample->raw_data == NULL) {
2231		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2232		       perf_evsel__name(evsel), sample->tid,
2233		       sample->cpu, sample->raw_size);
2234	} else {
2235		tracepoint_handler handler = evsel->handler;
2236		handler(trace, evsel, event, sample);
2237	}
 
 
 
2238}
2239
2240static int trace__add_syscall_newtp(struct trace *trace)
2241{
2242	int ret = -1;
2243	struct perf_evlist *evlist = trace->evlist;
2244	struct perf_evsel *sys_enter, *sys_exit;
2245
2246	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2247	if (sys_enter == NULL)
2248		goto out;
2249
2250	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2251		goto out_delete_sys_enter;
2252
2253	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2254	if (sys_exit == NULL)
2255		goto out_delete_sys_enter;
2256
2257	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2258		goto out_delete_sys_exit;
2259
2260	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2261	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2262
2263	perf_evlist__add(evlist, sys_enter);
2264	perf_evlist__add(evlist, sys_exit);
2265
2266	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2267		/*
2268		 * We're interested only in the user space callchain
2269		 * leading to the syscall, allow overriding that for
2270		 * debugging reasons using --kernel_syscall_callchains
2271		 */
2272		sys_exit->attr.exclude_callchain_kernel = 1;
2273	}
2274
2275	trace->syscalls.events.sys_enter = sys_enter;
2276	trace->syscalls.events.sys_exit  = sys_exit;
2277
2278	ret = 0;
2279out:
2280	return ret;
2281
2282out_delete_sys_exit:
2283	perf_evsel__delete_priv(sys_exit);
2284out_delete_sys_enter:
2285	perf_evsel__delete_priv(sys_enter);
2286	goto out;
2287}
2288
2289static int trace__set_ev_qualifier_filter(struct trace *trace)
2290{
2291	int err = -1;
2292	struct perf_evsel *sys_exit;
2293	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2294						trace->ev_qualifier_ids.nr,
2295						trace->ev_qualifier_ids.entries);
2296
2297	if (filter == NULL)
2298		goto out_enomem;
2299
2300	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2301					  filter)) {
2302		sys_exit = trace->syscalls.events.sys_exit;
2303		err = perf_evsel__append_tp_filter(sys_exit, filter);
2304	}
2305
2306	free(filter);
2307out:
2308	return err;
2309out_enomem:
2310	errno = ENOMEM;
2311	goto out;
2312}
2313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2314static int trace__set_filter_loop_pids(struct trace *trace)
2315{
2316	unsigned int nr = 1;
2317	pid_t pids[32] = {
2318		getpid(),
2319	};
2320	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2321
2322	while (thread && nr < ARRAY_SIZE(pids)) {
2323		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
 
 
2324
2325		if (parent == NULL)
2326			break;
2327
2328		if (!strcmp(thread__comm_str(parent), "sshd")) {
2329			pids[nr++] = parent->tid;
 
2330			break;
2331		}
2332		thread = parent;
2333	}
2334
2335	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2336}
2337
2338static int trace__run(struct trace *trace, int argc, const char **argv)
2339{
2340	struct perf_evlist *evlist = trace->evlist;
2341	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2342	int err = -1, i;
2343	unsigned long before;
2344	const bool forks = argc > 0;
2345	bool draining = false;
2346
2347	trace->live = true;
2348
2349	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2350		goto out_error_raw_syscalls;
 
2351
2352	if (trace->trace_syscalls)
2353		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
 
2354
2355	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2356		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2357		if (pgfault_maj == NULL)
2358			goto out_error_mem;
2359		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2360		perf_evlist__add(evlist, pgfault_maj);
2361	}
2362
2363	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2364		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2365		if (pgfault_min == NULL)
2366			goto out_error_mem;
2367		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2368		perf_evlist__add(evlist, pgfault_min);
2369	}
2370
 
 
 
2371	if (trace->sched &&
2372	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2373				   trace__sched_stat_runtime))
2374		goto out_error_sched_stat_runtime;
2375
2376	/*
2377	 * If a global cgroup was set, apply it to all the events without an
2378	 * explicit cgroup. I.e.:
2379	 *
2380	 * 	trace -G A -e sched:*switch
2381	 *
2382	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2383	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2384	 *
2385	 * trace -e sched:*switch -G A
2386	 *
2387	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2388	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2389	 * a cgroup (on the root cgroup, sys wide, etc).
2390	 *
2391	 * Multiple cgroups:
2392	 *
2393	 * trace -G A -e sched:*switch -G B
2394	 *
2395	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2396	 * to the 'B' cgroup.
2397	 *
2398	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2399	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2400	 */
2401	if (trace->cgroup)
2402		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2403
2404	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2405	if (err < 0) {
2406		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2407		goto out_delete_evlist;
2408	}
2409
2410	err = trace__symbols_init(trace, evlist);
2411	if (err < 0) {
2412		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2413		goto out_delete_evlist;
2414	}
2415
2416	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2417
2418	signal(SIGCHLD, sig_handler);
2419	signal(SIGINT, sig_handler);
2420
2421	if (forks) {
2422		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2423						    argv, false, NULL);
2424		if (err < 0) {
2425			fprintf(trace->output, "Couldn't run the workload!\n");
2426			goto out_delete_evlist;
2427		}
 
2428	}
2429
2430	err = perf_evlist__open(evlist);
2431	if (err < 0)
2432		goto out_error_open;
 
 
 
2433
2434	err = bpf__apply_obj_config();
2435	if (err) {
2436		char errbuf[BUFSIZ];
2437
2438		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2439		pr_err("ERROR: Apply config to BPF failed: %s\n",
2440			 errbuf);
2441		goto out_error_open;
 
 
 
2442	}
2443
2444	/*
2445	 * Better not use !target__has_task() here because we need to cover the
2446	 * case where no threads were specified in the command line, but a
2447	 * workload was, and in that case we will fill in the thread_map when
2448	 * we fork the workload in perf_evlist__prepare_workload.
2449	 */
2450	if (trace->filter_pids.nr > 0)
2451		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2452	else if (thread_map__pid(evlist->threads, 0) == -1)
2453		err = trace__set_filter_loop_pids(trace);
2454
2455	if (err < 0)
2456		goto out_error_mem;
2457
 
 
 
 
 
2458	if (trace->ev_qualifier_ids.nr > 0) {
2459		err = trace__set_ev_qualifier_filter(trace);
2460		if (err < 0)
2461			goto out_errno;
2462
2463		pr_debug("event qualifier tracepoint filter: %s\n",
2464			 trace->syscalls.events.sys_exit->filter);
 
 
2465	}
2466
2467	err = perf_evlist__apply_filters(evlist, &evsel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2468	if (err < 0)
2469		goto out_error_apply_filters;
2470
2471	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2472	if (err < 0)
2473		goto out_error_mmap;
2474
2475	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2476		perf_evlist__enable(evlist);
2477
2478	if (forks)
2479		perf_evlist__start_workload(evlist);
2480
2481	if (trace->opts.initial_delay) {
2482		usleep(trace->opts.initial_delay * 1000);
2483		perf_evlist__enable(evlist);
2484	}
2485
2486	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2487				  evlist->threads->nr > 1 ||
2488				  perf_evlist__first(evlist)->attr.inherit;
2489
2490	/*
2491	 * Now that we already used evsel->attr to ask the kernel to setup the
2492	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2493	 * trace__resolve_callchain(), allowing per-event max-stack settings
2494	 * to override an explicitely set --max-stack global setting.
2495	 */
2496	evlist__for_each_entry(evlist, evsel) {
2497		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2498		    evsel->attr.sample_max_stack == 0)
2499			evsel->attr.sample_max_stack = trace->max_stack;
2500	}
2501again:
2502	before = trace->nr_events;
2503
2504	for (i = 0; i < evlist->nr_mmaps; i++) {
2505		union perf_event *event;
2506		struct perf_mmap *md;
2507
2508		md = &evlist->mmap[i];
2509		if (perf_mmap__read_init(md) < 0)
2510			continue;
2511
2512		while ((event = perf_mmap__read_event(md)) != NULL) {
2513			struct perf_sample sample;
2514
2515			++trace->nr_events;
2516
2517			err = perf_evlist__parse_sample(evlist, event, &sample);
2518			if (err) {
2519				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2520				goto next_event;
2521			}
2522
2523			trace__handle_event(trace, event, &sample);
2524next_event:
2525			perf_mmap__consume(md);
2526
2527			if (interrupted)
2528				goto out_disable;
2529
2530			if (done && !draining) {
2531				perf_evlist__disable(evlist);
2532				draining = true;
2533			}
2534		}
2535		perf_mmap__read_done(md);
2536	}
2537
2538	if (trace->nr_events == before) {
2539		int timeout = done ? 100 : -1;
2540
2541		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2542			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2543				draining = true;
2544
2545			goto again;
 
 
 
2546		}
2547	} else {
2548		goto again;
2549	}
2550
2551out_disable:
2552	thread__zput(trace->current);
2553
2554	perf_evlist__disable(evlist);
 
 
 
2555
2556	if (!err) {
2557		if (trace->summary)
2558			trace__fprintf_thread_summary(trace, trace->output);
2559
2560		if (trace->show_tool_stats) {
2561			fprintf(trace->output, "Stats:\n "
2562					       " vfs_getname : %" PRIu64 "\n"
2563					       " proc_getname: %" PRIu64 "\n",
2564				trace->stats.vfs_getname,
2565				trace->stats.proc_getname);
2566		}
2567	}
2568
2569out_delete_evlist:
2570	trace__symbols__exit(trace);
2571
2572	perf_evlist__delete(evlist);
2573	cgroup__put(trace->cgroup);
2574	trace->evlist = NULL;
2575	trace->live = false;
2576	return err;
2577{
2578	char errbuf[BUFSIZ];
2579
2580out_error_sched_stat_runtime:
2581	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2582	goto out_error;
2583
2584out_error_raw_syscalls:
2585	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2586	goto out_error;
2587
2588out_error_mmap:
2589	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2590	goto out_error;
2591
2592out_error_open:
2593	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2594
2595out_error:
2596	fprintf(trace->output, "%s\n", errbuf);
2597	goto out_delete_evlist;
2598
2599out_error_apply_filters:
2600	fprintf(trace->output,
2601		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2602		evsel->filter, perf_evsel__name(evsel), errno,
2603		str_error_r(errno, errbuf, sizeof(errbuf)));
2604	goto out_delete_evlist;
2605}
2606out_error_mem:
2607	fprintf(trace->output, "Not enough memory to run!\n");
2608	goto out_delete_evlist;
2609
2610out_errno:
2611	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2612	goto out_delete_evlist;
2613}
2614
2615static int trace__replay(struct trace *trace)
2616{
2617	const struct perf_evsel_str_handler handlers[] = {
2618		{ "probe:vfs_getname",	     trace__vfs_getname, },
2619	};
2620	struct perf_data data = {
2621		.file      = {
2622			.path = input_name,
2623		},
2624		.mode      = PERF_DATA_MODE_READ,
2625		.force     = trace->force,
2626	};
2627	struct perf_session *session;
2628	struct perf_evsel *evsel;
2629	int err = -1;
2630
2631	trace->tool.sample	  = trace__process_sample;
2632	trace->tool.mmap	  = perf_event__process_mmap;
2633	trace->tool.mmap2	  = perf_event__process_mmap2;
2634	trace->tool.comm	  = perf_event__process_comm;
2635	trace->tool.exit	  = perf_event__process_exit;
2636	trace->tool.fork	  = perf_event__process_fork;
2637	trace->tool.attr	  = perf_event__process_attr;
2638	trace->tool.tracing_data  = perf_event__process_tracing_data;
2639	trace->tool.build_id	  = perf_event__process_build_id;
2640	trace->tool.namespaces	  = perf_event__process_namespaces;
2641
2642	trace->tool.ordered_events = true;
2643	trace->tool.ordering_requires_timestamps = true;
2644
2645	/* add tid to output */
2646	trace->multiple_threads = true;
2647
2648	session = perf_session__new(&data, false, &trace->tool);
2649	if (session == NULL)
2650		return -1;
2651
2652	if (trace->opts.target.pid)
2653		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2654
2655	if (trace->opts.target.tid)
2656		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2657
2658	if (symbol__init(&session->header.env) < 0)
2659		goto out;
2660
2661	trace->host = &session->machines.host;
2662
2663	err = perf_session__set_tracepoints_handlers(session, handlers);
2664	if (err)
2665		goto out;
2666
2667	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2668						     "raw_syscalls:sys_enter");
2669	/* older kernels have syscalls tp versus raw_syscalls */
2670	if (evsel == NULL)
2671		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672							     "syscalls:sys_enter");
2673
2674	if (evsel &&
2675	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2676	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2677		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2678		goto out;
2679	}
2680
2681	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682						     "raw_syscalls:sys_exit");
2683	if (evsel == NULL)
2684		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2685							     "syscalls:sys_exit");
2686	if (evsel &&
2687	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2688	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2689		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2690		goto out;
2691	}
2692
2693	evlist__for_each_entry(session->evlist, evsel) {
2694		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2695		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2696		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2697		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2698			evsel->handler = trace__pgfault;
2699	}
2700
2701	setup_pager();
2702
2703	err = perf_session__process_events(session);
2704	if (err)
2705		pr_err("Failed to process events, error %d", err);
2706
2707	else if (trace->summary)
2708		trace__fprintf_thread_summary(trace, trace->output);
2709
2710out:
2711	perf_session__delete(session);
2712
2713	return err;
2714}
2715
2716static size_t trace__fprintf_threads_header(FILE *fp)
2717{
2718	size_t printed;
2719
2720	printed  = fprintf(fp, "\n Summary of events:\n\n");
2721
2722	return printed;
2723}
2724
2725DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2726	struct stats 	*stats;
2727	double		msecs;
2728	int		syscall;
2729)
2730{
2731	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2732	struct stats *stats = source->priv;
2733
2734	entry->syscall = source->i;
2735	entry->stats   = stats;
2736	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2737}
2738
2739static size_t thread__dump_stats(struct thread_trace *ttrace,
2740				 struct trace *trace, FILE *fp)
2741{
2742	size_t printed = 0;
2743	struct syscall *sc;
2744	struct rb_node *nd;
2745	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2746
2747	if (syscall_stats == NULL)
2748		return 0;
2749
2750	printed += fprintf(fp, "\n");
2751
2752	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2753	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2754	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2755
2756	resort_rb__for_each_entry(nd, syscall_stats) {
2757		struct stats *stats = syscall_stats_entry->stats;
2758		if (stats) {
2759			double min = (double)(stats->min) / NSEC_PER_MSEC;
2760			double max = (double)(stats->max) / NSEC_PER_MSEC;
2761			double avg = avg_stats(stats);
2762			double pct;
2763			u64 n = (u64) stats->n;
2764
2765			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2766			avg /= NSEC_PER_MSEC;
2767
2768			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2769			printed += fprintf(fp, "   %-15s", sc->name);
2770			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2771					   n, syscall_stats_entry->msecs, min, avg);
2772			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
 
 
 
 
 
 
 
 
 
2773		}
2774	}
2775
2776	resort_rb__delete(syscall_stats);
2777	printed += fprintf(fp, "\n\n");
2778
2779	return printed;
2780}
2781
2782static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2783{
2784	size_t printed = 0;
2785	struct thread_trace *ttrace = thread__priv(thread);
2786	double ratio;
2787
2788	if (ttrace == NULL)
2789		return 0;
2790
2791	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2792
2793	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2794	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2795	printed += fprintf(fp, "%.1f%%", ratio);
2796	if (ttrace->pfmaj)
2797		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2798	if (ttrace->pfmin)
2799		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2800	if (trace->sched)
2801		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2802	else if (fputc('\n', fp) != EOF)
2803		++printed;
2804
2805	printed += thread__dump_stats(ttrace, trace, fp);
2806
2807	return printed;
2808}
2809
2810static unsigned long thread__nr_events(struct thread_trace *ttrace)
2811{
2812	return ttrace ? ttrace->nr_events : 0;
2813}
2814
2815DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2816	struct thread *thread;
2817)
2818{
2819	entry->thread = rb_entry(nd, struct thread, rb_node);
 
 
 
 
 
 
 
 
 
 
 
2820}
2821
2822static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2823{
2824	size_t printed = trace__fprintf_threads_header(fp);
2825	struct rb_node *nd;
2826	int i;
2827
2828	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2829		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2830
2831		if (threads == NULL) {
2832			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2833			return 0;
2834		}
2835
2836		resort_rb__for_each_entry(nd, threads)
2837			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2838
2839		resort_rb__delete(threads);
 
2840	}
 
2841	return printed;
2842}
2843
2844static int trace__set_duration(const struct option *opt, const char *str,
2845			       int unset __maybe_unused)
2846{
2847	struct trace *trace = opt->value;
2848
2849	trace->duration_filter = atof(str);
2850	return 0;
2851}
2852
2853static int trace__set_filter_pids(const struct option *opt, const char *str,
2854				  int unset __maybe_unused)
2855{
2856	int ret = -1;
2857	size_t i;
2858	struct trace *trace = opt->value;
2859	/*
2860	 * FIXME: introduce a intarray class, plain parse csv and create a
2861	 * { int nr, int entries[] } struct...
2862	 */
2863	struct intlist *list = intlist__new(str);
2864
2865	if (list == NULL)
2866		return -1;
2867
2868	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2869	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2870
2871	if (trace->filter_pids.entries == NULL)
2872		goto out;
2873
2874	trace->filter_pids.entries[0] = getpid();
2875
2876	for (i = 1; i < trace->filter_pids.nr; ++i)
2877		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2878
2879	intlist__delete(list);
2880	ret = 0;
2881out:
2882	return ret;
2883}
2884
2885static int trace__open_output(struct trace *trace, const char *filename)
2886{
2887	struct stat st;
2888
2889	if (!stat(filename, &st) && st.st_size) {
2890		char oldname[PATH_MAX];
2891
2892		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2893		unlink(oldname);
2894		rename(filename, oldname);
2895	}
2896
2897	trace->output = fopen(filename, "w");
2898
2899	return trace->output == NULL ? -errno : 0;
2900}
2901
2902static int parse_pagefaults(const struct option *opt, const char *str,
2903			    int unset __maybe_unused)
2904{
2905	int *trace_pgfaults = opt->value;
2906
2907	if (strcmp(str, "all") == 0)
2908		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2909	else if (strcmp(str, "maj") == 0)
2910		*trace_pgfaults |= TRACE_PFMAJ;
2911	else if (strcmp(str, "min") == 0)
2912		*trace_pgfaults |= TRACE_PFMIN;
2913	else
2914		return -1;
2915
2916	return 0;
2917}
2918
2919static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2920{
2921	struct perf_evsel *evsel;
 
 
 
 
2922
2923	evlist__for_each_entry(evlist, evsel)
2924		evsel->handler = handler;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2925}
2926
2927/*
2928 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2929 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2930 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2931 *
2932 * It'd be better to introduce a parse_options() variant that would return a
2933 * list with the terms it didn't match to an event...
2934 */
2935static int trace__parse_events_option(const struct option *opt, const char *str,
2936				      int unset __maybe_unused)
2937{
2938	struct trace *trace = (struct trace *)opt->value;
2939	const char *s = str;
2940	char *sep = NULL, *lists[2] = { NULL, NULL, };
2941	int len = strlen(str) + 1, err = -1, list, idx;
2942	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2943	char group_name[PATH_MAX];
 
2944
2945	if (strace_groups_dir == NULL)
2946		return -1;
2947
2948	if (*s == '!') {
2949		++s;
2950		trace->not_ev_qualifier = true;
2951	}
2952
2953	while (1) {
2954		if ((sep = strchr(s, ',')) != NULL)
2955			*sep = '\0';
2956
2957		list = 0;
2958		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2959		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2960			list = 1;
 
 
 
 
 
 
 
2961		} else {
2962			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2963			if (access(group_name, R_OK) == 0)
2964				list = 1;
2965		}
2966
2967		if (lists[list]) {
2968			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2969		} else {
2970			lists[list] = malloc(len);
2971			if (lists[list] == NULL)
2972				goto out;
2973			strcpy(lists[list], s);
2974		}
2975
2976		if (!sep)
2977			break;
2978
2979		*sep = ',';
2980		s = sep + 1;
2981	}
2982
2983	if (lists[1] != NULL) {
2984		struct strlist_config slist_config = {
2985			.dirname = strace_groups_dir,
2986		};
2987
2988		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2989		if (trace->ev_qualifier == NULL) {
2990			fputs("Not enough memory to parse event qualifier", trace->output);
2991			goto out;
2992		}
2993
2994		if (trace__validate_ev_qualifier(trace))
2995			goto out;
 
2996	}
2997
2998	err = 0;
2999
3000	if (lists[0]) {
3001		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002					       "event selector. use 'perf list' to list available events",
3003					       parse_events_option);
 
 
 
3004		err = parse_events_option(&o, lists[0], 0);
3005	}
3006out:
 
 
 
3007	if (sep)
3008		*sep = ',';
3009
3010	return err;
3011}
3012
3013static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014{
3015	struct trace *trace = opt->value;
3016
3017	if (!list_empty(&trace->evlist->entries))
3018		return parse_cgroups(opt, str, unset);
3019
 
 
 
3020	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021
3022	return 0;
3023}
3024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3025int cmd_trace(int argc, const char **argv)
3026{
3027	const char *trace_usage[] = {
3028		"perf trace [<options>] [<command>]",
3029		"perf trace [<options>] -- <command> [<options>]",
3030		"perf trace record [<options>] [<command>]",
3031		"perf trace record [<options>] -- <command> [<options>]",
3032		NULL
3033	};
3034	struct trace trace = {
3035		.syscalls = {
3036			. max = -1,
3037		},
3038		.opts = {
3039			.target = {
3040				.uid	   = UINT_MAX,
3041				.uses_mmap = true,
3042			},
3043			.user_freq     = UINT_MAX,
3044			.user_interval = ULLONG_MAX,
3045			.no_buffering  = true,
3046			.mmap_pages    = UINT_MAX,
3047			.proc_map_timeout  = 500,
3048		},
3049		.output = stderr,
3050		.show_comm = true,
3051		.trace_syscalls = true,
 
 
 
 
3052		.kernel_syscallchains = false,
3053		.max_stack = UINT_MAX,
 
3054	};
3055	const char *output_name = NULL;
3056	const struct option trace_options[] = {
3057	OPT_CALLBACK('e', "event", &trace, "event",
3058		     "event/syscall selector. use 'perf list' to list available events",
3059		     trace__parse_events_option),
 
 
3060	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061		    "show the thread COMM next to its id"),
3062	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064		     trace__parse_events_option),
3065	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068		    "trace events on existing process id"),
3069	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070		    "trace events on existing thread id"),
3071	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072		     "pids to filter (by the kernel)", trace__set_filter_pids),
3073	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074		    "system-wide collection from all CPUs"),
3075	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076		    "list of cpus to monitor"),
3077	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078		    "child tasks do not inherit counters"),
3079	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080		     "number of mmap data pages",
3081		     perf_evlist__parse_mmap_pages),
3082	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083		   "user to profile"),
3084	OPT_CALLBACK(0, "duration", &trace, "float",
3085		     "show only events with duration > N.M ms",
3086		     trace__set_duration),
3087	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089	OPT_BOOLEAN('T', "time", &trace.full_time,
3090		    "Show full timestamp, not time relative to first start"),
3091	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092		    "Show only syscalls that failed"),
3093	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094		    "Show only syscall summary with statistics"),
3095	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096		    "Show all syscalls and summary with statistics"),
 
 
3097	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098		     "Trace pagefaults", parse_pagefaults, "maj"),
3099	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101	OPT_CALLBACK(0, "call-graph", &trace.opts,
3102		     "record_mode[,record_size]", record_callchain_help,
3103		     &record_parse_callchain_opt),
 
 
3104	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105		    "Show the kernel callchains on the syscall exit path"),
 
 
3106	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107		     "Set the minimum stack depth when parsing the callchain, "
3108		     "anything below the specified depth will be ignored."),
3109	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110		     "Set the maximum stack depth when parsing the callchain, "
3111		     "anything beyond the specified depth will be ignored. "
3112		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 
 
3113	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116			"per thread proc mmap processing timeout in ms"),
3117	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118		     trace__parse_cgroups),
3119	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120		     "ms to wait before starting measurement after program "
3121		     "start"),
 
3122	OPT_END()
3123	};
3124	bool __maybe_unused max_stack_user_set = true;
3125	bool mmap_pages_user_set = true;
 
3126	const char * const trace_subcommands[] = { "record", NULL };
3127	int err;
3128	char bf[BUFSIZ];
 
3129
3130	signal(SIGSEGV, sighandler_dump_stack);
3131	signal(SIGFPE, sighandler_dump_stack);
 
 
 
 
 
 
3132
3133	trace.evlist = perf_evlist__new();
3134	trace.sctbl = syscalltbl__new();
3135
3136	if (trace.evlist == NULL || trace.sctbl == NULL) {
3137		pr_err("Not enough memory to run!\n");
3138		err = -ENOMEM;
3139		goto out;
3140	}
3141
 
 
 
 
 
 
 
 
 
 
 
 
 
3142	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3145	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146		usage_with_options_msg(trace_usage, trace_options,
3147				       "cgroup monitoring only available in system-wide mode");
3148	}
3149
3150	err = bpf__setup_stdout(trace.evlist);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3151	if (err) {
3152		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154		goto out;
3155	}
3156
 
 
 
3157	err = -1;
3158
3159	if (trace.trace_pgfaults) {
3160		trace.opts.sample_address = true;
3161		trace.opts.sample_time = true;
3162	}
3163
3164	if (trace.opts.mmap_pages == UINT_MAX)
3165		mmap_pages_user_set = false;
3166
3167	if (trace.max_stack == UINT_MAX) {
3168		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3169		max_stack_user_set = false;
3170	}
3171
3172#ifdef HAVE_DWARF_UNWIND_SUPPORT
3173	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175	}
3176#endif
3177
3178	if (callchain_param.enabled) {
3179		if (!mmap_pages_user_set && geteuid() == 0)
3180			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181
3182		symbol_conf.use_callchain = true;
3183	}
3184
3185	if (trace.evlist->nr_entries > 0)
3186		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3187
3188	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189		return trace__record(&trace, argc-1, &argv[1]);
3190
 
 
 
 
3191	/* summary_only implies summary option, but don't overwrite summary if set */
3192	if (trace.summary_only)
3193		trace.summary = trace.summary_only;
3194
3195	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197		pr_err("Please specify something to trace.\n");
3198		return -1;
3199	}
3200
3201	if (!trace.trace_syscalls && trace.ev_qualifier) {
3202		pr_err("The -e option can't be used with --no-syscalls.\n");
3203		goto out;
3204	}
3205
3206	if (output_name != NULL) {
3207		err = trace__open_output(&trace, output_name);
3208		if (err < 0) {
3209			perror("failed to create output file");
3210			goto out;
3211		}
3212	}
3213
3214	trace.open_id = syscalltbl__id(trace.sctbl, "open");
 
 
3215
3216	err = target__validate(&trace.opts.target);
3217	if (err) {
3218		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3219		fprintf(trace.output, "%s", bf);
3220		goto out_close;
3221	}
3222
3223	err = target__parse_uid(&trace.opts.target);
3224	if (err) {
3225		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3226		fprintf(trace.output, "%s", bf);
3227		goto out_close;
3228	}
3229
3230	if (!argc && target__none(&trace.opts.target))
3231		trace.opts.target.system_wide = true;
3232
3233	if (input_name)
3234		err = trace__replay(&trace);
3235	else
3236		err = trace__run(&trace, argc, argv);
3237
3238out_close:
3239	if (output_name != NULL)
3240		fclose(trace.output);
3241out:
 
 
 
 
3242	return err;
3243}