Linux Audio

Check our new training course

Loading...
v4.17
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
 
 
 
 
 
 
 
 
 
 
  21#include "builtin.h"
  22#include "util/cgroup.h"
  23#include "util/color.h"
 
  24#include "util/debug.h"
 
  25#include "util/env.h"
  26#include "util/event.h"
 
 
 
  27#include "util/evlist.h"
 
 
 
  28#include <subcmd/exec-cmd.h>
  29#include "util/machine.h"
 
 
  30#include "util/path.h"
  31#include "util/session.h"
  32#include "util/thread.h"
  33#include <subcmd/parse-options.h>
  34#include "util/strlist.h"
  35#include "util/intlist.h"
  36#include "util/thread_map.h"
  37#include "util/stat.h"
 
 
  38#include "trace/beauty/beauty.h"
  39#include "trace-event.h"
  40#include "util/parse-events.h"
  41#include "util/bpf-loader.h"
  42#include "callchain.h"
  43#include "print_binary.h"
  44#include "string2.h"
  45#include "syscalltbl.h"
  46#include "rb_resort.h"
 
 
  47
  48#include <errno.h>
  49#include <inttypes.h>
  50#include <poll.h>
  51#include <signal.h>
  52#include <stdlib.h>
  53#include <string.h>
  54#include <linux/err.h>
  55#include <linux/filter.h>
  56#include <linux/kernel.h>
 
  57#include <linux/random.h>
  58#include <linux/stringify.h>
  59#include <linux/time64.h>
 
  60#include <fcntl.h>
 
  61
  62#include "sane_ctype.h"
 
 
 
 
 
  63
  64#ifndef O_CLOEXEC
  65# define O_CLOEXEC		02000000
  66#endif
  67
  68#ifndef F_LINUX_SPECIFIC_BASE
  69# define F_LINUX_SPECIFIC_BASE	1024
  70#endif
  71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  72struct trace {
  73	struct perf_tool	tool;
  74	struct syscalltbl	*sctbl;
  75	struct {
  76		int		max;
  77		struct syscall  *table;
  78		struct {
  79			struct perf_evsel *sys_enter,
  80					  *sys_exit;
 
  81		}		events;
  82	} syscalls;
 
 
 
 
 
 
  83	struct record_opts	opts;
  84	struct perf_evlist	*evlist;
  85	struct machine		*host;
  86	struct thread		*current;
  87	struct cgroup		*cgroup;
  88	u64			base_time;
  89	FILE			*output;
  90	unsigned long		nr_events;
 
 
 
  91	struct strlist		*ev_qualifier;
  92	struct {
  93		size_t		nr;
  94		int		*entries;
  95	}			ev_qualifier_ids;
  96	struct {
  97		size_t		nr;
  98		pid_t		*entries;
 
  99	}			filter_pids;
 100	double			duration_filter;
 101	double			runtime_ms;
 102	struct {
 103		u64		vfs_getname,
 104				proc_getname;
 105	} stats;
 106	unsigned int		max_stack;
 107	unsigned int		min_stack;
 
 
 
 
 108	bool			not_ev_qualifier;
 109	bool			live;
 110	bool			full_time;
 111	bool			sched;
 112	bool			multiple_threads;
 113	bool			summary;
 114	bool			summary_only;
 
 115	bool			failure_only;
 116	bool			show_comm;
 117	bool			print_sample;
 118	bool			show_tool_stats;
 119	bool			trace_syscalls;
 
 120	bool			kernel_syscallchains;
 
 
 
 
 
 
 121	bool			force;
 122	bool			vfs_getname;
 
 123	int			trace_pgfaults;
 124	int			open_id;
 
 
 
 
 125};
 126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 127struct tp_field {
 128	int offset;
 129	union {
 130		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 131		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 132	};
 133};
 134
 135#define TP_UINT_FIELD(bits) \
 136static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 137{ \
 138	u##bits value; \
 139	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 140	return value;  \
 141}
 142
 143TP_UINT_FIELD(8);
 144TP_UINT_FIELD(16);
 145TP_UINT_FIELD(32);
 146TP_UINT_FIELD(64);
 147
 148#define TP_UINT_FIELD__SWAPPED(bits) \
 149static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 150{ \
 151	u##bits value; \
 152	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 153	return bswap_##bits(value);\
 154}
 155
 156TP_UINT_FIELD__SWAPPED(16);
 157TP_UINT_FIELD__SWAPPED(32);
 158TP_UINT_FIELD__SWAPPED(64);
 159
 160static int tp_field__init_uint(struct tp_field *field,
 161			       struct format_field *format_field,
 162			       bool needs_swap)
 163{
 164	field->offset = format_field->offset;
 165
 166	switch (format_field->size) {
 167	case 1:
 168		field->integer = tp_field__u8;
 169		break;
 170	case 2:
 171		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 172		break;
 173	case 4:
 174		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 175		break;
 176	case 8:
 177		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 178		break;
 179	default:
 180		return -1;
 181	}
 182
 183	return 0;
 184}
 185
 
 
 
 
 
 186static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 187{
 188	return sample->raw_data + field->offset;
 189}
 190
 191static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 192{
 193	field->offset = format_field->offset;
 194	field->pointer = tp_field__ptr;
 195	return 0;
 196}
 197
 
 
 
 
 
 198struct syscall_tp {
 199	struct tp_field id;
 200	union {
 201		struct tp_field args, ret;
 202	};
 203};
 204
 205static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 206					  struct tp_field *field,
 207					  const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 208{
 209	struct format_field *format_field = perf_evsel__field(evsel, name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 210
 211	if (format_field == NULL)
 212		return -1;
 213
 214	return tp_field__init_uint(field, format_field, evsel->needs_swap);
 215}
 216
 217#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 218	({ struct syscall_tp *sc = evsel->priv;\
 219	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 220
 221static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 222					 struct tp_field *field,
 223					 const char *name)
 224{
 225	struct format_field *format_field = perf_evsel__field(evsel, name);
 226
 227	if (format_field == NULL)
 228		return -1;
 229
 230	return tp_field__init_ptr(field, format_field);
 231}
 232
 233#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 234	({ struct syscall_tp *sc = evsel->priv;\
 235	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 236
 237static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 238{
 239	zfree(&evsel->priv);
 240	perf_evsel__delete(evsel);
 241}
 242
 243static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
 244{
 245	evsel->priv = malloc(sizeof(struct syscall_tp));
 246	if (evsel->priv != NULL) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 247		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 248			goto out_delete;
 249
 250		evsel->handler = handler;
 251		return 0;
 252	}
 253
 254	return -ENOMEM;
 255
 256out_delete:
 257	zfree(&evsel->priv);
 258	return -ENOENT;
 259}
 260
 261static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
 262{
 263	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 264
 265	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 266	if (IS_ERR(evsel))
 267		evsel = perf_evsel__newtp("syscalls", direction);
 268
 269	if (IS_ERR(evsel))
 270		return NULL;
 271
 272	if (perf_evsel__init_syscall_tp(evsel, handler))
 273		goto out_delete;
 274
 275	return evsel;
 276
 277out_delete:
 278	perf_evsel__delete_priv(evsel);
 279	return NULL;
 280}
 281
 282#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 283	({ struct syscall_tp *fields = evsel->priv; \
 284	   fields->name.integer(&fields->name, sample); })
 285
 286#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 287	({ struct syscall_tp *fields = evsel->priv; \
 288	   fields->name.pointer(&fields->name, sample); })
 289
 290size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
 291{
 292	int idx = val - sa->offset;
 293
 294	if (idx < 0 || idx >= sa->nr_entries)
 295		return scnprintf(bf, size, intfmt, val);
 
 
 
 
 296
 297	return scnprintf(bf, size, "%s", sa->entries[idx]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 298}
 299
 300static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 301						const char *intfmt,
 302					        struct syscall_arg *arg)
 303{
 304	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
 305}
 306
 307static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 308					      struct syscall_arg *arg)
 309{
 310	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 311}
 312
 313#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 314
 315struct strarrays {
 316	int		nr_entries;
 317	struct strarray **entries;
 318};
 319
 320#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
 321	.nr_entries = ARRAY_SIZE(array), \
 322	.entries = array, \
 323}
 324
 325size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 326					struct syscall_arg *arg)
 
 
 
 
 
 
 
 
 
 327{
 328	struct strarrays *sas = arg->parm;
 329	int i;
 330
 331	for (i = 0; i < sas->nr_entries; ++i) {
 332		struct strarray *sa = sas->entries[i];
 333		int idx = arg->val - sa->offset;
 334
 335		if (idx >= 0 && idx < sa->nr_entries) {
 336			if (sa->entries[idx] == NULL)
 337				break;
 338			return scnprintf(bf, size, "%s", sa->entries[idx]);
 339		}
 340	}
 341
 342	return scnprintf(bf, size, "%d", arg->val);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 343}
 344
 345#ifndef AT_FDCWD
 346#define AT_FDCWD	-100
 347#endif
 348
 349static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 350					   struct syscall_arg *arg)
 351{
 352	int fd = arg->val;
 
 353
 354	if (fd == AT_FDCWD)
 355		return scnprintf(bf, size, "CWD");
 356
 357	return syscall_arg__scnprintf_fd(bf, size, arg);
 358}
 359
 360#define SCA_FDAT syscall_arg__scnprintf_fd_at
 361
 362static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 363					      struct syscall_arg *arg);
 364
 365#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 366
 367size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 368{
 369	return scnprintf(bf, size, "%#lx", arg->val);
 370}
 371
 
 
 
 
 
 
 
 372size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 373{
 374	return scnprintf(bf, size, "%d", arg->val);
 375}
 376
 377size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 378{
 379	return scnprintf(bf, size, "%ld", arg->val);
 380}
 381
 
 
 
 
 
 
 
 
 
 
 382static const char *bpf_cmd[] = {
 383	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 384	"MAP_GET_NEXT_KEY", "PROG_LOAD",
 
 
 
 
 
 
 
 
 
 
 
 
 
 385};
 386static DEFINE_STRARRAY(bpf_cmd);
 
 
 
 
 387
 388static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 389static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 390
 391static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 392static DEFINE_STRARRAY(itimers);
 393
 394static const char *keyctl_options[] = {
 395	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 396	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 397	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 398	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 399	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 400};
 401static DEFINE_STRARRAY(keyctl_options);
 402
 403static const char *whences[] = { "SET", "CUR", "END",
 404#ifdef SEEK_DATA
 405"DATA",
 406#endif
 407#ifdef SEEK_HOLE
 408"HOLE",
 409#endif
 410};
 411static DEFINE_STRARRAY(whences);
 412
 413static const char *fcntl_cmds[] = {
 414	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 415	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 416	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 417	"GETOWNER_UIDS",
 418};
 419static DEFINE_STRARRAY(fcntl_cmds);
 420
 421static const char *fcntl_linux_specific_cmds[] = {
 422	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
 423	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 424	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 425};
 426
 427static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
 428
 429static struct strarray *fcntl_cmds_arrays[] = {
 430	&strarray__fcntl_cmds,
 431	&strarray__fcntl_linux_specific_cmds,
 432};
 433
 434static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 435
 436static const char *rlimit_resources[] = {
 437	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 438	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 439	"RTTIME",
 440};
 441static DEFINE_STRARRAY(rlimit_resources);
 442
 443static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 444static DEFINE_STRARRAY(sighow);
 445
 446static const char *clockid[] = {
 447	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 448	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 449	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 450};
 451static DEFINE_STRARRAY(clockid);
 452
 453static const char *socket_families[] = {
 454	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 455	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 456	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 457	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 458	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 459	"ALG", "NFC", "VSOCK",
 460};
 461static DEFINE_STRARRAY(socket_families);
 462
 463static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 464						 struct syscall_arg *arg)
 465{
 
 
 466	size_t printed = 0;
 467	int mode = arg->val;
 468
 469	if (mode == F_OK) /* 0 */
 470		return scnprintf(bf, size, "F");
 471#define	P_MODE(n) \
 472	if (mode & n##_OK) { \
 473		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 474		mode &= ~n##_OK; \
 475	}
 476
 477	P_MODE(R);
 478	P_MODE(W);
 479	P_MODE(X);
 480#undef P_MODE
 481
 482	if (mode)
 483		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 484
 485	return printed;
 486}
 487
 488#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 489
 490static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 491					      struct syscall_arg *arg);
 492
 493#define SCA_FILENAME syscall_arg__scnprintf_filename
 494
 
 
 
 
 
 
 
 
 
 495static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 496						struct syscall_arg *arg)
 497{
 
 
 498	int printed = 0, flags = arg->val;
 499
 500#define	P_FLAG(n) \
 501	if (flags & O_##n) { \
 502		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 503		flags &= ~O_##n; \
 504	}
 505
 506	P_FLAG(CLOEXEC);
 507	P_FLAG(NONBLOCK);
 508#undef P_FLAG
 509
 510	if (flags)
 511		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 512
 513	return printed;
 514}
 515
 516#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 517
 518#ifndef GRND_NONBLOCK
 519#define GRND_NONBLOCK	0x0001
 520#endif
 521#ifndef GRND_RANDOM
 522#define GRND_RANDOM	0x0002
 523#endif
 524
 525static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 526						   struct syscall_arg *arg)
 527{
 
 
 528	int printed = 0, flags = arg->val;
 529
 530#define	P_FLAG(n) \
 531	if (flags & GRND_##n) { \
 532		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 533		flags &= ~GRND_##n; \
 534	}
 535
 536	P_FLAG(RANDOM);
 537	P_FLAG(NONBLOCK);
 538#undef P_FLAG
 539
 540	if (flags)
 541		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 542
 543	return printed;
 544}
 545
 546#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 548#define STRARRAY(name, array) \
 549	  { .scnprintf	= SCA_STRARRAY, \
 
 
 
 
 
 
 550	    .parm	= &strarray__##array, }
 551
 552#include "trace/beauty/arch_errno_names.c"
 553#include "trace/beauty/eventfd.c"
 554#include "trace/beauty/futex_op.c"
 555#include "trace/beauty/futex_val3.c"
 556#include "trace/beauty/mmap.c"
 557#include "trace/beauty/mode_t.c"
 558#include "trace/beauty/msg_flags.c"
 559#include "trace/beauty/open_flags.c"
 560#include "trace/beauty/perf_event_open.c"
 561#include "trace/beauty/pid.c"
 562#include "trace/beauty/sched_policy.c"
 563#include "trace/beauty/seccomp.c"
 564#include "trace/beauty/signum.c"
 565#include "trace/beauty/socket_type.c"
 566#include "trace/beauty/waitid_options.c"
 567
 568struct syscall_arg_fmt {
 569	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 570	void	   *parm;
 571	const char *name;
 572	bool	   show_zero;
 573};
 574
 575static struct syscall_fmt {
 576	const char *name;
 577	const char *alias;
 578	struct syscall_arg_fmt arg[6];
 579	u8	   nr_args;
 580	bool	   errpid;
 581	bool	   timeout;
 582	bool	   hexret;
 583} syscall_fmts[] = {
 584	{ .name	    = "access",
 585	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
 
 
 
 
 
 
 
 586	{ .name	    = "bpf",
 587	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 
 588	{ .name	    = "brk",	    .hexret = true,
 589	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
 590	{ .name     = "clock_gettime",
 591	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
 
 
 592	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
 593	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
 594		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
 595		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
 596		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
 597		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
 598	{ .name	    = "close",
 599	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
 
 
 
 
 600	{ .name	    = "epoll_ctl",
 601	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
 602	{ .name	    = "eventfd2",
 603	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
 
 
 
 
 
 
 
 
 
 604	{ .name	    = "fchmodat",
 605	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 606	{ .name	    = "fchownat",
 607	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 608	{ .name	    = "fcntl",
 609	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
 
 610			   .parm      = &strarrays__fcntl_cmds_arrays,
 611			   .show_zero = true, },
 612		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
 613	{ .name	    = "flock",
 614	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
 
 
 
 
 
 
 
 
 
 615	{ .name	    = "fstat", .alias = "newfstat", },
 616	{ .name	    = "fstatat", .alias = "newfstatat", },
 617	{ .name	    = "futex",
 618	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
 619		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
 620	{ .name	    = "futimesat",
 621	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 622	{ .name	    = "getitimer",
 623	  .arg = { [0] = STRARRAY(which, itimers), }, },
 624	{ .name	    = "getpid",	    .errpid = true, },
 625	{ .name	    = "getpgid",    .errpid = true, },
 626	{ .name	    = "getppid",    .errpid = true, },
 627	{ .name	    = "getrandom",
 628	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
 629	{ .name	    = "getrlimit",
 630	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 
 
 631	{ .name	    = "gettid",	    .errpid = true, },
 632	{ .name	    = "ioctl",
 633	  .arg = {
 634#if defined(__i386__) || defined(__x86_64__)
 635/*
 636 * FIXME: Make this available to all arches.
 637 */
 638		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
 639		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 640#else
 641		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 642#endif
 643	{ .name	    = "kcmp",	    .nr_args = 5,
 644	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
 645		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
 646		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
 647		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
 648		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
 649	{ .name	    = "keyctl",
 650	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
 651	{ .name	    = "kill",
 652	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 653	{ .name	    = "linkat",
 654	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 655	{ .name	    = "lseek",
 656	  .arg = { [2] = STRARRAY(whence, whences), }, },
 657	{ .name	    = "lstat", .alias = "newlstat", },
 658	{ .name     = "madvise",
 659	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
 660		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
 661	{ .name	    = "mkdirat",
 662	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 663	{ .name	    = "mknodat",
 664	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 665	{ .name	    = "mlock",
 666	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 667	{ .name	    = "mlockall",
 668	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 669	{ .name	    = "mmap",	    .hexret = true,
 670/* The standard mmap maps to old_mmap on s390x */
 671#if defined(__s390x__)
 672	.alias = "old_mmap",
 673#endif
 674	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
 675		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
 676		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
 
 
 
 
 
 
 
 
 
 
 
 
 677	{ .name	    = "mprotect",
 678	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
 679		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
 680	{ .name	    = "mq_unlink",
 681	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 682	{ .name	    = "mremap",	    .hexret = true,
 683	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
 684		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
 685		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
 686	{ .name	    = "munlock",
 687	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 688	{ .name	    = "munmap",
 689	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 690	{ .name	    = "name_to_handle_at",
 691	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 692	{ .name	    = "newfstatat",
 693	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 
 
 
 
 694	{ .name	    = "open",
 695	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 696	{ .name	    = "open_by_handle_at",
 697	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
 698		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 699	{ .name	    = "openat",
 700	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
 701		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 702	{ .name	    = "perf_event_open",
 703	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
 
 704		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
 705		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 706	{ .name	    = "pipe2",
 707	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
 708	{ .name	    = "pkey_alloc",
 709	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
 710	{ .name	    = "pkey_free",
 711	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
 712	{ .name	    = "pkey_mprotect",
 713	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
 714		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
 715		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
 716	{ .name	    = "poll", .timeout = true, },
 717	{ .name	    = "ppoll", .timeout = true, },
 718	{ .name	    = "prctl", .alias = "arch_prctl",
 719	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
 
 
 720		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
 721		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
 722	{ .name	    = "pread", .alias = "pread64", },
 723	{ .name	    = "preadv", .alias = "pread", },
 724	{ .name	    = "prlimit64",
 725	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
 
 726	{ .name	    = "pwrite", .alias = "pwrite64", },
 727	{ .name	    = "readlinkat",
 728	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 729	{ .name	    = "recvfrom",
 730	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 731	{ .name	    = "recvmmsg",
 732	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 733	{ .name	    = "recvmsg",
 734	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 735	{ .name	    = "renameat",
 736	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 
 
 
 
 
 
 
 737	{ .name	    = "rt_sigaction",
 738	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 739	{ .name	    = "rt_sigprocmask",
 740	  .arg = { [0] = STRARRAY(how, sighow), }, },
 741	{ .name	    = "rt_sigqueueinfo",
 742	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 743	{ .name	    = "rt_tgsigqueueinfo",
 744	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 745	{ .name	    = "sched_setscheduler",
 746	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
 747	{ .name	    = "seccomp",
 748	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
 749		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
 750	{ .name	    = "select", .timeout = true, },
 
 751	{ .name	    = "sendmmsg",
 752	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 753	{ .name	    = "sendmsg",
 754	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 755	{ .name	    = "sendto",
 756	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 
 
 
 757	{ .name	    = "set_tid_address", .errpid = true, },
 758	{ .name	    = "setitimer",
 759	  .arg = { [0] = STRARRAY(which, itimers), }, },
 760	{ .name	    = "setrlimit",
 761	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
 
 
 
 762	{ .name	    = "socket",
 763	  .arg = { [0] = STRARRAY(family, socket_families),
 764		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
 
 765	{ .name	    = "socketpair",
 766	  .arg = { [0] = STRARRAY(family, socket_families),
 767		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
 
 768	{ .name	    = "stat", .alias = "newstat", },
 769	{ .name	    = "statx",
 770	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
 771		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
 772		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
 773	{ .name	    = "swapoff",
 774	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 775	{ .name	    = "swapon",
 776	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
 777	{ .name	    = "symlinkat",
 778	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 
 
 779	{ .name	    = "tgkill",
 780	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 781	{ .name	    = "tkill",
 782	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
 
 
 783	{ .name	    = "uname", .alias = "newuname", },
 784	{ .name	    = "unlinkat",
 785	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
 
 
 786	{ .name	    = "utimensat",
 787	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 788	{ .name	    = "wait4",	    .errpid = true,
 789	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 790	{ .name	    = "waitid",	    .errpid = true,
 791	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 
 
 792};
 793
 794static int syscall_fmt__cmp(const void *name, const void *fmtp)
 795{
 796	const struct syscall_fmt *fmt = fmtp;
 797	return strcmp(name, fmt->name);
 798}
 799
 800static struct syscall_fmt *syscall_fmt__find(const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 801{
 802	const int nmemb = ARRAY_SIZE(syscall_fmts);
 803	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 804}
 805
 
 
 
 
 
 
 806struct syscall {
 807	struct event_format *tp_format;
 808	int		    nr_args;
 809	struct format_field *args;
 810	const char	    *name;
 
 
 
 811	bool		    is_exit;
 812	struct syscall_fmt  *fmt;
 
 
 
 
 
 813	struct syscall_arg_fmt *arg_fmt;
 814};
 815
 816/*
 817 * We need to have this 'calculated' boolean because in some cases we really
 818 * don't know what is the duration of a syscall, for instance, when we start
 819 * a session and some threads are waiting for a syscall to finish, say 'poll',
 820 * in which case all we can do is to print "( ? ) for duration and for the
 821 * start timestamp.
 822 */
 823static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 824{
 825	double duration = (double)t / NSEC_PER_MSEC;
 826	size_t printed = fprintf(fp, "(");
 827
 828	if (!calculated)
 829		printed += fprintf(fp, "         ");
 830	else if (duration >= 1.0)
 831		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 832	else if (duration >= 0.01)
 833		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 834	else
 835		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 836	return printed + fprintf(fp, "): ");
 837}
 838
 839/**
 840 * filename.ptr: The filename char pointer that will be vfs_getname'd
 841 * filename.entry_str_pos: Where to insert the string translated from
 842 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 843 * ret_scnprintf: syscall args may set this to a different syscall return
 844 *                formatter, for instance, fcntl may return fds, file flags, etc.
 845 */
 846struct thread_trace {
 847	u64		  entry_time;
 848	bool		  entry_pending;
 849	unsigned long	  nr_events;
 850	unsigned long	  pfmaj, pfmin;
 851	char		  *entry_str;
 852	double		  runtime_ms;
 853	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 854        struct {
 855		unsigned long ptr;
 856		short int     entry_str_pos;
 857		bool	      pending_open;
 858		unsigned int  namelen;
 859		char	      *name;
 860	} filename;
 861	struct {
 862		int	  max;
 863		char	  **table;
 864	} paths;
 865
 866	struct intlist *syscall_stats;
 867};
 868
 869static struct thread_trace *thread_trace__new(void)
 870{
 871	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 872
 873	if (ttrace)
 874		ttrace->paths.max = -1;
 875
 876	ttrace->syscall_stats = intlist__new(NULL);
 877
 878	return ttrace;
 879}
 880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 881static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 882{
 883	struct thread_trace *ttrace;
 884
 885	if (thread == NULL)
 886		goto fail;
 887
 888	if (thread__priv(thread) == NULL)
 889		thread__set_priv(thread, thread_trace__new());
 890
 891	if (thread__priv(thread) == NULL)
 892		goto fail;
 893
 894	ttrace = thread__priv(thread);
 895	++ttrace->nr_events;
 896
 897	return ttrace;
 898fail:
 899	color_fprintf(fp, PERF_COLOR_RED,
 900		      "WARNING: not enough memory, dropping samples!\n");
 901	return NULL;
 902}
 903
 904
 905void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
 906				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
 907{
 908	struct thread_trace *ttrace = thread__priv(arg->thread);
 909
 910	ttrace->ret_scnprintf = ret_scnprintf;
 911}
 912
 913#define TRACE_PFMAJ		(1 << 0)
 914#define TRACE_PFMIN		(1 << 1)
 915
 916static const size_t trace__entry_str_size = 2048;
 917
 918static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 919{
 920	struct thread_trace *ttrace = thread__priv(thread);
 
 
 
 921
 922	if (fd > ttrace->paths.max) {
 923		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 
 924
 925		if (npath == NULL)
 926			return -1;
 
 
 
 
 
 
 
 
 927
 928		if (ttrace->paths.max != -1) {
 929			memset(npath + ttrace->paths.max + 1, 0,
 930			       (fd - ttrace->paths.max) * sizeof(char *));
 931		} else {
 932			memset(npath, 0, (fd + 1) * sizeof(char *));
 933		}
 934
 935		ttrace->paths.table = npath;
 936		ttrace->paths.max   = fd;
 937	}
 938
 939	ttrace->paths.table[fd] = strdup(pathname);
 
 940
 941	return ttrace->paths.table[fd] != NULL ? 0 : -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 942}
 943
 944static int thread__read_fd_path(struct thread *thread, int fd)
 945{
 946	char linkname[PATH_MAX], pathname[PATH_MAX];
 947	struct stat st;
 948	int ret;
 949
 950	if (thread->pid_ == thread->tid) {
 951		scnprintf(linkname, sizeof(linkname),
 952			  "/proc/%d/fd/%d", thread->pid_, fd);
 953	} else {
 954		scnprintf(linkname, sizeof(linkname),
 955			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
 
 956	}
 957
 958	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
 959		return -1;
 960
 961	ret = readlink(linkname, pathname, sizeof(pathname));
 962
 963	if (ret < 0 || ret > st.st_size)
 964		return -1;
 965
 966	pathname[ret] = '\0';
 967	return trace__set_fd_pathname(thread, fd, pathname);
 968}
 969
 970static const char *thread__fd_path(struct thread *thread, int fd,
 971				   struct trace *trace)
 972{
 973	struct thread_trace *ttrace = thread__priv(thread);
 974
 975	if (ttrace == NULL)
 976		return NULL;
 977
 978	if (fd < 0)
 979		return NULL;
 980
 981	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
 982		if (!trace->live)
 983			return NULL;
 984		++trace->stats.proc_getname;
 985		if (thread__read_fd_path(thread, fd))
 986			return NULL;
 987	}
 988
 989	return ttrace->paths.table[fd];
 990}
 991
 992size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
 993{
 994	int fd = arg->val;
 995	size_t printed = scnprintf(bf, size, "%d", fd);
 996	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
 997
 998	if (path)
 999		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000
1001	return printed;
1002}
1003
1004size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005{
1006        size_t printed = scnprintf(bf, size, "%d", fd);
1007	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008
1009	if (thread) {
1010		const char *path = thread__fd_path(thread, fd, trace);
1011
1012		if (path)
1013			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014
1015		thread__put(thread);
1016	}
1017
1018        return printed;
1019}
1020
1021static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022					      struct syscall_arg *arg)
1023{
1024	int fd = arg->val;
1025	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026	struct thread_trace *ttrace = thread__priv(arg->thread);
1027
1028	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029		zfree(&ttrace->paths.table[fd]);
1030
1031	return printed;
1032}
1033
1034static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035				     unsigned long ptr)
1036{
1037	struct thread_trace *ttrace = thread__priv(thread);
1038
1039	ttrace->filename.ptr = ptr;
1040	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041}
1042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044					      struct syscall_arg *arg)
1045{
1046	unsigned long ptr = arg->val;
1047
 
 
 
1048	if (!arg->trace->vfs_getname)
1049		return scnprintf(bf, size, "%#x", ptr);
1050
1051	thread__set_filename_pos(arg->thread, bf, ptr);
1052	return 0;
1053}
1054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1055static bool trace__filter_duration(struct trace *trace, double t)
1056{
1057	return t < (trace->duration_filter * NSEC_PER_MSEC);
1058}
1059
1060static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061{
1062	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063
1064	return fprintf(fp, "%10.3f ", ts);
1065}
1066
1067/*
1068 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069 * using ttrace->entry_time for a thread that receives a sys_exit without
1070 * first having received a sys_enter ("poll" issued before tracing session
1071 * starts, lost sys_enter exit due to ring buffer overflow).
1072 */
1073static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074{
1075	if (tstamp > 0)
1076		return __trace__fprintf_tstamp(trace, tstamp, fp);
1077
1078	return fprintf(fp, "         ? ");
1079}
1080
1081static bool done = false;
1082static bool interrupted = false;
 
1083
1084static void sig_handler(int sig)
1085{
1086	done = true;
1087	interrupted = sig == SIGINT;
1088}
1089
1090static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092{
1093	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094	printed += fprintf_duration(duration, duration_calculated, fp);
 
 
 
 
 
1095
1096	if (trace->multiple_threads) {
1097		if (trace->show_comm)
1098			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099		printed += fprintf(fp, "%d ", thread->tid);
1100	}
1101
1102	return printed;
1103}
1104
 
 
 
 
 
 
 
 
 
 
 
 
1105static int trace__process_event(struct trace *trace, struct machine *machine,
1106				union perf_event *event, struct perf_sample *sample)
1107{
1108	int ret = 0;
1109
1110	switch (event->header.type) {
1111	case PERF_RECORD_LOST:
1112		color_fprintf(trace->output, PERF_COLOR_RED,
1113			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1114		ret = machine__process_lost_event(machine, event, sample);
1115		break;
1116	default:
1117		ret = machine__process_event(machine, event, sample);
1118		break;
1119	}
1120
1121	return ret;
1122}
1123
1124static int trace__tool_process(struct perf_tool *tool,
1125			       union perf_event *event,
1126			       struct perf_sample *sample,
1127			       struct machine *machine)
1128{
1129	struct trace *trace = container_of(tool, struct trace, tool);
1130	return trace__process_event(trace, machine, event, sample);
1131}
1132
1133static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134{
1135	struct machine *machine = vmachine;
1136
1137	if (machine->kptr_restrict_warned)
1138		return NULL;
1139
1140	if (symbol_conf.kptr_restrict) {
1141		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143			   "Kernel samples will not be resolved.\n");
1144		machine->kptr_restrict_warned = true;
1145		return NULL;
1146	}
1147
1148	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149}
1150
1151static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152{
1153	int err = symbol__init(NULL);
1154
1155	if (err)
1156		return err;
1157
1158	trace->host = machine__new_host();
1159	if (trace->host == NULL)
1160		return -ENOMEM;
1161
 
 
1162	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163	if (err < 0)
1164		goto out;
1165
1166	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167					    evlist->threads, trace__tool_process, false,
1168					    trace->opts.proc_map_timeout, 1);
1169out:
1170	if (err)
1171		symbol__exit();
1172
1173	return err;
1174}
1175
1176static void trace__symbols__exit(struct trace *trace)
1177{
1178	machine__exit(trace->host);
1179	trace->host = NULL;
1180
1181	symbol__exit();
1182}
1183
1184static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185{
1186	int idx;
1187
1188	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189		nr_args = sc->fmt->nr_args;
1190
1191	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192	if (sc->arg_fmt == NULL)
1193		return -1;
1194
1195	for (idx = 0; idx < nr_args; ++idx) {
1196		if (sc->fmt)
1197			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198	}
1199
1200	sc->nr_args = nr_args;
1201	return 0;
1202}
1203
1204static int syscall__set_arg_fmts(struct syscall *sc)
 
 
 
 
 
1205{
1206	struct format_field *field;
1207	int idx = 0, len;
 
 
 
 
 
 
 
 
1208
1209	for (field = sc->args; field; field = field->next, ++idx) {
1210		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1211			continue;
1212
 
 
 
 
 
 
1213		if (strcmp(field->type, "const char *") == 0 &&
1214			 (strcmp(field->name, "filename") == 0 ||
1215			  strcmp(field->name, "path") == 0 ||
1216			  strcmp(field->name, "pathname") == 0))
1217			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218		else if (field->flags & FIELD_IS_POINTER)
1219			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220		else if (strcmp(field->type, "pid_t") == 0)
1221			sc->arg_fmt[idx].scnprintf = SCA_PID;
1222		else if (strcmp(field->type, "umode_t") == 0)
1223			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224		else if ((strcmp(field->type, "int") == 0 ||
 
 
 
1225			  strcmp(field->type, "unsigned int") == 0 ||
1226			  strcmp(field->type, "long") == 0) &&
1227			 (len = strlen(field->name)) >= 2 &&
1228			 strcmp(field->name + len - 2, "fd") == 0) {
1229			/*
1230			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1231			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232			 * 65 int
1233			 * 23 unsigned int
1234			 * 7 unsigned long
1235			 */
1236			sc->arg_fmt[idx].scnprintf = SCA_FD;
 
 
 
 
 
 
 
 
 
 
 
1237		}
1238	}
1239
 
 
 
 
 
 
 
 
 
 
 
1240	return 0;
1241}
1242
1243static int trace__read_syscall_info(struct trace *trace, int id)
1244{
1245	char tp_name[128];
1246	struct syscall *sc;
1247	const char *name = syscalltbl__name(trace->sctbl, id);
 
1248
1249	if (name == NULL)
1250		return -1;
1251
1252	if (id > trace->syscalls.max) {
1253		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254
1255		if (nsyscalls == NULL)
1256			return -1;
 
 
 
 
 
 
 
 
 
 
 
1257
1258		if (trace->syscalls.max != -1) {
1259			memset(nsyscalls + trace->syscalls.max + 1, 0,
1260			       (id - trace->syscalls.max) * sizeof(*sc));
1261		} else {
1262			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263		}
 
1264
1265		trace->syscalls.table = nsyscalls;
1266		trace->syscalls.max   = id;
 
1267	}
1268
1269	sc = trace->syscalls.table + id;
1270	sc->name = name;
1271
1272	sc->fmt  = syscall_fmt__find(sc->name);
1273
1274	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276
1277	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280	}
1281
1282	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283		return -1;
 
 
 
 
 
 
1284
1285	if (IS_ERR(sc->tp_format))
1286		return -1;
 
 
 
 
 
1287
1288	sc->args = sc->tp_format->format.fields;
1289	/*
1290	 * We need to check and discard the first variable '__syscall_nr'
1291	 * or 'nr' that mean the syscall number. It is needless here.
1292	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293	 */
1294	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295		sc->args = sc->args->next;
1296		--sc->nr_args;
1297	}
1298
1299	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1300
1301	return syscall__set_arg_fmts(sc);
 
 
 
 
 
 
 
1302}
1303
1304static int trace__validate_ev_qualifier(struct trace *trace)
1305{
1306	int err = 0, i;
1307	size_t nr_allocated;
1308	struct str_node *pos;
 
1309
1310	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312						 sizeof(trace->ev_qualifier_ids.entries[0]));
1313
1314	if (trace->ev_qualifier_ids.entries == NULL) {
1315		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316		       trace->output);
1317		err = -EINVAL;
1318		goto out;
1319	}
1320
1321	nr_allocated = trace->ev_qualifier_ids.nr;
1322	i = 0;
1323
1324	strlist__for_each_entry(pos, trace->ev_qualifier) {
1325		const char *sc = pos->s;
1326		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327
1328		if (id < 0) {
1329			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330			if (id >= 0)
1331				goto matches;
1332
1333			if (err == 0) {
1334				fputs("Error:\tInvalid syscall ", trace->output);
1335				err = -EINVAL;
1336			} else {
1337				fputs(", ", trace->output);
1338			}
1339
1340			fputs(sc, trace->output);
 
1341		}
1342matches:
1343		trace->ev_qualifier_ids.entries[i++] = id;
1344		if (match_next == -1)
1345			continue;
1346
1347		while (1) {
1348			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349			if (id < 0)
1350				break;
1351			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352				void *entries;
1353
1354				nr_allocated += 8;
1355				entries = realloc(trace->ev_qualifier_ids.entries,
1356						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357				if (entries == NULL) {
1358					err = -ENOMEM;
1359					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360					goto out_free;
1361				}
1362				trace->ev_qualifier_ids.entries = entries;
1363			}
1364			trace->ev_qualifier_ids.nr++;
1365			trace->ev_qualifier_ids.entries[i++] = id;
1366		}
1367	}
1368
1369	if (err < 0) {
1370		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1372out_free:
1373		zfree(&trace->ev_qualifier_ids.entries);
1374		trace->ev_qualifier_ids.nr = 0;
1375	}
1376out:
 
 
1377	return err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1378}
1379
1380/*
1381 * args is to be interpreted as a series of longs but we need to handle
1382 * 8-byte unaligned accesses. args points to raw_data within the event
1383 * and raw_data is guaranteed to be 8-byte unaligned because it is
1384 * preceded by raw_size which is a u32. So we need to copy args to a temp
1385 * variable to read it. Most notably this avoids extended load instructions
1386 * on unaligned addresses
1387 */
1388unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389{
1390	unsigned long val;
1391	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392
1393	memcpy(&val, p, sizeof(val));
1394	return val;
1395}
1396
1397static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398				      struct syscall_arg *arg)
1399{
1400	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402
1403	return scnprintf(bf, size, "arg%d: ", arg->idx);
1404}
1405
1406static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407				     struct syscall_arg *arg, unsigned long val)
 
 
 
 
1408{
1409	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
 
 
 
 
 
 
 
 
 
1410		arg->val = val;
1411		if (sc->arg_fmt[arg->idx].parm)
1412			arg->parm = sc->arg_fmt[arg->idx].parm;
1413		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414	}
1415	return scnprintf(bf, size, "%ld", val);
1416}
1417
1418static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419				      unsigned char *args, struct trace *trace,
1420				      struct thread *thread)
1421{
1422	size_t printed = 0;
1423	unsigned long val;
1424	u8 bit = 1;
1425	struct syscall_arg arg = {
1426		.args	= args,
 
 
 
 
1427		.idx	= 0,
1428		.mask	= 0,
1429		.trace  = trace,
1430		.thread = thread,
 
1431	};
1432	struct thread_trace *ttrace = thread__priv(thread);
 
1433
1434	/*
1435	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1436	 * right formatter for the return value (an fd? file flags?), which is
1437	 * not needed for syscalls that always return a given type, say an fd.
1438	 */
1439	ttrace->ret_scnprintf = NULL;
1440
1441	if (sc->args != NULL) {
1442		struct format_field *field;
1443
1444		for (field = sc->args; field;
1445		     field = field->next, ++arg.idx, bit <<= 1) {
1446			if (arg.mask & bit)
1447				continue;
1448
 
1449			val = syscall_arg__val(&arg, arg.idx);
 
 
 
 
 
1450
1451			/*
1452 			 * Suppress this argument if its value is zero and
1453 			 * and we don't have a string associated in an
1454 			 * strarray for it.
1455 			 */
1456			if (val == 0 &&
1457			    !(sc->arg_fmt &&
1458			      (sc->arg_fmt[arg.idx].show_zero ||
1459			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461			      sc->arg_fmt[arg.idx].parm))
1462				continue;
1463
1464			printed += scnprintf(bf + printed, size - printed,
1465					     "%s%s: ", printed ? ", " : "", field->name);
1466			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1467		}
1468	} else if (IS_ERR(sc->tp_format)) {
1469		/*
1470		 * If we managed to read the tracepoint /format file, then we
1471		 * may end up not having any args, like with gettid(), so only
1472		 * print the raw args when we didn't manage to read it.
1473		 */
1474		while (arg.idx < sc->nr_args) {
1475			if (arg.mask & bit)
1476				goto next_arg;
1477			val = syscall_arg__val(&arg, arg.idx);
1478			if (printed)
1479				printed += scnprintf(bf + printed, size - printed, ", ");
1480			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482next_arg:
1483			++arg.idx;
1484			bit <<= 1;
1485		}
1486	}
1487
1488	return printed;
1489}
1490
1491typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492				  union perf_event *event,
1493				  struct perf_sample *sample);
1494
1495static struct syscall *trace__syscall_info(struct trace *trace,
1496					   struct perf_evsel *evsel, int id)
1497{
 
1498
1499	if (id < 0) {
1500
1501		/*
1502		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503		 * before that, leaving at a higher verbosity level till that is
1504		 * explained. Reproduced with plain ftrace with:
1505		 *
1506		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507		 * grep "NR -1 " /t/trace_pipe
1508		 *
1509		 * After generating some load on the machine.
1510 		 */
1511		if (verbose > 1) {
1512			static u64 n;
1513			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514				id, perf_evsel__name(evsel), ++n);
1515		}
1516		return NULL;
1517	}
1518
1519	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520	    trace__read_syscall_info(trace, id))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1521		goto out_cant_read;
1522
1523	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524		goto out_cant_read;
1525
1526	return &trace->syscalls.table[id];
1527
1528out_cant_read:
1529	if (verbose > 0) {
1530		fprintf(trace->output, "Problems reading syscall %d", id);
1531		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
 
1532			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533		fputs(" information\n", trace->output);
1534	}
1535	return NULL;
1536}
1537
1538static void thread__update_stats(struct thread_trace *ttrace,
1539				 int id, struct perf_sample *sample)
 
 
 
 
 
 
 
1540{
1541	struct int_node *inode;
1542	struct stats *stats;
1543	u64 duration = 0;
1544
1545	inode = intlist__findnew(ttrace->syscall_stats, id);
1546	if (inode == NULL)
1547		return;
1548
1549	stats = inode->priv;
1550	if (stats == NULL) {
1551		stats = malloc(sizeof(struct stats));
1552		if (stats == NULL)
1553			return;
1554		init_stats(stats);
 
1555		inode->priv = stats;
1556	}
1557
1558	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559		duration = sample->time - ttrace->entry_time;
1560
1561	update_stats(stats, duration);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1562}
1563
1564static int trace__printf_interrupted_entry(struct trace *trace)
1565{
1566	struct thread_trace *ttrace;
1567	size_t printed;
 
1568
1569	if (trace->failure_only || trace->current == NULL)
1570		return 0;
1571
1572	ttrace = thread__priv(trace->current);
1573
1574	if (!ttrace->entry_pending)
1575		return 0;
1576
1577	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
 
 
 
 
 
 
1579	ttrace->entry_pending = false;
 
1580
1581	return printed;
1582}
1583
1584static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585				 struct perf_sample *sample, struct thread *thread)
1586{
1587	int printed = 0;
1588
1589	if (trace->print_sample) {
1590		double ts = (double)sample->time / NSEC_PER_MSEC;
1591
1592		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593				   perf_evsel__name(evsel), ts,
1594				   thread__comm_str(thread),
1595				   sample->pid, sample->tid, sample->cpu);
1596	}
1597
1598	return printed;
1599}
1600
1601static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1602			    union perf_event *event __maybe_unused,
1603			    struct perf_sample *sample)
1604{
1605	char *msg;
1606	void *args;
1607	size_t printed = 0;
1608	struct thread *thread;
1609	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
 
 
1610	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611	struct thread_trace *ttrace;
1612
1613	if (sc == NULL)
1614		return -1;
1615
1616	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617	ttrace = thread__trace(thread, trace->output);
1618	if (ttrace == NULL)
1619		goto out_put;
1620
1621	trace__fprintf_sample(trace, evsel, sample, thread);
1622
1623	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624
1625	if (ttrace->entry_str == NULL) {
1626		ttrace->entry_str = malloc(trace__entry_str_size);
1627		if (!ttrace->entry_str)
1628			goto out_put;
1629	}
1630
1631	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632		trace__printf_interrupted_entry(trace);
1633
 
 
 
 
 
 
 
 
 
 
 
1634	ttrace->entry_time = sample->time;
1635	msg = ttrace->entry_str;
1636	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637
1638	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639					   args, trace, thread);
1640
1641	if (sc->is_exit) {
1642		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
 
 
1643			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
 
 
 
1645		}
1646	} else {
1647		ttrace->entry_pending = true;
1648		/* See trace__vfs_getname & trace__sys_exit */
1649		ttrace->filename.pending_open = false;
1650	}
1651
1652	if (trace->current != thread) {
1653		thread__put(trace->current);
1654		trace->current = thread__get(thread);
1655	}
1656	err = 0;
1657out_put:
1658	thread__put(thread);
1659	return err;
1660}
1661
1662static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1663				    struct perf_sample *sample,
1664				    struct callchain_cursor *cursor)
1665{
1666	struct addr_location al;
1667	int max_stack = evsel->attr.sample_max_stack ?
1668			evsel->attr.sample_max_stack :
1669			trace->max_stack;
 
1670
1671	if (machine__resolve(trace->host, &al, sample) < 0 ||
1672	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673		return -1;
1674
1675	return 0;
 
 
 
1676}
1677
1678static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679{
1680	/* TODO: user-configurable print_opts */
1681	const unsigned int print_opts = EVSEL__PRINT_SYM |
1682				        EVSEL__PRINT_DSO |
1683				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684
1685	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686}
1687
1688static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689{
1690	struct perf_env *env = perf_evsel__env(evsel);
1691	const char *arch_name = perf_env__arch(env);
1692
1693	return arch_syscalls__strerrno(arch_name, err);
1694}
1695
1696static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697			   union perf_event *event __maybe_unused,
1698			   struct perf_sample *sample)
1699{
1700	long ret;
1701	u64 duration = 0;
1702	bool duration_calculated = false;
1703	struct thread *thread;
1704	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
 
1705	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706	struct thread_trace *ttrace;
1707
1708	if (sc == NULL)
1709		return -1;
1710
1711	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712	ttrace = thread__trace(thread, trace->output);
1713	if (ttrace == NULL)
1714		goto out_put;
1715
1716	trace__fprintf_sample(trace, evsel, sample, thread);
1717
1718	if (trace->summary)
1719		thread__update_stats(ttrace, id, sample);
1720
1721	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722
1723	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
 
 
 
1724		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725		ttrace->filename.pending_open = false;
1726		++trace->stats.vfs_getname;
1727	}
1728
1729	if (ttrace->entry_time) {
1730		duration = sample->time - ttrace->entry_time;
1731		if (trace__filter_duration(trace, duration))
1732			goto out;
1733		duration_calculated = true;
1734	} else if (trace->duration_filter)
1735		goto out;
1736
1737	if (sample->callchain) {
1738		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1739		if (callchain_ret == 0) {
1740			if (callchain_cursor.nr < trace->min_stack)
1741				goto out;
1742			callchain_ret = 1;
1743		}
1744	}
1745
1746	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747		goto out;
1748
1749	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750
1751	if (ttrace->entry_pending) {
1752		fprintf(trace->output, "%-70s", ttrace->entry_str);
1753	} else {
1754		fprintf(trace->output, " ... [");
1755		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756		fprintf(trace->output, "]: %s()", sc->name);
 
1757	}
1758
 
 
 
 
 
 
 
 
 
1759	if (sc->fmt == NULL) {
1760		if (ret < 0)
1761			goto errno_print;
1762signed_print:
1763		fprintf(trace->output, ") = %ld", ret);
1764	} else if (ret < 0) {
1765errno_print: {
1766		char bf[STRERR_BUFSIZE];
1767		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768			   *e = errno_to_name(evsel, -ret);
1769
1770		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771	}
1772	} else if (ret == 0 && sc->fmt->timeout)
1773		fprintf(trace->output, ") = 0 Timeout");
1774	else if (ttrace->ret_scnprintf) {
1775		char bf[1024];
1776		struct syscall_arg arg = {
1777			.val	= ret,
1778			.thread	= thread,
1779			.trace	= trace,
1780		};
1781		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782		ttrace->ret_scnprintf = NULL;
1783		fprintf(trace->output, ") = %s", bf);
1784	} else if (sc->fmt->hexret)
1785		fprintf(trace->output, ") = %#lx", ret);
1786	else if (sc->fmt->errpid) {
1787		struct thread *child = machine__find_thread(trace->host, ret, ret);
1788
1789		if (child != NULL) {
1790			fprintf(trace->output, ") = %ld", ret);
1791			if (child->comm_set)
1792				fprintf(trace->output, " (%s)", thread__comm_str(child));
1793			thread__put(child);
1794		}
1795	} else
1796		goto signed_print;
1797
1798	fputc('\n', trace->output);
1799
 
 
 
 
 
 
 
1800	if (callchain_ret > 0)
1801		trace__fprintf_callchain(trace, sample);
1802	else if (callchain_ret < 0)
1803		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804out:
1805	ttrace->entry_pending = false;
1806	err = 0;
1807out_put:
1808	thread__put(thread);
1809	return err;
1810}
1811
1812static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813			      union perf_event *event __maybe_unused,
1814			      struct perf_sample *sample)
1815{
1816	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817	struct thread_trace *ttrace;
1818	size_t filename_len, entry_str_len, to_move;
1819	ssize_t remaining_space;
1820	char *pos;
1821	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822
1823	if (!thread)
1824		goto out;
1825
1826	ttrace = thread__priv(thread);
1827	if (!ttrace)
1828		goto out_put;
1829
1830	filename_len = strlen(filename);
1831	if (filename_len == 0)
1832		goto out_put;
1833
1834	if (ttrace->filename.namelen < filename_len) {
1835		char *f = realloc(ttrace->filename.name, filename_len + 1);
1836
1837		if (f == NULL)
1838			goto out_put;
1839
1840		ttrace->filename.namelen = filename_len;
1841		ttrace->filename.name = f;
1842	}
1843
1844	strcpy(ttrace->filename.name, filename);
1845	ttrace->filename.pending_open = true;
1846
1847	if (!ttrace->filename.ptr)
1848		goto out_put;
1849
1850	entry_str_len = strlen(ttrace->entry_str);
1851	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852	if (remaining_space <= 0)
1853		goto out_put;
1854
1855	if (filename_len > (size_t)remaining_space) {
1856		filename += filename_len - remaining_space;
1857		filename_len = remaining_space;
1858	}
1859
1860	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862	memmove(pos + filename_len, pos, to_move);
1863	memcpy(pos, filename, filename_len);
1864
1865	ttrace->filename.ptr = 0;
1866	ttrace->filename.entry_str_pos = 0;
1867out_put:
1868	thread__put(thread);
1869out:
1870	return 0;
1871}
1872
1873static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874				     union perf_event *event __maybe_unused,
1875				     struct perf_sample *sample)
1876{
1877        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879	struct thread *thread = machine__findnew_thread(trace->host,
1880							sample->pid,
1881							sample->tid);
1882	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883
1884	if (ttrace == NULL)
1885		goto out_dump;
1886
1887	ttrace->runtime_ms += runtime_ms;
1888	trace->runtime_ms += runtime_ms;
1889out_put:
1890	thread__put(thread);
1891	return 0;
1892
1893out_dump:
1894	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895	       evsel->name,
1896	       perf_evsel__strval(evsel, sample, "comm"),
1897	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898	       runtime,
1899	       perf_evsel__intval(evsel, sample, "vruntime"));
1900	goto out_put;
1901}
1902
1903static int bpf_output__printer(enum binary_printer_ops op,
1904			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1905{
1906	unsigned char ch = (unsigned char)val;
1907
1908	switch (op) {
1909	case BINARY_PRINT_CHAR_DATA:
1910		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911	case BINARY_PRINT_DATA_BEGIN:
1912	case BINARY_PRINT_LINE_BEGIN:
1913	case BINARY_PRINT_ADDR:
1914	case BINARY_PRINT_NUM_DATA:
1915	case BINARY_PRINT_NUM_PAD:
1916	case BINARY_PRINT_SEP:
1917	case BINARY_PRINT_CHAR_PAD:
1918	case BINARY_PRINT_LINE_END:
1919	case BINARY_PRINT_DATA_END:
1920	default:
1921		break;
1922	}
1923
1924	return 0;
1925}
1926
1927static void bpf_output__fprintf(struct trace *trace,
1928				struct perf_sample *sample)
1929{
1930	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931			bpf_output__printer, NULL, trace->output);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1932}
1933
1934static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935				union perf_event *event __maybe_unused,
1936				struct perf_sample *sample)
1937{
 
1938	int callchain_ret = 0;
1939
 
 
 
 
 
1940	if (sample->callchain) {
1941		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1942		if (callchain_ret == 0) {
1943			if (callchain_cursor.nr < trace->min_stack)
1944				goto out;
1945			callchain_ret = 1;
1946		}
1947	}
1948
1949	trace__printf_interrupted_entry(trace);
1950	trace__fprintf_tstamp(trace, sample->time, trace->output);
1951
1952	if (trace->trace_syscalls)
1953		fprintf(trace->output, "(         ): ");
1954
1955	fprintf(trace->output, "%s:", evsel->name);
 
1956
1957	if (perf_evsel__is_bpf_output(evsel)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1958		bpf_output__fprintf(trace, sample);
1959	} else if (evsel->tp_format) {
1960		event_format__fprintf(evsel->tp_format, sample->cpu,
1961				      sample->raw_data, sample->raw_size,
1962				      trace->output);
 
 
 
 
 
 
 
1963	}
1964
1965	fprintf(trace->output, "\n");
 
1966
1967	if (callchain_ret > 0)
1968		trace__fprintf_callchain(trace, sample);
1969	else if (callchain_ret < 0)
1970		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 
 
 
 
 
 
 
1971out:
 
1972	return 0;
1973}
1974
1975static void print_location(FILE *f, struct perf_sample *sample,
1976			   struct addr_location *al,
1977			   bool print_dso, bool print_sym)
1978{
1979
1980	if ((verbose > 0 || print_dso) && al->map)
1981		fprintf(f, "%s@", al->map->dso->long_name);
1982
1983	if ((verbose > 0 || print_sym) && al->sym)
1984		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985			al->addr - al->sym->start);
1986	else if (al->map)
1987		fprintf(f, "0x%" PRIx64, al->addr);
1988	else
1989		fprintf(f, "0x%" PRIx64, sample->addr);
1990}
1991
1992static int trace__pgfault(struct trace *trace,
1993			  struct perf_evsel *evsel,
1994			  union perf_event *event __maybe_unused,
1995			  struct perf_sample *sample)
1996{
1997	struct thread *thread;
1998	struct addr_location al;
1999	char map_type = 'd';
2000	struct thread_trace *ttrace;
2001	int err = -1;
2002	int callchain_ret = 0;
2003
 
2004	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005
2006	if (sample->callchain) {
2007		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
2008		if (callchain_ret == 0) {
2009			if (callchain_cursor.nr < trace->min_stack)
2010				goto out_put;
2011			callchain_ret = 1;
2012		}
2013	}
2014
2015	ttrace = thread__trace(thread, trace->output);
2016	if (ttrace == NULL)
2017		goto out_put;
2018
2019	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020		ttrace->pfmaj++;
2021	else
2022		ttrace->pfmin++;
2023
2024	if (trace->summary_only)
2025		goto out;
2026
2027	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2028			      sample->ip, &al);
2029
2030	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2031
2032	fprintf(trace->output, "%sfault [",
2033		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2034		"maj" : "min");
2035
2036	print_location(trace->output, sample, &al, false, true);
2037
2038	fprintf(trace->output, "] => ");
2039
2040	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2041				   sample->addr, &al);
2042
2043	if (!al.map) {
2044		thread__find_addr_location(thread, sample->cpumode,
2045					   MAP__FUNCTION, sample->addr, &al);
2046
2047		if (al.map)
2048			map_type = 'x';
2049		else
2050			map_type = '?';
2051	}
2052
2053	print_location(trace->output, sample, &al, true, false);
2054
2055	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2056
2057	if (callchain_ret > 0)
2058		trace__fprintf_callchain(trace, sample);
2059	else if (callchain_ret < 0)
2060		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 
 
2061out:
2062	err = 0;
2063out_put:
2064	thread__put(thread);
 
2065	return err;
2066}
2067
2068static void trace__set_base_time(struct trace *trace,
2069				 struct perf_evsel *evsel,
2070				 struct perf_sample *sample)
2071{
2072	/*
2073	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2074	 * and don't use sample->time unconditionally, we may end up having
2075	 * some other event in the future without PERF_SAMPLE_TIME for good
2076	 * reason, i.e. we may not be interested in its timestamps, just in
2077	 * it taking place, picking some piece of information when it
2078	 * appears in our event stream (vfs_getname comes to mind).
2079	 */
2080	if (trace->base_time == 0 && !trace->full_time &&
2081	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2082		trace->base_time = sample->time;
2083}
2084
2085static int trace__process_sample(struct perf_tool *tool,
2086				 union perf_event *event,
2087				 struct perf_sample *sample,
2088				 struct perf_evsel *evsel,
2089				 struct machine *machine __maybe_unused)
2090{
2091	struct trace *trace = container_of(tool, struct trace, tool);
2092	struct thread *thread;
2093	int err = 0;
2094
2095	tracepoint_handler handler = evsel->handler;
2096
2097	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098	if (thread && thread__is_filtered(thread))
2099		goto out;
2100
2101	trace__set_base_time(trace, evsel, sample);
2102
2103	if (handler) {
2104		++trace->nr_events;
2105		handler(trace, evsel, event, sample);
2106	}
2107out:
2108	thread__put(thread);
2109	return err;
2110}
2111
2112static int trace__record(struct trace *trace, int argc, const char **argv)
2113{
2114	unsigned int rec_argc, i, j;
2115	const char **rec_argv;
2116	const char * const record_args[] = {
2117		"record",
2118		"-R",
2119		"-m", "1024",
2120		"-c", "1",
2121	};
2122
 
2123	const char * const sc_args[] = { "-e", };
2124	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2125	const char * const majpf_args[] = { "-e", "major-faults" };
2126	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2127	const char * const minpf_args[] = { "-e", "minor-faults" };
2128	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
 
2129
2130	/* +1 is for the event string below */
2131	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2132		majpf_args_nr + minpf_args_nr + argc;
2133	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2134
2135	if (rec_argv == NULL)
2136		return -ENOMEM;
2137
2138	j = 0;
2139	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2140		rec_argv[j++] = record_args[i];
2141
2142	if (trace->trace_syscalls) {
2143		for (i = 0; i < sc_args_nr; i++)
2144			rec_argv[j++] = sc_args[i];
2145
2146		/* event string may be different for older kernels - e.g., RHEL6 */
2147		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2148			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2149		else if (is_valid_tracepoint("syscalls:sys_enter"))
2150			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2151		else {
2152			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2153			free(rec_argv);
2154			return -1;
2155		}
2156	}
2157
 
 
 
2158	if (trace->trace_pgfaults & TRACE_PFMAJ)
2159		for (i = 0; i < majpf_args_nr; i++)
2160			rec_argv[j++] = majpf_args[i];
2161
2162	if (trace->trace_pgfaults & TRACE_PFMIN)
2163		for (i = 0; i < minpf_args_nr; i++)
2164			rec_argv[j++] = minpf_args[i];
2165
2166	for (i = 0; i < (unsigned int)argc; i++)
2167		rec_argv[j++] = argv[i];
2168
2169	return cmd_record(j, rec_argv);
 
 
 
 
2170}
2171
2172static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2173
2174static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2175{
2176	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
 
 
 
2177
2178	if (IS_ERR(evsel))
 
 
 
2179		return false;
2180
2181	if (perf_evsel__field(evsel, "pathname") == NULL) {
2182		perf_evsel__delete(evsel);
2183		return false;
 
 
 
 
 
 
 
 
 
 
2184	}
2185
2186	evsel->handler = trace__vfs_getname;
2187	perf_evlist__add(evlist, evsel);
2188	return true;
2189}
2190
2191static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2192{
2193	struct perf_evsel *evsel;
2194	struct perf_event_attr attr = {
2195		.type = PERF_TYPE_SOFTWARE,
2196		.mmap_data = 1,
2197	};
2198
2199	attr.config = config;
2200	attr.sample_period = 1;
2201
2202	event_attr_init(&attr);
2203
2204	evsel = perf_evsel__new(&attr);
2205	if (evsel)
2206		evsel->handler = trace__pgfault;
2207
2208	return evsel;
2209}
2210
 
 
 
 
 
 
 
 
 
 
2211static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2212{
2213	const u32 type = event->header.type;
2214	struct perf_evsel *evsel;
2215
2216	if (type != PERF_RECORD_SAMPLE) {
2217		trace__process_event(trace, trace->host, event, sample);
2218		return;
2219	}
2220
2221	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2222	if (evsel == NULL) {
2223		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2224		return;
2225	}
2226
 
 
 
2227	trace__set_base_time(trace, evsel, sample);
2228
2229	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2230	    sample->raw_data == NULL) {
2231		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2232		       perf_evsel__name(evsel), sample->tid,
2233		       sample->cpu, sample->raw_size);
2234	} else {
2235		tracepoint_handler handler = evsel->handler;
2236		handler(trace, evsel, event, sample);
2237	}
 
 
 
2238}
2239
2240static int trace__add_syscall_newtp(struct trace *trace)
2241{
2242	int ret = -1;
2243	struct perf_evlist *evlist = trace->evlist;
2244	struct perf_evsel *sys_enter, *sys_exit;
2245
2246	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2247	if (sys_enter == NULL)
2248		goto out;
2249
2250	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2251		goto out_delete_sys_enter;
2252
2253	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2254	if (sys_exit == NULL)
2255		goto out_delete_sys_enter;
2256
2257	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2258		goto out_delete_sys_exit;
2259
2260	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2261	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2262
2263	perf_evlist__add(evlist, sys_enter);
2264	perf_evlist__add(evlist, sys_exit);
2265
2266	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2267		/*
2268		 * We're interested only in the user space callchain
2269		 * leading to the syscall, allow overriding that for
2270		 * debugging reasons using --kernel_syscall_callchains
2271		 */
2272		sys_exit->attr.exclude_callchain_kernel = 1;
2273	}
2274
2275	trace->syscalls.events.sys_enter = sys_enter;
2276	trace->syscalls.events.sys_exit  = sys_exit;
2277
2278	ret = 0;
2279out:
2280	return ret;
2281
2282out_delete_sys_exit:
2283	perf_evsel__delete_priv(sys_exit);
2284out_delete_sys_enter:
2285	perf_evsel__delete_priv(sys_enter);
2286	goto out;
2287}
2288
2289static int trace__set_ev_qualifier_filter(struct trace *trace)
2290{
2291	int err = -1;
2292	struct perf_evsel *sys_exit;
2293	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2294						trace->ev_qualifier_ids.nr,
2295						trace->ev_qualifier_ids.entries);
2296
2297	if (filter == NULL)
2298		goto out_enomem;
2299
2300	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2301					  filter)) {
2302		sys_exit = trace->syscalls.events.sys_exit;
2303		err = perf_evsel__append_tp_filter(sys_exit, filter);
2304	}
2305
2306	free(filter);
2307out:
2308	return err;
2309out_enomem:
2310	errno = ENOMEM;
2311	goto out;
2312}
2313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2314static int trace__set_filter_loop_pids(struct trace *trace)
2315{
2316	unsigned int nr = 1;
2317	pid_t pids[32] = {
2318		getpid(),
2319	};
2320	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2321
2322	while (thread && nr < ARRAY_SIZE(pids)) {
2323		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
 
 
2324
2325		if (parent == NULL)
2326			break;
2327
2328		if (!strcmp(thread__comm_str(parent), "sshd")) {
2329			pids[nr++] = parent->tid;
 
2330			break;
2331		}
2332		thread = parent;
2333	}
2334
2335	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2336}
2337
2338static int trace__run(struct trace *trace, int argc, const char **argv)
2339{
2340	struct perf_evlist *evlist = trace->evlist;
2341	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2342	int err = -1, i;
2343	unsigned long before;
2344	const bool forks = argc > 0;
2345	bool draining = false;
2346
2347	trace->live = true;
2348
2349	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2350		goto out_error_raw_syscalls;
 
2351
2352	if (trace->trace_syscalls)
2353		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
 
2354
2355	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2356		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2357		if (pgfault_maj == NULL)
2358			goto out_error_mem;
2359		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2360		perf_evlist__add(evlist, pgfault_maj);
2361	}
2362
2363	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2364		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2365		if (pgfault_min == NULL)
2366			goto out_error_mem;
2367		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2368		perf_evlist__add(evlist, pgfault_min);
2369	}
2370
 
 
 
2371	if (trace->sched &&
2372	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2373				   trace__sched_stat_runtime))
2374		goto out_error_sched_stat_runtime;
2375
2376	/*
2377	 * If a global cgroup was set, apply it to all the events without an
2378	 * explicit cgroup. I.e.:
2379	 *
2380	 * 	trace -G A -e sched:*switch
2381	 *
2382	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2383	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2384	 *
2385	 * trace -e sched:*switch -G A
2386	 *
2387	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2388	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2389	 * a cgroup (on the root cgroup, sys wide, etc).
2390	 *
2391	 * Multiple cgroups:
2392	 *
2393	 * trace -G A -e sched:*switch -G B
2394	 *
2395	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2396	 * to the 'B' cgroup.
2397	 *
2398	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2399	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2400	 */
2401	if (trace->cgroup)
2402		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2403
2404	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2405	if (err < 0) {
2406		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2407		goto out_delete_evlist;
2408	}
2409
2410	err = trace__symbols_init(trace, evlist);
2411	if (err < 0) {
2412		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2413		goto out_delete_evlist;
2414	}
2415
2416	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2417
2418	signal(SIGCHLD, sig_handler);
2419	signal(SIGINT, sig_handler);
2420
2421	if (forks) {
2422		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2423						    argv, false, NULL);
2424		if (err < 0) {
2425			fprintf(trace->output, "Couldn't run the workload!\n");
2426			goto out_delete_evlist;
2427		}
 
2428	}
2429
2430	err = perf_evlist__open(evlist);
2431	if (err < 0)
2432		goto out_error_open;
 
 
 
2433
2434	err = bpf__apply_obj_config();
2435	if (err) {
2436		char errbuf[BUFSIZ];
2437
2438		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2439		pr_err("ERROR: Apply config to BPF failed: %s\n",
2440			 errbuf);
2441		goto out_error_open;
 
 
 
2442	}
2443
2444	/*
2445	 * Better not use !target__has_task() here because we need to cover the
2446	 * case where no threads were specified in the command line, but a
2447	 * workload was, and in that case we will fill in the thread_map when
2448	 * we fork the workload in perf_evlist__prepare_workload.
2449	 */
2450	if (trace->filter_pids.nr > 0)
2451		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2452	else if (thread_map__pid(evlist->threads, 0) == -1)
2453		err = trace__set_filter_loop_pids(trace);
2454
2455	if (err < 0)
2456		goto out_error_mem;
2457
 
 
 
 
 
2458	if (trace->ev_qualifier_ids.nr > 0) {
2459		err = trace__set_ev_qualifier_filter(trace);
2460		if (err < 0)
2461			goto out_errno;
2462
2463		pr_debug("event qualifier tracepoint filter: %s\n",
2464			 trace->syscalls.events.sys_exit->filter);
 
 
2465	}
2466
2467	err = perf_evlist__apply_filters(evlist, &evsel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2468	if (err < 0)
2469		goto out_error_apply_filters;
2470
2471	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2472	if (err < 0)
2473		goto out_error_mmap;
2474
2475	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2476		perf_evlist__enable(evlist);
2477
2478	if (forks)
2479		perf_evlist__start_workload(evlist);
2480
2481	if (trace->opts.initial_delay) {
2482		usleep(trace->opts.initial_delay * 1000);
2483		perf_evlist__enable(evlist);
2484	}
2485
2486	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2487				  evlist->threads->nr > 1 ||
2488				  perf_evlist__first(evlist)->attr.inherit;
2489
2490	/*
2491	 * Now that we already used evsel->attr to ask the kernel to setup the
2492	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2493	 * trace__resolve_callchain(), allowing per-event max-stack settings
2494	 * to override an explicitely set --max-stack global setting.
2495	 */
2496	evlist__for_each_entry(evlist, evsel) {
2497		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2498		    evsel->attr.sample_max_stack == 0)
2499			evsel->attr.sample_max_stack = trace->max_stack;
2500	}
2501again:
2502	before = trace->nr_events;
2503
2504	for (i = 0; i < evlist->nr_mmaps; i++) {
2505		union perf_event *event;
2506		struct perf_mmap *md;
2507
2508		md = &evlist->mmap[i];
2509		if (perf_mmap__read_init(md) < 0)
2510			continue;
2511
2512		while ((event = perf_mmap__read_event(md)) != NULL) {
2513			struct perf_sample sample;
2514
2515			++trace->nr_events;
2516
2517			err = perf_evlist__parse_sample(evlist, event, &sample);
2518			if (err) {
2519				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2520				goto next_event;
2521			}
2522
2523			trace__handle_event(trace, event, &sample);
2524next_event:
2525			perf_mmap__consume(md);
2526
2527			if (interrupted)
2528				goto out_disable;
2529
2530			if (done && !draining) {
2531				perf_evlist__disable(evlist);
2532				draining = true;
2533			}
2534		}
2535		perf_mmap__read_done(md);
2536	}
2537
2538	if (trace->nr_events == before) {
2539		int timeout = done ? 100 : -1;
2540
2541		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2542			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2543				draining = true;
2544
2545			goto again;
 
 
 
2546		}
2547	} else {
2548		goto again;
2549	}
2550
2551out_disable:
2552	thread__zput(trace->current);
2553
2554	perf_evlist__disable(evlist);
 
 
 
2555
2556	if (!err) {
2557		if (trace->summary)
2558			trace__fprintf_thread_summary(trace, trace->output);
2559
2560		if (trace->show_tool_stats) {
2561			fprintf(trace->output, "Stats:\n "
2562					       " vfs_getname : %" PRIu64 "\n"
2563					       " proc_getname: %" PRIu64 "\n",
2564				trace->stats.vfs_getname,
2565				trace->stats.proc_getname);
2566		}
2567	}
2568
2569out_delete_evlist:
2570	trace__symbols__exit(trace);
2571
2572	perf_evlist__delete(evlist);
2573	cgroup__put(trace->cgroup);
2574	trace->evlist = NULL;
2575	trace->live = false;
2576	return err;
2577{
2578	char errbuf[BUFSIZ];
2579
2580out_error_sched_stat_runtime:
2581	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2582	goto out_error;
2583
2584out_error_raw_syscalls:
2585	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2586	goto out_error;
2587
2588out_error_mmap:
2589	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2590	goto out_error;
2591
2592out_error_open:
2593	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2594
2595out_error:
2596	fprintf(trace->output, "%s\n", errbuf);
2597	goto out_delete_evlist;
2598
2599out_error_apply_filters:
2600	fprintf(trace->output,
2601		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2602		evsel->filter, perf_evsel__name(evsel), errno,
2603		str_error_r(errno, errbuf, sizeof(errbuf)));
2604	goto out_delete_evlist;
2605}
2606out_error_mem:
2607	fprintf(trace->output, "Not enough memory to run!\n");
2608	goto out_delete_evlist;
2609
2610out_errno:
2611	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2612	goto out_delete_evlist;
2613}
2614
2615static int trace__replay(struct trace *trace)
2616{
2617	const struct perf_evsel_str_handler handlers[] = {
2618		{ "probe:vfs_getname",	     trace__vfs_getname, },
2619	};
2620	struct perf_data data = {
2621		.file      = {
2622			.path = input_name,
2623		},
2624		.mode      = PERF_DATA_MODE_READ,
2625		.force     = trace->force,
2626	};
2627	struct perf_session *session;
2628	struct perf_evsel *evsel;
2629	int err = -1;
2630
2631	trace->tool.sample	  = trace__process_sample;
2632	trace->tool.mmap	  = perf_event__process_mmap;
2633	trace->tool.mmap2	  = perf_event__process_mmap2;
2634	trace->tool.comm	  = perf_event__process_comm;
2635	trace->tool.exit	  = perf_event__process_exit;
2636	trace->tool.fork	  = perf_event__process_fork;
2637	trace->tool.attr	  = perf_event__process_attr;
2638	trace->tool.tracing_data  = perf_event__process_tracing_data;
2639	trace->tool.build_id	  = perf_event__process_build_id;
2640	trace->tool.namespaces	  = perf_event__process_namespaces;
2641
2642	trace->tool.ordered_events = true;
2643	trace->tool.ordering_requires_timestamps = true;
2644
2645	/* add tid to output */
2646	trace->multiple_threads = true;
2647
2648	session = perf_session__new(&data, false, &trace->tool);
2649	if (session == NULL)
2650		return -1;
2651
2652	if (trace->opts.target.pid)
2653		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2654
2655	if (trace->opts.target.tid)
2656		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2657
2658	if (symbol__init(&session->header.env) < 0)
2659		goto out;
2660
2661	trace->host = &session->machines.host;
2662
2663	err = perf_session__set_tracepoints_handlers(session, handlers);
2664	if (err)
2665		goto out;
2666
2667	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2668						     "raw_syscalls:sys_enter");
2669	/* older kernels have syscalls tp versus raw_syscalls */
2670	if (evsel == NULL)
2671		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672							     "syscalls:sys_enter");
2673
2674	if (evsel &&
2675	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2676	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2677		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2678		goto out;
2679	}
2680
2681	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682						     "raw_syscalls:sys_exit");
2683	if (evsel == NULL)
2684		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2685							     "syscalls:sys_exit");
2686	if (evsel &&
2687	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2688	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2689		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2690		goto out;
2691	}
2692
2693	evlist__for_each_entry(session->evlist, evsel) {
2694		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2695		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2696		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2697		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2698			evsel->handler = trace__pgfault;
2699	}
2700
2701	setup_pager();
2702
2703	err = perf_session__process_events(session);
2704	if (err)
2705		pr_err("Failed to process events, error %d", err);
2706
2707	else if (trace->summary)
2708		trace__fprintf_thread_summary(trace, trace->output);
2709
2710out:
2711	perf_session__delete(session);
2712
2713	return err;
2714}
2715
2716static size_t trace__fprintf_threads_header(FILE *fp)
2717{
2718	size_t printed;
2719
2720	printed  = fprintf(fp, "\n Summary of events:\n\n");
2721
2722	return printed;
2723}
2724
2725DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2726	struct stats 	*stats;
2727	double		msecs;
2728	int		syscall;
2729)
2730{
2731	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2732	struct stats *stats = source->priv;
2733
2734	entry->syscall = source->i;
2735	entry->stats   = stats;
2736	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2737}
2738
2739static size_t thread__dump_stats(struct thread_trace *ttrace,
2740				 struct trace *trace, FILE *fp)
2741{
2742	size_t printed = 0;
2743	struct syscall *sc;
2744	struct rb_node *nd;
2745	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2746
2747	if (syscall_stats == NULL)
2748		return 0;
2749
2750	printed += fprintf(fp, "\n");
2751
2752	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2753	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2754	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2755
2756	resort_rb__for_each_entry(nd, syscall_stats) {
2757		struct stats *stats = syscall_stats_entry->stats;
2758		if (stats) {
2759			double min = (double)(stats->min) / NSEC_PER_MSEC;
2760			double max = (double)(stats->max) / NSEC_PER_MSEC;
2761			double avg = avg_stats(stats);
2762			double pct;
2763			u64 n = (u64) stats->n;
2764
2765			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2766			avg /= NSEC_PER_MSEC;
2767
2768			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2769			printed += fprintf(fp, "   %-15s", sc->name);
2770			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2771					   n, syscall_stats_entry->msecs, min, avg);
2772			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
 
 
 
 
 
 
 
 
 
2773		}
2774	}
2775
2776	resort_rb__delete(syscall_stats);
2777	printed += fprintf(fp, "\n\n");
2778
2779	return printed;
2780}
2781
2782static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2783{
2784	size_t printed = 0;
2785	struct thread_trace *ttrace = thread__priv(thread);
2786	double ratio;
2787
2788	if (ttrace == NULL)
2789		return 0;
2790
2791	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2792
2793	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2794	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2795	printed += fprintf(fp, "%.1f%%", ratio);
2796	if (ttrace->pfmaj)
2797		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2798	if (ttrace->pfmin)
2799		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2800	if (trace->sched)
2801		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2802	else if (fputc('\n', fp) != EOF)
2803		++printed;
2804
2805	printed += thread__dump_stats(ttrace, trace, fp);
2806
2807	return printed;
2808}
2809
2810static unsigned long thread__nr_events(struct thread_trace *ttrace)
2811{
2812	return ttrace ? ttrace->nr_events : 0;
2813}
2814
2815DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2816	struct thread *thread;
2817)
2818{
2819	entry->thread = rb_entry(nd, struct thread, rb_node);
 
 
 
 
 
 
 
 
 
 
 
2820}
2821
2822static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2823{
2824	size_t printed = trace__fprintf_threads_header(fp);
2825	struct rb_node *nd;
2826	int i;
2827
2828	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2829		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2830
2831		if (threads == NULL) {
2832			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2833			return 0;
2834		}
2835
2836		resort_rb__for_each_entry(nd, threads)
2837			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2838
2839		resort_rb__delete(threads);
 
2840	}
 
2841	return printed;
2842}
2843
2844static int trace__set_duration(const struct option *opt, const char *str,
2845			       int unset __maybe_unused)
2846{
2847	struct trace *trace = opt->value;
2848
2849	trace->duration_filter = atof(str);
2850	return 0;
2851}
2852
2853static int trace__set_filter_pids(const struct option *opt, const char *str,
2854				  int unset __maybe_unused)
2855{
2856	int ret = -1;
2857	size_t i;
2858	struct trace *trace = opt->value;
2859	/*
2860	 * FIXME: introduce a intarray class, plain parse csv and create a
2861	 * { int nr, int entries[] } struct...
2862	 */
2863	struct intlist *list = intlist__new(str);
2864
2865	if (list == NULL)
2866		return -1;
2867
2868	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2869	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2870
2871	if (trace->filter_pids.entries == NULL)
2872		goto out;
2873
2874	trace->filter_pids.entries[0] = getpid();
2875
2876	for (i = 1; i < trace->filter_pids.nr; ++i)
2877		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2878
2879	intlist__delete(list);
2880	ret = 0;
2881out:
2882	return ret;
2883}
2884
2885static int trace__open_output(struct trace *trace, const char *filename)
2886{
2887	struct stat st;
2888
2889	if (!stat(filename, &st) && st.st_size) {
2890		char oldname[PATH_MAX];
2891
2892		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2893		unlink(oldname);
2894		rename(filename, oldname);
2895	}
2896
2897	trace->output = fopen(filename, "w");
2898
2899	return trace->output == NULL ? -errno : 0;
2900}
2901
2902static int parse_pagefaults(const struct option *opt, const char *str,
2903			    int unset __maybe_unused)
2904{
2905	int *trace_pgfaults = opt->value;
2906
2907	if (strcmp(str, "all") == 0)
2908		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2909	else if (strcmp(str, "maj") == 0)
2910		*trace_pgfaults |= TRACE_PFMAJ;
2911	else if (strcmp(str, "min") == 0)
2912		*trace_pgfaults |= TRACE_PFMIN;
2913	else
2914		return -1;
2915
2916	return 0;
2917}
2918
2919static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2920{
2921	struct perf_evsel *evsel;
2922
2923	evlist__for_each_entry(evlist, evsel)
2924		evsel->handler = handler;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2925}
2926
2927/*
2928 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2929 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2930 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2931 *
2932 * It'd be better to introduce a parse_options() variant that would return a
2933 * list with the terms it didn't match to an event...
2934 */
2935static int trace__parse_events_option(const struct option *opt, const char *str,
2936				      int unset __maybe_unused)
2937{
2938	struct trace *trace = (struct trace *)opt->value;
2939	const char *s = str;
2940	char *sep = NULL, *lists[2] = { NULL, NULL, };
2941	int len = strlen(str) + 1, err = -1, list, idx;
2942	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2943	char group_name[PATH_MAX];
 
2944
2945	if (strace_groups_dir == NULL)
2946		return -1;
2947
2948	if (*s == '!') {
2949		++s;
2950		trace->not_ev_qualifier = true;
2951	}
2952
2953	while (1) {
2954		if ((sep = strchr(s, ',')) != NULL)
2955			*sep = '\0';
2956
2957		list = 0;
2958		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2959		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2960			list = 1;
 
 
 
 
 
 
 
2961		} else {
2962			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2963			if (access(group_name, R_OK) == 0)
2964				list = 1;
2965		}
2966
2967		if (lists[list]) {
2968			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2969		} else {
2970			lists[list] = malloc(len);
2971			if (lists[list] == NULL)
2972				goto out;
2973			strcpy(lists[list], s);
2974		}
2975
2976		if (!sep)
2977			break;
2978
2979		*sep = ',';
2980		s = sep + 1;
2981	}
2982
2983	if (lists[1] != NULL) {
2984		struct strlist_config slist_config = {
2985			.dirname = strace_groups_dir,
2986		};
2987
2988		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2989		if (trace->ev_qualifier == NULL) {
2990			fputs("Not enough memory to parse event qualifier", trace->output);
2991			goto out;
2992		}
2993
2994		if (trace__validate_ev_qualifier(trace))
2995			goto out;
 
2996	}
2997
2998	err = 0;
2999
3000	if (lists[0]) {
3001		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002					       "event selector. use 'perf list' to list available events",
3003					       parse_events_option);
 
 
 
3004		err = parse_events_option(&o, lists[0], 0);
3005	}
3006out:
 
 
 
3007	if (sep)
3008		*sep = ',';
3009
3010	return err;
3011}
3012
3013static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014{
3015	struct trace *trace = opt->value;
3016
3017	if (!list_empty(&trace->evlist->entries))
3018		return parse_cgroups(opt, str, unset);
3019
 
 
 
3020	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021
3022	return 0;
3023}
3024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3025int cmd_trace(int argc, const char **argv)
3026{
3027	const char *trace_usage[] = {
3028		"perf trace [<options>] [<command>]",
3029		"perf trace [<options>] -- <command> [<options>]",
3030		"perf trace record [<options>] [<command>]",
3031		"perf trace record [<options>] -- <command> [<options>]",
3032		NULL
3033	};
3034	struct trace trace = {
3035		.syscalls = {
3036			. max = -1,
3037		},
3038		.opts = {
3039			.target = {
3040				.uid	   = UINT_MAX,
3041				.uses_mmap = true,
3042			},
3043			.user_freq     = UINT_MAX,
3044			.user_interval = ULLONG_MAX,
3045			.no_buffering  = true,
3046			.mmap_pages    = UINT_MAX,
3047			.proc_map_timeout  = 500,
3048		},
3049		.output = stderr,
3050		.show_comm = true,
3051		.trace_syscalls = true,
 
 
 
 
3052		.kernel_syscallchains = false,
3053		.max_stack = UINT_MAX,
 
3054	};
3055	const char *output_name = NULL;
3056	const struct option trace_options[] = {
3057	OPT_CALLBACK('e', "event", &trace, "event",
3058		     "event/syscall selector. use 'perf list' to list available events",
3059		     trace__parse_events_option),
 
 
3060	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061		    "show the thread COMM next to its id"),
3062	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064		     trace__parse_events_option),
3065	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068		    "trace events on existing process id"),
3069	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070		    "trace events on existing thread id"),
3071	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072		     "pids to filter (by the kernel)", trace__set_filter_pids),
3073	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074		    "system-wide collection from all CPUs"),
3075	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076		    "list of cpus to monitor"),
3077	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078		    "child tasks do not inherit counters"),
3079	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080		     "number of mmap data pages",
3081		     perf_evlist__parse_mmap_pages),
3082	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083		   "user to profile"),
3084	OPT_CALLBACK(0, "duration", &trace, "float",
3085		     "show only events with duration > N.M ms",
3086		     trace__set_duration),
3087	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089	OPT_BOOLEAN('T', "time", &trace.full_time,
3090		    "Show full timestamp, not time relative to first start"),
3091	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092		    "Show only syscalls that failed"),
3093	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094		    "Show only syscall summary with statistics"),
3095	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096		    "Show all syscalls and summary with statistics"),
 
 
3097	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098		     "Trace pagefaults", parse_pagefaults, "maj"),
3099	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101	OPT_CALLBACK(0, "call-graph", &trace.opts,
3102		     "record_mode[,record_size]", record_callchain_help,
3103		     &record_parse_callchain_opt),
 
 
3104	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105		    "Show the kernel callchains on the syscall exit path"),
 
 
3106	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107		     "Set the minimum stack depth when parsing the callchain, "
3108		     "anything below the specified depth will be ignored."),
3109	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110		     "Set the maximum stack depth when parsing the callchain, "
3111		     "anything beyond the specified depth will be ignored. "
3112		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 
 
3113	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116			"per thread proc mmap processing timeout in ms"),
3117	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118		     trace__parse_cgroups),
3119	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120		     "ms to wait before starting measurement after program "
3121		     "start"),
 
 
 
3122	OPT_END()
3123	};
3124	bool __maybe_unused max_stack_user_set = true;
3125	bool mmap_pages_user_set = true;
 
3126	const char * const trace_subcommands[] = { "record", NULL };
3127	int err;
3128	char bf[BUFSIZ];
 
3129
3130	signal(SIGSEGV, sighandler_dump_stack);
3131	signal(SIGFPE, sighandler_dump_stack);
 
3132
3133	trace.evlist = perf_evlist__new();
 
 
 
 
 
3134	trace.sctbl = syscalltbl__new();
3135
3136	if (trace.evlist == NULL || trace.sctbl == NULL) {
3137		pr_err("Not enough memory to run!\n");
3138		err = -ENOMEM;
3139		goto out;
3140	}
3141
 
 
 
 
 
 
 
 
 
 
 
 
 
3142	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3145	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146		usage_with_options_msg(trace_usage, trace_options,
3147				       "cgroup monitoring only available in system-wide mode");
3148	}
3149
3150	err = bpf__setup_stdout(trace.evlist);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3151	if (err) {
3152		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154		goto out;
3155	}
3156
 
 
 
3157	err = -1;
3158
3159	if (trace.trace_pgfaults) {
3160		trace.opts.sample_address = true;
3161		trace.opts.sample_time = true;
3162	}
3163
3164	if (trace.opts.mmap_pages == UINT_MAX)
3165		mmap_pages_user_set = false;
3166
3167	if (trace.max_stack == UINT_MAX) {
3168		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3169		max_stack_user_set = false;
3170	}
3171
3172#ifdef HAVE_DWARF_UNWIND_SUPPORT
3173	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175	}
3176#endif
3177
3178	if (callchain_param.enabled) {
3179		if (!mmap_pages_user_set && geteuid() == 0)
3180			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181
3182		symbol_conf.use_callchain = true;
3183	}
3184
3185	if (trace.evlist->nr_entries > 0)
3186		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3187
3188	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189		return trace__record(&trace, argc-1, &argv[1]);
3190
 
 
 
 
3191	/* summary_only implies summary option, but don't overwrite summary if set */
3192	if (trace.summary_only)
3193		trace.summary = trace.summary_only;
3194
3195	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197		pr_err("Please specify something to trace.\n");
3198		return -1;
3199	}
3200
3201	if (!trace.trace_syscalls && trace.ev_qualifier) {
3202		pr_err("The -e option can't be used with --no-syscalls.\n");
3203		goto out;
3204	}
3205
3206	if (output_name != NULL) {
3207		err = trace__open_output(&trace, output_name);
3208		if (err < 0) {
3209			perror("failed to create output file");
3210			goto out;
3211		}
3212	}
3213
3214	trace.open_id = syscalltbl__id(trace.sctbl, "open");
 
 
3215
3216	err = target__validate(&trace.opts.target);
3217	if (err) {
3218		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3219		fprintf(trace.output, "%s", bf);
3220		goto out_close;
3221	}
3222
3223	err = target__parse_uid(&trace.opts.target);
3224	if (err) {
3225		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3226		fprintf(trace.output, "%s", bf);
3227		goto out_close;
3228	}
3229
3230	if (!argc && target__none(&trace.opts.target))
3231		trace.opts.target.system_wide = true;
3232
3233	if (input_name)
3234		err = trace__replay(&trace);
3235	else
3236		err = trace__run(&trace, argc, argv);
3237
3238out_close:
3239	if (output_name != NULL)
3240		fclose(trace.output);
3241out:
 
 
 
 
3242	return err;
3243}
v6.13.7
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
 
 
  15 */
  16
  17#include "util/record.h"
  18#include <api/fs/tracing_path.h>
  19#ifdef HAVE_LIBBPF_SUPPORT
  20#include <bpf/bpf.h>
  21#include <bpf/libbpf.h>
  22#include <bpf/btf.h>
  23#ifdef HAVE_BPF_SKEL
  24#include "bpf_skel/augmented_raw_syscalls.skel.h"
  25#endif
  26#endif
  27#include "util/bpf_map.h"
  28#include "util/rlimit.h"
  29#include "builtin.h"
  30#include "util/cgroup.h"
  31#include "util/color.h"
  32#include "util/config.h"
  33#include "util/debug.h"
  34#include "util/dso.h"
  35#include "util/env.h"
  36#include "util/event.h"
  37#include "util/evsel.h"
  38#include "util/evsel_fprintf.h"
  39#include "util/synthetic-events.h"
  40#include "util/evlist.h"
  41#include "util/evswitch.h"
  42#include "util/mmap.h"
  43#include <subcmd/pager.h>
  44#include <subcmd/exec-cmd.h>
  45#include "util/machine.h"
  46#include "util/map.h"
  47#include "util/symbol.h"
  48#include "util/path.h"
  49#include "util/session.h"
  50#include "util/thread.h"
  51#include <subcmd/parse-options.h>
  52#include "util/strlist.h"
  53#include "util/intlist.h"
  54#include "util/thread_map.h"
  55#include "util/stat.h"
  56#include "util/tool.h"
  57#include "util/util.h"
  58#include "trace/beauty/beauty.h"
  59#include "trace-event.h"
  60#include "util/parse-events.h"
  61#include "util/tracepoint.h"
  62#include "callchain.h"
  63#include "print_binary.h"
  64#include "string2.h"
  65#include "syscalltbl.h"
  66#include "rb_resort.h"
  67#include "../perf.h"
  68#include "trace_augment.h"
  69
  70#include <errno.h>
  71#include <inttypes.h>
  72#include <poll.h>
  73#include <signal.h>
  74#include <stdlib.h>
  75#include <string.h>
  76#include <linux/err.h>
  77#include <linux/filter.h>
  78#include <linux/kernel.h>
  79#include <linux/list_sort.h>
  80#include <linux/random.h>
  81#include <linux/stringify.h>
  82#include <linux/time64.h>
  83#include <linux/zalloc.h>
  84#include <fcntl.h>
  85#include <sys/sysmacros.h>
  86
  87#include <linux/ctype.h>
  88#include <perf/mmap.h>
  89
  90#ifdef HAVE_LIBTRACEEVENT
  91#include <event-parse.h>
  92#endif
  93
  94#ifndef O_CLOEXEC
  95# define O_CLOEXEC		02000000
  96#endif
  97
  98#ifndef F_LINUX_SPECIFIC_BASE
  99# define F_LINUX_SPECIFIC_BASE	1024
 100#endif
 101
 102#define RAW_SYSCALL_ARGS_NUM	6
 103
 104/*
 105 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
 106 *
 107 * We have to explicitely mark the direction of the flow of data, if from the
 108 * kernel to user space or the other way around, since the BPF collector we
 109 * have so far copies only from user to kernel space, mark the arguments that
 110 * go that direction, so that we don´t end up collecting the previous contents
 111 * for syscall args that goes from kernel to user space.
 112 */
 113struct syscall_arg_fmt {
 114	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 115	bool	   (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
 116	unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
 117	void	   *parm;
 118	const char *name;
 119	u16	   nr_entries; // for arrays
 120	bool	   from_user;
 121	bool	   show_zero;
 122#ifdef HAVE_LIBBPF_SUPPORT
 123	const struct btf_type *type;
 124	int	   type_id; /* used in btf_dump */
 125#endif
 126};
 127
 128struct syscall_fmt {
 129	const char *name;
 130	const char *alias;
 131	struct {
 132		const char *sys_enter,
 133			   *sys_exit;
 134	}	   bpf_prog_name;
 135	struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
 136	u8	   nr_args;
 137	bool	   errpid;
 138	bool	   timeout;
 139	bool	   hexret;
 140};
 141
 142struct trace {
 143	struct perf_tool	tool;
 144	struct syscalltbl	*sctbl;
 145	struct {
 
 146		struct syscall  *table;
 147		struct {
 148			struct evsel *sys_enter,
 149				*sys_exit,
 150				*bpf_output;
 151		}		events;
 152	} syscalls;
 153#ifdef HAVE_BPF_SKEL
 154	struct augmented_raw_syscalls_bpf *skel;
 155#endif
 156#ifdef HAVE_LIBBPF_SUPPORT
 157	struct btf		*btf;
 158#endif
 159	struct record_opts	opts;
 160	struct evlist	*evlist;
 161	struct machine		*host;
 162	struct thread		*current;
 163	struct cgroup		*cgroup;
 164	u64			base_time;
 165	FILE			*output;
 166	unsigned long		nr_events;
 167	unsigned long		nr_events_printed;
 168	unsigned long		max_events;
 169	struct evswitch		evswitch;
 170	struct strlist		*ev_qualifier;
 171	struct {
 172		size_t		nr;
 173		int		*entries;
 174	}			ev_qualifier_ids;
 175	struct {
 176		size_t		nr;
 177		pid_t		*entries;
 178		struct bpf_map  *map;
 179	}			filter_pids;
 180	double			duration_filter;
 181	double			runtime_ms;
 182	struct {
 183		u64		vfs_getname,
 184				proc_getname;
 185	} stats;
 186	unsigned int		max_stack;
 187	unsigned int		min_stack;
 188	int			raw_augmented_syscalls_args_size;
 189	bool			raw_augmented_syscalls;
 190	bool			fd_path_disabled;
 191	bool			sort_events;
 192	bool			not_ev_qualifier;
 193	bool			live;
 194	bool			full_time;
 195	bool			sched;
 196	bool			multiple_threads;
 197	bool			summary;
 198	bool			summary_only;
 199	bool			errno_summary;
 200	bool			failure_only;
 201	bool			show_comm;
 202	bool			print_sample;
 203	bool			show_tool_stats;
 204	bool			trace_syscalls;
 205	bool			libtraceevent_print;
 206	bool			kernel_syscallchains;
 207	s16			args_alignment;
 208	bool			show_tstamp;
 209	bool			show_duration;
 210	bool			show_zeros;
 211	bool			show_arg_names;
 212	bool			show_string_prefix;
 213	bool			force;
 214	bool			vfs_getname;
 215	bool			force_btf;
 216	int			trace_pgfaults;
 217	char			*perfconfig_events;
 218	struct {
 219		struct ordered_events	data;
 220		u64			last;
 221	} oe;
 222};
 223
 224static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused)
 225{
 226#ifdef HAVE_LIBBPF_SUPPORT
 227	if (trace->btf != NULL)
 228		return;
 229
 230	trace->btf = btf__load_vmlinux_btf();
 231	if (verbose > 0) {
 232		fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" :
 233						    "Failed to load vmlinux BTF\n");
 234	}
 235#endif
 236}
 237
 238struct tp_field {
 239	int offset;
 240	union {
 241		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 242		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 243	};
 244};
 245
 246#define TP_UINT_FIELD(bits) \
 247static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 248{ \
 249	u##bits value; \
 250	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 251	return value;  \
 252}
 253
 254TP_UINT_FIELD(8);
 255TP_UINT_FIELD(16);
 256TP_UINT_FIELD(32);
 257TP_UINT_FIELD(64);
 258
 259#define TP_UINT_FIELD__SWAPPED(bits) \
 260static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 261{ \
 262	u##bits value; \
 263	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 264	return bswap_##bits(value);\
 265}
 266
 267TP_UINT_FIELD__SWAPPED(16);
 268TP_UINT_FIELD__SWAPPED(32);
 269TP_UINT_FIELD__SWAPPED(64);
 270
 271static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
 
 
 272{
 273	field->offset = offset;
 274
 275	switch (size) {
 276	case 1:
 277		field->integer = tp_field__u8;
 278		break;
 279	case 2:
 280		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 281		break;
 282	case 4:
 283		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 284		break;
 285	case 8:
 286		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 287		break;
 288	default:
 289		return -1;
 290	}
 291
 292	return 0;
 293}
 294
 295static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
 296{
 297	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
 298}
 299
 300static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 301{
 302	return sample->raw_data + field->offset;
 303}
 304
 305static int __tp_field__init_ptr(struct tp_field *field, int offset)
 306{
 307	field->offset = offset;
 308	field->pointer = tp_field__ptr;
 309	return 0;
 310}
 311
 312static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
 313{
 314	return __tp_field__init_ptr(field, format_field->offset);
 315}
 316
 317struct syscall_tp {
 318	struct tp_field id;
 319	union {
 320		struct tp_field args, ret;
 321	};
 322};
 323
 324/*
 325 * The evsel->priv as used by 'perf trace'
 326 * sc:	for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
 327 * fmt: for all the other tracepoints
 328 */
 329struct evsel_trace {
 330	struct syscall_tp	sc;
 331	struct syscall_arg_fmt  *fmt;
 332};
 333
 334static struct evsel_trace *evsel_trace__new(void)
 335{
 336	return zalloc(sizeof(struct evsel_trace));
 337}
 338
 339static void evsel_trace__delete(struct evsel_trace *et)
 340{
 341	if (et == NULL)
 342		return;
 343
 344	zfree(&et->fmt);
 345	free(et);
 346}
 347
 348/*
 349 * Used with raw_syscalls:sys_{enter,exit} and with the
 350 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
 351 */
 352static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
 353{
 354	struct evsel_trace *et = evsel->priv;
 355
 356	return &et->sc;
 357}
 358
 359static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
 360{
 361	if (evsel->priv == NULL) {
 362		evsel->priv = evsel_trace__new();
 363		if (evsel->priv == NULL)
 364			return NULL;
 365	}
 366
 367	return __evsel__syscall_tp(evsel);
 368}
 369
 370/*
 371 * Used with all the other tracepoints.
 372 */
 373static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
 374{
 375	struct evsel_trace *et = evsel->priv;
 376
 377	return et->fmt;
 378}
 379
 380static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
 381{
 382	struct evsel_trace *et = evsel->priv;
 383
 384	if (evsel->priv == NULL) {
 385		et = evsel->priv = evsel_trace__new();
 386
 387		if (et == NULL)
 388			return NULL;
 389	}
 390
 391	if (et->fmt == NULL) {
 392		et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
 393		if (et->fmt == NULL)
 394			goto out_delete;
 395	}
 396
 397	return __evsel__syscall_arg_fmt(evsel);
 398
 399out_delete:
 400	evsel_trace__delete(evsel->priv);
 401	evsel->priv = NULL;
 402	return NULL;
 403}
 404
 405static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
 406{
 407	struct tep_format_field *format_field = evsel__field(evsel, name);
 408
 409	if (format_field == NULL)
 410		return -1;
 411
 412	return tp_field__init_uint(field, format_field, evsel->needs_swap);
 413}
 414
 415#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 416	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
 417	   evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 418
 419static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
 
 
 420{
 421	struct tep_format_field *format_field = evsel__field(evsel, name);
 422
 423	if (format_field == NULL)
 424		return -1;
 425
 426	return tp_field__init_ptr(field, format_field);
 427}
 428
 429#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 430	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
 431	   evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 432
 433static void evsel__delete_priv(struct evsel *evsel)
 434{
 435	zfree(&evsel->priv);
 436	evsel__delete(evsel);
 437}
 438
 439static int evsel__init_syscall_tp(struct evsel *evsel)
 440{
 441	struct syscall_tp *sc = evsel__syscall_tp(evsel);
 442
 443	if (sc != NULL) {
 444		if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
 445		    evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
 446			return -ENOENT;
 447
 448		return 0;
 449	}
 450
 451	return -ENOMEM;
 452}
 453
 454static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
 455{
 456	struct syscall_tp *sc = evsel__syscall_tp(evsel);
 457
 458	if (sc != NULL) {
 459		struct tep_format_field *syscall_id = evsel__field(tp, "id");
 460		if (syscall_id == NULL)
 461			syscall_id = evsel__field(tp, "__syscall_nr");
 462		if (syscall_id == NULL ||
 463		    __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
 464			return -EINVAL;
 465
 466		return 0;
 467	}
 468
 469	return -ENOMEM;
 470}
 471
 472static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
 473{
 474	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
 475
 476	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 477}
 478
 479static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
 480{
 481	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
 482
 483	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
 484}
 485
 486static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
 487{
 488	if (evsel__syscall_tp(evsel) != NULL) {
 489		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 490			return -ENOENT;
 491
 492		evsel->handler = handler;
 493		return 0;
 494	}
 495
 496	return -ENOMEM;
 
 
 
 
 497}
 498
 499static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
 500{
 501	struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
 502
 503	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 504	if (IS_ERR(evsel))
 505		evsel = evsel__newtp("syscalls", direction);
 506
 507	if (IS_ERR(evsel))
 508		return NULL;
 509
 510	if (evsel__init_raw_syscall_tp(evsel, handler))
 511		goto out_delete;
 512
 513	return evsel;
 514
 515out_delete:
 516	evsel__delete_priv(evsel);
 517	return NULL;
 518}
 519
 520#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 521	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 522	   fields->name.integer(&fields->name, sample); })
 523
 524#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 525	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 526	   fields->name.pointer(&fields->name, sample); })
 527
 528size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
 529{
 530	int idx = val - sa->offset;
 531
 532	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 533		size_t printed = scnprintf(bf, size, intfmt, val);
 534		if (show_suffix)
 535			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 536		return printed;
 537	}
 538
 539	return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
 540}
 541
 542size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 543{
 544	int idx = val - sa->offset;
 545
 546	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 547		size_t printed = scnprintf(bf, size, intfmt, val);
 548		if (show_prefix)
 549			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 550		return printed;
 551	}
 552
 553	return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 554}
 555
 556static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 557						const char *intfmt,
 558					        struct syscall_arg *arg)
 559{
 560	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
 561}
 562
 563static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 564					      struct syscall_arg *arg)
 565{
 566	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 567}
 568
 569#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 570
 571bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 572{
 573	return strarray__strtoul(arg->parm, bf, size, ret);
 574}
 575
 576bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 577{
 578	return strarray__strtoul_flags(arg->parm, bf, size, ret);
 579}
 580
 581bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 582{
 583	return strarrays__strtoul(arg->parm, bf, size, ret);
 584}
 585
 586size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
 587{
 588	return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
 589}
 590
 591size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 592{
 593	size_t printed;
 594	int i;
 595
 596	for (i = 0; i < sas->nr_entries; ++i) {
 597		struct strarray *sa = sas->entries[i];
 598		int idx = val - sa->offset;
 599
 600		if (idx >= 0 && idx < sa->nr_entries) {
 601			if (sa->entries[idx] == NULL)
 602				break;
 603			return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 604		}
 605	}
 606
 607	printed = scnprintf(bf, size, intfmt, val);
 608	if (show_prefix)
 609		printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
 610	return printed;
 611}
 612
 613bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
 614{
 615	int i;
 616
 617	for (i = 0; i < sa->nr_entries; ++i) {
 618		if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
 619			*ret = sa->offset + i;
 620			return true;
 621		}
 622	}
 623
 624	return false;
 625}
 626
 627bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
 628{
 629	u64 val = 0;
 630	char *tok = bf, *sep, *end;
 631
 632	*ret = 0;
 633
 634	while (size != 0) {
 635		int toklen = size;
 636
 637		sep = memchr(tok, '|', size);
 638		if (sep != NULL) {
 639			size -= sep - tok + 1;
 640
 641			end = sep - 1;
 642			while (end > tok && isspace(*end))
 643				--end;
 644
 645			toklen = end - tok + 1;
 646		}
 647
 648		while (isspace(*tok))
 649			++tok;
 650
 651		if (isalpha(*tok) || *tok == '_') {
 652			if (!strarray__strtoul(sa, tok, toklen, &val))
 653				return false;
 654		} else
 655			val = strtoul(tok, NULL, 0);
 656
 657		*ret |= (1 << (val - 1));
 658
 659		if (sep == NULL)
 660			break;
 661		tok = sep + 1;
 662	}
 663
 664	return true;
 665}
 666
 667bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
 668{
 669	int i;
 670
 671	for (i = 0; i < sas->nr_entries; ++i) {
 672		struct strarray *sa = sas->entries[i];
 673
 674		if (strarray__strtoul(sa, bf, size, ret))
 675			return true;
 676	}
 677
 678	return false;
 679}
 680
 681size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 682					struct syscall_arg *arg)
 683{
 684	return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
 685}
 686
 687#ifndef AT_FDCWD
 688#define AT_FDCWD	-100
 689#endif
 690
 691static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 692					   struct syscall_arg *arg)
 693{
 694	int fd = arg->val;
 695	const char *prefix = "AT_FD";
 696
 697	if (fd == AT_FDCWD)
 698		return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
 699
 700	return syscall_arg__scnprintf_fd(bf, size, arg);
 701}
 702
 703#define SCA_FDAT syscall_arg__scnprintf_fd_at
 704
 705static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 706					      struct syscall_arg *arg);
 707
 708#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 709
 710size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 711{
 712	return scnprintf(bf, size, "%#lx", arg->val);
 713}
 714
 715size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
 716{
 717	if (arg->val == 0)
 718		return scnprintf(bf, size, "NULL");
 719	return syscall_arg__scnprintf_hex(bf, size, arg);
 720}
 721
 722size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 723{
 724	return scnprintf(bf, size, "%d", arg->val);
 725}
 726
 727size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 728{
 729	return scnprintf(bf, size, "%ld", arg->val);
 730}
 731
 732static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
 733{
 734	// XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
 735	//     fill missing comms using thread__set_comm()...
 736	//     here or in a special syscall_arg__scnprintf_pid_sched_tp...
 737	return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
 738}
 739
 740#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
 741
 742static const char *bpf_cmd[] = {
 743	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 744	"MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
 745	"PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
 746	"PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
 747	"PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
 748	"TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
 749	"BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
 750	"MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
 751	"LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
 752	"LINK_DETACH", "PROG_BIND_MAP",
 753};
 754static DEFINE_STRARRAY(bpf_cmd, "BPF_");
 755
 756static const char *fsmount_flags[] = {
 757	[1] = "CLOEXEC",
 758};
 759static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
 760
 761#include "trace/beauty/generated/fsconfig_arrays.c"
 762
 763static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
 764
 765static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 766static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
 767
 768static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 769static DEFINE_STRARRAY(itimers, "ITIMER_");
 770
 771static const char *keyctl_options[] = {
 772	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 773	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 774	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 775	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 776	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 777};
 778static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
 779
 780static const char *whences[] = { "SET", "CUR", "END",
 781#ifdef SEEK_DATA
 782"DATA",
 783#endif
 784#ifdef SEEK_HOLE
 785"HOLE",
 786#endif
 787};
 788static DEFINE_STRARRAY(whences, "SEEK_");
 789
 790static const char *fcntl_cmds[] = {
 791	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 792	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 793	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 794	"GETOWNER_UIDS",
 795};
 796static DEFINE_STRARRAY(fcntl_cmds, "F_");
 797
 798static const char *fcntl_linux_specific_cmds[] = {
 799	"SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
 800	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 801	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 802};
 803
 804static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
 805
 806static struct strarray *fcntl_cmds_arrays[] = {
 807	&strarray__fcntl_cmds,
 808	&strarray__fcntl_linux_specific_cmds,
 809};
 810
 811static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 812
 813static const char *rlimit_resources[] = {
 814	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 815	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 816	"RTTIME",
 817};
 818static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
 819
 820static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 821static DEFINE_STRARRAY(sighow, "SIG_");
 822
 823static const char *clockid[] = {
 824	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 825	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 826	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 827};
 828static DEFINE_STRARRAY(clockid, "CLOCK_");
 
 
 
 
 
 
 
 
 
 
 829
 830static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 831						 struct syscall_arg *arg)
 832{
 833	bool show_prefix = arg->show_string_prefix;
 834	const char *suffix = "_OK";
 835	size_t printed = 0;
 836	int mode = arg->val;
 837
 838	if (mode == F_OK) /* 0 */
 839		return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
 840#define	P_MODE(n) \
 841	if (mode & n##_OK) { \
 842		printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
 843		mode &= ~n##_OK; \
 844	}
 845
 846	P_MODE(R);
 847	P_MODE(W);
 848	P_MODE(X);
 849#undef P_MODE
 850
 851	if (mode)
 852		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 853
 854	return printed;
 855}
 856
 857#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 858
 859static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 860					      struct syscall_arg *arg);
 861
 862#define SCA_FILENAME syscall_arg__scnprintf_filename
 863
 864// 'argname' is just documentational at this point, to remove the previous comment with that info
 865#define SCA_FILENAME_FROM_USER(argname) \
 866	  { .scnprintf	= SCA_FILENAME, \
 867	    .from_user	= true, }
 868
 869static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg);
 870
 871#define SCA_BUF syscall_arg__scnprintf_buf
 872
 873static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 874						struct syscall_arg *arg)
 875{
 876	bool show_prefix = arg->show_string_prefix;
 877	const char *prefix = "O_";
 878	int printed = 0, flags = arg->val;
 879
 880#define	P_FLAG(n) \
 881	if (flags & O_##n) { \
 882		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 883		flags &= ~O_##n; \
 884	}
 885
 886	P_FLAG(CLOEXEC);
 887	P_FLAG(NONBLOCK);
 888#undef P_FLAG
 889
 890	if (flags)
 891		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 892
 893	return printed;
 894}
 895
 896#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 897
 898#ifndef GRND_NONBLOCK
 899#define GRND_NONBLOCK	0x0001
 900#endif
 901#ifndef GRND_RANDOM
 902#define GRND_RANDOM	0x0002
 903#endif
 904
 905static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 906						   struct syscall_arg *arg)
 907{
 908	bool show_prefix = arg->show_string_prefix;
 909	const char *prefix = "GRND_";
 910	int printed = 0, flags = arg->val;
 911
 912#define	P_FLAG(n) \
 913	if (flags & GRND_##n) { \
 914		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 915		flags &= ~GRND_##n; \
 916	}
 917
 918	P_FLAG(RANDOM);
 919	P_FLAG(NONBLOCK);
 920#undef P_FLAG
 921
 922	if (flags)
 923		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 924
 925	return printed;
 926}
 927
 928#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 929
 930#ifdef HAVE_LIBBPF_SUPPORT
 931static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
 932{
 933	int id;
 934
 935	type = strstr(type, "enum ");
 936	if (type == NULL)
 937		return;
 938
 939	type += 5; // skip "enum " to get the enumeration name
 940
 941	id = btf__find_by_name(btf, type);
 942	if (id < 0)
 943		return;
 944
 945	arg_fmt->type = btf__type_by_id(btf, id);
 946}
 947
 948static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
 949{
 950	const struct btf_type *bt = arg->fmt->type;
 951	struct btf *btf = arg->trace->btf;
 952	struct btf_enum *be = btf_enum(bt);
 953
 954	for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
 955		const char *name = btf__name_by_offset(btf, be->name_off);
 956		int max_len = max(size, strlen(name));
 957
 958		if (strncmp(name, bf, max_len) == 0) {
 959			*val = be->val;
 960			return true;
 961		}
 962	}
 963
 964	return false;
 965}
 966
 967static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
 968{
 969	const struct btf_type *bt;
 970	char *type = arg->type_name;
 971	struct btf *btf;
 972
 973	trace__load_vmlinux_btf(arg->trace);
 974
 975	btf = arg->trace->btf;
 976	if (btf == NULL)
 977		return false;
 978
 979	if (arg->fmt->type == NULL) {
 980		// See if this is an enum
 981		syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type);
 982	}
 983
 984	// Now let's see if we have a BTF type resolved
 985	bt = arg->fmt->type;
 986	if (bt == NULL)
 987		return false;
 988
 989	// If it is an enum:
 990	if (btf_is_enum(arg->fmt->type))
 991		return syscall_arg__strtoul_btf_enum(bf, size, arg, val);
 992
 993	return false;
 994}
 995
 996static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
 997{
 998	struct btf_enum *be = btf_enum(type);
 999	const int nr_entries = btf_vlen(type);
1000
1001	for (int i = 0; i < nr_entries; ++i, ++be) {
1002		if (be->val == val) {
1003			return scnprintf(bf, size, "%s",
1004					 btf__name_by_offset(btf, be->name_off));
1005		}
1006	}
1007
1008	return 0;
1009}
1010
1011struct trace_btf_dump_snprintf_ctx {
1012	char   *bf;
1013	size_t printed, size;
1014};
1015
1016static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args)
1017{
1018	struct trace_btf_dump_snprintf_ctx *ctx = vctx;
1019
1020	ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args);
1021}
1022
1023static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg)
1024{
1025	struct trace_btf_dump_snprintf_ctx ctx = {
1026		.bf   = bf,
1027		.size = size,
1028	};
1029	struct augmented_arg *augmented_arg = arg->augmented.args;
1030	int type_id = arg->fmt->type_id, consumed;
1031	struct btf_dump *btf_dump;
1032
1033	LIBBPF_OPTS(btf_dump_opts, dump_opts);
1034	LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts);
1035
1036	if (arg == NULL || arg->augmented.args == NULL)
1037		return 0;
1038
1039	dump_data_opts.compact	  = true;
1040	dump_data_opts.skip_names = !arg->trace->show_arg_names;
1041
1042	btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts);
1043	if (btf_dump == NULL)
1044		return 0;
1045
1046	/* pretty print the struct data here */
1047	if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0)
1048		return 0;
1049
1050	consumed = sizeof(*augmented_arg) + augmented_arg->size;
1051	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1052	arg->augmented.size -= consumed;
1053
1054	btf_dump__free(btf_dump);
1055
1056	return ctx.printed;
1057}
1058
1059static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
1060				   size_t size, int val, char *type)
1061{
1062	struct syscall_arg_fmt *arg_fmt = arg->fmt;
1063
1064	if (trace->btf == NULL)
1065		return 0;
1066
1067	if (arg_fmt->type == NULL) {
1068		// Check if this is an enum and if we have the BTF type for it.
1069		syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
1070	}
1071
1072	// Did we manage to find a BTF type for the syscall/tracepoint argument?
1073	if (arg_fmt->type == NULL)
1074		return 0;
1075
1076	if (btf_is_enum(arg_fmt->type))
1077		return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
1078	else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type))
1079		return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
1080
1081	return 0;
1082}
1083
1084#else // HAVE_LIBBPF_SUPPORT
1085static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused,
1086				   char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
1087				   char *type __maybe_unused)
1088{
1089	return 0;
1090}
1091
1092static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused,
1093					  struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
1094{
1095	return false;
1096}
1097#endif // HAVE_LIBBPF_SUPPORT
1098
1099#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type
1100
1101#define STRARRAY(name, array) \
1102	  { .scnprintf	= SCA_STRARRAY, \
1103	    .strtoul	= STUL_STRARRAY, \
1104	    .parm	= &strarray__##array, }
1105
1106#define STRARRAY_FLAGS(name, array) \
1107	  { .scnprintf	= SCA_STRARRAY_FLAGS, \
1108	    .strtoul	= STUL_STRARRAY_FLAGS, \
1109	    .parm	= &strarray__##array, }
1110
1111#include "trace/beauty/arch_errno_names.c"
1112#include "trace/beauty/eventfd.c"
1113#include "trace/beauty/futex_op.c"
1114#include "trace/beauty/futex_val3.c"
1115#include "trace/beauty/mmap.c"
1116#include "trace/beauty/mode_t.c"
1117#include "trace/beauty/msg_flags.c"
1118#include "trace/beauty/open_flags.c"
1119#include "trace/beauty/perf_event_open.c"
1120#include "trace/beauty/pid.c"
1121#include "trace/beauty/sched_policy.c"
1122#include "trace/beauty/seccomp.c"
1123#include "trace/beauty/signum.c"
1124#include "trace/beauty/socket_type.c"
1125#include "trace/beauty/waitid_options.c"
1126
1127static const struct syscall_fmt syscall_fmts[] = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1128	{ .name	    = "access",
1129	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
1130	{ .name	    = "arch_prctl",
1131	  .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
1132		   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
1133	{ .name	    = "bind",
1134	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1135		   [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
1136		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1137	{ .name	    = "bpf",
1138	  .arg = { [0] = STRARRAY(cmd, bpf_cmd),
1139		   [1] = { .from_user = true /* attr */, }, } },
1140	{ .name	    = "brk",	    .hexret = true,
1141	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
1142	{ .name     = "clock_gettime",
1143	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
1144	{ .name	    = "clock_nanosleep",
1145	  .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, },
1146	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
1147	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
1148		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
1149		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
1150		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
1151		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
1152	{ .name	    = "close",
1153	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
1154	{ .name	    = "connect",
1155	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1156		   [1] = SCA_SOCKADDR_FROM_USER(servaddr),
1157		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1158	{ .name	    = "epoll_ctl",
1159	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
1160	{ .name	    = "eventfd2",
1161	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
1162	{ .name     = "faccessat",
1163	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
1164		   [1] = SCA_FILENAME_FROM_USER(pathname),
1165		   [2] = { .scnprintf = SCA_ACCMODE,	  /* mode */ }, }, },
1166	{ .name     = "faccessat2",
1167	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
1168		   [1] = SCA_FILENAME_FROM_USER(pathname),
1169		   [2] = { .scnprintf = SCA_ACCMODE,	  /* mode */ },
1170		   [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, },
1171	{ .name	    = "fchmodat",
1172	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1173	{ .name	    = "fchownat",
1174	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1175	{ .name	    = "fcntl",
1176	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD,  /* cmd */
1177			   .strtoul   = STUL_STRARRAYS,
1178			   .parm      = &strarrays__fcntl_cmds_arrays,
1179			   .show_zero = true, },
1180		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
1181	{ .name	    = "flock",
1182	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
1183	{ .name     = "fsconfig",
1184	  .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
1185	{ .name     = "fsmount",
1186	  .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
1187		   [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
1188	{ .name     = "fspick",
1189	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
1190		   [1] = SCA_FILENAME_FROM_USER(path),
1191		   [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
1192	{ .name	    = "fstat", .alias = "newfstat", },
 
1193	{ .name	    = "futex",
1194	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
1195		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
1196	{ .name	    = "futimesat",
1197	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1198	{ .name	    = "getitimer",
1199	  .arg = { [0] = STRARRAY(which, itimers), }, },
1200	{ .name	    = "getpid",	    .errpid = true, },
1201	{ .name	    = "getpgid",    .errpid = true, },
1202	{ .name	    = "getppid",    .errpid = true, },
1203	{ .name	    = "getrandom",
1204	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
1205	{ .name	    = "getrlimit",
1206	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
1207	{ .name	    = "getsockopt",
1208	  .arg = { [1] = STRARRAY(level, socket_level), }, },
1209	{ .name	    = "gettid",	    .errpid = true, },
1210	{ .name	    = "ioctl",
1211	  .arg = {
1212#if defined(__i386__) || defined(__x86_64__)
1213/*
1214 * FIXME: Make this available to all arches.
1215 */
1216		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
1217		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1218#else
1219		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1220#endif
1221	{ .name	    = "kcmp",	    .nr_args = 5,
1222	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
1223		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
1224		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
1225		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
1226		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
1227	{ .name	    = "keyctl",
1228	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
1229	{ .name	    = "kill",
1230	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1231	{ .name	    = "linkat",
1232	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1233	{ .name	    = "lseek",
1234	  .arg = { [2] = STRARRAY(whence, whences), }, },
1235	{ .name	    = "lstat", .alias = "newlstat", },
1236	{ .name     = "madvise",
1237	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
1238		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
1239	{ .name	    = "mkdirat",
1240	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1241	{ .name	    = "mknodat",
1242	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 
 
 
 
1243	{ .name	    = "mmap",	    .hexret = true,
1244/* The standard mmap maps to old_mmap on s390x */
1245#if defined(__s390x__)
1246	.alias = "old_mmap",
1247#endif
1248	  .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1249		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */
1250			   .strtoul   = STUL_STRARRAY_FLAGS,
1251			   .parm      = &strarray__mmap_flags, },
1252		   [5] = { .scnprintf = SCA_HEX,	/* offset */ }, }, },
1253	{ .name	    = "mount",
1254	  .arg = { [0] = SCA_FILENAME_FROM_USER(devname),
1255		   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
1256			   .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
1257	{ .name	    = "move_mount",
1258	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* from_dfd */ },
1259		   [1] = SCA_FILENAME_FROM_USER(pathname),
1260		   [2] = { .scnprintf = SCA_FDAT,	/* to_dfd */ },
1261		   [3] = SCA_FILENAME_FROM_USER(pathname),
1262		   [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
1263	{ .name	    = "mprotect",
1264	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
1265		   [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, },
1266	{ .name	    = "mq_unlink",
1267	  .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, },
1268	{ .name	    = "mremap",	    .hexret = true,
1269	  .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
 
 
 
 
 
 
1270	{ .name	    = "name_to_handle_at",
1271	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1272	{ .name	    = "nanosleep",
1273	  .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, },
1274	{ .name	    = "newfstatat", .alias = "fstatat",
1275	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
1276		   [1] = SCA_FILENAME_FROM_USER(pathname),
1277		   [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1278	{ .name	    = "open",
1279	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1280	{ .name	    = "open_by_handle_at",
1281	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
1282		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1283	{ .name	    = "openat",
1284	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
1285		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1286	{ .name	    = "perf_event_open",
1287	  .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr),
1288		   [2] = { .scnprintf = SCA_INT,	/* cpu */ },
1289		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
1290		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
1291	{ .name	    = "pipe2",
1292	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
1293	{ .name	    = "pkey_alloc",
1294	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
1295	{ .name	    = "pkey_free",
1296	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
1297	{ .name	    = "pkey_mprotect",
1298	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
1299		   [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1300		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
1301	{ .name	    = "poll", .timeout = true, },
1302	{ .name	    = "ppoll", .timeout = true, },
1303	{ .name	    = "prctl",
1304	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
1305			   .strtoul   = STUL_STRARRAY,
1306			   .parm      = &strarray__prctl_options, },
1307		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
1308		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
1309	{ .name	    = "pread", .alias = "pread64", },
1310	{ .name	    = "preadv", .alias = "pread", },
1311	{ .name	    = "prlimit64",
1312	  .arg = { [1] = STRARRAY(resource, rlimit_resources),
1313		   [2] = { .from_user = true /* new_rlim */, }, }, },
1314	{ .name	    = "pwrite", .alias = "pwrite64", },
1315	{ .name	    = "readlinkat",
1316	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1317	{ .name	    = "recvfrom",
1318	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1319	{ .name	    = "recvmmsg",
1320	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1321	{ .name	    = "recvmsg",
1322	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1323	{ .name	    = "renameat",
1324	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1325		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
1326	{ .name	    = "renameat2",
1327	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1328		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
1329		   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
1330	{ .name	    = "rseq",	    .errpid = true,
1331	  .arg = { [0] = { .from_user = true /* rseq */, }, }, },
1332	{ .name	    = "rt_sigaction",
1333	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1334	{ .name	    = "rt_sigprocmask",
1335	  .arg = { [0] = STRARRAY(how, sighow), }, },
1336	{ .name	    = "rt_sigqueueinfo",
1337	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1338	{ .name	    = "rt_tgsigqueueinfo",
1339	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1340	{ .name	    = "sched_setscheduler",
1341	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
1342	{ .name	    = "seccomp",
1343	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
1344		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
1345	{ .name	    = "select", .timeout = true, },
1346	{ .name	    = "sendfile", .alias = "sendfile64", },
1347	{ .name	    = "sendmmsg",
1348	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1349	{ .name	    = "sendmsg",
1350	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1351	{ .name	    = "sendto",
1352	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
1353		   [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
1354	{ .name	    = "set_robust_list",	    .errpid = true,
1355	  .arg = { [0] = { .from_user = true /* head */, }, }, },
1356	{ .name	    = "set_tid_address", .errpid = true, },
1357	{ .name	    = "setitimer",
1358	  .arg = { [0] = STRARRAY(which, itimers), }, },
1359	{ .name	    = "setrlimit",
1360	  .arg = { [0] = STRARRAY(resource, rlimit_resources),
1361		   [1] = { .from_user = true /* rlim */, }, }, },
1362	{ .name	    = "setsockopt",
1363	  .arg = { [1] = STRARRAY(level, socket_level), }, },
1364	{ .name	    = "socket",
1365	  .arg = { [0] = STRARRAY(family, socket_families),
1366		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1367		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1368	{ .name	    = "socketpair",
1369	  .arg = { [0] = STRARRAY(family, socket_families),
1370		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1371		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1372	{ .name	    = "stat", .alias = "newstat", },
1373	{ .name	    = "statx",
1374	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
1375		   [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } ,
1376		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
1377	{ .name	    = "swapoff",
1378	  .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1379	{ .name	    = "swapon",
1380	  .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1381	{ .name	    = "symlinkat",
1382	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1383	{ .name	    = "sync_file_range",
1384	  .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
1385	{ .name	    = "tgkill",
1386	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1387	{ .name	    = "tkill",
1388	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1389	{ .name     = "umount2", .alias = "umount",
1390	  .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, },
1391	{ .name	    = "uname", .alias = "newuname", },
1392	{ .name	    = "unlinkat",
1393	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
1394		   [1] = SCA_FILENAME_FROM_USER(pathname),
1395		   [2] = { .scnprintf = SCA_FS_AT_FLAGS,  /* flags */ }, }, },
1396	{ .name	    = "utimensat",
1397	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
1398	{ .name	    = "wait4",	    .errpid = true,
1399	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1400	{ .name	    = "waitid",	    .errpid = true,
1401	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1402	{ .name	    = "write",
1403	  .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
1404};
1405
1406static int syscall_fmt__cmp(const void *name, const void *fmtp)
1407{
1408	const struct syscall_fmt *fmt = fmtp;
1409	return strcmp(name, fmt->name);
1410}
1411
1412static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts,
1413						     const int nmemb,
1414						     const char *name)
1415{
1416	return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1417}
1418
1419static const struct syscall_fmt *syscall_fmt__find(const char *name)
1420{
1421	const int nmemb = ARRAY_SIZE(syscall_fmts);
1422	return __syscall_fmt__find(syscall_fmts, nmemb, name);
1423}
1424
1425static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts,
1426							      const int nmemb, const char *alias)
1427{
1428	int i;
1429
1430	for (i = 0; i < nmemb; ++i) {
1431		if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
1432			return &fmts[i];
1433	}
1434
1435	return NULL;
1436}
1437
1438static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
1439{
1440	const int nmemb = ARRAY_SIZE(syscall_fmts);
1441	return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
1442}
1443
1444/*
1445 * is_exit: is this "exit" or "exit_group"?
1446 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
1447 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
1448 * nonexistent: Just a hole in the syscall table, syscall id not allocated
1449 */
1450struct syscall {
1451	struct tep_event    *tp_format;
1452	int		    nr_args;
1453	int		    args_size;
1454	struct {
1455		struct bpf_program *sys_enter,
1456				   *sys_exit;
1457	}		    bpf_prog;
1458	bool		    is_exit;
1459	bool		    is_open;
1460	bool		    nonexistent;
1461	bool		    use_btf;
1462	struct tep_format_field *args;
1463	const char	    *name;
1464	const struct syscall_fmt  *fmt;
1465	struct syscall_arg_fmt *arg_fmt;
1466};
1467
1468/*
1469 * We need to have this 'calculated' boolean because in some cases we really
1470 * don't know what is the duration of a syscall, for instance, when we start
1471 * a session and some threads are waiting for a syscall to finish, say 'poll',
1472 * in which case all we can do is to print "( ? ) for duration and for the
1473 * start timestamp.
1474 */
1475static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1476{
1477	double duration = (double)t / NSEC_PER_MSEC;
1478	size_t printed = fprintf(fp, "(");
1479
1480	if (!calculated)
1481		printed += fprintf(fp, "         ");
1482	else if (duration >= 1.0)
1483		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1484	else if (duration >= 0.01)
1485		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1486	else
1487		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1488	return printed + fprintf(fp, "): ");
1489}
1490
1491/**
1492 * filename.ptr: The filename char pointer that will be vfs_getname'd
1493 * filename.entry_str_pos: Where to insert the string translated from
1494 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1495 * ret_scnprintf: syscall args may set this to a different syscall return
1496 *                formatter, for instance, fcntl may return fds, file flags, etc.
1497 */
1498struct thread_trace {
1499	u64		  entry_time;
1500	bool		  entry_pending;
1501	unsigned long	  nr_events;
1502	unsigned long	  pfmaj, pfmin;
1503	char		  *entry_str;
1504	double		  runtime_ms;
1505	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1506        struct {
1507		unsigned long ptr;
1508		short int     entry_str_pos;
1509		bool	      pending_open;
1510		unsigned int  namelen;
1511		char	      *name;
1512	} filename;
1513	struct {
1514		int	      max;
1515		struct file   *table;
1516	} files;
1517
1518	struct intlist *syscall_stats;
1519};
1520
1521static struct thread_trace *thread_trace__new(void)
1522{
1523	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1524
1525	if (ttrace) {
1526		ttrace->files.max = -1;
1527		ttrace->syscall_stats = intlist__new(NULL);
1528	}
1529
1530	return ttrace;
1531}
1532
1533static void thread_trace__free_files(struct thread_trace *ttrace);
1534
1535static void thread_trace__delete(void *pttrace)
1536{
1537	struct thread_trace *ttrace = pttrace;
1538
1539	if (!ttrace)
1540		return;
1541
1542	intlist__delete(ttrace->syscall_stats);
1543	ttrace->syscall_stats = NULL;
1544	thread_trace__free_files(ttrace);
1545	zfree(&ttrace->entry_str);
1546	free(ttrace);
1547}
1548
1549static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1550{
1551	struct thread_trace *ttrace;
1552
1553	if (thread == NULL)
1554		goto fail;
1555
1556	if (thread__priv(thread) == NULL)
1557		thread__set_priv(thread, thread_trace__new());
1558
1559	if (thread__priv(thread) == NULL)
1560		goto fail;
1561
1562	ttrace = thread__priv(thread);
1563	++ttrace->nr_events;
1564
1565	return ttrace;
1566fail:
1567	color_fprintf(fp, PERF_COLOR_RED,
1568		      "WARNING: not enough memory, dropping samples!\n");
1569	return NULL;
1570}
1571
1572
1573void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1574				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1575{
1576	struct thread_trace *ttrace = thread__priv(arg->thread);
1577
1578	ttrace->ret_scnprintf = ret_scnprintf;
1579}
1580
1581#define TRACE_PFMAJ		(1 << 0)
1582#define TRACE_PFMIN		(1 << 1)
1583
1584static const size_t trace__entry_str_size = 2048;
1585
1586static void thread_trace__free_files(struct thread_trace *ttrace)
1587{
1588	for (int i = 0; i < ttrace->files.max; ++i) {
1589		struct file *file = ttrace->files.table + i;
1590		zfree(&file->pathname);
1591	}
1592
1593	zfree(&ttrace->files.table);
1594	ttrace->files.max  = -1;
1595}
1596
1597static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1598{
1599	if (fd < 0)
1600		return NULL;
1601
1602	if (fd > ttrace->files.max) {
1603		struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1604
1605		if (nfiles == NULL)
1606			return NULL;
1607
1608		if (ttrace->files.max != -1) {
1609			memset(nfiles + ttrace->files.max + 1, 0,
1610			       (fd - ttrace->files.max) * sizeof(struct file));
1611		} else {
1612			memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1613		}
1614
1615		ttrace->files.table = nfiles;
1616		ttrace->files.max   = fd;
1617	}
1618
1619	return ttrace->files.table + fd;
1620}
1621
1622struct file *thread__files_entry(struct thread *thread, int fd)
1623{
1624	return thread_trace__files_entry(thread__priv(thread), fd);
1625}
1626
1627static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1628{
1629	struct thread_trace *ttrace = thread__priv(thread);
1630	struct file *file = thread_trace__files_entry(ttrace, fd);
1631
1632	if (file != NULL) {
1633		struct stat st;
1634		if (stat(pathname, &st) == 0)
1635			file->dev_maj = major(st.st_rdev);
1636		file->pathname = strdup(pathname);
1637		if (file->pathname)
1638			return 0;
1639	}
1640
1641	return -1;
1642}
1643
1644static int thread__read_fd_path(struct thread *thread, int fd)
1645{
1646	char linkname[PATH_MAX], pathname[PATH_MAX];
1647	struct stat st;
1648	int ret;
1649
1650	if (thread__pid(thread) == thread__tid(thread)) {
1651		scnprintf(linkname, sizeof(linkname),
1652			  "/proc/%d/fd/%d", thread__pid(thread), fd);
1653	} else {
1654		scnprintf(linkname, sizeof(linkname),
1655			  "/proc/%d/task/%d/fd/%d",
1656			  thread__pid(thread), thread__tid(thread), fd);
1657	}
1658
1659	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1660		return -1;
1661
1662	ret = readlink(linkname, pathname, sizeof(pathname));
1663
1664	if (ret < 0 || ret > st.st_size)
1665		return -1;
1666
1667	pathname[ret] = '\0';
1668	return trace__set_fd_pathname(thread, fd, pathname);
1669}
1670
1671static const char *thread__fd_path(struct thread *thread, int fd,
1672				   struct trace *trace)
1673{
1674	struct thread_trace *ttrace = thread__priv(thread);
1675
1676	if (ttrace == NULL || trace->fd_path_disabled)
1677		return NULL;
1678
1679	if (fd < 0)
1680		return NULL;
1681
1682	if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1683		if (!trace->live)
1684			return NULL;
1685		++trace->stats.proc_getname;
1686		if (thread__read_fd_path(thread, fd))
1687			return NULL;
1688	}
1689
1690	return ttrace->files.table[fd].pathname;
1691}
1692
1693size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1694{
1695	int fd = arg->val;
1696	size_t printed = scnprintf(bf, size, "%d", fd);
1697	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1698
1699	if (path)
1700		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1701
1702	return printed;
1703}
1704
1705size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1706{
1707        size_t printed = scnprintf(bf, size, "%d", fd);
1708	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1709
1710	if (thread) {
1711		const char *path = thread__fd_path(thread, fd, trace);
1712
1713		if (path)
1714			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1715
1716		thread__put(thread);
1717	}
1718
1719        return printed;
1720}
1721
1722static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1723					      struct syscall_arg *arg)
1724{
1725	int fd = arg->val;
1726	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1727	struct thread_trace *ttrace = thread__priv(arg->thread);
1728
1729	if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1730		zfree(&ttrace->files.table[fd].pathname);
1731
1732	return printed;
1733}
1734
1735static void thread__set_filename_pos(struct thread *thread, const char *bf,
1736				     unsigned long ptr)
1737{
1738	struct thread_trace *ttrace = thread__priv(thread);
1739
1740	ttrace->filename.ptr = ptr;
1741	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1742}
1743
1744static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1745{
1746	struct augmented_arg *augmented_arg = arg->augmented.args;
1747	size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1748	/*
1749	 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1750	 * we would have two strings, each prefixed by its size.
1751	 */
1752	int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1753
1754	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1755	arg->augmented.size -= consumed;
1756
1757	return printed;
1758}
1759
1760static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1761					      struct syscall_arg *arg)
1762{
1763	unsigned long ptr = arg->val;
1764
1765	if (arg->augmented.args)
1766		return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1767
1768	if (!arg->trace->vfs_getname)
1769		return scnprintf(bf, size, "%#x", ptr);
1770
1771	thread__set_filename_pos(arg->thread, bf, ptr);
1772	return 0;
1773}
1774
1775#define MAX_CONTROL_CHAR 31
1776#define MAX_ASCII 127
1777
1778static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg)
1779{
1780	struct augmented_arg *augmented_arg = arg->augmented.args;
1781	unsigned char *orig = (unsigned char *)augmented_arg->value;
1782	size_t printed = 0;
1783	int consumed;
1784
1785	if (augmented_arg == NULL)
1786		return 0;
1787
1788	for (int j = 0; j < augmented_arg->size; ++j) {
1789		bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII;
1790		/* print control characters (0~31 and 127), and non-ascii characters in \(digits) */
1791		printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]);
1792	}
1793
1794	consumed = sizeof(*augmented_arg) + augmented_arg->size;
1795	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1796	arg->augmented.size -= consumed;
1797
1798	return printed;
1799}
1800
1801static bool trace__filter_duration(struct trace *trace, double t)
1802{
1803	return t < (trace->duration_filter * NSEC_PER_MSEC);
1804}
1805
1806static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1807{
1808	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1809
1810	return fprintf(fp, "%10.3f ", ts);
1811}
1812
1813/*
1814 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1815 * using ttrace->entry_time for a thread that receives a sys_exit without
1816 * first having received a sys_enter ("poll" issued before tracing session
1817 * starts, lost sys_enter exit due to ring buffer overflow).
1818 */
1819static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1820{
1821	if (tstamp > 0)
1822		return __trace__fprintf_tstamp(trace, tstamp, fp);
1823
1824	return fprintf(fp, "         ? ");
1825}
1826
1827static pid_t workload_pid = -1;
1828static volatile sig_atomic_t done = false;
1829static volatile sig_atomic_t interrupted = false;
1830
1831static void sighandler_interrupt(int sig __maybe_unused)
1832{
1833	done = interrupted = true;
 
1834}
1835
1836static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
1837			    void *context __maybe_unused)
1838{
1839	if (info->si_pid == workload_pid)
1840		done = true;
1841}
1842
1843static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1844{
1845	size_t printed = 0;
1846
1847	if (trace->multiple_threads) {
1848		if (trace->show_comm)
1849			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1850		printed += fprintf(fp, "%d ", thread__tid(thread));
1851	}
1852
1853	return printed;
1854}
1855
1856static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1857					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1858{
1859	size_t printed = 0;
1860
1861	if (trace->show_tstamp)
1862		printed = trace__fprintf_tstamp(trace, tstamp, fp);
1863	if (trace->show_duration)
1864		printed += fprintf_duration(duration, duration_calculated, fp);
1865	return printed + trace__fprintf_comm_tid(trace, thread, fp);
1866}
1867
1868static int trace__process_event(struct trace *trace, struct machine *machine,
1869				union perf_event *event, struct perf_sample *sample)
1870{
1871	int ret = 0;
1872
1873	switch (event->header.type) {
1874	case PERF_RECORD_LOST:
1875		color_fprintf(trace->output, PERF_COLOR_RED,
1876			      "LOST %" PRIu64 " events!\n", (u64)event->lost.lost);
1877		ret = machine__process_lost_event(machine, event, sample);
1878		break;
1879	default:
1880		ret = machine__process_event(machine, event, sample);
1881		break;
1882	}
1883
1884	return ret;
1885}
1886
1887static int trace__tool_process(const struct perf_tool *tool,
1888			       union perf_event *event,
1889			       struct perf_sample *sample,
1890			       struct machine *machine)
1891{
1892	struct trace *trace = container_of(tool, struct trace, tool);
1893	return trace__process_event(trace, machine, event, sample);
1894}
1895
1896static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1897{
1898	struct machine *machine = vmachine;
1899
1900	if (machine->kptr_restrict_warned)
1901		return NULL;
1902
1903	if (symbol_conf.kptr_restrict) {
1904		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1905			   "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1906			   "Kernel samples will not be resolved.\n");
1907		machine->kptr_restrict_warned = true;
1908		return NULL;
1909	}
1910
1911	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1912}
1913
1914static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1915{
1916	int err = symbol__init(NULL);
1917
1918	if (err)
1919		return err;
1920
1921	trace->host = machine__new_host();
1922	if (trace->host == NULL)
1923		return -ENOMEM;
1924
1925	thread__set_priv_destructor(thread_trace__delete);
1926
1927	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1928	if (err < 0)
1929		goto out;
1930
1931	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1932					    evlist->core.threads, trace__tool_process,
1933					    true, false, 1);
1934out:
1935	if (err)
1936		symbol__exit();
1937
1938	return err;
1939}
1940
1941static void trace__symbols__exit(struct trace *trace)
1942{
1943	machine__exit(trace->host);
1944	trace->host = NULL;
1945
1946	symbol__exit();
1947}
1948
1949static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1950{
1951	int idx;
1952
1953	if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
1954		nr_args = sc->fmt->nr_args;
1955
1956	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1957	if (sc->arg_fmt == NULL)
1958		return -1;
1959
1960	for (idx = 0; idx < nr_args; ++idx) {
1961		if (sc->fmt)
1962			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1963	}
1964
1965	sc->nr_args = nr_args;
1966	return 0;
1967}
1968
1969static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
1970	{ .name = "msr",	.scnprintf = SCA_X86_MSR,	  .strtoul = STUL_X86_MSR,	   },
1971	{ .name = "vector",	.scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
1972};
1973
1974static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
1975{
1976       const struct syscall_arg_fmt *fmt = fmtp;
1977       return strcmp(name, fmt->name);
1978}
1979
1980static const struct syscall_arg_fmt *
1981__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb,
1982				const char *name)
1983{
1984       return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
1985}
1986
1987static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
1988{
1989       const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
1990       return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
1991}
1992
1993static struct tep_format_field *
1994syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
1995			    bool *use_btf)
1996{
1997	struct tep_format_field *last_field = NULL;
1998	int len;
1999
2000	for (; field; field = field->next, ++arg) {
2001		last_field = field;
2002
2003		if (arg->scnprintf)
2004			continue;
2005
2006		len = strlen(field->name);
2007
2008		// As far as heuristics (or intention) goes this seems to hold true, and makes sense!
2009		if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
2010			arg->from_user = true;
2011
2012		if (strcmp(field->type, "const char *") == 0 &&
2013		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
2014		     strstr(field->name, "path") != NULL)) {
2015			arg->scnprintf = SCA_FILENAME;
2016		} else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
2017			arg->scnprintf = SCA_PTR;
 
2018		else if (strcmp(field->type, "pid_t") == 0)
2019			arg->scnprintf = SCA_PID;
2020		else if (strcmp(field->type, "umode_t") == 0)
2021			arg->scnprintf = SCA_MODE_T;
2022		else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
2023			arg->scnprintf = SCA_CHAR_ARRAY;
2024			arg->nr_entries = field->arraylen;
2025		} else if ((strcmp(field->type, "int") == 0 ||
2026			  strcmp(field->type, "unsigned int") == 0 ||
2027			  strcmp(field->type, "long") == 0) &&
2028			 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
 
2029			/*
2030			 * /sys/kernel/tracing/events/syscalls/sys_enter*
2031			 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
2032			 * 65 int
2033			 * 23 unsigned int
2034			 * 7 unsigned long
2035			 */
2036			arg->scnprintf = SCA_FD;
2037		} else if (strstr(field->type, "enum") && use_btf != NULL) {
2038			*use_btf = true;
2039			arg->strtoul = STUL_BTF_TYPE;
2040		} else {
2041			const struct syscall_arg_fmt *fmt =
2042				syscall_arg_fmt__find_by_name(field->name);
2043
2044			if (fmt) {
2045				arg->scnprintf = fmt->scnprintf;
2046				arg->strtoul   = fmt->strtoul;
2047			}
2048		}
2049	}
2050
2051	return last_field;
2052}
2053
2054static int syscall__set_arg_fmts(struct syscall *sc)
2055{
2056	struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args,
2057									  &sc->use_btf);
2058
2059	if (last_field)
2060		sc->args_size = last_field->offset + last_field->size;
2061
2062	return 0;
2063}
2064
2065static int trace__read_syscall_info(struct trace *trace, int id)
2066{
2067	char tp_name[128];
2068	struct syscall *sc;
2069	const char *name = syscalltbl__name(trace->sctbl, id);
2070	int err;
2071
2072#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2073	if (trace->syscalls.table == NULL) {
2074		trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
2075		if (trace->syscalls.table == NULL)
2076			return -ENOMEM;
2077	}
2078#else
2079	if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
2080		// When using libaudit we don't know beforehand what is the max syscall id
2081		struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
2082
2083		if (table == NULL)
2084			return -ENOMEM;
2085
2086		// Need to memset from offset 0 and +1 members if brand new
2087		if (trace->syscalls.table == NULL)
2088			memset(table, 0, (id + 1) * sizeof(*sc));
2089		else
2090			memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
2091
2092		trace->syscalls.table	      = table;
2093		trace->sctbl->syscalls.max_id = id;
2094	}
2095#endif
2096	sc = trace->syscalls.table + id;
2097	if (sc->nonexistent)
2098		return -EEXIST;
2099
2100	if (name == NULL) {
2101		sc->nonexistent = true;
2102		return -EEXIST;
2103	}
2104
 
2105	sc->name = name;
 
2106	sc->fmt  = syscall_fmt__find(sc->name);
2107
2108	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
2109	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2110
2111	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
2112		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
2113		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2114	}
2115
2116	/*
2117	 * Fails to read trace point format via sysfs node, so the trace point
2118	 * doesn't exist.  Set the 'nonexistent' flag as true.
2119	 */
2120	if (IS_ERR(sc->tp_format)) {
2121		sc->nonexistent = true;
2122		return PTR_ERR(sc->tp_format);
2123	}
2124
2125	/*
2126	 * The tracepoint format contains __syscall_nr field, so it's one more
2127	 * than the actual number of syscall arguments.
2128	 */
2129	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
2130					RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
2131		return -ENOMEM;
2132
2133	sc->args = sc->tp_format->format.fields;
2134	/*
2135	 * We need to check and discard the first variable '__syscall_nr'
2136	 * or 'nr' that mean the syscall number. It is needless here.
2137	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
2138	 */
2139	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
2140		sc->args = sc->args->next;
2141		--sc->nr_args;
2142	}
2143
2144	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
2145	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
2146
2147	err = syscall__set_arg_fmts(sc);
2148
2149	/* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */
2150	if (sc->use_btf)
2151		trace__load_vmlinux_btf(trace);
2152
2153	return err;
2154}
2155
2156static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf)
2157{
2158	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
2159
2160	if (fmt != NULL) {
2161		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf);
2162		return 0;
2163	}
2164
2165	return -ENOMEM;
2166}
2167
2168static int intcmp(const void *a, const void *b)
2169{
2170	const int *one = a, *another = b;
2171
2172	return *one - *another;
2173}
2174
2175static int trace__validate_ev_qualifier(struct trace *trace)
2176{
2177	int err = 0;
2178	bool printed_invalid_prefix = false;
2179	struct str_node *pos;
2180	size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
2181
2182	trace->ev_qualifier_ids.entries = malloc(nr_allocated *
 
2183						 sizeof(trace->ev_qualifier_ids.entries[0]));
2184
2185	if (trace->ev_qualifier_ids.entries == NULL) {
2186		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
2187		       trace->output);
2188		err = -EINVAL;
2189		goto out;
2190	}
2191
 
 
 
2192	strlist__for_each_entry(pos, trace->ev_qualifier) {
2193		const char *sc = pos->s;
2194		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
2195
2196		if (id < 0) {
2197			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
2198			if (id >= 0)
2199				goto matches;
2200
2201			if (!printed_invalid_prefix) {
2202				pr_debug("Skipping unknown syscalls: ");
2203				printed_invalid_prefix = true;
2204			} else {
2205				pr_debug(", ");
2206			}
2207
2208			pr_debug("%s", sc);
2209			continue;
2210		}
2211matches:
2212		trace->ev_qualifier_ids.entries[nr_used++] = id;
2213		if (match_next == -1)
2214			continue;
2215
2216		while (1) {
2217			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
2218			if (id < 0)
2219				break;
2220			if (nr_allocated == nr_used) {
2221				void *entries;
2222
2223				nr_allocated += 8;
2224				entries = realloc(trace->ev_qualifier_ids.entries,
2225						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
2226				if (entries == NULL) {
2227					err = -ENOMEM;
2228					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
2229					goto out_free;
2230				}
2231				trace->ev_qualifier_ids.entries = entries;
2232			}
2233			trace->ev_qualifier_ids.entries[nr_used++] = id;
 
2234		}
2235	}
2236
2237	trace->ev_qualifier_ids.nr = nr_used;
2238	qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
 
 
 
 
 
2239out:
2240	if (printed_invalid_prefix)
2241		pr_debug("\n");
2242	return err;
2243out_free:
2244	zfree(&trace->ev_qualifier_ids.entries);
2245	trace->ev_qualifier_ids.nr = 0;
2246	goto out;
2247}
2248
2249static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
2250{
2251	bool in_ev_qualifier;
2252
2253	if (trace->ev_qualifier_ids.nr == 0)
2254		return true;
2255
2256	in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
2257				  trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
2258
2259	if (in_ev_qualifier)
2260	       return !trace->not_ev_qualifier;
2261
2262	return trace->not_ev_qualifier;
2263}
2264
2265/*
2266 * args is to be interpreted as a series of longs but we need to handle
2267 * 8-byte unaligned accesses. args points to raw_data within the event
2268 * and raw_data is guaranteed to be 8-byte unaligned because it is
2269 * preceded by raw_size which is a u32. So we need to copy args to a temp
2270 * variable to read it. Most notably this avoids extended load instructions
2271 * on unaligned addresses
2272 */
2273unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
2274{
2275	unsigned long val;
2276	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
2277
2278	memcpy(&val, p, sizeof(val));
2279	return val;
2280}
2281
2282static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
2283				      struct syscall_arg *arg)
2284{
2285	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
2286		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
2287
2288	return scnprintf(bf, size, "arg%d: ", arg->idx);
2289}
2290
2291/*
2292 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
2293 * as mount 'flags' argument that needs ignoring some magic flag, see comment
2294 * in tools/perf/trace/beauty/mount_flags.c
2295 */
2296static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
2297{
2298	if (fmt && fmt->mask_val)
2299		return fmt->mask_val(arg, val);
2300
2301	return val;
2302}
2303
2304static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
2305					     struct syscall_arg *arg, unsigned long val)
2306{
2307	if (fmt && fmt->scnprintf) {
2308		arg->val = val;
2309		if (fmt->parm)
2310			arg->parm = fmt->parm;
2311		return fmt->scnprintf(bf, size, arg);
2312	}
2313	return scnprintf(bf, size, "%ld", val);
2314}
2315
2316static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
2317				      unsigned char *args, void *augmented_args, int augmented_args_size,
2318				      struct trace *trace, struct thread *thread)
2319{
2320	size_t printed = 0, btf_printed;
2321	unsigned long val;
2322	u8 bit = 1;
2323	struct syscall_arg arg = {
2324		.args	= args,
2325		.augmented = {
2326			.size = augmented_args_size,
2327			.args = augmented_args,
2328		},
2329		.idx	= 0,
2330		.mask	= 0,
2331		.trace  = trace,
2332		.thread = thread,
2333		.show_string_prefix = trace->show_string_prefix,
2334	};
2335	struct thread_trace *ttrace = thread__priv(thread);
2336	void *default_scnprintf;
2337
2338	/*
2339	 * Things like fcntl will set this in its 'cmd' formatter to pick the
2340	 * right formatter for the return value (an fd? file flags?), which is
2341	 * not needed for syscalls that always return a given type, say an fd.
2342	 */
2343	ttrace->ret_scnprintf = NULL;
2344
2345	if (sc->args != NULL) {
2346		struct tep_format_field *field;
2347
2348		for (field = sc->args; field;
2349		     field = field->next, ++arg.idx, bit <<= 1) {
2350			if (arg.mask & bit)
2351				continue;
2352
2353			arg.fmt = &sc->arg_fmt[arg.idx];
2354			val = syscall_arg__val(&arg, arg.idx);
2355			/*
2356			 * Some syscall args need some mask, most don't and
2357			 * return val untouched.
2358			 */
2359			val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
2360
2361			/*
2362			 * Suppress this argument if its value is zero and show_zero
2363			 * property isn't set.
2364			 *
2365			 * If it has a BTF type, then override the zero suppression knob
2366			 * as the common case is for zero in an enum to have an associated entry.
2367			 */
2368			if (val == 0 && !trace->show_zeros &&
2369			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
2370			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE))
 
2371				continue;
2372
2373			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2374
2375			if (trace->show_arg_names)
2376				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2377
2378			default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
2379
2380			if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
2381				btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
2382								   size - printed, val, field->type);
2383				if (btf_printed) {
2384					printed += btf_printed;
2385					continue;
2386				}
2387			}
2388
2389			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
2390								  bf + printed, size - printed, &arg, val);
2391		}
2392	} else if (IS_ERR(sc->tp_format)) {
2393		/*
2394		 * If we managed to read the tracepoint /format file, then we
2395		 * may end up not having any args, like with gettid(), so only
2396		 * print the raw args when we didn't manage to read it.
2397		 */
2398		while (arg.idx < sc->nr_args) {
2399			if (arg.mask & bit)
2400				goto next_arg;
2401			val = syscall_arg__val(&arg, arg.idx);
2402			if (printed)
2403				printed += scnprintf(bf + printed, size - printed, ", ");
2404			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
2405			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
2406next_arg:
2407			++arg.idx;
2408			bit <<= 1;
2409		}
2410	}
2411
2412	return printed;
2413}
2414
2415typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
2416				  union perf_event *event,
2417				  struct perf_sample *sample);
2418
2419static struct syscall *trace__syscall_info(struct trace *trace,
2420					   struct evsel *evsel, int id)
2421{
2422	int err = 0;
2423
2424	if (id < 0) {
2425
2426		/*
2427		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
2428		 * before that, leaving at a higher verbosity level till that is
2429		 * explained. Reproduced with plain ftrace with:
2430		 *
2431		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
2432		 * grep "NR -1 " /t/trace_pipe
2433		 *
2434		 * After generating some load on the machine.
2435 		 */
2436		if (verbose > 1) {
2437			static u64 n;
2438			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2439				id, evsel__name(evsel), ++n);
2440		}
2441		return NULL;
2442	}
2443
2444	err = -EINVAL;
2445
2446#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2447	if (id > trace->sctbl->syscalls.max_id) {
2448#else
2449	if (id >= trace->sctbl->syscalls.max_id) {
2450		/*
2451		 * With libaudit we don't know beforehand what is the max_id,
2452		 * so we let trace__read_syscall_info() figure that out as we
2453		 * go on reading syscalls.
2454		 */
2455		err = trace__read_syscall_info(trace, id);
2456		if (err)
2457#endif
2458		goto out_cant_read;
2459	}
2460
2461	if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
2462	    (err = trace__read_syscall_info(trace, id)) != 0)
2463		goto out_cant_read;
2464
2465	if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
2466		goto out_cant_read;
2467
2468	return &trace->syscalls.table[id];
2469
2470out_cant_read:
2471	if (verbose > 0) {
2472		char sbuf[STRERR_BUFSIZE];
2473		fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
2474		if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
2475			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2476		fputs(" information\n", trace->output);
2477	}
2478	return NULL;
2479}
2480
2481struct syscall_stats {
2482	struct stats stats;
2483	u64	     nr_failures;
2484	int	     max_errno;
2485	u32	     *errnos;
2486};
2487
2488static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
2489				 int id, struct perf_sample *sample, long err, bool errno_summary)
2490{
2491	struct int_node *inode;
2492	struct syscall_stats *stats;
2493	u64 duration = 0;
2494
2495	inode = intlist__findnew(ttrace->syscall_stats, id);
2496	if (inode == NULL)
2497		return;
2498
2499	stats = inode->priv;
2500	if (stats == NULL) {
2501		stats = zalloc(sizeof(*stats));
2502		if (stats == NULL)
2503			return;
2504
2505		init_stats(&stats->stats);
2506		inode->priv = stats;
2507	}
2508
2509	if (ttrace->entry_time && sample->time > ttrace->entry_time)
2510		duration = sample->time - ttrace->entry_time;
2511
2512	update_stats(&stats->stats, duration);
2513
2514	if (err < 0) {
2515		++stats->nr_failures;
2516
2517		if (!errno_summary)
2518			return;
2519
2520		err = -err;
2521		if (err > stats->max_errno) {
2522			u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
2523
2524			if (new_errnos) {
2525				memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
2526			} else {
2527				pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
2528					 thread__comm_str(thread), thread__pid(thread),
2529					 thread__tid(thread));
2530				return;
2531			}
2532
2533			stats->errnos = new_errnos;
2534			stats->max_errno = err;
2535		}
2536
2537		++stats->errnos[err - 1];
2538	}
2539}
2540
2541static int trace__printf_interrupted_entry(struct trace *trace)
2542{
2543	struct thread_trace *ttrace;
2544	size_t printed;
2545	int len;
2546
2547	if (trace->failure_only || trace->current == NULL)
2548		return 0;
2549
2550	ttrace = thread__priv(trace->current);
2551
2552	if (!ttrace->entry_pending)
2553		return 0;
2554
2555	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
2556	printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
2557
2558	if (len < trace->args_alignment - 4)
2559		printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
2560
2561	printed += fprintf(trace->output, " ...\n");
2562
2563	ttrace->entry_pending = false;
2564	++trace->nr_events_printed;
2565
2566	return printed;
2567}
2568
2569static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
2570				 struct perf_sample *sample, struct thread *thread)
2571{
2572	int printed = 0;
2573
2574	if (trace->print_sample) {
2575		double ts = (double)sample->time / NSEC_PER_MSEC;
2576
2577		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
2578				   evsel__name(evsel), ts,
2579				   thread__comm_str(thread),
2580				   sample->pid, sample->tid, sample->cpu);
2581	}
2582
2583	return printed;
2584}
2585
2586static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
2587{
2588	void *augmented_args = NULL;
2589	/*
2590	 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
2591	 * and there we get all 6 syscall args plus the tracepoint common fields
2592	 * that gets calculated at the start and the syscall_nr (another long).
2593	 * So we check if that is the case and if so don't look after the
2594	 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
2595	 * which is fixed.
2596	 *
2597	 * We'll revisit this later to pass s->args_size to the BPF augmenter
2598	 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
2599	 * copies only what we need for each syscall, like what happens when we
2600	 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
2601	 * traffic to just what is needed for each syscall.
2602	 */
2603	int args_size = raw_augmented_args_size ?: sc->args_size;
2604
2605	*augmented_args_size = sample->raw_size - args_size;
2606	if (*augmented_args_size > 0)
2607		augmented_args = sample->raw_data + args_size;
2608
2609	return augmented_args;
2610}
2611
2612static void syscall__exit(struct syscall *sc)
2613{
2614	if (!sc)
2615		return;
2616
2617	zfree(&sc->arg_fmt);
2618}
2619
2620static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
2621			    union perf_event *event __maybe_unused,
2622			    struct perf_sample *sample)
2623{
2624	char *msg;
2625	void *args;
2626	int printed = 0;
2627	struct thread *thread;
2628	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2629	int augmented_args_size = 0;
2630	void *augmented_args = NULL;
2631	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2632	struct thread_trace *ttrace;
2633
2634	if (sc == NULL)
2635		return -1;
2636
2637	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2638	ttrace = thread__trace(thread, trace->output);
2639	if (ttrace == NULL)
2640		goto out_put;
2641
2642	trace__fprintf_sample(trace, evsel, sample, thread);
2643
2644	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2645
2646	if (ttrace->entry_str == NULL) {
2647		ttrace->entry_str = malloc(trace__entry_str_size);
2648		if (!ttrace->entry_str)
2649			goto out_put;
2650	}
2651
2652	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
2653		trace__printf_interrupted_entry(trace);
2654	/*
2655	 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
2656	 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
2657	 * this breaks syscall__augmented_args() check for augmented args, as we calculate
2658	 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
2659	 * so when handling, say the openat syscall, we end up getting 6 args for the
2660	 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
2661	 * thinking that the extra 2 u64 args are the augmented filename, so just check
2662	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
2663	 */
2664	if (evsel != trace->syscalls.events.sys_enter)
2665		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2666	ttrace->entry_time = sample->time;
2667	msg = ttrace->entry_str;
2668	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2669
2670	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2671					   args, augmented_args, augmented_args_size, trace, thread);
2672
2673	if (sc->is_exit) {
2674		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
2675			int alignment = 0;
2676
2677			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2678			printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2679			if (trace->args_alignment > printed)
2680				alignment = trace->args_alignment - printed;
2681			fprintf(trace->output, "%*s= ?\n", alignment, " ");
2682		}
2683	} else {
2684		ttrace->entry_pending = true;
2685		/* See trace__vfs_getname & trace__sys_exit */
2686		ttrace->filename.pending_open = false;
2687	}
2688
2689	if (trace->current != thread) {
2690		thread__put(trace->current);
2691		trace->current = thread__get(thread);
2692	}
2693	err = 0;
2694out_put:
2695	thread__put(thread);
2696	return err;
2697}
2698
2699static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2700				    struct perf_sample *sample)
2701{
2702	struct thread_trace *ttrace;
2703	struct thread *thread;
2704	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2705	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2706	char msg[1024];
2707	void *args, *augmented_args = NULL;
2708	int augmented_args_size;
2709	size_t printed = 0;
2710
2711	if (sc == NULL)
2712		return -1;
2713
2714	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2715	ttrace = thread__trace(thread, trace->output);
2716	/*
2717	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2718	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2719	 */
2720	if (ttrace == NULL)
2721		goto out_put;
2722
2723	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2724	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2725	printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2726	fprintf(trace->output, "%.*s", (int)printed, msg);
2727	err = 0;
2728out_put:
2729	thread__put(thread);
2730	return err;
2731}
2732
2733static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2734				    struct perf_sample *sample,
2735				    struct callchain_cursor *cursor)
2736{
2737	struct addr_location al;
2738	int max_stack = evsel->core.attr.sample_max_stack ?
2739			evsel->core.attr.sample_max_stack :
2740			trace->max_stack;
2741	int err = -1;
2742
2743	addr_location__init(&al);
2744	if (machine__resolve(trace->host, &al, sample) < 0)
2745		goto out;
2746
2747	err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2748out:
2749	addr_location__exit(&al);
2750	return err;
2751}
2752
2753static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2754{
2755	/* TODO: user-configurable print_opts */
2756	const unsigned int print_opts = EVSEL__PRINT_SYM |
2757				        EVSEL__PRINT_DSO |
2758				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
2759
2760	return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
2761}
2762
2763static const char *errno_to_name(struct evsel *evsel, int err)
2764{
2765	struct perf_env *env = evsel__env(evsel);
 
2766
2767	return perf_env__arch_strerrno(env, err);
2768}
2769
2770static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2771			   union perf_event *event __maybe_unused,
2772			   struct perf_sample *sample)
2773{
2774	long ret;
2775	u64 duration = 0;
2776	bool duration_calculated = false;
2777	struct thread *thread;
2778	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2779	int alignment = trace->args_alignment;
2780	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2781	struct thread_trace *ttrace;
2782
2783	if (sc == NULL)
2784		return -1;
2785
2786	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2787	ttrace = thread__trace(thread, trace->output);
2788	if (ttrace == NULL)
2789		goto out_put;
2790
2791	trace__fprintf_sample(trace, evsel, sample, thread);
2792
 
 
 
2793	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2794
2795	if (trace->summary)
2796		thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
2797
2798	if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2799		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2800		ttrace->filename.pending_open = false;
2801		++trace->stats.vfs_getname;
2802	}
2803
2804	if (ttrace->entry_time) {
2805		duration = sample->time - ttrace->entry_time;
2806		if (trace__filter_duration(trace, duration))
2807			goto out;
2808		duration_calculated = true;
2809	} else if (trace->duration_filter)
2810		goto out;
2811
2812	if (sample->callchain) {
2813		struct callchain_cursor *cursor = get_tls_callchain_cursor();
2814
2815		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2816		if (callchain_ret == 0) {
2817			if (cursor->nr < trace->min_stack)
2818				goto out;
2819			callchain_ret = 1;
2820		}
2821	}
2822
2823	if (trace->summary_only || (ret >= 0 && trace->failure_only))
2824		goto out;
2825
2826	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2827
2828	if (ttrace->entry_pending) {
2829		printed = fprintf(trace->output, "%s", ttrace->entry_str);
2830	} else {
2831		printed += fprintf(trace->output, " ... [");
2832		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2833		printed += 9;
2834		printed += fprintf(trace->output, "]: %s()", sc->name);
2835	}
2836
2837	printed++; /* the closing ')' */
2838
2839	if (alignment > printed)
2840		alignment -= printed;
2841	else
2842		alignment = 0;
2843
2844	fprintf(trace->output, ")%*s= ", alignment, " ");
2845
2846	if (sc->fmt == NULL) {
2847		if (ret < 0)
2848			goto errno_print;
2849signed_print:
2850		fprintf(trace->output, "%ld", ret);
2851	} else if (ret < 0) {
2852errno_print: {
2853		char bf[STRERR_BUFSIZE];
2854		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2855			   *e = errno_to_name(evsel, -ret);
2856
2857		fprintf(trace->output, "-1 %s (%s)", e, emsg);
2858	}
2859	} else if (ret == 0 && sc->fmt->timeout)
2860		fprintf(trace->output, "0 (Timeout)");
2861	else if (ttrace->ret_scnprintf) {
2862		char bf[1024];
2863		struct syscall_arg arg = {
2864			.val	= ret,
2865			.thread	= thread,
2866			.trace	= trace,
2867		};
2868		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2869		ttrace->ret_scnprintf = NULL;
2870		fprintf(trace->output, "%s", bf);
2871	} else if (sc->fmt->hexret)
2872		fprintf(trace->output, "%#lx", ret);
2873	else if (sc->fmt->errpid) {
2874		struct thread *child = machine__find_thread(trace->host, ret, ret);
2875
2876		if (child != NULL) {
2877			fprintf(trace->output, "%ld", ret);
2878			if (thread__comm_set(child))
2879				fprintf(trace->output, " (%s)", thread__comm_str(child));
2880			thread__put(child);
2881		}
2882	} else
2883		goto signed_print;
2884
2885	fputc('\n', trace->output);
2886
2887	/*
2888	 * We only consider an 'event' for the sake of --max-events a non-filtered
2889	 * sys_enter + sys_exit and other tracepoint events.
2890	 */
2891	if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2892		interrupted = true;
2893
2894	if (callchain_ret > 0)
2895		trace__fprintf_callchain(trace, sample);
2896	else if (callchain_ret < 0)
2897		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2898out:
2899	ttrace->entry_pending = false;
2900	err = 0;
2901out_put:
2902	thread__put(thread);
2903	return err;
2904}
2905
2906static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2907			      union perf_event *event __maybe_unused,
2908			      struct perf_sample *sample)
2909{
2910	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2911	struct thread_trace *ttrace;
2912	size_t filename_len, entry_str_len, to_move;
2913	ssize_t remaining_space;
2914	char *pos;
2915	const char *filename = evsel__rawptr(evsel, sample, "pathname");
2916
2917	if (!thread)
2918		goto out;
2919
2920	ttrace = thread__priv(thread);
2921	if (!ttrace)
2922		goto out_put;
2923
2924	filename_len = strlen(filename);
2925	if (filename_len == 0)
2926		goto out_put;
2927
2928	if (ttrace->filename.namelen < filename_len) {
2929		char *f = realloc(ttrace->filename.name, filename_len + 1);
2930
2931		if (f == NULL)
2932			goto out_put;
2933
2934		ttrace->filename.namelen = filename_len;
2935		ttrace->filename.name = f;
2936	}
2937
2938	strcpy(ttrace->filename.name, filename);
2939	ttrace->filename.pending_open = true;
2940
2941	if (!ttrace->filename.ptr)
2942		goto out_put;
2943
2944	entry_str_len = strlen(ttrace->entry_str);
2945	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2946	if (remaining_space <= 0)
2947		goto out_put;
2948
2949	if (filename_len > (size_t)remaining_space) {
2950		filename += filename_len - remaining_space;
2951		filename_len = remaining_space;
2952	}
2953
2954	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2955	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2956	memmove(pos + filename_len, pos, to_move);
2957	memcpy(pos, filename, filename_len);
2958
2959	ttrace->filename.ptr = 0;
2960	ttrace->filename.entry_str_pos = 0;
2961out_put:
2962	thread__put(thread);
2963out:
2964	return 0;
2965}
2966
2967static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2968				     union perf_event *event __maybe_unused,
2969				     struct perf_sample *sample)
2970{
2971        u64 runtime = evsel__intval(evsel, sample, "runtime");
2972	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2973	struct thread *thread = machine__findnew_thread(trace->host,
2974							sample->pid,
2975							sample->tid);
2976	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2977
2978	if (ttrace == NULL)
2979		goto out_dump;
2980
2981	ttrace->runtime_ms += runtime_ms;
2982	trace->runtime_ms += runtime_ms;
2983out_put:
2984	thread__put(thread);
2985	return 0;
2986
2987out_dump:
2988	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2989	       evsel->name,
2990	       evsel__strval(evsel, sample, "comm"),
2991	       (pid_t)evsel__intval(evsel, sample, "pid"),
2992	       runtime,
2993	       evsel__intval(evsel, sample, "vruntime"));
2994	goto out_put;
2995}
2996
2997static int bpf_output__printer(enum binary_printer_ops op,
2998			       unsigned int val, void *extra __maybe_unused, FILE *fp)
2999{
3000	unsigned char ch = (unsigned char)val;
3001
3002	switch (op) {
3003	case BINARY_PRINT_CHAR_DATA:
3004		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
3005	case BINARY_PRINT_DATA_BEGIN:
3006	case BINARY_PRINT_LINE_BEGIN:
3007	case BINARY_PRINT_ADDR:
3008	case BINARY_PRINT_NUM_DATA:
3009	case BINARY_PRINT_NUM_PAD:
3010	case BINARY_PRINT_SEP:
3011	case BINARY_PRINT_CHAR_PAD:
3012	case BINARY_PRINT_LINE_END:
3013	case BINARY_PRINT_DATA_END:
3014	default:
3015		break;
3016	}
3017
3018	return 0;
3019}
3020
3021static void bpf_output__fprintf(struct trace *trace,
3022				struct perf_sample *sample)
3023{
3024	binary__fprintf(sample->raw_data, sample->raw_size, 8,
3025			bpf_output__printer, NULL, trace->output);
3026	++trace->nr_events_printed;
3027}
3028
3029static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
3030				       struct thread *thread, void *augmented_args, int augmented_args_size)
3031{
3032	char bf[2048];
3033	size_t size = sizeof(bf);
3034	struct tep_format_field *field = evsel->tp_format->format.fields;
3035	struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
3036	size_t printed = 0, btf_printed;
3037	unsigned long val;
3038	u8 bit = 1;
3039	struct syscall_arg syscall_arg = {
3040		.augmented = {
3041			.size = augmented_args_size,
3042			.args = augmented_args,
3043		},
3044		.idx	= 0,
3045		.mask	= 0,
3046		.trace  = trace,
3047		.thread = thread,
3048		.show_string_prefix = trace->show_string_prefix,
3049	};
3050
3051	for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
3052		if (syscall_arg.mask & bit)
3053			continue;
3054
3055		syscall_arg.len = 0;
3056		syscall_arg.fmt = arg;
3057		if (field->flags & TEP_FIELD_IS_ARRAY) {
3058			int offset = field->offset;
3059
3060			if (field->flags & TEP_FIELD_IS_DYNAMIC) {
3061				offset = format_field__intval(field, sample, evsel->needs_swap);
3062				syscall_arg.len = offset >> 16;
3063				offset &= 0xffff;
3064				if (tep_field_is_relative(field->flags))
3065					offset += field->offset + field->size;
3066			}
3067
3068			val = (uintptr_t)(sample->raw_data + offset);
3069		} else
3070			val = format_field__intval(field, sample, evsel->needs_swap);
3071		/*
3072		 * Some syscall args need some mask, most don't and
3073		 * return val untouched.
3074		 */
3075		val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
3076
3077		/* Suppress this argument if its value is zero and show_zero property isn't set. */
3078		if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE)
3079			continue;
3080
3081		printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
3082
3083		if (trace->show_arg_names)
3084			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
3085
3086		btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type);
3087		if (btf_printed) {
3088			printed += btf_printed;
3089			continue;
3090		}
3091
3092		printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
3093	}
3094
3095	return printed + fprintf(trace->output, "%.*s", (int)printed, bf);
3096}
3097
3098static int trace__event_handler(struct trace *trace, struct evsel *evsel,
3099				union perf_event *event __maybe_unused,
3100				struct perf_sample *sample)
3101{
3102	struct thread *thread;
3103	int callchain_ret = 0;
3104
3105	if (evsel->nr_events_printed >= evsel->max_events)
3106		return 0;
3107
3108	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3109
3110	if (sample->callchain) {
3111		struct callchain_cursor *cursor = get_tls_callchain_cursor();
3112
3113		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3114		if (callchain_ret == 0) {
3115			if (cursor->nr < trace->min_stack)
3116				goto out;
3117			callchain_ret = 1;
3118		}
3119	}
3120
3121	trace__printf_interrupted_entry(trace);
3122	trace__fprintf_tstamp(trace, sample->time, trace->output);
3123
3124	if (trace->trace_syscalls && trace->show_duration)
3125		fprintf(trace->output, "(         ): ");
3126
3127	if (thread)
3128		trace__fprintf_comm_tid(trace, thread, trace->output);
3129
3130	if (evsel == trace->syscalls.events.bpf_output) {
3131		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
3132		struct syscall *sc = trace__syscall_info(trace, evsel, id);
3133
3134		if (sc) {
3135			fprintf(trace->output, "%s(", sc->name);
3136			trace__fprintf_sys_enter(trace, evsel, sample);
3137			fputc(')', trace->output);
3138			goto newline;
3139		}
3140
3141		/*
3142		 * XXX: Not having the associated syscall info or not finding/adding
3143		 * 	the thread should never happen, but if it does...
3144		 * 	fall thru and print it as a bpf_output event.
3145		 */
3146	}
3147
3148	fprintf(trace->output, "%s(", evsel->name);
3149
3150	if (evsel__is_bpf_output(evsel)) {
3151		bpf_output__fprintf(trace, sample);
3152	} else if (evsel->tp_format) {
3153		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
3154		    trace__fprintf_sys_enter(trace, evsel, sample)) {
3155			if (trace->libtraceevent_print) {
3156				event_format__fprintf(evsel->tp_format, sample->cpu,
3157						      sample->raw_data, sample->raw_size,
3158						      trace->output);
3159			} else {
3160				trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
3161			}
3162		}
3163	}
3164
3165newline:
3166	fprintf(trace->output, ")\n");
3167
3168	if (callchain_ret > 0)
3169		trace__fprintf_callchain(trace, sample);
3170	else if (callchain_ret < 0)
3171		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3172
3173	++trace->nr_events_printed;
3174
3175	if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
3176		evsel__disable(evsel);
3177		evsel__close(evsel);
3178	}
3179out:
3180	thread__put(thread);
3181	return 0;
3182}
3183
3184static void print_location(FILE *f, struct perf_sample *sample,
3185			   struct addr_location *al,
3186			   bool print_dso, bool print_sym)
3187{
3188
3189	if ((verbose > 0 || print_dso) && al->map)
3190		fprintf(f, "%s@", dso__long_name(map__dso(al->map)));
3191
3192	if ((verbose > 0 || print_sym) && al->sym)
3193		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
3194			al->addr - al->sym->start);
3195	else if (al->map)
3196		fprintf(f, "0x%" PRIx64, al->addr);
3197	else
3198		fprintf(f, "0x%" PRIx64, sample->addr);
3199}
3200
3201static int trace__pgfault(struct trace *trace,
3202			  struct evsel *evsel,
3203			  union perf_event *event __maybe_unused,
3204			  struct perf_sample *sample)
3205{
3206	struct thread *thread;
3207	struct addr_location al;
3208	char map_type = 'd';
3209	struct thread_trace *ttrace;
3210	int err = -1;
3211	int callchain_ret = 0;
3212
3213	addr_location__init(&al);
3214	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3215
3216	if (sample->callchain) {
3217		struct callchain_cursor *cursor = get_tls_callchain_cursor();
3218
3219		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3220		if (callchain_ret == 0) {
3221			if (cursor->nr < trace->min_stack)
3222				goto out_put;
3223			callchain_ret = 1;
3224		}
3225	}
3226
3227	ttrace = thread__trace(thread, trace->output);
3228	if (ttrace == NULL)
3229		goto out_put;
3230
3231	if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
3232		ttrace->pfmaj++;
3233	else
3234		ttrace->pfmin++;
3235
3236	if (trace->summary_only)
3237		goto out;
3238
3239	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
 
3240
3241	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
3242
3243	fprintf(trace->output, "%sfault [",
3244		evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
3245		"maj" : "min");
3246
3247	print_location(trace->output, sample, &al, false, true);
3248
3249	fprintf(trace->output, "] => ");
3250
3251	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
 
3252
3253	if (!al.map) {
3254		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
 
3255
3256		if (al.map)
3257			map_type = 'x';
3258		else
3259			map_type = '?';
3260	}
3261
3262	print_location(trace->output, sample, &al, true, false);
3263
3264	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
3265
3266	if (callchain_ret > 0)
3267		trace__fprintf_callchain(trace, sample);
3268	else if (callchain_ret < 0)
3269		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3270
3271	++trace->nr_events_printed;
3272out:
3273	err = 0;
3274out_put:
3275	thread__put(thread);
3276	addr_location__exit(&al);
3277	return err;
3278}
3279
3280static void trace__set_base_time(struct trace *trace,
3281				 struct evsel *evsel,
3282				 struct perf_sample *sample)
3283{
3284	/*
3285	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
3286	 * and don't use sample->time unconditionally, we may end up having
3287	 * some other event in the future without PERF_SAMPLE_TIME for good
3288	 * reason, i.e. we may not be interested in its timestamps, just in
3289	 * it taking place, picking some piece of information when it
3290	 * appears in our event stream (vfs_getname comes to mind).
3291	 */
3292	if (trace->base_time == 0 && !trace->full_time &&
3293	    (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
3294		trace->base_time = sample->time;
3295}
3296
3297static int trace__process_sample(const struct perf_tool *tool,
3298				 union perf_event *event,
3299				 struct perf_sample *sample,
3300				 struct evsel *evsel,
3301				 struct machine *machine __maybe_unused)
3302{
3303	struct trace *trace = container_of(tool, struct trace, tool);
3304	struct thread *thread;
3305	int err = 0;
3306
3307	tracepoint_handler handler = evsel->handler;
3308
3309	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3310	if (thread && thread__is_filtered(thread))
3311		goto out;
3312
3313	trace__set_base_time(trace, evsel, sample);
3314
3315	if (handler) {
3316		++trace->nr_events;
3317		handler(trace, evsel, event, sample);
3318	}
3319out:
3320	thread__put(thread);
3321	return err;
3322}
3323
3324static int trace__record(struct trace *trace, int argc, const char **argv)
3325{
3326	unsigned int rec_argc, i, j;
3327	const char **rec_argv;
3328	const char * const record_args[] = {
3329		"record",
3330		"-R",
3331		"-m", "1024",
3332		"-c", "1",
3333	};
3334	pid_t pid = getpid();
3335	char *filter = asprintf__tp_filter_pids(1, &pid);
3336	const char * const sc_args[] = { "-e", };
3337	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
3338	const char * const majpf_args[] = { "-e", "major-faults" };
3339	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
3340	const char * const minpf_args[] = { "-e", "minor-faults" };
3341	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
3342	int err = -1;
3343
3344	/* +3 is for the event string below and the pid filter */
3345	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
3346		majpf_args_nr + minpf_args_nr + argc;
3347	rec_argv = calloc(rec_argc + 1, sizeof(char *));
3348
3349	if (rec_argv == NULL || filter == NULL)
3350		goto out_free;
3351
3352	j = 0;
3353	for (i = 0; i < ARRAY_SIZE(record_args); i++)
3354		rec_argv[j++] = record_args[i];
3355
3356	if (trace->trace_syscalls) {
3357		for (i = 0; i < sc_args_nr; i++)
3358			rec_argv[j++] = sc_args[i];
3359
3360		/* event string may be different for older kernels - e.g., RHEL6 */
3361		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
3362			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
3363		else if (is_valid_tracepoint("syscalls:sys_enter"))
3364			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
3365		else {
3366			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
3367			goto out_free;
 
3368		}
3369	}
3370
3371	rec_argv[j++] = "--filter";
3372	rec_argv[j++] = filter;
3373
3374	if (trace->trace_pgfaults & TRACE_PFMAJ)
3375		for (i = 0; i < majpf_args_nr; i++)
3376			rec_argv[j++] = majpf_args[i];
3377
3378	if (trace->trace_pgfaults & TRACE_PFMIN)
3379		for (i = 0; i < minpf_args_nr; i++)
3380			rec_argv[j++] = minpf_args[i];
3381
3382	for (i = 0; i < (unsigned int)argc; i++)
3383		rec_argv[j++] = argv[i];
3384
3385	err = cmd_record(j, rec_argv);
3386out_free:
3387	free(filter);
3388	free(rec_argv);
3389	return err;
3390}
3391
3392static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
3393
3394static bool evlist__add_vfs_getname(struct evlist *evlist)
3395{
3396	bool found = false;
3397	struct evsel *evsel, *tmp;
3398	struct parse_events_error err;
3399	int ret;
3400
3401	parse_events_error__init(&err);
3402	ret = parse_events(evlist, "probe:vfs_getname*", &err);
3403	parse_events_error__exit(&err);
3404	if (ret)
3405		return false;
3406
3407	evlist__for_each_entry_safe(evlist, evsel, tmp) {
3408		if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
3409			continue;
3410
3411		if (evsel__field(evsel, "pathname")) {
3412			evsel->handler = trace__vfs_getname;
3413			found = true;
3414			continue;
3415		}
3416
3417		list_del_init(&evsel->core.node);
3418		evsel->evlist = NULL;
3419		evsel__delete(evsel);
3420	}
3421
3422	return found;
 
 
3423}
3424
3425static struct evsel *evsel__new_pgfault(u64 config)
3426{
3427	struct evsel *evsel;
3428	struct perf_event_attr attr = {
3429		.type = PERF_TYPE_SOFTWARE,
3430		.mmap_data = 1,
3431	};
3432
3433	attr.config = config;
3434	attr.sample_period = 1;
3435
3436	event_attr_init(&attr);
3437
3438	evsel = evsel__new(&attr);
3439	if (evsel)
3440		evsel->handler = trace__pgfault;
3441
3442	return evsel;
3443}
3444
3445static void evlist__free_syscall_tp_fields(struct evlist *evlist)
3446{
3447	struct evsel *evsel;
3448
3449	evlist__for_each_entry(evlist, evsel) {
3450		evsel_trace__delete(evsel->priv);
3451		evsel->priv = NULL;
3452	}
3453}
3454
3455static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
3456{
3457	const u32 type = event->header.type;
3458	struct evsel *evsel;
3459
3460	if (type != PERF_RECORD_SAMPLE) {
3461		trace__process_event(trace, trace->host, event, sample);
3462		return;
3463	}
3464
3465	evsel = evlist__id2evsel(trace->evlist, sample->id);
3466	if (evsel == NULL) {
3467		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
3468		return;
3469	}
3470
3471	if (evswitch__discard(&trace->evswitch, evsel))
3472		return;
3473
3474	trace__set_base_time(trace, evsel, sample);
3475
3476	if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
3477	    sample->raw_data == NULL) {
3478		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
3479		       evsel__name(evsel), sample->tid,
3480		       sample->cpu, sample->raw_size);
3481	} else {
3482		tracepoint_handler handler = evsel->handler;
3483		handler(trace, evsel, event, sample);
3484	}
3485
3486	if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
3487		interrupted = true;
3488}
3489
3490static int trace__add_syscall_newtp(struct trace *trace)
3491{
3492	int ret = -1;
3493	struct evlist *evlist = trace->evlist;
3494	struct evsel *sys_enter, *sys_exit;
3495
3496	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
3497	if (sys_enter == NULL)
3498		goto out;
3499
3500	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
3501		goto out_delete_sys_enter;
3502
3503	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
3504	if (sys_exit == NULL)
3505		goto out_delete_sys_enter;
3506
3507	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
3508		goto out_delete_sys_exit;
3509
3510	evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
3511	evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
3512
3513	evlist__add(evlist, sys_enter);
3514	evlist__add(evlist, sys_exit);
3515
3516	if (callchain_param.enabled && !trace->kernel_syscallchains) {
3517		/*
3518		 * We're interested only in the user space callchain
3519		 * leading to the syscall, allow overriding that for
3520		 * debugging reasons using --kernel_syscall_callchains
3521		 */
3522		sys_exit->core.attr.exclude_callchain_kernel = 1;
3523	}
3524
3525	trace->syscalls.events.sys_enter = sys_enter;
3526	trace->syscalls.events.sys_exit  = sys_exit;
3527
3528	ret = 0;
3529out:
3530	return ret;
3531
3532out_delete_sys_exit:
3533	evsel__delete_priv(sys_exit);
3534out_delete_sys_enter:
3535	evsel__delete_priv(sys_enter);
3536	goto out;
3537}
3538
3539static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
3540{
3541	int err = -1;
3542	struct evsel *sys_exit;
3543	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
3544						trace->ev_qualifier_ids.nr,
3545						trace->ev_qualifier_ids.entries);
3546
3547	if (filter == NULL)
3548		goto out_enomem;
3549
3550	if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
 
3551		sys_exit = trace->syscalls.events.sys_exit;
3552		err = evsel__append_tp_filter(sys_exit, filter);
3553	}
3554
3555	free(filter);
3556out:
3557	return err;
3558out_enomem:
3559	errno = ENOMEM;
3560	goto out;
3561}
3562
3563#ifdef HAVE_BPF_SKEL
3564static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
3565{
3566       int id;
3567
3568	if (arg_fmt->type != NULL)
3569		return -1;
3570
3571       id = btf__find_by_name(btf, type);
3572       if (id < 0)
3573		return -1;
3574
3575       arg_fmt->type    = btf__type_by_id(btf, id);
3576       arg_fmt->type_id = id;
3577
3578       return 0;
3579}
3580
3581static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
3582{
3583	struct bpf_program *pos, *prog = NULL;
3584	const char *sec_name;
3585
3586	if (trace->skel->obj == NULL)
3587		return NULL;
3588
3589	bpf_object__for_each_program(pos, trace->skel->obj) {
3590		sec_name = bpf_program__section_name(pos);
3591		if (sec_name && !strcmp(sec_name, name)) {
3592			prog = pos;
3593			break;
3594		}
3595	}
3596
3597	return prog;
3598}
3599
3600static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
3601							const char *prog_name, const char *type)
3602{
3603	struct bpf_program *prog;
3604
3605	if (prog_name == NULL) {
3606		char default_prog_name[256];
3607		scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name);
3608		prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3609		if (prog != NULL)
3610			goto out_found;
3611		if (sc->fmt && sc->fmt->alias) {
3612			scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
3613			prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3614			if (prog != NULL)
3615				goto out_found;
3616		}
3617		goto out_unaugmented;
3618	}
3619
3620	prog = trace__find_bpf_program_by_title(trace, prog_name);
3621
3622	if (prog != NULL) {
3623out_found:
3624		return prog;
3625	}
3626
3627	pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
3628		 prog_name, type, sc->name);
3629out_unaugmented:
3630	return trace->skel->progs.syscall_unaugmented;
3631}
3632
3633static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
3634{
3635	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3636
3637	if (sc == NULL)
3638		return;
3639
3640	sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3641	sc->bpf_prog.sys_exit  = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit  : NULL,  "exit");
3642}
3643
3644static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
3645{
3646	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3647	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3648}
3649
3650static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
3651{
3652	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3653	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3654}
3655
3656static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
3657{
3658	struct tep_format_field *field;
3659	struct syscall *sc = trace__syscall_info(trace, NULL, key);
3660	const struct btf_type *bt;
3661	char *struct_offset, *tmp, name[32];
3662	bool can_augment = false;
3663	int i, cnt;
3664
3665	if (sc == NULL)
3666		return -1;
3667
3668	trace__load_vmlinux_btf(trace);
3669	if (trace->btf == NULL)
3670		return -1;
3671
3672	for (i = 0, field = sc->args; field; ++i, field = field->next) {
3673		// XXX We're only collecting pointer payloads _from_ user space
3674		if (!sc->arg_fmt[i].from_user)
3675			continue;
3676
3677		struct_offset = strstr(field->type, "struct ");
3678		if (struct_offset == NULL)
3679			struct_offset = strstr(field->type, "union ");
3680		else
3681			struct_offset++; // "union" is shorter
3682
3683		if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
3684			struct_offset += 6;
3685
3686			/* for 'struct foo *', we only want 'foo' */
3687			for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
3688			}
3689
3690			strncpy(name, struct_offset, cnt);
3691			name[cnt] = '\0';
3692
3693			/* cache struct's btf_type and type_id */
3694			if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name))
3695				continue;
3696
3697			bt = sc->arg_fmt[i].type;
3698			beauty_array[i] = bt->size;
3699			can_augment = true;
3700		} else if (field->flags & TEP_FIELD_IS_POINTER && /* string */
3701			   strcmp(field->type, "const char *") == 0 &&
3702			   (strstr(field->name, "name") ||
3703			    strstr(field->name, "path") ||
3704			    strstr(field->name, "file") ||
3705			    strstr(field->name, "root") ||
3706			    strstr(field->name, "key") ||
3707			    strstr(field->name, "special") ||
3708			    strstr(field->name, "type") ||
3709			    strstr(field->name, "description"))) {
3710			beauty_array[i] = 1;
3711			can_augment = true;
3712		} else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */
3713			   strstr(field->type, "char *") &&
3714			   (strstr(field->name, "buf") ||
3715			    strstr(field->name, "val") ||
3716			    strstr(field->name, "msg"))) {
3717			int j;
3718			struct tep_format_field *field_tmp;
3719
3720			/* find the size of the buffer that appears in pairs with buf */
3721			for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) {
3722				if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */
3723				    (strstr(field_tmp->name, "count") ||
3724				     strstr(field_tmp->name, "siz") ||  /* size, bufsiz */
3725				     (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) {
3726					 /* filename's got 'len' in it, we don't want that */
3727					beauty_array[i] = -(j + 1);
3728					can_augment = true;
3729					break;
3730				}
3731			}
3732		}
3733	}
3734
3735	if (can_augment)
3736		return 0;
3737
3738	return -1;
3739}
3740
3741static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
3742{
3743	struct tep_format_field *field, *candidate_field;
3744	/*
3745	 * We're only interested in syscalls that have a pointer:
3746	 */
3747	for (field = sc->args; field; field = field->next) {
3748		if (field->flags & TEP_FIELD_IS_POINTER)
3749			goto try_to_find_pair;
3750	}
3751
3752	return NULL;
3753
3754try_to_find_pair:
3755	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3756		int id = syscalltbl__id_at_idx(trace->sctbl, i);
3757		struct syscall *pair = trace__syscall_info(trace, NULL, id);
3758		struct bpf_program *pair_prog;
3759		bool is_candidate = false;
3760
3761		if (pair == NULL || pair == sc ||
3762		    pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
3763			continue;
3764
3765		for (field = sc->args, candidate_field = pair->args;
3766		     field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
3767			bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
3768			     candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
3769
3770			if (is_pointer) {
3771			       if (!candidate_is_pointer) {
3772					// The candidate just doesn't copies our pointer arg, might copy other pointers we want.
3773					continue;
3774			       }
3775			} else {
3776				if (candidate_is_pointer) {
3777					// The candidate might copy a pointer we don't have, skip it.
3778					goto next_candidate;
3779				}
3780				continue;
3781			}
3782
3783			if (strcmp(field->type, candidate_field->type))
3784				goto next_candidate;
3785
3786			/*
3787			 * This is limited in the BPF program but sys_write
3788			 * uses "const char *" for its "buf" arg so we need to
3789			 * use some heuristic that is kinda future proof...
3790			 */
3791			if (strcmp(field->type, "const char *") == 0 &&
3792			    !(strstr(field->name, "name") ||
3793			      strstr(field->name, "path") ||
3794			      strstr(field->name, "file") ||
3795			      strstr(field->name, "root") ||
3796			      strstr(field->name, "description")))
3797				goto next_candidate;
3798
3799			is_candidate = true;
3800		}
3801
3802		if (!is_candidate)
3803			goto next_candidate;
3804
3805		/*
3806		 * Check if the tentative pair syscall augmenter has more pointers, if it has,
3807		 * then it may be collecting that and we then can't use it, as it would collect
3808		 * more than what is common to the two syscalls.
3809		 */
3810		if (candidate_field) {
3811			for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
3812				if (candidate_field->flags & TEP_FIELD_IS_POINTER)
3813					goto next_candidate;
3814		}
3815
3816		pair_prog = pair->bpf_prog.sys_enter;
3817		/*
3818		 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
3819		 * have been searched for, so search it here and if it returns the
3820		 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
3821		 * program for a filtered syscall on a non-filtered one.
3822		 *
3823		 * For instance, we have "!syscalls:sys_enter_renameat" and that is
3824		 * useful for "renameat2".
3825		 */
3826		if (pair_prog == NULL) {
3827			pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3828			if (pair_prog == trace->skel->progs.syscall_unaugmented)
3829				goto next_candidate;
3830		}
3831
3832		pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3833		return pair_prog;
3834	next_candidate:
3835		continue;
3836	}
3837
3838	return NULL;
3839}
3840
3841static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3842{
3843	int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
3844	int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
3845	int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
3846	int err = 0;
3847	unsigned int beauty_array[6];
3848
3849	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3850		int prog_fd, key = syscalltbl__id_at_idx(trace->sctbl, i);
3851
3852		if (!trace__syscall_enabled(trace, key))
3853			continue;
3854
3855		trace__init_syscall_bpf_progs(trace, key);
3856
3857		// It'll get at least the "!raw_syscalls:unaugmented"
3858		prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3859		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3860		if (err)
3861			break;
3862		prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3863		err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3864		if (err)
3865			break;
3866
3867		/* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
3868		memset(beauty_array, 0, sizeof(beauty_array));
3869		err = trace__bpf_sys_enter_beauty_map(trace, key, (unsigned int *)beauty_array);
3870		if (err)
3871			continue;
3872		err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY);
3873		if (err)
3874			break;
3875	}
3876
3877	/*
3878	 * Now lets do a second pass looking for enabled syscalls without
3879	 * an augmenter that have a signature that is a superset of another
3880	 * syscall with an augmenter so that we can auto-reuse it.
3881	 *
3882	 * I.e. if we have an augmenter for the "open" syscall that has
3883	 * this signature:
3884	 *
3885	 *   int open(const char *pathname, int flags, mode_t mode);
3886	 *
3887	 * I.e. that will collect just the first string argument, then we
3888	 * can reuse it for the 'creat' syscall, that has this signature:
3889	 *
3890	 *   int creat(const char *pathname, mode_t mode);
3891	 *
3892	 * and for:
3893	 *
3894	 *   int stat(const char *pathname, struct stat *statbuf);
3895	 *   int lstat(const char *pathname, struct stat *statbuf);
3896	 *
3897	 * Because the 'open' augmenter will collect the first arg as a string,
3898	 * and leave alone all the other args, which already helps with
3899	 * beautifying 'stat' and 'lstat''s pathname arg.
3900	 *
3901	 * Then, in time, when 'stat' gets an augmenter that collects both
3902	 * first and second arg (this one on the raw_syscalls:sys_exit prog
3903	 * array tail call, then that one will be used.
3904	 */
3905	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3906		int key = syscalltbl__id_at_idx(trace->sctbl, i);
3907		struct syscall *sc = trace__syscall_info(trace, NULL, key);
3908		struct bpf_program *pair_prog;
3909		int prog_fd;
3910
3911		if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3912			continue;
3913
3914		/*
3915		 * For now we're just reusing the sys_enter prog, and if it
3916		 * already has an augmenter, we don't need to find one.
3917		 */
3918		if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
3919			continue;
3920
3921		/*
3922		 * Look at all the other syscalls for one that has a signature
3923		 * that is close enough that we can share:
3924		 */
3925		pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3926		if (pair_prog == NULL)
3927			continue;
3928
3929		sc->bpf_prog.sys_enter = pair_prog;
3930
3931		/*
3932		 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3933		 * with the fd for the program we're reusing:
3934		 */
3935		prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3936		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3937		if (err)
3938			break;
3939	}
3940
3941	return err;
3942}
3943#endif // HAVE_BPF_SKEL
3944
3945static int trace__set_ev_qualifier_filter(struct trace *trace)
3946{
3947	if (trace->syscalls.events.sys_enter)
3948		return trace__set_ev_qualifier_tp_filter(trace);
3949	return 0;
3950}
3951
3952static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3953				    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3954{
3955	int err = 0;
3956#ifdef HAVE_LIBBPF_SUPPORT
3957	bool value = true;
3958	int map_fd = bpf_map__fd(map);
3959	size_t i;
3960
3961	for (i = 0; i < npids; ++i) {
3962		err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3963		if (err)
3964			break;
3965	}
3966#endif
3967	return err;
3968}
3969
3970static int trace__set_filter_loop_pids(struct trace *trace)
3971{
3972	unsigned int nr = 1, err;
3973	pid_t pids[32] = {
3974		getpid(),
3975	};
3976	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3977
3978	while (thread && nr < ARRAY_SIZE(pids)) {
3979		struct thread *parent = machine__find_thread(trace->host,
3980							     thread__ppid(thread),
3981							     thread__ppid(thread));
3982
3983		if (parent == NULL)
3984			break;
3985
3986		if (!strcmp(thread__comm_str(parent), "sshd") ||
3987		    strstarts(thread__comm_str(parent), "gnome-terminal")) {
3988			pids[nr++] = thread__tid(parent);
3989			break;
3990		}
3991		thread = parent;
3992	}
3993
3994	err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
3995	if (!err && trace->filter_pids.map)
3996		err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3997
3998	return err;
3999}
4000
4001static int trace__set_filter_pids(struct trace *trace)
4002{
4003	int err = 0;
4004	/*
4005	 * Better not use !target__has_task() here because we need to cover the
4006	 * case where no threads were specified in the command line, but a
4007	 * workload was, and in that case we will fill in the thread_map when
4008	 * we fork the workload in evlist__prepare_workload.
4009	 */
4010	if (trace->filter_pids.nr > 0) {
4011		err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
4012						    trace->filter_pids.entries);
4013		if (!err && trace->filter_pids.map) {
4014			err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
4015						       trace->filter_pids.entries);
4016		}
4017	} else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
4018		err = trace__set_filter_loop_pids(trace);
4019	}
4020
4021	return err;
4022}
4023
4024static int __trace__deliver_event(struct trace *trace, union perf_event *event)
4025{
4026	struct evlist *evlist = trace->evlist;
4027	struct perf_sample sample;
4028	int err = evlist__parse_sample(evlist, event, &sample);
4029
4030	if (err)
4031		fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
4032	else
4033		trace__handle_event(trace, event, &sample);
4034
4035	return 0;
4036}
4037
4038static int __trace__flush_events(struct trace *trace)
4039{
4040	u64 first = ordered_events__first_time(&trace->oe.data);
4041	u64 flush = trace->oe.last - NSEC_PER_SEC;
4042
4043	/* Is there some thing to flush.. */
4044	if (first && first < flush)
4045		return ordered_events__flush_time(&trace->oe.data, flush);
4046
4047	return 0;
4048}
4049
4050static int trace__flush_events(struct trace *trace)
4051{
4052	return !trace->sort_events ? 0 : __trace__flush_events(trace);
4053}
4054
4055static int trace__deliver_event(struct trace *trace, union perf_event *event)
4056{
4057	int err;
4058
4059	if (!trace->sort_events)
4060		return __trace__deliver_event(trace, event);
4061
4062	err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
4063	if (err && err != -1)
4064		return err;
4065
4066	err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
4067	if (err)
4068		return err;
4069
4070	return trace__flush_events(trace);
4071}
4072
4073static int ordered_events__deliver_event(struct ordered_events *oe,
4074					 struct ordered_event *event)
4075{
4076	struct trace *trace = container_of(oe, struct trace, oe.data);
4077
4078	return __trace__deliver_event(trace, event->event);
4079}
4080
4081static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg,
4082								   char **type)
4083{
4084	struct tep_format_field *field;
4085	struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
4086
4087	if (evsel->tp_format == NULL || fmt == NULL)
4088		return NULL;
4089
4090	for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
4091		if (strcmp(field->name, arg) == 0) {
4092			*type = field->type;
4093			return fmt;
4094		}
4095
4096	return NULL;
4097}
4098
4099static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
4100{
4101	char *tok, *left = evsel->filter, *new_filter = evsel->filter;
4102
4103	while ((tok = strpbrk(left, "=<>!")) != NULL) {
4104		char *right = tok + 1, *right_end;
4105
4106		if (*right == '=')
4107			++right;
4108
4109		while (isspace(*right))
4110			++right;
4111
4112		if (*right == '\0')
4113			break;
4114
4115		while (!isalpha(*left))
4116			if (++left == tok) {
4117				/*
4118				 * Bail out, can't find the name of the argument that is being
4119				 * used in the filter, let it try to set this filter, will fail later.
4120				 */
4121				return 0;
4122			}
4123
4124		right_end = right + 1;
4125		while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
4126			++right_end;
4127
4128		if (isalpha(*right)) {
4129			struct syscall_arg_fmt *fmt;
4130			int left_size = tok - left,
4131			    right_size = right_end - right;
4132			char arg[128], *type;
4133
4134			while (isspace(left[left_size - 1]))
4135				--left_size;
4136
4137			scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
4138
4139			fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type);
4140			if (fmt == NULL) {
4141				pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
4142				       arg, evsel->name, evsel->filter);
4143				return -1;
4144			}
4145
4146			pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
4147				 arg, (int)(right - tok), tok, right_size, right);
4148
4149			if (fmt->strtoul) {
4150				u64 val;
4151				struct syscall_arg syscall_arg = {
4152					.trace = trace,
4153					.fmt   = fmt,
4154					.type_name = type,
4155					.parm = fmt->parm,
4156				};
4157
4158				if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
4159					char *n, expansion[19];
4160					int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
4161					int expansion_offset = right - new_filter;
4162
4163					pr_debug("%s", expansion);
4164
4165					if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
4166						pr_debug(" out of memory!\n");
4167						free(new_filter);
4168						return -1;
4169					}
4170					if (new_filter != evsel->filter)
4171						free(new_filter);
4172					left = n + expansion_offset + expansion_lenght;
4173					new_filter = n;
4174				} else {
4175					pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4176					       right_size, right, arg, evsel->name, evsel->filter);
4177					return -1;
4178				}
4179			} else {
4180				pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4181				       arg, evsel->name, evsel->filter);
4182				return -1;
4183			}
4184
4185			pr_debug("\n");
4186		} else {
4187			left = right_end;
4188		}
4189	}
4190
4191	if (new_filter != evsel->filter) {
4192		pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
4193		evsel__set_filter(evsel, new_filter);
4194		free(new_filter);
4195	}
4196
4197	return 0;
4198}
4199
4200static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
4201{
4202	struct evlist *evlist = trace->evlist;
4203	struct evsel *evsel;
4204
4205	evlist__for_each_entry(evlist, evsel) {
4206		if (evsel->filter == NULL)
4207			continue;
4208
4209		if (trace__expand_filter(trace, evsel)) {
4210			*err_evsel = evsel;
4211			return -1;
4212		}
4213	}
4214
4215	return 0;
4216}
4217
4218static int trace__run(struct trace *trace, int argc, const char **argv)
4219{
4220	struct evlist *evlist = trace->evlist;
4221	struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
4222	int err = -1, i;
4223	unsigned long before;
4224	const bool forks = argc > 0;
4225	bool draining = false;
4226
4227	trace->live = true;
4228
4229	if (!trace->raw_augmented_syscalls) {
4230		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
4231			goto out_error_raw_syscalls;
4232
4233		if (trace->trace_syscalls)
4234			trace->vfs_getname = evlist__add_vfs_getname(evlist);
4235	}
4236
4237	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
4238		pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
4239		if (pgfault_maj == NULL)
4240			goto out_error_mem;
4241		evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
4242		evlist__add(evlist, pgfault_maj);
4243	}
4244
4245	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
4246		pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
4247		if (pgfault_min == NULL)
4248			goto out_error_mem;
4249		evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
4250		evlist__add(evlist, pgfault_min);
4251	}
4252
4253	/* Enable ignoring missing threads when -u/-p option is defined. */
4254	trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
4255
4256	if (trace->sched &&
4257	    evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
 
4258		goto out_error_sched_stat_runtime;
 
4259	/*
4260	 * If a global cgroup was set, apply it to all the events without an
4261	 * explicit cgroup. I.e.:
4262	 *
4263	 * 	trace -G A -e sched:*switch
4264	 *
4265	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
4266	 * _and_ sched:sched_switch to the 'A' cgroup, while:
4267	 *
4268	 * trace -e sched:*switch -G A
4269	 *
4270	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
4271	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
4272	 * a cgroup (on the root cgroup, sys wide, etc).
4273	 *
4274	 * Multiple cgroups:
4275	 *
4276	 * trace -G A -e sched:*switch -G B
4277	 *
4278	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
4279	 * to the 'B' cgroup.
4280	 *
4281	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
4282	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
4283	 */
4284	if (trace->cgroup)
4285		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
4286
4287	err = evlist__create_maps(evlist, &trace->opts.target);
4288	if (err < 0) {
4289		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
4290		goto out_delete_evlist;
4291	}
4292
4293	err = trace__symbols_init(trace, evlist);
4294	if (err < 0) {
4295		fprintf(trace->output, "Problems initializing symbol libraries!\n");
4296		goto out_delete_evlist;
4297	}
4298
4299	evlist__config(evlist, &trace->opts, &callchain_param);
 
 
 
4300
4301	if (forks) {
4302		err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
 
4303		if (err < 0) {
4304			fprintf(trace->output, "Couldn't run the workload!\n");
4305			goto out_delete_evlist;
4306		}
4307		workload_pid = evlist->workload.pid;
4308	}
4309
4310	err = evlist__open(evlist);
4311	if (err < 0)
4312		goto out_error_open;
4313#ifdef HAVE_BPF_SKEL
4314	if (trace->syscalls.events.bpf_output) {
4315		struct perf_cpu cpu;
4316
4317		/*
4318		 * Set up the __augmented_syscalls__ BPF map to hold for each
4319		 * CPU the bpf-output event's file descriptor.
4320		 */
4321		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
4322			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
4323					&cpu.cpu, sizeof(int),
4324					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
4325						       cpu.cpu, 0),
4326					sizeof(__u32), BPF_ANY);
4327		}
4328	}
4329
4330	if (trace->skel)
4331		trace->filter_pids.map = trace->skel->maps.pids_filtered;
4332#endif
4333	err = trace__set_filter_pids(trace);
 
 
 
 
 
 
 
4334	if (err < 0)
4335		goto out_error_mem;
4336
4337#ifdef HAVE_BPF_SKEL
4338	if (trace->skel && trace->skel->progs.sys_enter)
4339		trace__init_syscalls_bpf_prog_array_maps(trace);
4340#endif
4341
4342	if (trace->ev_qualifier_ids.nr > 0) {
4343		err = trace__set_ev_qualifier_filter(trace);
4344		if (err < 0)
4345			goto out_errno;
4346
4347		if (trace->syscalls.events.sys_exit) {
4348			pr_debug("event qualifier tracepoint filter: %s\n",
4349				 trace->syscalls.events.sys_exit->filter);
4350		}
4351	}
4352
4353	/*
4354	 * If the "close" syscall is not traced, then we will not have the
4355	 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
4356	 * fd->pathname table and were ending up showing the last value set by
4357	 * syscalls opening a pathname and associating it with a descriptor or
4358	 * reading it from /proc/pid/fd/ in cases where that doesn't make
4359	 * sense.
4360	 *
4361	 *  So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
4362	 *  not in use.
4363	 */
4364	trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
4365
4366	err = trace__expand_filters(trace, &evsel);
4367	if (err)
4368		goto out_delete_evlist;
4369	err = evlist__apply_filters(evlist, &evsel, &trace->opts.target);
4370	if (err < 0)
4371		goto out_error_apply_filters;
4372
4373	err = evlist__mmap(evlist, trace->opts.mmap_pages);
4374	if (err < 0)
4375		goto out_error_mmap;
4376
4377	if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
4378		evlist__enable(evlist);
4379
4380	if (forks)
4381		evlist__start_workload(evlist);
4382
4383	if (trace->opts.target.initial_delay) {
4384		usleep(trace->opts.target.initial_delay * 1000);
4385		evlist__enable(evlist);
4386	}
4387
4388	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
4389		perf_thread_map__nr(evlist->core.threads) > 1 ||
4390		evlist__first(evlist)->core.attr.inherit;
4391
4392	/*
4393	 * Now that we already used evsel->core.attr to ask the kernel to setup the
4394	 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
4395	 * trace__resolve_callchain(), allowing per-event max-stack settings
4396	 * to override an explicitly set --max-stack global setting.
4397	 */
4398	evlist__for_each_entry(evlist, evsel) {
4399		if (evsel__has_callchain(evsel) &&
4400		    evsel->core.attr.sample_max_stack == 0)
4401			evsel->core.attr.sample_max_stack = trace->max_stack;
4402	}
4403again:
4404	before = trace->nr_events;
4405
4406	for (i = 0; i < evlist->core.nr_mmaps; i++) {
4407		union perf_event *event;
4408		struct mmap *md;
4409
4410		md = &evlist->mmap[i];
4411		if (perf_mmap__read_init(&md->core) < 0)
4412			continue;
4413
4414		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
 
 
4415			++trace->nr_events;
4416
4417			err = trace__deliver_event(trace, event);
4418			if (err)
4419				goto out_disable;
 
 
4420
4421			perf_mmap__consume(&md->core);
 
 
4422
4423			if (interrupted)
4424				goto out_disable;
4425
4426			if (done && !draining) {
4427				evlist__disable(evlist);
4428				draining = true;
4429			}
4430		}
4431		perf_mmap__read_done(&md->core);
4432	}
4433
4434	if (trace->nr_events == before) {
4435		int timeout = done ? 100 : -1;
4436
4437		if (!draining && evlist__poll(evlist, timeout) > 0) {
4438			if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
4439				draining = true;
4440
4441			goto again;
4442		} else {
4443			if (trace__flush_events(trace))
4444				goto out_disable;
4445		}
4446	} else {
4447		goto again;
4448	}
4449
4450out_disable:
4451	thread__zput(trace->current);
4452
4453	evlist__disable(evlist);
4454
4455	if (trace->sort_events)
4456		ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
4457
4458	if (!err) {
4459		if (trace->summary)
4460			trace__fprintf_thread_summary(trace, trace->output);
4461
4462		if (trace->show_tool_stats) {
4463			fprintf(trace->output, "Stats:\n "
4464					       " vfs_getname : %" PRIu64 "\n"
4465					       " proc_getname: %" PRIu64 "\n",
4466				trace->stats.vfs_getname,
4467				trace->stats.proc_getname);
4468		}
4469	}
4470
4471out_delete_evlist:
4472	trace__symbols__exit(trace);
4473	evlist__free_syscall_tp_fields(evlist);
4474	evlist__delete(evlist);
4475	cgroup__put(trace->cgroup);
4476	trace->evlist = NULL;
4477	trace->live = false;
4478	return err;
4479{
4480	char errbuf[BUFSIZ];
4481
4482out_error_sched_stat_runtime:
4483	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
4484	goto out_error;
4485
4486out_error_raw_syscalls:
4487	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
4488	goto out_error;
4489
4490out_error_mmap:
4491	evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
4492	goto out_error;
4493
4494out_error_open:
4495	evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
4496
4497out_error:
4498	fprintf(trace->output, "%s\n", errbuf);
4499	goto out_delete_evlist;
4500
4501out_error_apply_filters:
4502	fprintf(trace->output,
4503		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
4504		evsel->filter, evsel__name(evsel), errno,
4505		str_error_r(errno, errbuf, sizeof(errbuf)));
4506	goto out_delete_evlist;
4507}
4508out_error_mem:
4509	fprintf(trace->output, "Not enough memory to run!\n");
4510	goto out_delete_evlist;
4511
4512out_errno:
4513	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
4514	goto out_delete_evlist;
4515}
4516
4517static int trace__replay(struct trace *trace)
4518{
4519	const struct evsel_str_handler handlers[] = {
4520		{ "probe:vfs_getname",	     trace__vfs_getname, },
4521	};
4522	struct perf_data data = {
4523		.path  = input_name,
4524		.mode  = PERF_DATA_MODE_READ,
4525		.force = trace->force,
 
 
4526	};
4527	struct perf_session *session;
4528	struct evsel *evsel;
4529	int err = -1;
4530
4531	trace->tool.sample	  = trace__process_sample;
4532	trace->tool.mmap	  = perf_event__process_mmap;
4533	trace->tool.mmap2	  = perf_event__process_mmap2;
4534	trace->tool.comm	  = perf_event__process_comm;
4535	trace->tool.exit	  = perf_event__process_exit;
4536	trace->tool.fork	  = perf_event__process_fork;
4537	trace->tool.attr	  = perf_event__process_attr;
4538	trace->tool.tracing_data  = perf_event__process_tracing_data;
4539	trace->tool.build_id	  = perf_event__process_build_id;
4540	trace->tool.namespaces	  = perf_event__process_namespaces;
4541
4542	trace->tool.ordered_events = true;
4543	trace->tool.ordering_requires_timestamps = true;
4544
4545	/* add tid to output */
4546	trace->multiple_threads = true;
4547
4548	session = perf_session__new(&data, &trace->tool);
4549	if (IS_ERR(session))
4550		return PTR_ERR(session);
4551
4552	if (trace->opts.target.pid)
4553		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
4554
4555	if (trace->opts.target.tid)
4556		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
4557
4558	if (symbol__init(&session->header.env) < 0)
4559		goto out;
4560
4561	trace->host = &session->machines.host;
4562
4563	err = perf_session__set_tracepoints_handlers(session, handlers);
4564	if (err)
4565		goto out;
4566
4567	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
4568	trace->syscalls.events.sys_enter = evsel;
4569	/* older kernels have syscalls tp versus raw_syscalls */
4570	if (evsel == NULL)
4571		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
 
4572
4573	if (evsel &&
4574	    (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
4575	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
4576		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
4577		goto out;
4578	}
4579
4580	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
4581	trace->syscalls.events.sys_exit = evsel;
4582	if (evsel == NULL)
4583		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
 
4584	if (evsel &&
4585	    (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
4586	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
4587		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
4588		goto out;
4589	}
4590
4591	evlist__for_each_entry(session->evlist, evsel) {
4592		if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
4593		    (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
4594		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
4595		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
4596			evsel->handler = trace__pgfault;
4597	}
4598
4599	setup_pager();
4600
4601	err = perf_session__process_events(session);
4602	if (err)
4603		pr_err("Failed to process events, error %d", err);
4604
4605	else if (trace->summary)
4606		trace__fprintf_thread_summary(trace, trace->output);
4607
4608out:
4609	perf_session__delete(session);
4610
4611	return err;
4612}
4613
4614static size_t trace__fprintf_threads_header(FILE *fp)
4615{
4616	size_t printed;
4617
4618	printed  = fprintf(fp, "\n Summary of events:\n\n");
4619
4620	return printed;
4621}
4622
4623DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
4624	struct syscall_stats *stats;
4625	double		     msecs;
4626	int		     syscall;
4627)
4628{
4629	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
4630	struct syscall_stats *stats = source->priv;
4631
4632	entry->syscall = source->i;
4633	entry->stats   = stats;
4634	entry->msecs   = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
4635}
4636
4637static size_t thread__dump_stats(struct thread_trace *ttrace,
4638				 struct trace *trace, FILE *fp)
4639{
4640	size_t printed = 0;
4641	struct syscall *sc;
4642	struct rb_node *nd;
4643	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
4644
4645	if (syscall_stats == NULL)
4646		return 0;
4647
4648	printed += fprintf(fp, "\n");
4649
4650	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
4651	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
4652	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
4653
4654	resort_rb__for_each_entry(nd, syscall_stats) {
4655		struct syscall_stats *stats = syscall_stats_entry->stats;
4656		if (stats) {
4657			double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
4658			double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
4659			double avg = avg_stats(&stats->stats);
4660			double pct;
4661			u64 n = (u64)stats->stats.n;
4662
4663			pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
4664			avg /= NSEC_PER_MSEC;
4665
4666			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
4667			printed += fprintf(fp, "   %-15s", sc->name);
4668			printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
4669					   n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
4670			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
4671
4672			if (trace->errno_summary && stats->nr_failures) {
4673				int e;
4674
4675				for (e = 0; e < stats->max_errno; ++e) {
4676					if (stats->errnos[e] != 0)
4677						fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
4678				}
4679			}
4680		}
4681	}
4682
4683	resort_rb__delete(syscall_stats);
4684	printed += fprintf(fp, "\n\n");
4685
4686	return printed;
4687}
4688
4689static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
4690{
4691	size_t printed = 0;
4692	struct thread_trace *ttrace = thread__priv(thread);
4693	double ratio;
4694
4695	if (ttrace == NULL)
4696		return 0;
4697
4698	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
4699
4700	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread));
4701	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
4702	printed += fprintf(fp, "%.1f%%", ratio);
4703	if (ttrace->pfmaj)
4704		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
4705	if (ttrace->pfmin)
4706		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
4707	if (trace->sched)
4708		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
4709	else if (fputc('\n', fp) != EOF)
4710		++printed;
4711
4712	printed += thread__dump_stats(ttrace, trace, fp);
4713
4714	return printed;
4715}
4716
4717static unsigned long thread__nr_events(struct thread_trace *ttrace)
4718{
4719	return ttrace ? ttrace->nr_events : 0;
4720}
4721
4722static int trace_nr_events_cmp(void *priv __maybe_unused,
4723			       const struct list_head *la,
4724			       const struct list_head *lb)
4725{
4726	struct thread_list *a = list_entry(la, struct thread_list, list);
4727	struct thread_list *b = list_entry(lb, struct thread_list, list);
4728	unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
4729	unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
4730
4731	if (a_nr_events != b_nr_events)
4732		return a_nr_events < b_nr_events ? -1 : 1;
4733
4734	/* Identical number of threads, place smaller tids first. */
4735	return thread__tid(a->thread) < thread__tid(b->thread)
4736		? -1
4737		: (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
4738}
4739
4740static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
4741{
4742	size_t printed = trace__fprintf_threads_header(fp);
4743	LIST_HEAD(threads);
 
4744
4745	if (machine__thread_list(trace->host, &threads) == 0) {
4746		struct thread_list *pos;
4747
4748		list_sort(NULL, &threads, trace_nr_events_cmp);
 
 
 
 
 
 
4749
4750		list_for_each_entry(pos, &threads, list)
4751			printed += trace__fprintf_thread(fp, pos->thread, trace);
4752	}
4753	thread_list__delete(&threads);
4754	return printed;
4755}
4756
4757static int trace__set_duration(const struct option *opt, const char *str,
4758			       int unset __maybe_unused)
4759{
4760	struct trace *trace = opt->value;
4761
4762	trace->duration_filter = atof(str);
4763	return 0;
4764}
4765
4766static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
4767					      int unset __maybe_unused)
4768{
4769	int ret = -1;
4770	size_t i;
4771	struct trace *trace = opt->value;
4772	/*
4773	 * FIXME: introduce a intarray class, plain parse csv and create a
4774	 * { int nr, int entries[] } struct...
4775	 */
4776	struct intlist *list = intlist__new(str);
4777
4778	if (list == NULL)
4779		return -1;
4780
4781	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
4782	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
4783
4784	if (trace->filter_pids.entries == NULL)
4785		goto out;
4786
4787	trace->filter_pids.entries[0] = getpid();
4788
4789	for (i = 1; i < trace->filter_pids.nr; ++i)
4790		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
4791
4792	intlist__delete(list);
4793	ret = 0;
4794out:
4795	return ret;
4796}
4797
4798static int trace__open_output(struct trace *trace, const char *filename)
4799{
4800	struct stat st;
4801
4802	if (!stat(filename, &st) && st.st_size) {
4803		char oldname[PATH_MAX];
4804
4805		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
4806		unlink(oldname);
4807		rename(filename, oldname);
4808	}
4809
4810	trace->output = fopen(filename, "w");
4811
4812	return trace->output == NULL ? -errno : 0;
4813}
4814
4815static int parse_pagefaults(const struct option *opt, const char *str,
4816			    int unset __maybe_unused)
4817{
4818	int *trace_pgfaults = opt->value;
4819
4820	if (strcmp(str, "all") == 0)
4821		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
4822	else if (strcmp(str, "maj") == 0)
4823		*trace_pgfaults |= TRACE_PFMAJ;
4824	else if (strcmp(str, "min") == 0)
4825		*trace_pgfaults |= TRACE_PFMIN;
4826	else
4827		return -1;
4828
4829	return 0;
4830}
4831
4832static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
4833{
4834	struct evsel *evsel;
4835
4836	evlist__for_each_entry(evlist, evsel) {
4837		if (evsel->handler == NULL)
4838			evsel->handler = handler;
4839	}
4840}
4841
4842static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
4843{
4844	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
4845
4846	if (fmt) {
4847		const struct syscall_fmt *scfmt = syscall_fmt__find(name);
4848
4849		if (scfmt) {
4850			int skip = 0;
4851
4852			if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
4853			    strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
4854				++skip;
4855
4856			memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
4857		}
4858	}
4859}
4860
4861static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf)
4862{
4863	struct evsel *evsel;
4864
4865	evlist__for_each_entry(evlist, evsel) {
4866		if (evsel->priv || !evsel->tp_format)
4867			continue;
4868
4869		if (strcmp(evsel->tp_format->system, "syscalls")) {
4870			evsel__init_tp_arg_scnprintf(evsel, use_btf);
4871			continue;
4872		}
4873
4874		if (evsel__init_syscall_tp(evsel))
4875			return -1;
4876
4877		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
4878			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4879
4880			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
4881				return -1;
4882
4883			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
4884		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
4885			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4886
4887			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
4888				return -1;
4889
4890			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
4891		}
4892	}
4893
4894	return 0;
4895}
4896
4897/*
4898 * XXX: Hackish, just splitting the combined -e+--event (syscalls
4899 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
4900 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
4901 *
4902 * It'd be better to introduce a parse_options() variant that would return a
4903 * list with the terms it didn't match to an event...
4904 */
4905static int trace__parse_events_option(const struct option *opt, const char *str,
4906				      int unset __maybe_unused)
4907{
4908	struct trace *trace = (struct trace *)opt->value;
4909	const char *s = str;
4910	char *sep = NULL, *lists[2] = { NULL, NULL, };
4911	int len = strlen(str) + 1, err = -1, list, idx;
4912	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
4913	char group_name[PATH_MAX];
4914	const struct syscall_fmt *fmt;
4915
4916	if (strace_groups_dir == NULL)
4917		return -1;
4918
4919	if (*s == '!') {
4920		++s;
4921		trace->not_ev_qualifier = true;
4922	}
4923
4924	while (1) {
4925		if ((sep = strchr(s, ',')) != NULL)
4926			*sep = '\0';
4927
4928		list = 0;
4929		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
4930		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
4931			list = 1;
4932			goto do_concat;
4933		}
4934
4935		fmt = syscall_fmt__find_by_alias(s);
4936		if (fmt != NULL) {
4937			list = 1;
4938			s = fmt->name;
4939		} else {
4940			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
4941			if (access(group_name, R_OK) == 0)
4942				list = 1;
4943		}
4944do_concat:
4945		if (lists[list]) {
4946			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
4947		} else {
4948			lists[list] = malloc(len);
4949			if (lists[list] == NULL)
4950				goto out;
4951			strcpy(lists[list], s);
4952		}
4953
4954		if (!sep)
4955			break;
4956
4957		*sep = ',';
4958		s = sep + 1;
4959	}
4960
4961	if (lists[1] != NULL) {
4962		struct strlist_config slist_config = {
4963			.dirname = strace_groups_dir,
4964		};
4965
4966		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
4967		if (trace->ev_qualifier == NULL) {
4968			fputs("Not enough memory to parse event qualifier", trace->output);
4969			goto out;
4970		}
4971
4972		if (trace__validate_ev_qualifier(trace))
4973			goto out;
4974		trace->trace_syscalls = true;
4975	}
4976
4977	err = 0;
4978
4979	if (lists[0]) {
4980		struct parse_events_option_args parse_events_option_args = {
4981			.evlistp = &trace->evlist,
4982		};
4983		struct option o = {
4984			.value = &parse_events_option_args,
4985		};
4986		err = parse_events_option(&o, lists[0], 0);
4987	}
4988out:
4989	free(strace_groups_dir);
4990	free(lists[0]);
4991	free(lists[1]);
4992	if (sep)
4993		*sep = ',';
4994
4995	return err;
4996}
4997
4998static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
4999{
5000	struct trace *trace = opt->value;
5001
5002	if (!list_empty(&trace->evlist->core.entries)) {
5003		struct option o = {
5004			.value = &trace->evlist,
5005		};
5006		return parse_cgroups(&o, str, unset);
5007	}
5008	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
5009
5010	return 0;
5011}
5012
5013static int trace__config(const char *var, const char *value, void *arg)
5014{
5015	struct trace *trace = arg;
5016	int err = 0;
5017
5018	if (!strcmp(var, "trace.add_events")) {
5019		trace->perfconfig_events = strdup(value);
5020		if (trace->perfconfig_events == NULL) {
5021			pr_err("Not enough memory for %s\n", "trace.add_events");
5022			return -1;
5023		}
5024	} else if (!strcmp(var, "trace.show_timestamp")) {
5025		trace->show_tstamp = perf_config_bool(var, value);
5026	} else if (!strcmp(var, "trace.show_duration")) {
5027		trace->show_duration = perf_config_bool(var, value);
5028	} else if (!strcmp(var, "trace.show_arg_names")) {
5029		trace->show_arg_names = perf_config_bool(var, value);
5030		if (!trace->show_arg_names)
5031			trace->show_zeros = true;
5032	} else if (!strcmp(var, "trace.show_zeros")) {
5033		bool new_show_zeros = perf_config_bool(var, value);
5034		if (!trace->show_arg_names && !new_show_zeros) {
5035			pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
5036			goto out;
5037		}
5038		trace->show_zeros = new_show_zeros;
5039	} else if (!strcmp(var, "trace.show_prefix")) {
5040		trace->show_string_prefix = perf_config_bool(var, value);
5041	} else if (!strcmp(var, "trace.no_inherit")) {
5042		trace->opts.no_inherit = perf_config_bool(var, value);
5043	} else if (!strcmp(var, "trace.args_alignment")) {
5044		int args_alignment = 0;
5045		if (perf_config_int(&args_alignment, var, value) == 0)
5046			trace->args_alignment = args_alignment;
5047	} else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
5048		if (strcasecmp(value, "libtraceevent") == 0)
5049			trace->libtraceevent_print = true;
5050		else if (strcasecmp(value, "libbeauty") == 0)
5051			trace->libtraceevent_print = false;
5052	}
5053out:
5054	return err;
5055}
5056
5057static void trace__exit(struct trace *trace)
5058{
5059	int i;
5060
5061	strlist__delete(trace->ev_qualifier);
5062	zfree(&trace->ev_qualifier_ids.entries);
5063	if (trace->syscalls.table) {
5064		for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
5065			syscall__exit(&trace->syscalls.table[i]);
5066		zfree(&trace->syscalls.table);
5067	}
5068	syscalltbl__delete(trace->sctbl);
5069	zfree(&trace->perfconfig_events);
5070}
5071
5072#ifdef HAVE_BPF_SKEL
5073static int bpf__setup_bpf_output(struct evlist *evlist)
5074{
5075	int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
5076
5077	if (err)
5078		pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
5079
5080	return err;
5081}
5082#endif
5083
5084int cmd_trace(int argc, const char **argv)
5085{
5086	const char *trace_usage[] = {
5087		"perf trace [<options>] [<command>]",
5088		"perf trace [<options>] -- <command> [<options>]",
5089		"perf trace record [<options>] [<command>]",
5090		"perf trace record [<options>] -- <command> [<options>]",
5091		NULL
5092	};
5093	struct trace trace = {
 
 
 
5094		.opts = {
5095			.target = {
5096				.uid	   = UINT_MAX,
5097				.uses_mmap = true,
5098			},
5099			.user_freq     = UINT_MAX,
5100			.user_interval = ULLONG_MAX,
5101			.no_buffering  = true,
5102			.mmap_pages    = UINT_MAX,
 
5103		},
5104		.output = stderr,
5105		.show_comm = true,
5106		.show_tstamp = true,
5107		.show_duration = true,
5108		.show_arg_names = true,
5109		.args_alignment = 70,
5110		.trace_syscalls = false,
5111		.kernel_syscallchains = false,
5112		.max_stack = UINT_MAX,
5113		.max_events = ULONG_MAX,
5114	};
5115	const char *output_name = NULL;
5116	const struct option trace_options[] = {
5117	OPT_CALLBACK('e', "event", &trace, "event",
5118		     "event/syscall selector. use 'perf list' to list available events",
5119		     trace__parse_events_option),
5120	OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
5121		     "event filter", parse_filter),
5122	OPT_BOOLEAN(0, "comm", &trace.show_comm,
5123		    "show the thread COMM next to its id"),
5124	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
5125	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
5126		     trace__parse_events_option),
5127	OPT_STRING('o', "output", &output_name, "file", "output file name"),
5128	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
5129	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
5130		    "trace events on existing process id"),
5131	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
5132		    "trace events on existing thread id"),
5133	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
5134		     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
5135	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
5136		    "system-wide collection from all CPUs"),
5137	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
5138		    "list of cpus to monitor"),
5139	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
5140		    "child tasks do not inherit counters"),
5141	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
5142		     "number of mmap data pages", evlist__parse_mmap_pages),
 
5143	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
5144		   "user to profile"),
5145	OPT_CALLBACK(0, "duration", &trace, "float",
5146		     "show only events with duration > N.M ms",
5147		     trace__set_duration),
5148	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
5149	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
5150	OPT_BOOLEAN('T', "time", &trace.full_time,
5151		    "Show full timestamp, not time relative to first start"),
5152	OPT_BOOLEAN(0, "failure", &trace.failure_only,
5153		    "Show only syscalls that failed"),
5154	OPT_BOOLEAN('s', "summary", &trace.summary_only,
5155		    "Show only syscall summary with statistics"),
5156	OPT_BOOLEAN('S', "with-summary", &trace.summary,
5157		    "Show all syscalls and summary with statistics"),
5158	OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
5159		    "Show errno stats per syscall, use with -s or -S"),
5160	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
5161		     "Trace pagefaults", parse_pagefaults, "maj"),
5162	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
5163	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
5164	OPT_CALLBACK(0, "call-graph", &trace.opts,
5165		     "record_mode[,record_size]", record_callchain_help,
5166		     &record_parse_callchain_opt),
5167	OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
5168		    "Use libtraceevent to print the tracepoint arguments."),
5169	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
5170		    "Show the kernel callchains on the syscall exit path"),
5171	OPT_ULONG(0, "max-events", &trace.max_events,
5172		"Set the maximum number of events to print, exit after that is reached. "),
5173	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
5174		     "Set the minimum stack depth when parsing the callchain, "
5175		     "anything below the specified depth will be ignored."),
5176	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
5177		     "Set the maximum stack depth when parsing the callchain, "
5178		     "anything beyond the specified depth will be ignored. "
5179		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
5180	OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
5181			"Sort batch of events before processing, use if getting out of order events"),
5182	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
5183			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
5184	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
5185			"per thread proc mmap processing timeout in ms"),
5186	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
5187		     trace__parse_cgroups),
5188	OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
5189		     "ms to wait before starting measurement after program "
5190		     "start"),
5191	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
5192		       "to customized ones"),
5193	OPTS_EVSWITCH(&trace.evswitch),
5194	OPT_END()
5195	};
5196	bool __maybe_unused max_stack_user_set = true;
5197	bool mmap_pages_user_set = true;
5198	struct evsel *evsel;
5199	const char * const trace_subcommands[] = { "record", NULL };
5200	int err = -1;
5201	char bf[BUFSIZ];
5202	struct sigaction sigchld_act;
5203
5204	signal(SIGSEGV, sighandler_dump_stack);
5205	signal(SIGFPE, sighandler_dump_stack);
5206	signal(SIGINT, sighandler_interrupt);
5207
5208	memset(&sigchld_act, 0, sizeof(sigchld_act));
5209	sigchld_act.sa_flags = SA_SIGINFO;
5210	sigchld_act.sa_sigaction = sighandler_chld;
5211	sigaction(SIGCHLD, &sigchld_act, NULL);
5212
5213	trace.evlist = evlist__new();
5214	trace.sctbl = syscalltbl__new();
5215
5216	if (trace.evlist == NULL || trace.sctbl == NULL) {
5217		pr_err("Not enough memory to run!\n");
5218		err = -ENOMEM;
5219		goto out;
5220	}
5221
5222	/*
5223	 * Parsing .perfconfig may entail creating a BPF event, that may need
5224	 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
5225	 * is too small. This affects just this process, not touching the
5226	 * global setting. If it fails we'll get something in 'perf trace -v'
5227	 * to help diagnose the problem.
5228	 */
5229	rlimit__bump_memlock();
5230
5231	err = perf_config(trace__config, &trace);
5232	if (err)
5233		goto out;
5234
5235	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
5236				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
5237
5238	/*
5239	 * Here we already passed thru trace__parse_events_option() and it has
5240	 * already figured out if -e syscall_name, if not but if --event
5241	 * foo:bar was used, the user is interested _just_ in those, say,
5242	 * tracepoint events, not in the strace-like syscall-name-based mode.
5243	 *
5244	 * This is important because we need to check if strace-like mode is
5245	 * needed to decided if we should filter out the eBPF
5246	 * __augmented_syscalls__ code, if it is in the mix, say, via
5247	 * .perfconfig trace.add_events, and filter those out.
5248	 */
5249	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
5250	    trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
5251		trace.trace_syscalls = true;
5252	}
5253	/*
5254	 * Now that we have --verbose figured out, lets see if we need to parse
5255	 * events from .perfconfig, so that if those events fail parsing, say some
5256	 * BPF program fails, then we'll be able to use --verbose to see what went
5257	 * wrong in more detail.
5258	 */
5259	if (trace.perfconfig_events != NULL) {
5260		struct parse_events_error parse_err;
5261
5262		parse_events_error__init(&parse_err);
5263		err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
5264		if (err)
5265			parse_events_error__print(&parse_err, trace.perfconfig_events);
5266		parse_events_error__exit(&parse_err);
5267		if (err)
5268			goto out;
5269	}
5270
5271	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
5272		usage_with_options_msg(trace_usage, trace_options,
5273				       "cgroup monitoring only available in system-wide mode");
5274	}
5275
5276#ifdef HAVE_BPF_SKEL
5277	if (!trace.trace_syscalls)
5278		goto skip_augmentation;
5279
5280	if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
5281		pr_debug("Syscall augmentation fails with record, disabling augmentation");
5282		goto skip_augmentation;
5283	}
5284
5285	trace.skel = augmented_raw_syscalls_bpf__open();
5286	if (!trace.skel) {
5287		pr_debug("Failed to open augmented syscalls BPF skeleton");
5288	} else {
5289		/*
5290		 * Disable attaching the BPF programs except for sys_enter and
5291		 * sys_exit that tail call into this as necessary.
5292		 */
5293		struct bpf_program *prog;
5294
5295		bpf_object__for_each_program(prog, trace.skel->obj) {
5296			if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
5297				bpf_program__set_autoattach(prog, /*autoattach=*/false);
5298		}
5299
5300		err = augmented_raw_syscalls_bpf__load(trace.skel);
5301
5302		if (err < 0) {
5303			libbpf_strerror(err, bf, sizeof(bf));
5304			pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf);
5305		} else {
5306			augmented_raw_syscalls_bpf__attach(trace.skel);
5307			trace__add_syscall_newtp(&trace);
5308		}
5309	}
5310
5311	err = bpf__setup_bpf_output(trace.evlist);
5312	if (err) {
5313		libbpf_strerror(err, bf, sizeof(bf));
5314		pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
5315		goto out;
5316	}
5317	trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
5318	assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__"));
5319skip_augmentation:
5320#endif
5321	err = -1;
5322
5323	if (trace.trace_pgfaults) {
5324		trace.opts.sample_address = true;
5325		trace.opts.sample_time = true;
5326	}
5327
5328	if (trace.opts.mmap_pages == UINT_MAX)
5329		mmap_pages_user_set = false;
5330
5331	if (trace.max_stack == UINT_MAX) {
5332		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
5333		max_stack_user_set = false;
5334	}
5335
5336#ifdef HAVE_DWARF_UNWIND_SUPPORT
5337	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
5338		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
5339	}
5340#endif
5341
5342	if (callchain_param.enabled) {
5343		if (!mmap_pages_user_set && geteuid() == 0)
5344			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
5345
5346		symbol_conf.use_callchain = true;
5347	}
5348
5349	if (trace.evlist->core.nr_entries > 0) {
5350		bool use_btf = false;
5351
5352		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
5353		if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
5354			perror("failed to set syscalls:* tracepoint fields");
5355			goto out;
5356		}
5357
5358		if (use_btf)
5359			trace__load_vmlinux_btf(&trace);
5360	}
5361
5362	if (trace.sort_events) {
5363		ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
5364		ordered_events__set_copy_on_queue(&trace.oe.data, true);
5365	}
5366
5367	/*
5368	 * If we are augmenting syscalls, then combine what we put in the
5369	 * __augmented_syscalls__ BPF map with what is in the
5370	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
5371	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
5372	 *
5373	 * We'll switch to look at two BPF maps, one for sys_enter and the
5374	 * other for sys_exit when we start augmenting the sys_exit paths with
5375	 * buffers that are being copied from kernel to userspace, think 'read'
5376	 * syscall.
5377	 */
5378	if (trace.syscalls.events.bpf_output) {
5379		evlist__for_each_entry(trace.evlist, evsel) {
5380			bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
5381
5382			if (raw_syscalls_sys_exit) {
5383				trace.raw_augmented_syscalls = true;
5384				goto init_augmented_syscall_tp;
5385			}
5386
5387			if (trace.syscalls.events.bpf_output->priv == NULL &&
5388			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
5389				struct evsel *augmented = trace.syscalls.events.bpf_output;
5390				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
5391				    evsel__init_augmented_syscall_tp_args(augmented))
5392					goto out;
5393				/*
5394				 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
5395				 * Above we made sure we can get from the payload the tp fields
5396				 * that we get from syscalls:sys_enter tracefs format file.
5397				 */
5398				augmented->handler = trace__sys_enter;
5399				/*
5400				 * Now we do the same for the *syscalls:sys_enter event so that
5401				 * if we handle it directly, i.e. if the BPF prog returns 0 so
5402				 * as not to filter it, then we'll handle it just like we would
5403				 * for the BPF_OUTPUT one:
5404				 */
5405				if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
5406				    evsel__init_augmented_syscall_tp_args(evsel))
5407					goto out;
5408				evsel->handler = trace__sys_enter;
5409			}
5410
5411			if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
5412				struct syscall_tp *sc;
5413init_augmented_syscall_tp:
5414				if (evsel__init_augmented_syscall_tp(evsel, evsel))
5415					goto out;
5416				sc = __evsel__syscall_tp(evsel);
5417				/*
5418				 * For now with BPF raw_augmented we hook into
5419				 * raw_syscalls:sys_enter and there we get all
5420				 * 6 syscall args plus the tracepoint common
5421				 * fields and the syscall_nr (another long).
5422				 * So we check if that is the case and if so
5423				 * don't look after the sc->args_size but
5424				 * always after the full raw_syscalls:sys_enter
5425				 * payload, which is fixed.
5426				 *
5427				 * We'll revisit this later to pass
5428				 * s->args_size to the BPF augmenter (now
5429				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
5430				 * so that it copies only what we need for each
5431				 * syscall, like what happens when we use
5432				 * syscalls:sys_enter_NAME, so that we reduce
5433				 * the kernel/userspace traffic to just what is
5434				 * needed for each syscall.
5435				 */
5436				if (trace.raw_augmented_syscalls)
5437					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
5438				evsel__init_augmented_syscall_tp_ret(evsel);
5439				evsel->handler = trace__sys_exit;
5440			}
5441		}
5442	}
5443
5444	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
5445		return trace__record(&trace, argc-1, &argv[1]);
5446
5447	/* Using just --errno-summary will trigger --summary */
5448	if (trace.errno_summary && !trace.summary && !trace.summary_only)
5449		trace.summary_only = true;
5450
5451	/* summary_only implies summary option, but don't overwrite summary if set */
5452	if (trace.summary_only)
5453		trace.summary = trace.summary_only;
5454
5455	/* Keep exited threads, otherwise information might be lost for summary */
5456	if (trace.summary)
5457		symbol_conf.keep_exited_threads = true;
 
 
 
 
 
 
 
5458
5459	if (output_name != NULL) {
5460		err = trace__open_output(&trace, output_name);
5461		if (err < 0) {
5462			perror("failed to create output file");
5463			goto out;
5464		}
5465	}
5466
5467	err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
5468	if (err)
5469		goto out_close;
5470
5471	err = target__validate(&trace.opts.target);
5472	if (err) {
5473		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5474		fprintf(trace.output, "%s", bf);
5475		goto out_close;
5476	}
5477
5478	err = target__parse_uid(&trace.opts.target);
5479	if (err) {
5480		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5481		fprintf(trace.output, "%s", bf);
5482		goto out_close;
5483	}
5484
5485	if (!argc && target__none(&trace.opts.target))
5486		trace.opts.target.system_wide = true;
5487
5488	if (input_name)
5489		err = trace__replay(&trace);
5490	else
5491		err = trace__run(&trace, argc, argv);
5492
5493out_close:
5494	if (output_name != NULL)
5495		fclose(trace.output);
5496out:
5497	trace__exit(&trace);
5498#ifdef HAVE_BPF_SKEL
5499	augmented_raw_syscalls_bpf__destroy(trace.skel);
5500#endif
5501	return err;
5502}