Linux Audio

Check our new training course

Loading...
v4.10.11
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
  15 *
  16 * Released under the GPL v2. (and only v2, not any later version)
  17 */
  18
  19#include <traceevent/event-parse.h>
  20#include <api/fs/tracing_path.h>
 
 
 
 
 
 
 
 
 
 
  21#include "builtin.h"
 
  22#include "util/color.h"
 
  23#include "util/debug.h"
 
 
 
 
 
 
  24#include "util/evlist.h"
 
 
 
  25#include <subcmd/exec-cmd.h>
  26#include "util/machine.h"
 
 
 
  27#include "util/session.h"
  28#include "util/thread.h"
  29#include <subcmd/parse-options.h>
  30#include "util/strlist.h"
  31#include "util/intlist.h"
  32#include "util/thread_map.h"
  33#include "util/stat.h"
 
 
 
  34#include "trace-event.h"
  35#include "util/parse-events.h"
  36#include "util/bpf-loader.h"
  37#include "callchain.h"
 
 
  38#include "syscalltbl.h"
  39#include "rb_resort.h"
 
 
  40
  41#include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
 
 
 
  42#include <stdlib.h>
 
  43#include <linux/err.h>
  44#include <linux/filter.h>
  45#include <linux/audit.h>
 
  46#include <linux/random.h>
  47#include <linux/stringify.h>
  48#include <linux/time64.h>
 
 
 
 
 
 
 
 
 
 
  49
  50#ifndef O_CLOEXEC
  51# define O_CLOEXEC		02000000
  52#endif
  53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  54struct trace {
  55	struct perf_tool	tool;
  56	struct syscalltbl	*sctbl;
  57	struct {
  58		int		max;
  59		struct syscall  *table;
  60		struct {
  61			struct perf_evsel *sys_enter,
  62					  *sys_exit;
 
  63		}		events;
  64	} syscalls;
 
 
 
 
 
 
  65	struct record_opts	opts;
  66	struct perf_evlist	*evlist;
  67	struct machine		*host;
  68	struct thread		*current;
 
  69	u64			base_time;
  70	FILE			*output;
  71	unsigned long		nr_events;
 
 
 
  72	struct strlist		*ev_qualifier;
  73	struct {
  74		size_t		nr;
  75		int		*entries;
  76	}			ev_qualifier_ids;
  77	struct {
  78		size_t		nr;
  79		pid_t		*entries;
 
  80	}			filter_pids;
  81	double			duration_filter;
  82	double			runtime_ms;
  83	struct {
  84		u64		vfs_getname,
  85				proc_getname;
  86	} stats;
  87	unsigned int		max_stack;
  88	unsigned int		min_stack;
 
 
 
 
  89	bool			not_ev_qualifier;
  90	bool			live;
  91	bool			full_time;
  92	bool			sched;
  93	bool			multiple_threads;
  94	bool			summary;
  95	bool			summary_only;
 
 
  96	bool			show_comm;
 
  97	bool			show_tool_stats;
  98	bool			trace_syscalls;
 
  99	bool			kernel_syscallchains;
 
 
 
 
 
 
 100	bool			force;
 101	bool			vfs_getname;
 
 102	int			trace_pgfaults;
 103	int			open_id;
 
 
 
 
 104};
 105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 106struct tp_field {
 107	int offset;
 108	union {
 109		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 110		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 111	};
 112};
 113
 114#define TP_UINT_FIELD(bits) \
 115static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 116{ \
 117	u##bits value; \
 118	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 119	return value;  \
 120}
 121
 122TP_UINT_FIELD(8);
 123TP_UINT_FIELD(16);
 124TP_UINT_FIELD(32);
 125TP_UINT_FIELD(64);
 126
 127#define TP_UINT_FIELD__SWAPPED(bits) \
 128static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 129{ \
 130	u##bits value; \
 131	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 132	return bswap_##bits(value);\
 133}
 134
 135TP_UINT_FIELD__SWAPPED(16);
 136TP_UINT_FIELD__SWAPPED(32);
 137TP_UINT_FIELD__SWAPPED(64);
 138
 139static int tp_field__init_uint(struct tp_field *field,
 140			       struct format_field *format_field,
 141			       bool needs_swap)
 142{
 143	field->offset = format_field->offset;
 144
 145	switch (format_field->size) {
 146	case 1:
 147		field->integer = tp_field__u8;
 148		break;
 149	case 2:
 150		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 151		break;
 152	case 4:
 153		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 154		break;
 155	case 8:
 156		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 157		break;
 158	default:
 159		return -1;
 160	}
 161
 162	return 0;
 163}
 164
 
 
 
 
 
 165static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 166{
 167	return sample->raw_data + field->offset;
 168}
 169
 170static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 171{
 172	field->offset = format_field->offset;
 173	field->pointer = tp_field__ptr;
 174	return 0;
 175}
 176
 
 
 
 
 
 177struct syscall_tp {
 178	struct tp_field id;
 179	union {
 180		struct tp_field args, ret;
 181	};
 182};
 183
 184static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
 185					  struct tp_field *field,
 186					  const char *name)
 
 
 
 
 
 
 
 
 187{
 188	struct format_field *format_field = perf_evsel__field(evsel, name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 189
 190	if (format_field == NULL)
 191		return -1;
 192
 193	return tp_field__init_uint(field, format_field, evsel->needs_swap);
 194}
 195
 196#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 197	({ struct syscall_tp *sc = evsel->priv;\
 198	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 199
 200static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
 201					 struct tp_field *field,
 202					 const char *name)
 203{
 204	struct format_field *format_field = perf_evsel__field(evsel, name);
 205
 206	if (format_field == NULL)
 207		return -1;
 208
 209	return tp_field__init_ptr(field, format_field);
 210}
 211
 212#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 213	({ struct syscall_tp *sc = evsel->priv;\
 214	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 215
 216static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 217{
 218	zfree(&evsel->priv);
 219	perf_evsel__delete(evsel);
 220}
 221
 222static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
 223{
 224	evsel->priv = malloc(sizeof(struct syscall_tp));
 225	if (evsel->priv != NULL) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 226		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 227			goto out_delete;
 228
 229		evsel->handler = handler;
 230		return 0;
 231	}
 232
 233	return -ENOMEM;
 234
 235out_delete:
 236	zfree(&evsel->priv);
 237	return -ENOENT;
 238}
 239
 240static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
 241{
 242	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
 243
 244	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 245	if (IS_ERR(evsel))
 246		evsel = perf_evsel__newtp("syscalls", direction);
 247
 248	if (IS_ERR(evsel))
 249		return NULL;
 250
 251	if (perf_evsel__init_syscall_tp(evsel, handler))
 252		goto out_delete;
 253
 254	return evsel;
 255
 256out_delete:
 257	perf_evsel__delete_priv(evsel);
 258	return NULL;
 259}
 260
 261#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 262	({ struct syscall_tp *fields = evsel->priv; \
 263	   fields->name.integer(&fields->name, sample); })
 264
 265#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 266	({ struct syscall_tp *fields = evsel->priv; \
 267	   fields->name.pointer(&fields->name, sample); })
 268
 269struct syscall_arg {
 270	unsigned long val;
 271	struct thread *thread;
 272	struct trace  *trace;
 273	void	      *parm;
 274	u8	      idx;
 275	u8	      mask;
 276};
 277
 278struct strarray {
 279	int	    offset;
 280	int	    nr_entries;
 281	const char **entries;
 282};
 
 283
 284#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
 285	.nr_entries = ARRAY_SIZE(array), \
 286	.entries = array, \
 287}
 288
 289#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
 290	.offset	    = off, \
 291	.nr_entries = ARRAY_SIZE(array), \
 292	.entries = array, \
 
 
 
 
 
 
 
 
 293}
 294
 295static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 296						const char *intfmt,
 297					        struct syscall_arg *arg)
 298{
 299	struct strarray *sa = arg->parm;
 300	int idx = arg->val - sa->offset;
 301
 302	if (idx < 0 || idx >= sa->nr_entries)
 303		return scnprintf(bf, size, intfmt, arg->val);
 304
 305	return scnprintf(bf, size, "%s", sa->entries[idx]);
 306}
 307
 308static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 309					      struct syscall_arg *arg)
 310{
 311	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 312}
 313
 314#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 315
 316#if defined(__i386__) || defined(__x86_64__)
 317/*
 318 * FIXME: Make this available to all arches as soon as the ioctl beautifier
 319 * 	  gets rewritten to support all arches.
 320 */
 321static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
 322						 struct syscall_arg *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 323{
 324	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 325}
 326
 327#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
 328#endif /* defined(__i386__) || defined(__x86_64__) */
 
 
 
 
 
 
 
 
 329
 330static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
 331					struct syscall_arg *arg);
 332
 333#define SCA_FD syscall_arg__scnprintf_fd
 
 
 
 
 334
 335#ifndef AT_FDCWD
 336#define AT_FDCWD	-100
 337#endif
 338
 339static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 340					   struct syscall_arg *arg)
 341{
 342	int fd = arg->val;
 
 343
 344	if (fd == AT_FDCWD)
 345		return scnprintf(bf, size, "CWD");
 346
 347	return syscall_arg__scnprintf_fd(bf, size, arg);
 348}
 349
 350#define SCA_FDAT syscall_arg__scnprintf_fd_at
 351
 352static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 353					      struct syscall_arg *arg);
 354
 355#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 356
 357static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
 358					 struct syscall_arg *arg)
 359{
 360	return scnprintf(bf, size, "%#lx", arg->val);
 361}
 362
 363#define SCA_HEX syscall_arg__scnprintf_hex
 
 
 
 
 
 364
 365static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
 366					 struct syscall_arg *arg)
 367{
 368	return scnprintf(bf, size, "%d", arg->val);
 369}
 370
 371#define SCA_INT syscall_arg__scnprintf_int
 
 
 
 
 
 
 
 
 
 
 
 
 
 372
 373static const char *bpf_cmd[] = {
 374	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 375	"MAP_GET_NEXT_KEY", "PROG_LOAD",
 
 
 
 
 
 
 
 
 
 
 
 
 
 376};
 377static DEFINE_STRARRAY(bpf_cmd);
 
 
 
 
 378
 379static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 380static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
 381
 382static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 383static DEFINE_STRARRAY(itimers);
 384
 385static const char *keyctl_options[] = {
 386	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 387	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 388	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 389	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 390	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 391};
 392static DEFINE_STRARRAY(keyctl_options);
 393
 394static const char *whences[] = { "SET", "CUR", "END",
 395#ifdef SEEK_DATA
 396"DATA",
 397#endif
 398#ifdef SEEK_HOLE
 399"HOLE",
 400#endif
 401};
 402static DEFINE_STRARRAY(whences);
 403
 404static const char *fcntl_cmds[] = {
 405	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 406	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
 407	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
 408	"F_GETOWNER_UIDS",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 409};
 410static DEFINE_STRARRAY(fcntl_cmds);
 
 411
 412static const char *rlimit_resources[] = {
 413	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 414	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 415	"RTTIME",
 416};
 417static DEFINE_STRARRAY(rlimit_resources);
 418
 419static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 420static DEFINE_STRARRAY(sighow);
 421
 422static const char *clockid[] = {
 423	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 424	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 425	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 426};
 427static DEFINE_STRARRAY(clockid);
 428
 429static const char *socket_families[] = {
 430	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
 431	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
 432	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
 433	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
 434	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
 435	"ALG", "NFC", "VSOCK",
 436};
 437static DEFINE_STRARRAY(socket_families);
 438
 439static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 440						 struct syscall_arg *arg)
 441{
 
 
 442	size_t printed = 0;
 443	int mode = arg->val;
 444
 445	if (mode == F_OK) /* 0 */
 446		return scnprintf(bf, size, "F");
 447#define	P_MODE(n) \
 448	if (mode & n##_OK) { \
 449		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
 450		mode &= ~n##_OK; \
 451	}
 452
 453	P_MODE(R);
 454	P_MODE(W);
 455	P_MODE(X);
 456#undef P_MODE
 457
 458	if (mode)
 459		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 460
 461	return printed;
 462}
 463
 464#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 465
 466static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 467					      struct syscall_arg *arg);
 468
 469#define SCA_FILENAME syscall_arg__scnprintf_filename
 470
 
 
 
 
 
 
 
 
 
 471static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 472						struct syscall_arg *arg)
 473{
 
 
 474	int printed = 0, flags = arg->val;
 475
 476#define	P_FLAG(n) \
 477	if (flags & O_##n) { \
 478		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 479		flags &= ~O_##n; \
 480	}
 481
 482	P_FLAG(CLOEXEC);
 483	P_FLAG(NONBLOCK);
 484#undef P_FLAG
 485
 486	if (flags)
 487		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 488
 489	return printed;
 490}
 491
 492#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 493
 494#if defined(__i386__) || defined(__x86_64__)
 495/*
 496 * FIXME: Make this available to all arches.
 497 */
 498#define TCGETS		0x5401
 499
 500static const char *tioctls[] = {
 501	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
 502	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
 503	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
 504	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
 505	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
 506	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
 507	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
 508	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
 509	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
 510	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
 511	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
 512	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
 513	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
 514	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
 515	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
 516};
 517
 518static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
 519#endif /* defined(__i386__) || defined(__x86_64__) */
 520
 521#ifndef GRND_NONBLOCK
 522#define GRND_NONBLOCK	0x0001
 523#endif
 524#ifndef GRND_RANDOM
 525#define GRND_RANDOM	0x0002
 526#endif
 527
 528static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 529						   struct syscall_arg *arg)
 530{
 
 
 531	int printed = 0, flags = arg->val;
 532
 533#define	P_FLAG(n) \
 534	if (flags & GRND_##n) { \
 535		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
 536		flags &= ~GRND_##n; \
 537	}
 538
 539	P_FLAG(RANDOM);
 540	P_FLAG(NONBLOCK);
 541#undef P_FLAG
 542
 543	if (flags)
 544		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 545
 546	return printed;
 547}
 548
 549#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 550
 551#define STRARRAY(arg, name, array) \
 552	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
 553	  .arg_parm	 = { [arg] = &strarray__##array, }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 555#include "trace/beauty/eventfd.c"
 556#include "trace/beauty/flock.c"
 557#include "trace/beauty/futex_op.c"
 
 558#include "trace/beauty/mmap.c"
 559#include "trace/beauty/mode_t.c"
 560#include "trace/beauty/msg_flags.c"
 561#include "trace/beauty/open_flags.c"
 562#include "trace/beauty/perf_event_open.c"
 563#include "trace/beauty/pid.c"
 564#include "trace/beauty/sched_policy.c"
 565#include "trace/beauty/seccomp.c"
 566#include "trace/beauty/signum.c"
 567#include "trace/beauty/socket_type.c"
 568#include "trace/beauty/waitid_options.c"
 569
 570static struct syscall_fmt {
 571	const char *name;
 572	const char *alias;
 573	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
 574	void	   *arg_parm[6];
 575	bool	   errmsg;
 576	bool	   errpid;
 577	bool	   timeout;
 578	bool	   hexret;
 579} syscall_fmts[] = {
 580	{ .name	    = "access",	    .errmsg = true,
 581	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
 582	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
 583	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
 584	{ .name	    = "brk",	    .hexret = true,
 585	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
 586	{ .name	    = "chdir",	    .errmsg = true, },
 587	{ .name	    = "chmod",	    .errmsg = true, },
 588	{ .name	    = "chroot",	    .errmsg = true, },
 589	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
 590	{ .name	    = "clone",	    .errpid = true, },
 591	{ .name	    = "close",	    .errmsg = true,
 592	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
 593	{ .name	    = "connect",    .errmsg = true, },
 594	{ .name	    = "creat",	    .errmsg = true, },
 595	{ .name	    = "dup",	    .errmsg = true, },
 596	{ .name	    = "dup2",	    .errmsg = true, },
 597	{ .name	    = "dup3",	    .errmsg = true, },
 598	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
 599	{ .name	    = "eventfd2",   .errmsg = true,
 600	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
 601	{ .name	    = "faccessat",  .errmsg = true, },
 602	{ .name	    = "fadvise64",  .errmsg = true, },
 603	{ .name	    = "fallocate",  .errmsg = true, },
 604	{ .name	    = "fchdir",	    .errmsg = true, },
 605	{ .name	    = "fchmod",	    .errmsg = true, },
 606	{ .name	    = "fchmodat",   .errmsg = true,
 607	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 608	{ .name	    = "fchown",	    .errmsg = true, },
 609	{ .name	    = "fchownat",   .errmsg = true,
 610	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 611	{ .name	    = "fcntl",	    .errmsg = true,
 612	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
 613	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
 614	{ .name	    = "fdatasync",  .errmsg = true, },
 615	{ .name	    = "flock",	    .errmsg = true,
 616	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
 617	{ .name	    = "fsetxattr",  .errmsg = true, },
 618	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
 619	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
 620	{ .name	    = "fstatfs",    .errmsg = true, },
 621	{ .name	    = "fsync",    .errmsg = true, },
 622	{ .name	    = "ftruncate", .errmsg = true, },
 623	{ .name	    = "futex",	    .errmsg = true,
 624	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
 625	{ .name	    = "futimesat", .errmsg = true,
 626	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 627	{ .name	    = "getdents",   .errmsg = true, },
 628	{ .name	    = "getdents64", .errmsg = true, },
 629	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 630	{ .name	    = "getpid",	    .errpid = true, },
 631	{ .name	    = "getpgid",    .errpid = true, },
 632	{ .name	    = "getppid",    .errpid = true, },
 633	{ .name	    = "getrandom",  .errmsg = true,
 634	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
 635	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 636	{ .name	    = "getxattr",   .errmsg = true, },
 637	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
 638	{ .name	    = "ioctl",	    .errmsg = true,
 639	  .arg_scnprintf = {
 
 
 640#if defined(__i386__) || defined(__x86_64__)
 641/*
 642 * FIXME: Make this available to all arches.
 643 */
 644			     [1] = SCA_STRHEXARRAY, /* cmd */
 645			     [2] = SCA_HEX, /* arg */ },
 646	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
 647#else
 648			     [2] = SCA_HEX, /* arg */ }, },
 649#endif
 650	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
 651	{ .name	    = "kill",	    .errmsg = true,
 652	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 653	{ .name	    = "lchown",    .errmsg = true, },
 654	{ .name	    = "lgetxattr",  .errmsg = true, },
 655	{ .name	    = "linkat",	    .errmsg = true,
 656	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 657	{ .name	    = "listxattr",  .errmsg = true, },
 658	{ .name	    = "llistxattr", .errmsg = true, },
 659	{ .name	    = "lremovexattr",  .errmsg = true, },
 660	{ .name	    = "lseek",	    .errmsg = true,
 661	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
 662	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
 663	{ .name	    = "lsetxattr",  .errmsg = true, },
 664	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
 665	{ .name	    = "lsxattr",    .errmsg = true, },
 666	{ .name     = "madvise",    .errmsg = true,
 667	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
 668			     [2] = SCA_MADV_BHV, /* behavior */ }, },
 669	{ .name	    = "mkdir",    .errmsg = true, },
 670	{ .name	    = "mkdirat",    .errmsg = true,
 671	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 672	{ .name	    = "mknod",      .errmsg = true, },
 673	{ .name	    = "mknodat",    .errmsg = true,
 674	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
 675	{ .name	    = "mlock",	    .errmsg = true,
 676	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 677	{ .name	    = "mlockall",   .errmsg = true,
 678	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 679	{ .name	    = "mmap",	    .hexret = true,
 680	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
 681			     [2] = SCA_MMAP_PROT, /* prot */
 682			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
 683	{ .name	    = "mprotect",   .errmsg = true,
 684	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
 685			     [2] = SCA_MMAP_PROT, /* prot */ }, },
 686	{ .name	    = "mq_unlink", .errmsg = true,
 687	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 688	{ .name	    = "mremap",	    .hexret = true,
 689	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
 690			     [3] = SCA_MREMAP_FLAGS, /* flags */
 691			     [4] = SCA_HEX, /* new_addr */ }, },
 692	{ .name	    = "munlock",    .errmsg = true,
 693	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 694	{ .name	    = "munmap",	    .errmsg = true,
 695	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
 696	{ .name	    = "name_to_handle_at", .errmsg = true,
 697	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 698	{ .name	    = "newfstatat", .errmsg = true,
 699	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 700	{ .name	    = "open",	    .errmsg = true,
 701	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
 702	{ .name	    = "open_by_handle_at", .errmsg = true,
 703	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
 704			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 705	{ .name	    = "openat",	    .errmsg = true,
 706	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
 707			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 708	{ .name	    = "perf_event_open", .errmsg = true,
 709	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
 710			     [3] = SCA_FD,  /* group_fd */
 711			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
 712	{ .name	    = "pipe2",	    .errmsg = true,
 713	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
 714	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
 715	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
 716	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
 717	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
 718	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
 719	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
 720	{ .name	    = "pwritev",    .errmsg = true, },
 721	{ .name	    = "read",	    .errmsg = true, },
 722	{ .name	    = "readlink",   .errmsg = true, },
 723	{ .name	    = "readlinkat", .errmsg = true,
 724	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 725	{ .name	    = "readv",	    .errmsg = true, },
 726	{ .name	    = "recvfrom",   .errmsg = true,
 727	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 728	{ .name	    = "recvmmsg",   .errmsg = true,
 729	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 730	{ .name	    = "recvmsg",    .errmsg = true,
 731	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
 732	{ .name	    = "removexattr", .errmsg = true, },
 733	{ .name	    = "renameat",   .errmsg = true,
 734	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 735	{ .name	    = "rmdir",    .errmsg = true, },
 736	{ .name	    = "rt_sigaction", .errmsg = true,
 737	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
 738	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
 739	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
 740	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 741	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
 742	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
 743	{ .name	    = "sched_getattr",	      .errmsg = true, },
 744	{ .name	    = "sched_setattr",	      .errmsg = true, },
 745	{ .name	    = "sched_setscheduler",   .errmsg = true,
 746	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
 747	{ .name	    = "seccomp", .errmsg = true,
 748	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
 749			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
 750	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
 751	{ .name	    = "sendmmsg",    .errmsg = true,
 752	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 753	{ .name	    = "sendmsg",    .errmsg = true,
 754	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
 755	{ .name	    = "sendto",	    .errmsg = true,
 756	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 757	{ .name	    = "set_tid_address", .errpid = true, },
 758	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
 759	{ .name	    = "setpgid",    .errmsg = true, },
 760	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 761	{ .name	    = "setxattr",   .errmsg = true, },
 762	{ .name	    = "shutdown",   .errmsg = true, },
 763	{ .name	    = "socket",	    .errmsg = true,
 764	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
 765			     [1] = SCA_SK_TYPE, /* type */ },
 766	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
 767	{ .name	    = "socketpair", .errmsg = true,
 768	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
 769			     [1] = SCA_SK_TYPE, /* type */ },
 770	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
 771	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
 772	{ .name	    = "statfs",	    .errmsg = true, },
 773	{ .name	    = "swapoff",    .errmsg = true,
 774	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
 775	{ .name	    = "swapon",	    .errmsg = true,
 776	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
 777	{ .name	    = "symlinkat",  .errmsg = true,
 778	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 779	{ .name	    = "tgkill",	    .errmsg = true,
 780	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
 781	{ .name	    = "tkill",	    .errmsg = true,
 782	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 783	{ .name	    = "truncate",   .errmsg = true, },
 784	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
 785	{ .name	    = "unlinkat",   .errmsg = true,
 786	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
 787	{ .name	    = "utime",  .errmsg = true, },
 788	{ .name	    = "utimensat",  .errmsg = true,
 789	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
 790	{ .name	    = "utimes",  .errmsg = true, },
 791	{ .name	    = "vmsplice",  .errmsg = true, },
 
 
 
 
 
 
 
 792	{ .name	    = "wait4",	    .errpid = true,
 793	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
 794	{ .name	    = "waitid",	    .errpid = true,
 795	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
 796	{ .name	    = "write",	    .errmsg = true, },
 797	{ .name	    = "writev",	    .errmsg = true, },
 798};
 799
 800static int syscall_fmt__cmp(const void *name, const void *fmtp)
 801{
 802	const struct syscall_fmt *fmt = fmtp;
 803	return strcmp(name, fmt->name);
 804}
 805
 806static struct syscall_fmt *syscall_fmt__find(const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 807{
 808	const int nmemb = ARRAY_SIZE(syscall_fmts);
 809	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
 810}
 811
 
 
 
 
 
 
 812struct syscall {
 813	struct event_format *tp_format;
 814	int		    nr_args;
 815	struct format_field *args;
 816	const char	    *name;
 
 
 
 817	bool		    is_exit;
 818	struct syscall_fmt  *fmt;
 819	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 820	void		    **arg_parm;
 
 
 
 
 821};
 822
 823static size_t fprintf_duration(unsigned long t, FILE *fp)
 
 
 
 
 
 
 
 824{
 825	double duration = (double)t / NSEC_PER_MSEC;
 826	size_t printed = fprintf(fp, "(");
 827
 828	if (duration >= 1.0)
 
 
 829		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
 830	else if (duration >= 0.01)
 831		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
 832	else
 833		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
 834	return printed + fprintf(fp, "): ");
 835}
 836
 837/**
 838 * filename.ptr: The filename char pointer that will be vfs_getname'd
 839 * filename.entry_str_pos: Where to insert the string translated from
 840 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
 
 
 841 */
 842struct thread_trace {
 843	u64		  entry_time;
 844	bool		  entry_pending;
 845	unsigned long	  nr_events;
 846	unsigned long	  pfmaj, pfmin;
 847	char		  *entry_str;
 848	double		  runtime_ms;
 
 849        struct {
 850		unsigned long ptr;
 851		short int     entry_str_pos;
 852		bool	      pending_open;
 853		unsigned int  namelen;
 854		char	      *name;
 855	} filename;
 856	struct {
 857		int	  max;
 858		char	  **table;
 859	} paths;
 860
 861	struct intlist *syscall_stats;
 862};
 863
 864static struct thread_trace *thread_trace__new(void)
 865{
 866	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 867
 868	if (ttrace)
 869		ttrace->paths.max = -1;
 870
 871	ttrace->syscall_stats = intlist__new(NULL);
 872
 873	return ttrace;
 874}
 875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 876static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 877{
 878	struct thread_trace *ttrace;
 879
 880	if (thread == NULL)
 881		goto fail;
 882
 883	if (thread__priv(thread) == NULL)
 884		thread__set_priv(thread, thread_trace__new());
 885
 886	if (thread__priv(thread) == NULL)
 887		goto fail;
 888
 889	ttrace = thread__priv(thread);
 890	++ttrace->nr_events;
 891
 892	return ttrace;
 893fail:
 894	color_fprintf(fp, PERF_COLOR_RED,
 895		      "WARNING: not enough memory, dropping samples!\n");
 896	return NULL;
 897}
 898
 
 
 
 
 
 
 
 
 
 899#define TRACE_PFMAJ		(1 << 0)
 900#define TRACE_PFMIN		(1 << 1)
 901
 902static const size_t trace__entry_str_size = 2048;
 903
 904static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 905{
 906	struct thread_trace *ttrace = thread__priv(thread);
 
 
 
 907
 908	if (fd > ttrace->paths.max) {
 909		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
 
 910
 911		if (npath == NULL)
 912			return -1;
 
 
 
 
 
 
 
 
 913
 914		if (ttrace->paths.max != -1) {
 915			memset(npath + ttrace->paths.max + 1, 0,
 916			       (fd - ttrace->paths.max) * sizeof(char *));
 917		} else {
 918			memset(npath, 0, (fd + 1) * sizeof(char *));
 919		}
 920
 921		ttrace->paths.table = npath;
 922		ttrace->paths.max   = fd;
 923	}
 924
 925	ttrace->paths.table[fd] = strdup(pathname);
 
 926
 927	return ttrace->paths.table[fd] != NULL ? 0 : -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 928}
 929
 930static int thread__read_fd_path(struct thread *thread, int fd)
 931{
 932	char linkname[PATH_MAX], pathname[PATH_MAX];
 933	struct stat st;
 934	int ret;
 935
 936	if (thread->pid_ == thread->tid) {
 937		scnprintf(linkname, sizeof(linkname),
 938			  "/proc/%d/fd/%d", thread->pid_, fd);
 939	} else {
 940		scnprintf(linkname, sizeof(linkname),
 941			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
 
 942	}
 943
 944	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
 945		return -1;
 946
 947	ret = readlink(linkname, pathname, sizeof(pathname));
 948
 949	if (ret < 0 || ret > st.st_size)
 950		return -1;
 951
 952	pathname[ret] = '\0';
 953	return trace__set_fd_pathname(thread, fd, pathname);
 954}
 955
 956static const char *thread__fd_path(struct thread *thread, int fd,
 957				   struct trace *trace)
 958{
 959	struct thread_trace *ttrace = thread__priv(thread);
 960
 961	if (ttrace == NULL)
 962		return NULL;
 963
 964	if (fd < 0)
 965		return NULL;
 966
 967	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
 968		if (!trace->live)
 969			return NULL;
 970		++trace->stats.proc_getname;
 971		if (thread__read_fd_path(thread, fd))
 972			return NULL;
 973	}
 974
 975	return ttrace->paths.table[fd];
 976}
 977
 978static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
 979					struct syscall_arg *arg)
 980{
 981	int fd = arg->val;
 982	size_t printed = scnprintf(bf, size, "%d", fd);
 983	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
 984
 985	if (path)
 986		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
 987
 988	return printed;
 989}
 990
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 991static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 992					      struct syscall_arg *arg)
 993{
 994	int fd = arg->val;
 995	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
 996	struct thread_trace *ttrace = thread__priv(arg->thread);
 997
 998	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
 999		zfree(&ttrace->paths.table[fd]);
1000
1001	return printed;
1002}
1003
1004static void thread__set_filename_pos(struct thread *thread, const char *bf,
1005				     unsigned long ptr)
1006{
1007	struct thread_trace *ttrace = thread__priv(thread);
1008
1009	ttrace->filename.ptr = ptr;
1010	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1011}
1012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1014					      struct syscall_arg *arg)
1015{
1016	unsigned long ptr = arg->val;
1017
 
 
 
1018	if (!arg->trace->vfs_getname)
1019		return scnprintf(bf, size, "%#x", ptr);
1020
1021	thread__set_filename_pos(arg->thread, bf, ptr);
1022	return 0;
1023}
1024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025static bool trace__filter_duration(struct trace *trace, double t)
1026{
1027	return t < (trace->duration_filter * NSEC_PER_MSEC);
1028}
1029
1030static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1031{
1032	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1033
1034	return fprintf(fp, "%10.3f ", ts);
1035}
1036
1037static bool done = false;
1038static bool interrupted = false;
 
 
 
 
 
 
 
 
1039
1040static void sig_handler(int sig)
 
 
 
 
 
 
 
1041{
1042	done = true;
1043	interrupted = sig == SIGINT;
1044}
1045
1046static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1047					u64 duration, u64 tstamp, FILE *fp)
1048{
1049	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1050	printed += fprintf_duration(duration, fp);
 
 
 
 
 
1051
1052	if (trace->multiple_threads) {
1053		if (trace->show_comm)
1054			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1055		printed += fprintf(fp, "%d ", thread->tid);
1056	}
1057
1058	return printed;
1059}
1060
 
 
 
 
 
 
 
 
 
 
 
 
1061static int trace__process_event(struct trace *trace, struct machine *machine,
1062				union perf_event *event, struct perf_sample *sample)
1063{
1064	int ret = 0;
1065
1066	switch (event->header.type) {
1067	case PERF_RECORD_LOST:
1068		color_fprintf(trace->output, PERF_COLOR_RED,
1069			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1070		ret = machine__process_lost_event(machine, event, sample);
1071		break;
1072	default:
1073		ret = machine__process_event(machine, event, sample);
1074		break;
1075	}
1076
1077	return ret;
1078}
1079
1080static int trace__tool_process(struct perf_tool *tool,
1081			       union perf_event *event,
1082			       struct perf_sample *sample,
1083			       struct machine *machine)
1084{
1085	struct trace *trace = container_of(tool, struct trace, tool);
1086	return trace__process_event(trace, machine, event, sample);
1087}
1088
1089static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1090{
1091	struct machine *machine = vmachine;
1092
1093	if (machine->kptr_restrict_warned)
1094		return NULL;
1095
1096	if (symbol_conf.kptr_restrict) {
1097		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1098			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1099			   "Kernel samples will not be resolved.\n");
1100		machine->kptr_restrict_warned = true;
1101		return NULL;
1102	}
1103
1104	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1105}
1106
1107static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1108{
1109	int err = symbol__init(NULL);
1110
1111	if (err)
1112		return err;
1113
1114	trace->host = machine__new_host();
1115	if (trace->host == NULL)
1116		return -ENOMEM;
1117
1118	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1119		return -errno;
 
 
 
1120
1121	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1122					    evlist->threads, trace__tool_process, false,
1123					    trace->opts.proc_map_timeout);
 
1124	if (err)
1125		symbol__exit();
1126
1127	return err;
1128}
1129
1130static int syscall__set_arg_fmts(struct syscall *sc)
 
 
 
 
 
 
 
 
1131{
1132	struct format_field *field;
1133	int idx = 0, len;
1134
1135	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1136	if (sc->arg_scnprintf == NULL)
 
 
 
1137		return -1;
1138
1139	if (sc->fmt)
1140		sc->arg_parm = sc->fmt->arg_parm;
 
 
1141
1142	for (field = sc->args; field; field = field->next) {
1143		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1144			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1145		else if (strcmp(field->type, "const char *") == 0 &&
1146			 (strcmp(field->name, "filename") == 0 ||
1147			  strcmp(field->name, "path") == 0 ||
1148			  strcmp(field->name, "pathname") == 0))
1149			sc->arg_scnprintf[idx] = SCA_FILENAME;
1150		else if (field->flags & FIELD_IS_POINTER)
1151			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1152		else if (strcmp(field->type, "pid_t") == 0)
1153			sc->arg_scnprintf[idx] = SCA_PID;
1154		else if (strcmp(field->type, "umode_t") == 0)
1155			sc->arg_scnprintf[idx] = SCA_MODE_T;
1156		else if ((strcmp(field->type, "int") == 0 ||
 
 
 
1157			  strcmp(field->type, "unsigned int") == 0 ||
1158			  strcmp(field->type, "long") == 0) &&
1159			 (len = strlen(field->name)) >= 2 &&
1160			 strcmp(field->name + len - 2, "fd") == 0) {
1161			/*
1162			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1163			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1164			 * 65 int
1165			 * 23 unsigned int
1166			 * 7 unsigned long
1167			 */
1168			sc->arg_scnprintf[idx] = SCA_FD;
 
 
 
 
 
 
 
 
 
 
 
1169		}
1170		++idx;
1171	}
1172
 
 
 
 
 
 
 
 
 
 
 
1173	return 0;
1174}
1175
1176static int trace__read_syscall_info(struct trace *trace, int id)
1177{
1178	char tp_name[128];
1179	struct syscall *sc;
1180	const char *name = syscalltbl__name(trace->sctbl, id);
 
1181
1182	if (name == NULL)
1183		return -1;
1184
1185	if (id > trace->syscalls.max) {
1186		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1187
1188		if (nsyscalls == NULL)
1189			return -1;
 
 
 
 
 
 
 
 
 
 
 
1190
1191		if (trace->syscalls.max != -1) {
1192			memset(nsyscalls + trace->syscalls.max + 1, 0,
1193			       (id - trace->syscalls.max) * sizeof(*sc));
1194		} else {
1195			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1196		}
 
1197
1198		trace->syscalls.table = nsyscalls;
1199		trace->syscalls.max   = id;
 
1200	}
1201
1202	sc = trace->syscalls.table + id;
1203	sc->name = name;
1204
1205	sc->fmt  = syscall_fmt__find(sc->name);
1206
1207	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1208	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1209
1210	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1211		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1212		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1213	}
1214
1215	if (IS_ERR(sc->tp_format))
1216		return -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1217
1218	sc->args = sc->tp_format->format.fields;
1219	sc->nr_args = sc->tp_format->format.nr_fields;
1220	/*
1221	 * We need to check and discard the first variable '__syscall_nr'
1222	 * or 'nr' that mean the syscall number. It is needless here.
1223	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1224	 */
1225	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1226		sc->args = sc->args->next;
1227		--sc->nr_args;
1228	}
1229
1230	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
 
 
 
1231
1232	return syscall__set_arg_fmts(sc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1233}
1234
1235static int trace__validate_ev_qualifier(struct trace *trace)
1236{
1237	int err = 0, i;
 
1238	struct str_node *pos;
 
1239
1240	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1241	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1242						 sizeof(trace->ev_qualifier_ids.entries[0]));
1243
1244	if (trace->ev_qualifier_ids.entries == NULL) {
1245		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1246		       trace->output);
1247		err = -EINVAL;
1248		goto out;
1249	}
1250
1251	i = 0;
1252
1253	strlist__for_each_entry(pos, trace->ev_qualifier) {
1254		const char *sc = pos->s;
1255		int id = syscalltbl__id(trace->sctbl, sc);
1256
1257		if (id < 0) {
1258			if (err == 0) {
1259				fputs("Error:\tInvalid syscall ", trace->output);
1260				err = -EINVAL;
 
 
 
 
1261			} else {
1262				fputs(", ", trace->output);
1263			}
1264
1265			fputs(sc, trace->output);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1266		}
1267
1268		trace->ev_qualifier_ids.entries[i++] = id;
1269	}
1270
1271	if (err < 0) {
1272		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1273		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1274		zfree(&trace->ev_qualifier_ids.entries);
1275		trace->ev_qualifier_ids.nr = 0;
1276	}
1277out:
 
 
1278	return err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1279}
1280
1281/*
1282 * args is to be interpreted as a series of longs but we need to handle
1283 * 8-byte unaligned accesses. args points to raw_data within the event
1284 * and raw_data is guaranteed to be 8-byte unaligned because it is
1285 * preceded by raw_size which is a u32. So we need to copy args to a temp
1286 * variable to read it. Most notably this avoids extended load instructions
1287 * on unaligned addresses
1288 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1289
1290static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1291				      unsigned char *args, struct trace *trace,
1292				      struct thread *thread)
1293{
1294	size_t printed = 0;
1295	unsigned char *p;
1296	unsigned long val;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1297
1298	if (sc->args != NULL) {
1299		struct format_field *field;
1300		u8 bit = 1;
1301		struct syscall_arg arg = {
1302			.idx	= 0,
1303			.mask	= 0,
1304			.trace  = trace,
1305			.thread = thread,
1306		};
1307
1308		for (field = sc->args; field;
1309		     field = field->next, ++arg.idx, bit <<= 1) {
1310			if (arg.mask & bit)
1311				continue;
1312
1313			/* special care for unaligned accesses */
1314			p = args + sizeof(unsigned long) * arg.idx;
1315			memcpy(&val, p, sizeof(val));
 
 
 
 
1316
1317			/*
1318 			 * Suppress this argument if its value is zero and
1319 			 * and we don't have a string associated in an
1320 			 * strarray for it.
1321 			 */
1322			if (val == 0 &&
1323			    !(sc->arg_scnprintf &&
1324			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1325			      sc->arg_parm[arg.idx]))
 
1326				continue;
1327
1328			printed += scnprintf(bf + printed, size - printed,
1329					     "%s%s: ", printed ? ", " : "", field->name);
1330			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1331				arg.val = val;
1332				if (sc->arg_parm)
1333					arg.parm = sc->arg_parm[arg.idx];
1334				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1335								      size - printed, &arg);
1336			} else {
1337				printed += scnprintf(bf + printed, size - printed,
1338						     "%ld", val);
 
 
 
1339			}
 
 
 
1340		}
1341	} else if (IS_ERR(sc->tp_format)) {
1342		/*
1343		 * If we managed to read the tracepoint /format file, then we
1344		 * may end up not having any args, like with gettid(), so only
1345		 * print the raw args when we didn't manage to read it.
1346		 */
1347		int i = 0;
1348
1349		while (i < 6) {
1350			/* special care for unaligned accesses */
1351			p = args + sizeof(unsigned long) * i;
1352			memcpy(&val, p, sizeof(val));
1353			printed += scnprintf(bf + printed, size - printed,
1354					     "%sarg%d: %ld",
1355					     printed ? ", " : "", i, val);
1356			++i;
 
1357		}
1358	}
1359
1360	return printed;
1361}
1362
1363typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1364				  union perf_event *event,
1365				  struct perf_sample *sample);
1366
1367static struct syscall *trace__syscall_info(struct trace *trace,
1368					   struct perf_evsel *evsel, int id)
1369{
 
1370
1371	if (id < 0) {
1372
1373		/*
1374		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1375		 * before that, leaving at a higher verbosity level till that is
1376		 * explained. Reproduced with plain ftrace with:
1377		 *
1378		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1379		 * grep "NR -1 " /t/trace_pipe
1380		 *
1381		 * After generating some load on the machine.
1382 		 */
1383		if (verbose > 1) {
1384			static u64 n;
1385			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1386				id, perf_evsel__name(evsel), ++n);
1387		}
1388		return NULL;
1389	}
1390
1391	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1392	    trace__read_syscall_info(trace, id))
 
 
 
 
 
 
 
 
 
 
 
 
1393		goto out_cant_read;
 
1394
1395	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
 
 
 
 
1396		goto out_cant_read;
1397
1398	return &trace->syscalls.table[id];
1399
1400out_cant_read:
1401	if (verbose) {
1402		fprintf(trace->output, "Problems reading syscall %d", id);
1403		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
 
1404			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1405		fputs(" information\n", trace->output);
1406	}
1407	return NULL;
1408}
1409
1410static void thread__update_stats(struct thread_trace *ttrace,
1411				 int id, struct perf_sample *sample)
 
 
 
 
 
 
 
1412{
1413	struct int_node *inode;
1414	struct stats *stats;
1415	u64 duration = 0;
1416
1417	inode = intlist__findnew(ttrace->syscall_stats, id);
1418	if (inode == NULL)
1419		return;
1420
1421	stats = inode->priv;
1422	if (stats == NULL) {
1423		stats = malloc(sizeof(struct stats));
1424		if (stats == NULL)
1425			return;
1426		init_stats(stats);
 
1427		inode->priv = stats;
1428	}
1429
1430	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1431		duration = sample->time - ttrace->entry_time;
1432
1433	update_stats(stats, duration);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434}
1435
1436static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1437{
1438	struct thread_trace *ttrace;
1439	u64 duration;
1440	size_t printed;
 
1441
1442	if (trace->current == NULL)
1443		return 0;
1444
1445	ttrace = thread__priv(trace->current);
1446
1447	if (!ttrace->entry_pending)
1448		return 0;
1449
1450	duration = sample->time - ttrace->entry_time;
 
 
 
 
 
 
1451
1452	printed  = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output);
1453	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1454	ttrace->entry_pending = false;
 
1455
1456	return printed;
1457}
1458
1459static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1460			    union perf_event *event __maybe_unused,
1461			    struct perf_sample *sample)
1462{
1463	char *msg;
1464	void *args;
1465	size_t printed = 0;
1466	struct thread *thread;
1467	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
 
 
1468	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1469	struct thread_trace *ttrace;
1470
1471	if (sc == NULL)
1472		return -1;
1473
1474	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1475	ttrace = thread__trace(thread, trace->output);
1476	if (ttrace == NULL)
1477		goto out_put;
1478
 
 
1479	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1480
1481	if (ttrace->entry_str == NULL) {
1482		ttrace->entry_str = malloc(trace__entry_str_size);
1483		if (!ttrace->entry_str)
1484			goto out_put;
1485	}
1486
1487	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1488		trace__printf_interrupted_entry(trace, sample);
1489
 
 
 
 
 
 
 
 
 
 
 
1490	ttrace->entry_time = sample->time;
1491	msg = ttrace->entry_str;
1492	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1493
1494	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1495					   args, trace, thread);
1496
1497	if (sc->is_exit) {
1498		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1499			trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output);
1500			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
 
 
 
 
 
1501		}
1502	} else {
1503		ttrace->entry_pending = true;
1504		/* See trace__vfs_getname & trace__sys_exit */
1505		ttrace->filename.pending_open = false;
1506	}
1507
1508	if (trace->current != thread) {
1509		thread__put(trace->current);
1510		trace->current = thread__get(thread);
1511	}
1512	err = 0;
1513out_put:
1514	thread__put(thread);
1515	return err;
1516}
1517
1518static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1519				    struct perf_sample *sample,
1520				    struct callchain_cursor *cursor)
1521{
1522	struct addr_location al;
 
 
 
 
1523
1524	if (machine__resolve(trace->host, &al, sample) < 0 ||
1525	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1526		return -1;
1527
1528	return 0;
 
 
 
1529}
1530
1531static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1532{
1533	/* TODO: user-configurable print_opts */
1534	const unsigned int print_opts = EVSEL__PRINT_SYM |
1535				        EVSEL__PRINT_DSO |
1536				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1537
1538	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1539}
1540
1541static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
1542			   union perf_event *event __maybe_unused,
1543			   struct perf_sample *sample)
1544{
1545	long ret;
1546	u64 duration = 0;
 
1547	struct thread *thread;
1548	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
 
1549	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1550	struct thread_trace *ttrace;
1551
1552	if (sc == NULL)
1553		return -1;
1554
1555	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1556	ttrace = thread__trace(thread, trace->output);
1557	if (ttrace == NULL)
1558		goto out_put;
1559
1560	if (trace->summary)
1561		thread__update_stats(ttrace, id, sample);
1562
1563	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1564
1565	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
 
 
 
1566		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1567		ttrace->filename.pending_open = false;
1568		++trace->stats.vfs_getname;
1569	}
1570
1571	if (ttrace->entry_time) {
1572		duration = sample->time - ttrace->entry_time;
1573		if (trace__filter_duration(trace, duration))
1574			goto out;
 
1575	} else if (trace->duration_filter)
1576		goto out;
1577
1578	if (sample->callchain) {
1579		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1580		if (callchain_ret == 0) {
1581			if (callchain_cursor.nr < trace->min_stack)
1582				goto out;
1583			callchain_ret = 1;
1584		}
1585	}
1586
1587	if (trace->summary_only)
1588		goto out;
1589
1590	trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output);
1591
1592	if (ttrace->entry_pending) {
1593		fprintf(trace->output, "%-70s", ttrace->entry_str);
1594	} else {
1595		fprintf(trace->output, " ... [");
1596		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1597		fprintf(trace->output, "]: %s()", sc->name);
 
1598	}
1599
 
 
 
 
 
 
 
 
 
1600	if (sc->fmt == NULL) {
 
 
1601signed_print:
1602		fprintf(trace->output, ") = %ld", ret);
1603	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
 
1604		char bf[STRERR_BUFSIZE];
1605		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1606			   *e = audit_errno_to_name(-ret);
1607
1608		fprintf(trace->output, ") = -1 %s %s", e, emsg);
 
1609	} else if (ret == 0 && sc->fmt->timeout)
1610		fprintf(trace->output, ") = 0 Timeout");
1611	else if (sc->fmt->hexret)
1612		fprintf(trace->output, ") = %#lx", ret);
 
 
 
 
 
 
 
 
 
 
1613	else if (sc->fmt->errpid) {
1614		struct thread *child = machine__find_thread(trace->host, ret, ret);
1615
1616		if (child != NULL) {
1617			fprintf(trace->output, ") = %ld", ret);
1618			if (child->comm_set)
1619				fprintf(trace->output, " (%s)", thread__comm_str(child));
1620			thread__put(child);
1621		}
1622	} else
1623		goto signed_print;
1624
1625	fputc('\n', trace->output);
1626
 
 
 
 
 
 
 
1627	if (callchain_ret > 0)
1628		trace__fprintf_callchain(trace, sample);
1629	else if (callchain_ret < 0)
1630		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1631out:
1632	ttrace->entry_pending = false;
1633	err = 0;
1634out_put:
1635	thread__put(thread);
1636	return err;
1637}
1638
1639static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1640			      union perf_event *event __maybe_unused,
1641			      struct perf_sample *sample)
1642{
1643	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1644	struct thread_trace *ttrace;
1645	size_t filename_len, entry_str_len, to_move;
1646	ssize_t remaining_space;
1647	char *pos;
1648	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1649
1650	if (!thread)
1651		goto out;
1652
1653	ttrace = thread__priv(thread);
1654	if (!ttrace)
1655		goto out;
1656
1657	filename_len = strlen(filename);
 
 
1658
1659	if (ttrace->filename.namelen < filename_len) {
1660		char *f = realloc(ttrace->filename.name, filename_len + 1);
1661
1662		if (f == NULL)
1663				goto out;
1664
1665		ttrace->filename.namelen = filename_len;
1666		ttrace->filename.name = f;
1667	}
1668
1669	strcpy(ttrace->filename.name, filename);
1670	ttrace->filename.pending_open = true;
1671
1672	if (!ttrace->filename.ptr)
1673		goto out;
1674
1675	entry_str_len = strlen(ttrace->entry_str);
1676	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1677	if (remaining_space <= 0)
1678		goto out;
1679
1680	if (filename_len > (size_t)remaining_space) {
1681		filename += filename_len - remaining_space;
1682		filename_len = remaining_space;
1683	}
1684
1685	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1686	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1687	memmove(pos + filename_len, pos, to_move);
1688	memcpy(pos, filename, filename_len);
1689
1690	ttrace->filename.ptr = 0;
1691	ttrace->filename.entry_str_pos = 0;
 
 
1692out:
1693	return 0;
1694}
1695
1696static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1697				     union perf_event *event __maybe_unused,
1698				     struct perf_sample *sample)
1699{
1700        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1701	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1702	struct thread *thread = machine__findnew_thread(trace->host,
1703							sample->pid,
1704							sample->tid);
1705	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1706
1707	if (ttrace == NULL)
1708		goto out_dump;
1709
1710	ttrace->runtime_ms += runtime_ms;
1711	trace->runtime_ms += runtime_ms;
 
1712	thread__put(thread);
1713	return 0;
1714
1715out_dump:
1716	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1717	       evsel->name,
1718	       perf_evsel__strval(evsel, sample, "comm"),
1719	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1720	       runtime,
1721	       perf_evsel__intval(evsel, sample, "vruntime"));
1722	thread__put(thread);
1723	return 0;
1724}
1725
1726static void bpf_output__printer(enum binary_printer_ops op,
1727				unsigned int val, void *extra)
1728{
1729	FILE *output = extra;
1730	unsigned char ch = (unsigned char)val;
1731
1732	switch (op) {
1733	case BINARY_PRINT_CHAR_DATA:
1734		fprintf(output, "%c", isprint(ch) ? ch : '.');
1735		break;
1736	case BINARY_PRINT_DATA_BEGIN:
1737	case BINARY_PRINT_LINE_BEGIN:
1738	case BINARY_PRINT_ADDR:
1739	case BINARY_PRINT_NUM_DATA:
1740	case BINARY_PRINT_NUM_PAD:
1741	case BINARY_PRINT_SEP:
1742	case BINARY_PRINT_CHAR_PAD:
1743	case BINARY_PRINT_LINE_END:
1744	case BINARY_PRINT_DATA_END:
1745	default:
1746		break;
1747	}
 
 
1748}
1749
1750static void bpf_output__fprintf(struct trace *trace,
1751				struct perf_sample *sample)
1752{
1753	print_binary(sample->raw_data, sample->raw_size, 8,
1754		     bpf_output__printer, trace->output);
 
1755}
1756
1757static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1758				union perf_event *event __maybe_unused,
1759				struct perf_sample *sample)
1760{
 
1761	int callchain_ret = 0;
1762
 
 
 
 
 
1763	if (sample->callchain) {
1764		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1765		if (callchain_ret == 0) {
1766			if (callchain_cursor.nr < trace->min_stack)
1767				goto out;
1768			callchain_ret = 1;
1769		}
1770	}
1771
1772	trace__printf_interrupted_entry(trace, sample);
1773	trace__fprintf_tstamp(trace, sample->time, trace->output);
1774
1775	if (trace->trace_syscalls)
1776		fprintf(trace->output, "(         ): ");
1777
1778	fprintf(trace->output, "%s:", evsel->name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1779
1780	if (perf_evsel__is_bpf_output(evsel)) {
 
 
1781		bpf_output__fprintf(trace, sample);
1782	} else if (evsel->tp_format) {
1783		event_format__fprintf(evsel->tp_format, sample->cpu,
1784				      sample->raw_data, sample->raw_size,
1785				      trace->output);
 
 
 
 
 
 
 
1786	}
1787
 
1788	fprintf(trace->output, ")\n");
1789
1790	if (callchain_ret > 0)
1791		trace__fprintf_callchain(trace, sample);
1792	else if (callchain_ret < 0)
1793		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 
 
 
 
 
 
 
1794out:
 
1795	return 0;
1796}
1797
1798static void print_location(FILE *f, struct perf_sample *sample,
1799			   struct addr_location *al,
1800			   bool print_dso, bool print_sym)
1801{
1802
1803	if ((verbose || print_dso) && al->map)
1804		fprintf(f, "%s@", al->map->dso->long_name);
1805
1806	if ((verbose || print_sym) && al->sym)
1807		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1808			al->addr - al->sym->start);
1809	else if (al->map)
1810		fprintf(f, "0x%" PRIx64, al->addr);
1811	else
1812		fprintf(f, "0x%" PRIx64, sample->addr);
1813}
1814
1815static int trace__pgfault(struct trace *trace,
1816			  struct perf_evsel *evsel,
1817			  union perf_event *event __maybe_unused,
1818			  struct perf_sample *sample)
1819{
1820	struct thread *thread;
1821	struct addr_location al;
1822	char map_type = 'd';
1823	struct thread_trace *ttrace;
1824	int err = -1;
1825	int callchain_ret = 0;
1826
 
1827	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1828
1829	if (sample->callchain) {
1830		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
 
 
1831		if (callchain_ret == 0) {
1832			if (callchain_cursor.nr < trace->min_stack)
1833				goto out_put;
1834			callchain_ret = 1;
1835		}
1836	}
1837
1838	ttrace = thread__trace(thread, trace->output);
1839	if (ttrace == NULL)
1840		goto out_put;
1841
1842	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1843		ttrace->pfmaj++;
1844	else
1845		ttrace->pfmin++;
1846
1847	if (trace->summary_only)
1848		goto out;
1849
1850	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1851			      sample->ip, &al);
1852
1853	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1854
1855	fprintf(trace->output, "%sfault [",
1856		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1857		"maj" : "min");
1858
1859	print_location(trace->output, sample, &al, false, true);
1860
1861	fprintf(trace->output, "] => ");
1862
1863	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1864				   sample->addr, &al);
1865
1866	if (!al.map) {
1867		thread__find_addr_location(thread, sample->cpumode,
1868					   MAP__FUNCTION, sample->addr, &al);
1869
1870		if (al.map)
1871			map_type = 'x';
1872		else
1873			map_type = '?';
1874	}
1875
1876	print_location(trace->output, sample, &al, true, false);
1877
1878	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1879
1880	if (callchain_ret > 0)
1881		trace__fprintf_callchain(trace, sample);
1882	else if (callchain_ret < 0)
1883		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 
 
1884out:
1885	err = 0;
1886out_put:
1887	thread__put(thread);
 
1888	return err;
1889}
1890
1891static void trace__set_base_time(struct trace *trace,
1892				 struct perf_evsel *evsel,
1893				 struct perf_sample *sample)
1894{
1895	/*
1896	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1897	 * and don't use sample->time unconditionally, we may end up having
1898	 * some other event in the future without PERF_SAMPLE_TIME for good
1899	 * reason, i.e. we may not be interested in its timestamps, just in
1900	 * it taking place, picking some piece of information when it
1901	 * appears in our event stream (vfs_getname comes to mind).
1902	 */
1903	if (trace->base_time == 0 && !trace->full_time &&
1904	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1905		trace->base_time = sample->time;
1906}
1907
1908static int trace__process_sample(struct perf_tool *tool,
1909				 union perf_event *event,
1910				 struct perf_sample *sample,
1911				 struct perf_evsel *evsel,
1912				 struct machine *machine __maybe_unused)
1913{
1914	struct trace *trace = container_of(tool, struct trace, tool);
1915	struct thread *thread;
1916	int err = 0;
1917
1918	tracepoint_handler handler = evsel->handler;
1919
1920	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1921	if (thread && thread__is_filtered(thread))
1922		return 0;
1923
1924	trace__set_base_time(trace, evsel, sample);
1925
1926	if (handler) {
1927		++trace->nr_events;
1928		handler(trace, evsel, event, sample);
1929	}
1930
 
1931	return err;
1932}
1933
1934static int trace__record(struct trace *trace, int argc, const char **argv)
1935{
1936	unsigned int rec_argc, i, j;
1937	const char **rec_argv;
1938	const char * const record_args[] = {
1939		"record",
1940		"-R",
1941		"-m", "1024",
1942		"-c", "1",
1943	};
1944
 
1945	const char * const sc_args[] = { "-e", };
1946	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1947	const char * const majpf_args[] = { "-e", "major-faults" };
1948	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1949	const char * const minpf_args[] = { "-e", "minor-faults" };
1950	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
 
1951
1952	/* +1 is for the event string below */
1953	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1954		majpf_args_nr + minpf_args_nr + argc;
1955	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1956
1957	if (rec_argv == NULL)
1958		return -ENOMEM;
1959
1960	j = 0;
1961	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1962		rec_argv[j++] = record_args[i];
1963
1964	if (trace->trace_syscalls) {
1965		for (i = 0; i < sc_args_nr; i++)
1966			rec_argv[j++] = sc_args[i];
1967
1968		/* event string may be different for older kernels - e.g., RHEL6 */
1969		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1970			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1971		else if (is_valid_tracepoint("syscalls:sys_enter"))
1972			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
1973		else {
1974			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
1975			return -1;
1976		}
1977	}
1978
 
 
 
1979	if (trace->trace_pgfaults & TRACE_PFMAJ)
1980		for (i = 0; i < majpf_args_nr; i++)
1981			rec_argv[j++] = majpf_args[i];
1982
1983	if (trace->trace_pgfaults & TRACE_PFMIN)
1984		for (i = 0; i < minpf_args_nr; i++)
1985			rec_argv[j++] = minpf_args[i];
1986
1987	for (i = 0; i < (unsigned int)argc; i++)
1988		rec_argv[j++] = argv[i];
1989
1990	return cmd_record(j, rec_argv, NULL);
 
 
 
 
1991}
1992
1993static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
1994
1995static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
1996{
1997	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
 
 
 
1998
1999	if (IS_ERR(evsel))
 
 
 
2000		return false;
2001
2002	if (perf_evsel__field(evsel, "pathname") == NULL) {
2003		perf_evsel__delete(evsel);
2004		return false;
 
 
 
 
 
 
 
 
 
 
2005	}
2006
2007	evsel->handler = trace__vfs_getname;
2008	perf_evlist__add(evlist, evsel);
2009	return true;
2010}
2011
2012static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2013{
2014	struct perf_evsel *evsel;
2015	struct perf_event_attr attr = {
2016		.type = PERF_TYPE_SOFTWARE,
2017		.mmap_data = 1,
2018	};
2019
2020	attr.config = config;
2021	attr.sample_period = 1;
2022
2023	event_attr_init(&attr);
2024
2025	evsel = perf_evsel__new(&attr);
2026	if (evsel)
2027		evsel->handler = trace__pgfault;
2028
2029	return evsel;
2030}
2031
 
 
 
 
 
 
 
 
 
 
2032static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2033{
2034	const u32 type = event->header.type;
2035	struct perf_evsel *evsel;
2036
2037	if (type != PERF_RECORD_SAMPLE) {
2038		trace__process_event(trace, trace->host, event, sample);
2039		return;
2040	}
2041
2042	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2043	if (evsel == NULL) {
2044		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2045		return;
2046	}
2047
 
 
 
2048	trace__set_base_time(trace, evsel, sample);
2049
2050	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2051	    sample->raw_data == NULL) {
2052		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2053		       perf_evsel__name(evsel), sample->tid,
2054		       sample->cpu, sample->raw_size);
2055	} else {
2056		tracepoint_handler handler = evsel->handler;
2057		handler(trace, evsel, event, sample);
2058	}
 
 
 
2059}
2060
2061static int trace__add_syscall_newtp(struct trace *trace)
2062{
2063	int ret = -1;
2064	struct perf_evlist *evlist = trace->evlist;
2065	struct perf_evsel *sys_enter, *sys_exit;
2066
2067	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2068	if (sys_enter == NULL)
2069		goto out;
2070
2071	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2072		goto out_delete_sys_enter;
2073
2074	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2075	if (sys_exit == NULL)
2076		goto out_delete_sys_enter;
2077
2078	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2079		goto out_delete_sys_exit;
2080
2081	perf_evlist__add(evlist, sys_enter);
2082	perf_evlist__add(evlist, sys_exit);
 
 
 
2083
2084	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2085		/*
2086		 * We're interested only in the user space callchain
2087		 * leading to the syscall, allow overriding that for
2088		 * debugging reasons using --kernel_syscall_callchains
2089		 */
2090		sys_exit->attr.exclude_callchain_kernel = 1;
2091	}
2092
2093	trace->syscalls.events.sys_enter = sys_enter;
2094	trace->syscalls.events.sys_exit  = sys_exit;
2095
2096	ret = 0;
2097out:
2098	return ret;
2099
2100out_delete_sys_exit:
2101	perf_evsel__delete_priv(sys_exit);
2102out_delete_sys_enter:
2103	perf_evsel__delete_priv(sys_enter);
2104	goto out;
2105}
2106
2107static int trace__set_ev_qualifier_filter(struct trace *trace)
2108{
2109	int err = -1;
2110	struct perf_evsel *sys_exit;
2111	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2112						trace->ev_qualifier_ids.nr,
2113						trace->ev_qualifier_ids.entries);
2114
2115	if (filter == NULL)
2116		goto out_enomem;
2117
2118	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2119					  filter)) {
2120		sys_exit = trace->syscalls.events.sys_exit;
2121		err = perf_evsel__append_tp_filter(sys_exit, filter);
2122	}
2123
2124	free(filter);
2125out:
2126	return err;
2127out_enomem:
2128	errno = ENOMEM;
2129	goto out;
2130}
2131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2132static int trace__run(struct trace *trace, int argc, const char **argv)
2133{
2134	struct perf_evlist *evlist = trace->evlist;
2135	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2136	int err = -1, i;
2137	unsigned long before;
2138	const bool forks = argc > 0;
2139	bool draining = false;
2140
2141	trace->live = true;
2142
2143	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2144		goto out_error_raw_syscalls;
 
2145
2146	if (trace->trace_syscalls)
2147		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
 
2148
2149	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2150		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2151		if (pgfault_maj == NULL)
2152			goto out_error_mem;
2153		perf_evlist__add(evlist, pgfault_maj);
 
2154	}
2155
2156	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2157		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2158		if (pgfault_min == NULL)
2159			goto out_error_mem;
2160		perf_evlist__add(evlist, pgfault_min);
 
2161	}
2162
 
 
 
2163	if (trace->sched &&
2164	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2165				   trace__sched_stat_runtime))
2166		goto out_error_sched_stat_runtime;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2167
2168	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2169	if (err < 0) {
2170		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2171		goto out_delete_evlist;
2172	}
2173
2174	err = trace__symbols_init(trace, evlist);
2175	if (err < 0) {
2176		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2177		goto out_delete_evlist;
2178	}
2179
2180	perf_evlist__config(evlist, &trace->opts, NULL);
2181
2182	if (callchain_param.enabled) {
2183		bool use_identifier = false;
2184
2185		if (trace->syscalls.events.sys_exit) {
2186			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2187						     &trace->opts, &callchain_param);
2188			use_identifier = true;
2189		}
2190
2191		if (pgfault_maj) {
2192			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2193			use_identifier = true;
2194		}
2195
2196		if (pgfault_min) {
2197			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2198			use_identifier = true;
2199		}
2200
2201		if (use_identifier) {
2202		       /*
2203			* Now we have evsels with different sample_ids, use
2204			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2205			* from a fixed position in each ring buffer record.
2206			*
2207			* As of this the changeset introducing this comment, this
2208			* isn't strictly needed, as the fields that can come before
2209			* PERF_SAMPLE_ID are all used, but we'll probably disable
2210			* some of those for things like copying the payload of
2211			* pointer syscall arguments, and for vfs_getname we don't
2212			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2213			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2214			*/
2215			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2216			perf_evlist__reset_sample_bit(evlist, ID);
2217		}
2218	}
2219
2220	signal(SIGCHLD, sig_handler);
2221	signal(SIGINT, sig_handler);
2222
2223	if (forks) {
2224		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2225						    argv, false, NULL);
2226		if (err < 0) {
2227			fprintf(trace->output, "Couldn't run the workload!\n");
2228			goto out_delete_evlist;
2229		}
 
2230	}
2231
2232	err = perf_evlist__open(evlist);
2233	if (err < 0)
2234		goto out_error_open;
 
 
 
2235
2236	err = bpf__apply_obj_config();
2237	if (err) {
2238		char errbuf[BUFSIZ];
2239
2240		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2241		pr_err("ERROR: Apply config to BPF failed: %s\n",
2242			 errbuf);
2243		goto out_error_open;
 
 
 
2244	}
2245
2246	/*
2247	 * Better not use !target__has_task() here because we need to cover the
2248	 * case where no threads were specified in the command line, but a
2249	 * workload was, and in that case we will fill in the thread_map when
2250	 * we fork the workload in perf_evlist__prepare_workload.
2251	 */
2252	if (trace->filter_pids.nr > 0)
2253		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2254	else if (thread_map__pid(evlist->threads, 0) == -1)
2255		err = perf_evlist__set_filter_pid(evlist, getpid());
2256
2257	if (err < 0)
2258		goto out_error_mem;
2259
 
 
 
 
 
2260	if (trace->ev_qualifier_ids.nr > 0) {
2261		err = trace__set_ev_qualifier_filter(trace);
2262		if (err < 0)
2263			goto out_errno;
2264
2265		pr_debug("event qualifier tracepoint filter: %s\n",
2266			 trace->syscalls.events.sys_exit->filter);
 
 
2267	}
2268
2269	err = perf_evlist__apply_filters(evlist, &evsel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2270	if (err < 0)
2271		goto out_error_apply_filters;
2272
2273	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2274	if (err < 0)
2275		goto out_error_mmap;
2276
2277	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2278		perf_evlist__enable(evlist);
2279
2280	if (forks)
2281		perf_evlist__start_workload(evlist);
2282
2283	if (trace->opts.initial_delay) {
2284		usleep(trace->opts.initial_delay * 1000);
2285		perf_evlist__enable(evlist);
2286	}
2287
2288	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2289				  evlist->threads->nr > 1 ||
2290				  perf_evlist__first(evlist)->attr.inherit;
 
 
 
 
 
 
 
 
 
 
 
 
2291again:
2292	before = trace->nr_events;
2293
2294	for (i = 0; i < evlist->nr_mmaps; i++) {
2295		union perf_event *event;
 
2296
2297		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2298			struct perf_sample sample;
 
2299
 
2300			++trace->nr_events;
2301
2302			err = perf_evlist__parse_sample(evlist, event, &sample);
2303			if (err) {
2304				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2305				goto next_event;
2306			}
2307
2308			trace__handle_event(trace, event, &sample);
2309next_event:
2310			perf_evlist__mmap_consume(evlist, i);
2311
2312			if (interrupted)
2313				goto out_disable;
2314
2315			if (done && !draining) {
2316				perf_evlist__disable(evlist);
2317				draining = true;
2318			}
2319		}
 
2320	}
2321
2322	if (trace->nr_events == before) {
2323		int timeout = done ? 100 : -1;
2324
2325		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2326			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2327				draining = true;
2328
2329			goto again;
 
 
 
2330		}
2331	} else {
2332		goto again;
2333	}
2334
2335out_disable:
2336	thread__zput(trace->current);
2337
2338	perf_evlist__disable(evlist);
 
 
 
2339
2340	if (!err) {
2341		if (trace->summary)
2342			trace__fprintf_thread_summary(trace, trace->output);
2343
2344		if (trace->show_tool_stats) {
2345			fprintf(trace->output, "Stats:\n "
2346					       " vfs_getname : %" PRIu64 "\n"
2347					       " proc_getname: %" PRIu64 "\n",
2348				trace->stats.vfs_getname,
2349				trace->stats.proc_getname);
2350		}
2351	}
2352
2353out_delete_evlist:
2354	perf_evlist__delete(evlist);
 
 
 
2355	trace->evlist = NULL;
2356	trace->live = false;
2357	return err;
2358{
2359	char errbuf[BUFSIZ];
2360
2361out_error_sched_stat_runtime:
2362	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2363	goto out_error;
2364
2365out_error_raw_syscalls:
2366	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2367	goto out_error;
2368
2369out_error_mmap:
2370	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2371	goto out_error;
2372
2373out_error_open:
2374	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2375
2376out_error:
2377	fprintf(trace->output, "%s\n", errbuf);
2378	goto out_delete_evlist;
2379
2380out_error_apply_filters:
2381	fprintf(trace->output,
2382		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2383		evsel->filter, perf_evsel__name(evsel), errno,
2384		str_error_r(errno, errbuf, sizeof(errbuf)));
2385	goto out_delete_evlist;
2386}
2387out_error_mem:
2388	fprintf(trace->output, "Not enough memory to run!\n");
2389	goto out_delete_evlist;
2390
2391out_errno:
2392	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2393	goto out_delete_evlist;
2394}
2395
2396static int trace__replay(struct trace *trace)
2397{
2398	const struct perf_evsel_str_handler handlers[] = {
2399		{ "probe:vfs_getname",	     trace__vfs_getname, },
2400	};
2401	struct perf_data_file file = {
2402		.path  = input_name,
2403		.mode  = PERF_DATA_MODE_READ,
2404		.force = trace->force,
2405	};
2406	struct perf_session *session;
2407	struct perf_evsel *evsel;
2408	int err = -1;
2409
2410	trace->tool.sample	  = trace__process_sample;
2411	trace->tool.mmap	  = perf_event__process_mmap;
2412	trace->tool.mmap2	  = perf_event__process_mmap2;
2413	trace->tool.comm	  = perf_event__process_comm;
2414	trace->tool.exit	  = perf_event__process_exit;
2415	trace->tool.fork	  = perf_event__process_fork;
2416	trace->tool.attr	  = perf_event__process_attr;
2417	trace->tool.tracing_data = perf_event__process_tracing_data;
2418	trace->tool.build_id	  = perf_event__process_build_id;
 
2419
2420	trace->tool.ordered_events = true;
2421	trace->tool.ordering_requires_timestamps = true;
2422
2423	/* add tid to output */
2424	trace->multiple_threads = true;
2425
2426	session = perf_session__new(&file, false, &trace->tool);
2427	if (session == NULL)
2428		return -1;
2429
2430	if (trace->opts.target.pid)
2431		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2432
2433	if (trace->opts.target.tid)
2434		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2435
2436	if (symbol__init(&session->header.env) < 0)
2437		goto out;
2438
2439	trace->host = &session->machines.host;
2440
2441	err = perf_session__set_tracepoints_handlers(session, handlers);
2442	if (err)
2443		goto out;
2444
2445	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2446						     "raw_syscalls:sys_enter");
2447	/* older kernels have syscalls tp versus raw_syscalls */
2448	if (evsel == NULL)
2449		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2450							     "syscalls:sys_enter");
2451
2452	if (evsel &&
2453	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2454	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2455		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2456		goto out;
2457	}
2458
2459	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2460						     "raw_syscalls:sys_exit");
2461	if (evsel == NULL)
2462		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2463							     "syscalls:sys_exit");
2464	if (evsel &&
2465	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2466	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2467		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2468		goto out;
2469	}
2470
2471	evlist__for_each_entry(session->evlist, evsel) {
2472		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2473		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2474		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2475		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2476			evsel->handler = trace__pgfault;
2477	}
2478
2479	setup_pager();
2480
2481	err = perf_session__process_events(session);
2482	if (err)
2483		pr_err("Failed to process events, error %d", err);
2484
2485	else if (trace->summary)
2486		trace__fprintf_thread_summary(trace, trace->output);
2487
2488out:
2489	perf_session__delete(session);
2490
2491	return err;
2492}
2493
2494static size_t trace__fprintf_threads_header(FILE *fp)
2495{
2496	size_t printed;
2497
2498	printed  = fprintf(fp, "\n Summary of events:\n\n");
2499
2500	return printed;
2501}
2502
2503DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2504	struct stats 	*stats;
2505	double		msecs;
2506	int		syscall;
2507)
2508{
2509	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2510	struct stats *stats = source->priv;
2511
2512	entry->syscall = source->i;
2513	entry->stats   = stats;
2514	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2515}
2516
2517static size_t thread__dump_stats(struct thread_trace *ttrace,
2518				 struct trace *trace, FILE *fp)
2519{
2520	size_t printed = 0;
2521	struct syscall *sc;
2522	struct rb_node *nd;
2523	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2524
2525	if (syscall_stats == NULL)
2526		return 0;
2527
2528	printed += fprintf(fp, "\n");
2529
2530	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2531	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2532	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2533
2534	resort_rb__for_each_entry(nd, syscall_stats) {
2535		struct stats *stats = syscall_stats_entry->stats;
2536		if (stats) {
2537			double min = (double)(stats->min) / NSEC_PER_MSEC;
2538			double max = (double)(stats->max) / NSEC_PER_MSEC;
2539			double avg = avg_stats(stats);
2540			double pct;
2541			u64 n = (u64) stats->n;
2542
2543			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2544			avg /= NSEC_PER_MSEC;
2545
2546			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2547			printed += fprintf(fp, "   %-15s", sc->name);
2548			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2549					   n, syscall_stats_entry->msecs, min, avg);
2550			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
 
 
 
 
 
 
 
 
 
2551		}
2552	}
2553
2554	resort_rb__delete(syscall_stats);
2555	printed += fprintf(fp, "\n\n");
2556
2557	return printed;
2558}
2559
2560static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2561{
2562	size_t printed = 0;
2563	struct thread_trace *ttrace = thread__priv(thread);
2564	double ratio;
2565
2566	if (ttrace == NULL)
2567		return 0;
2568
2569	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2570
2571	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2572	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2573	printed += fprintf(fp, "%.1f%%", ratio);
2574	if (ttrace->pfmaj)
2575		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2576	if (ttrace->pfmin)
2577		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2578	if (trace->sched)
2579		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2580	else if (fputc('\n', fp) != EOF)
2581		++printed;
2582
2583	printed += thread__dump_stats(ttrace, trace, fp);
2584
2585	return printed;
2586}
2587
2588static unsigned long thread__nr_events(struct thread_trace *ttrace)
2589{
2590	return ttrace ? ttrace->nr_events : 0;
2591}
2592
2593DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2594	struct thread *thread;
2595)
2596{
2597	entry->thread = rb_entry(nd, struct thread, rb_node);
 
 
 
 
 
 
 
 
 
 
 
2598}
2599
2600static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2601{
2602	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2603	size_t printed = trace__fprintf_threads_header(fp);
2604	struct rb_node *nd;
2605
2606	if (threads == NULL) {
2607		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2608		return 0;
2609	}
2610
2611	resort_rb__for_each_entry(nd, threads)
2612		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2613
2614	resort_rb__delete(threads);
2615
 
 
 
 
2616	return printed;
2617}
2618
2619static int trace__set_duration(const struct option *opt, const char *str,
2620			       int unset __maybe_unused)
2621{
2622	struct trace *trace = opt->value;
2623
2624	trace->duration_filter = atof(str);
2625	return 0;
2626}
2627
2628static int trace__set_filter_pids(const struct option *opt, const char *str,
2629				  int unset __maybe_unused)
2630{
2631	int ret = -1;
2632	size_t i;
2633	struct trace *trace = opt->value;
2634	/*
2635	 * FIXME: introduce a intarray class, plain parse csv and create a
2636	 * { int nr, int entries[] } struct...
2637	 */
2638	struct intlist *list = intlist__new(str);
2639
2640	if (list == NULL)
2641		return -1;
2642
2643	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2644	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2645
2646	if (trace->filter_pids.entries == NULL)
2647		goto out;
2648
2649	trace->filter_pids.entries[0] = getpid();
2650
2651	for (i = 1; i < trace->filter_pids.nr; ++i)
2652		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2653
2654	intlist__delete(list);
2655	ret = 0;
2656out:
2657	return ret;
2658}
2659
2660static int trace__open_output(struct trace *trace, const char *filename)
2661{
2662	struct stat st;
2663
2664	if (!stat(filename, &st) && st.st_size) {
2665		char oldname[PATH_MAX];
2666
2667		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2668		unlink(oldname);
2669		rename(filename, oldname);
2670	}
2671
2672	trace->output = fopen(filename, "w");
2673
2674	return trace->output == NULL ? -errno : 0;
2675}
2676
2677static int parse_pagefaults(const struct option *opt, const char *str,
2678			    int unset __maybe_unused)
2679{
2680	int *trace_pgfaults = opt->value;
2681
2682	if (strcmp(str, "all") == 0)
2683		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2684	else if (strcmp(str, "maj") == 0)
2685		*trace_pgfaults |= TRACE_PFMAJ;
2686	else if (strcmp(str, "min") == 0)
2687		*trace_pgfaults |= TRACE_PFMIN;
2688	else
2689		return -1;
2690
2691	return 0;
2692}
2693
2694static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2695{
2696	struct perf_evsel *evsel;
2697
2698	evlist__for_each_entry(evlist, evsel)
2699		evsel->handler = handler;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2700}
2701
2702int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2703{
2704	const char *trace_usage[] = {
2705		"perf trace [<options>] [<command>]",
2706		"perf trace [<options>] -- <command> [<options>]",
2707		"perf trace record [<options>] [<command>]",
2708		"perf trace record [<options>] -- <command> [<options>]",
2709		NULL
2710	};
2711	struct trace trace = {
2712		.syscalls = {
2713			. max = -1,
2714		},
2715		.opts = {
2716			.target = {
2717				.uid	   = UINT_MAX,
2718				.uses_mmap = true,
2719			},
2720			.user_freq     = UINT_MAX,
2721			.user_interval = ULLONG_MAX,
2722			.no_buffering  = true,
2723			.mmap_pages    = UINT_MAX,
2724			.proc_map_timeout  = 500,
2725		},
2726		.output = stderr,
2727		.show_comm = true,
2728		.trace_syscalls = true,
 
 
 
 
2729		.kernel_syscallchains = false,
2730		.max_stack = UINT_MAX,
 
2731	};
2732	const char *output_name = NULL;
2733	const char *ev_qualifier_str = NULL;
2734	const struct option trace_options[] = {
2735	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2736		     "event selector. use 'perf list' to list available events",
2737		     parse_events_option),
 
 
2738	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2739		    "show the thread COMM next to its id"),
2740	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2741	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
 
2742	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2743	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2744	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2745		    "trace events on existing process id"),
2746	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2747		    "trace events on existing thread id"),
2748	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2749		     "pids to filter (by the kernel)", trace__set_filter_pids),
2750	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2751		    "system-wide collection from all CPUs"),
2752	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2753		    "list of cpus to monitor"),
2754	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2755		    "child tasks do not inherit counters"),
2756	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2757		     "number of mmap data pages",
2758		     perf_evlist__parse_mmap_pages),
2759	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2760		   "user to profile"),
2761	OPT_CALLBACK(0, "duration", &trace, "float",
2762		     "show only events with duration > N.M ms",
2763		     trace__set_duration),
2764	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2765	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2766	OPT_BOOLEAN('T', "time", &trace.full_time,
2767		    "Show full timestamp, not time relative to first start"),
 
 
2768	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2769		    "Show only syscall summary with statistics"),
2770	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2771		    "Show all syscalls and summary with statistics"),
 
 
2772	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2773		     "Trace pagefaults", parse_pagefaults, "maj"),
2774	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2775	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2776	OPT_CALLBACK(0, "call-graph", &trace.opts,
2777		     "record_mode[,record_size]", record_callchain_help,
2778		     &record_parse_callchain_opt),
 
 
2779	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2780		    "Show the kernel callchains on the syscall exit path"),
 
 
2781	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2782		     "Set the minimum stack depth when parsing the callchain, "
2783		     "anything below the specified depth will be ignored."),
2784	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2785		     "Set the maximum stack depth when parsing the callchain, "
2786		     "anything beyond the specified depth will be ignored. "
2787		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2788	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
 
 
 
 
2789			"per thread proc mmap processing timeout in ms"),
2790	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
 
 
2791		     "ms to wait before starting measurement after program "
2792		     "start"),
 
 
 
2793	OPT_END()
2794	};
2795	bool __maybe_unused max_stack_user_set = true;
2796	bool mmap_pages_user_set = true;
 
2797	const char * const trace_subcommands[] = { "record", NULL };
2798	int err;
2799	char bf[BUFSIZ];
 
2800
2801	signal(SIGSEGV, sighandler_dump_stack);
2802	signal(SIGFPE, sighandler_dump_stack);
 
2803
2804	trace.evlist = perf_evlist__new();
 
 
 
 
 
2805	trace.sctbl = syscalltbl__new();
2806
2807	if (trace.evlist == NULL || trace.sctbl == NULL) {
2808		pr_err("Not enough memory to run!\n");
2809		err = -ENOMEM;
2810		goto out;
2811	}
2812
 
 
 
 
 
 
 
 
 
 
 
 
 
2813	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2814				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2815
2816	err = bpf__setup_stdout(trace.evlist);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2817	if (err) {
2818		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2819		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2820		goto out;
2821	}
2822
 
 
 
2823	err = -1;
2824
2825	if (trace.trace_pgfaults) {
2826		trace.opts.sample_address = true;
2827		trace.opts.sample_time = true;
2828	}
2829
2830	if (trace.opts.mmap_pages == UINT_MAX)
2831		mmap_pages_user_set = false;
2832
2833	if (trace.max_stack == UINT_MAX) {
2834		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2835		max_stack_user_set = false;
2836	}
2837
2838#ifdef HAVE_DWARF_UNWIND_SUPPORT
2839	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2840		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
 
2841#endif
2842
2843	if (callchain_param.enabled) {
2844		if (!mmap_pages_user_set && geteuid() == 0)
2845			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2846
2847		symbol_conf.use_callchain = true;
2848	}
2849
2850	if (trace.evlist->nr_entries > 0)
2851		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2852
2853	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2854		return trace__record(&trace, argc-1, &argv[1]);
2855
 
 
 
 
2856	/* summary_only implies summary option, but don't overwrite summary if set */
2857	if (trace.summary_only)
2858		trace.summary = trace.summary_only;
2859
2860	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2861	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2862		pr_err("Please specify something to trace.\n");
2863		return -1;
2864	}
2865
2866	if (!trace.trace_syscalls && ev_qualifier_str) {
2867		pr_err("The -e option can't be used with --no-syscalls.\n");
2868		goto out;
2869	}
2870
2871	if (output_name != NULL) {
2872		err = trace__open_output(&trace, output_name);
2873		if (err < 0) {
2874			perror("failed to create output file");
2875			goto out;
2876		}
2877	}
2878
2879	trace.open_id = syscalltbl__id(trace.sctbl, "open");
2880
2881	if (ev_qualifier_str != NULL) {
2882		const char *s = ev_qualifier_str;
2883		struct strlist_config slist_config = {
2884			.dirname = system_path(STRACE_GROUPS_DIR),
2885		};
2886
2887		trace.not_ev_qualifier = *s == '!';
2888		if (trace.not_ev_qualifier)
2889			++s;
2890		trace.ev_qualifier = strlist__new(s, &slist_config);
2891		if (trace.ev_qualifier == NULL) {
2892			fputs("Not enough memory to parse event qualifier",
2893			      trace.output);
2894			err = -ENOMEM;
2895			goto out_close;
2896		}
2897
2898		err = trace__validate_ev_qualifier(&trace);
2899		if (err)
2900			goto out_close;
2901	}
2902
2903	err = target__validate(&trace.opts.target);
2904	if (err) {
2905		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2906		fprintf(trace.output, "%s", bf);
2907		goto out_close;
2908	}
2909
2910	err = target__parse_uid(&trace.opts.target);
2911	if (err) {
2912		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2913		fprintf(trace.output, "%s", bf);
2914		goto out_close;
2915	}
2916
2917	if (!argc && target__none(&trace.opts.target))
2918		trace.opts.target.system_wide = true;
2919
2920	if (input_name)
2921		err = trace__replay(&trace);
2922	else
2923		err = trace__run(&trace, argc, argv);
2924
2925out_close:
2926	if (output_name != NULL)
2927		fclose(trace.output);
2928out:
 
 
 
 
2929	return err;
2930}
v6.13.7
   1/*
   2 * builtin-trace.c
   3 *
   4 * Builtin 'trace' command:
   5 *
   6 * Display a continuously updated trace of any workload, CPU, specific PID,
   7 * system wide, etc.  Default format is loosely strace like, but any other
   8 * event may be specified using --event.
   9 *
  10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  11 *
  12 * Initially based on the 'trace' prototype by Thomas Gleixner:
  13 *
  14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
 
 
  15 */
  16
  17#include "util/record.h"
  18#include <api/fs/tracing_path.h>
  19#ifdef HAVE_LIBBPF_SUPPORT
  20#include <bpf/bpf.h>
  21#include <bpf/libbpf.h>
  22#include <bpf/btf.h>
  23#ifdef HAVE_BPF_SKEL
  24#include "bpf_skel/augmented_raw_syscalls.skel.h"
  25#endif
  26#endif
  27#include "util/bpf_map.h"
  28#include "util/rlimit.h"
  29#include "builtin.h"
  30#include "util/cgroup.h"
  31#include "util/color.h"
  32#include "util/config.h"
  33#include "util/debug.h"
  34#include "util/dso.h"
  35#include "util/env.h"
  36#include "util/event.h"
  37#include "util/evsel.h"
  38#include "util/evsel_fprintf.h"
  39#include "util/synthetic-events.h"
  40#include "util/evlist.h"
  41#include "util/evswitch.h"
  42#include "util/mmap.h"
  43#include <subcmd/pager.h>
  44#include <subcmd/exec-cmd.h>
  45#include "util/machine.h"
  46#include "util/map.h"
  47#include "util/symbol.h"
  48#include "util/path.h"
  49#include "util/session.h"
  50#include "util/thread.h"
  51#include <subcmd/parse-options.h>
  52#include "util/strlist.h"
  53#include "util/intlist.h"
  54#include "util/thread_map.h"
  55#include "util/stat.h"
  56#include "util/tool.h"
  57#include "util/util.h"
  58#include "trace/beauty/beauty.h"
  59#include "trace-event.h"
  60#include "util/parse-events.h"
  61#include "util/tracepoint.h"
  62#include "callchain.h"
  63#include "print_binary.h"
  64#include "string2.h"
  65#include "syscalltbl.h"
  66#include "rb_resort.h"
  67#include "../perf.h"
  68#include "trace_augment.h"
  69
  70#include <errno.h>
  71#include <inttypes.h>
  72#include <poll.h>
  73#include <signal.h>
  74#include <stdlib.h>
  75#include <string.h>
  76#include <linux/err.h>
  77#include <linux/filter.h>
  78#include <linux/kernel.h>
  79#include <linux/list_sort.h>
  80#include <linux/random.h>
  81#include <linux/stringify.h>
  82#include <linux/time64.h>
  83#include <linux/zalloc.h>
  84#include <fcntl.h>
  85#include <sys/sysmacros.h>
  86
  87#include <linux/ctype.h>
  88#include <perf/mmap.h>
  89
  90#ifdef HAVE_LIBTRACEEVENT
  91#include <event-parse.h>
  92#endif
  93
  94#ifndef O_CLOEXEC
  95# define O_CLOEXEC		02000000
  96#endif
  97
  98#ifndef F_LINUX_SPECIFIC_BASE
  99# define F_LINUX_SPECIFIC_BASE	1024
 100#endif
 101
 102#define RAW_SYSCALL_ARGS_NUM	6
 103
 104/*
 105 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
 106 *
 107 * We have to explicitely mark the direction of the flow of data, if from the
 108 * kernel to user space or the other way around, since the BPF collector we
 109 * have so far copies only from user to kernel space, mark the arguments that
 110 * go that direction, so that we don´t end up collecting the previous contents
 111 * for syscall args that goes from kernel to user space.
 112 */
 113struct syscall_arg_fmt {
 114	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
 115	bool	   (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
 116	unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
 117	void	   *parm;
 118	const char *name;
 119	u16	   nr_entries; // for arrays
 120	bool	   from_user;
 121	bool	   show_zero;
 122#ifdef HAVE_LIBBPF_SUPPORT
 123	const struct btf_type *type;
 124	int	   type_id; /* used in btf_dump */
 125#endif
 126};
 127
 128struct syscall_fmt {
 129	const char *name;
 130	const char *alias;
 131	struct {
 132		const char *sys_enter,
 133			   *sys_exit;
 134	}	   bpf_prog_name;
 135	struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
 136	u8	   nr_args;
 137	bool	   errpid;
 138	bool	   timeout;
 139	bool	   hexret;
 140};
 141
 142struct trace {
 143	struct perf_tool	tool;
 144	struct syscalltbl	*sctbl;
 145	struct {
 
 146		struct syscall  *table;
 147		struct {
 148			struct evsel *sys_enter,
 149				*sys_exit,
 150				*bpf_output;
 151		}		events;
 152	} syscalls;
 153#ifdef HAVE_BPF_SKEL
 154	struct augmented_raw_syscalls_bpf *skel;
 155#endif
 156#ifdef HAVE_LIBBPF_SUPPORT
 157	struct btf		*btf;
 158#endif
 159	struct record_opts	opts;
 160	struct evlist	*evlist;
 161	struct machine		*host;
 162	struct thread		*current;
 163	struct cgroup		*cgroup;
 164	u64			base_time;
 165	FILE			*output;
 166	unsigned long		nr_events;
 167	unsigned long		nr_events_printed;
 168	unsigned long		max_events;
 169	struct evswitch		evswitch;
 170	struct strlist		*ev_qualifier;
 171	struct {
 172		size_t		nr;
 173		int		*entries;
 174	}			ev_qualifier_ids;
 175	struct {
 176		size_t		nr;
 177		pid_t		*entries;
 178		struct bpf_map  *map;
 179	}			filter_pids;
 180	double			duration_filter;
 181	double			runtime_ms;
 182	struct {
 183		u64		vfs_getname,
 184				proc_getname;
 185	} stats;
 186	unsigned int		max_stack;
 187	unsigned int		min_stack;
 188	int			raw_augmented_syscalls_args_size;
 189	bool			raw_augmented_syscalls;
 190	bool			fd_path_disabled;
 191	bool			sort_events;
 192	bool			not_ev_qualifier;
 193	bool			live;
 194	bool			full_time;
 195	bool			sched;
 196	bool			multiple_threads;
 197	bool			summary;
 198	bool			summary_only;
 199	bool			errno_summary;
 200	bool			failure_only;
 201	bool			show_comm;
 202	bool			print_sample;
 203	bool			show_tool_stats;
 204	bool			trace_syscalls;
 205	bool			libtraceevent_print;
 206	bool			kernel_syscallchains;
 207	s16			args_alignment;
 208	bool			show_tstamp;
 209	bool			show_duration;
 210	bool			show_zeros;
 211	bool			show_arg_names;
 212	bool			show_string_prefix;
 213	bool			force;
 214	bool			vfs_getname;
 215	bool			force_btf;
 216	int			trace_pgfaults;
 217	char			*perfconfig_events;
 218	struct {
 219		struct ordered_events	data;
 220		u64			last;
 221	} oe;
 222};
 223
 224static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused)
 225{
 226#ifdef HAVE_LIBBPF_SUPPORT
 227	if (trace->btf != NULL)
 228		return;
 229
 230	trace->btf = btf__load_vmlinux_btf();
 231	if (verbose > 0) {
 232		fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" :
 233						    "Failed to load vmlinux BTF\n");
 234	}
 235#endif
 236}
 237
 238struct tp_field {
 239	int offset;
 240	union {
 241		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
 242		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
 243	};
 244};
 245
 246#define TP_UINT_FIELD(bits) \
 247static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
 248{ \
 249	u##bits value; \
 250	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 251	return value;  \
 252}
 253
 254TP_UINT_FIELD(8);
 255TP_UINT_FIELD(16);
 256TP_UINT_FIELD(32);
 257TP_UINT_FIELD(64);
 258
 259#define TP_UINT_FIELD__SWAPPED(bits) \
 260static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
 261{ \
 262	u##bits value; \
 263	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
 264	return bswap_##bits(value);\
 265}
 266
 267TP_UINT_FIELD__SWAPPED(16);
 268TP_UINT_FIELD__SWAPPED(32);
 269TP_UINT_FIELD__SWAPPED(64);
 270
 271static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
 
 
 272{
 273	field->offset = offset;
 274
 275	switch (size) {
 276	case 1:
 277		field->integer = tp_field__u8;
 278		break;
 279	case 2:
 280		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
 281		break;
 282	case 4:
 283		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
 284		break;
 285	case 8:
 286		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
 287		break;
 288	default:
 289		return -1;
 290	}
 291
 292	return 0;
 293}
 294
 295static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
 296{
 297	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
 298}
 299
 300static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 301{
 302	return sample->raw_data + field->offset;
 303}
 304
 305static int __tp_field__init_ptr(struct tp_field *field, int offset)
 306{
 307	field->offset = offset;
 308	field->pointer = tp_field__ptr;
 309	return 0;
 310}
 311
 312static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
 313{
 314	return __tp_field__init_ptr(field, format_field->offset);
 315}
 316
 317struct syscall_tp {
 318	struct tp_field id;
 319	union {
 320		struct tp_field args, ret;
 321	};
 322};
 323
 324/*
 325 * The evsel->priv as used by 'perf trace'
 326 * sc:	for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
 327 * fmt: for all the other tracepoints
 328 */
 329struct evsel_trace {
 330	struct syscall_tp	sc;
 331	struct syscall_arg_fmt  *fmt;
 332};
 333
 334static struct evsel_trace *evsel_trace__new(void)
 335{
 336	return zalloc(sizeof(struct evsel_trace));
 337}
 338
 339static void evsel_trace__delete(struct evsel_trace *et)
 340{
 341	if (et == NULL)
 342		return;
 343
 344	zfree(&et->fmt);
 345	free(et);
 346}
 347
 348/*
 349 * Used with raw_syscalls:sys_{enter,exit} and with the
 350 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
 351 */
 352static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
 353{
 354	struct evsel_trace *et = evsel->priv;
 355
 356	return &et->sc;
 357}
 358
 359static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
 360{
 361	if (evsel->priv == NULL) {
 362		evsel->priv = evsel_trace__new();
 363		if (evsel->priv == NULL)
 364			return NULL;
 365	}
 366
 367	return __evsel__syscall_tp(evsel);
 368}
 369
 370/*
 371 * Used with all the other tracepoints.
 372 */
 373static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
 374{
 375	struct evsel_trace *et = evsel->priv;
 376
 377	return et->fmt;
 378}
 379
 380static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
 381{
 382	struct evsel_trace *et = evsel->priv;
 383
 384	if (evsel->priv == NULL) {
 385		et = evsel->priv = evsel_trace__new();
 386
 387		if (et == NULL)
 388			return NULL;
 389	}
 390
 391	if (et->fmt == NULL) {
 392		et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
 393		if (et->fmt == NULL)
 394			goto out_delete;
 395	}
 396
 397	return __evsel__syscall_arg_fmt(evsel);
 398
 399out_delete:
 400	evsel_trace__delete(evsel->priv);
 401	evsel->priv = NULL;
 402	return NULL;
 403}
 404
 405static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
 406{
 407	struct tep_format_field *format_field = evsel__field(evsel, name);
 408
 409	if (format_field == NULL)
 410		return -1;
 411
 412	return tp_field__init_uint(field, format_field, evsel->needs_swap);
 413}
 414
 415#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
 416	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
 417	   evsel__init_tp_uint_field(evsel, &sc->name, #name); })
 418
 419static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
 
 
 420{
 421	struct tep_format_field *format_field = evsel__field(evsel, name);
 422
 423	if (format_field == NULL)
 424		return -1;
 425
 426	return tp_field__init_ptr(field, format_field);
 427}
 428
 429#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
 430	({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
 431	   evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
 432
 433static void evsel__delete_priv(struct evsel *evsel)
 434{
 435	zfree(&evsel->priv);
 436	evsel__delete(evsel);
 437}
 438
 439static int evsel__init_syscall_tp(struct evsel *evsel)
 440{
 441	struct syscall_tp *sc = evsel__syscall_tp(evsel);
 442
 443	if (sc != NULL) {
 444		if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
 445		    evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
 446			return -ENOENT;
 447
 448		return 0;
 449	}
 450
 451	return -ENOMEM;
 452}
 453
 454static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
 455{
 456	struct syscall_tp *sc = evsel__syscall_tp(evsel);
 457
 458	if (sc != NULL) {
 459		struct tep_format_field *syscall_id = evsel__field(tp, "id");
 460		if (syscall_id == NULL)
 461			syscall_id = evsel__field(tp, "__syscall_nr");
 462		if (syscall_id == NULL ||
 463		    __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
 464			return -EINVAL;
 465
 466		return 0;
 467	}
 468
 469	return -ENOMEM;
 470}
 471
 472static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
 473{
 474	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
 475
 476	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
 477}
 478
 479static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
 480{
 481	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
 482
 483	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
 484}
 485
 486static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
 487{
 488	if (evsel__syscall_tp(evsel) != NULL) {
 489		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
 490			return -ENOENT;
 491
 492		evsel->handler = handler;
 493		return 0;
 494	}
 495
 496	return -ENOMEM;
 
 
 
 
 497}
 498
 499static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
 500{
 501	struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
 502
 503	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
 504	if (IS_ERR(evsel))
 505		evsel = evsel__newtp("syscalls", direction);
 506
 507	if (IS_ERR(evsel))
 508		return NULL;
 509
 510	if (evsel__init_raw_syscall_tp(evsel, handler))
 511		goto out_delete;
 512
 513	return evsel;
 514
 515out_delete:
 516	evsel__delete_priv(evsel);
 517	return NULL;
 518}
 519
 520#define perf_evsel__sc_tp_uint(evsel, name, sample) \
 521	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 522	   fields->name.integer(&fields->name, sample); })
 523
 524#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
 525	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 526	   fields->name.pointer(&fields->name, sample); })
 527
 528size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
 529{
 530	int idx = val - sa->offset;
 
 
 
 
 
 531
 532	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 533		size_t printed = scnprintf(bf, size, intfmt, val);
 534		if (show_suffix)
 535			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 536		return printed;
 537	}
 538
 539	return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
 
 
 540}
 541
 542size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 543{
 544	int idx = val - sa->offset;
 545
 546	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
 547		size_t printed = scnprintf(bf, size, intfmt, val);
 548		if (show_prefix)
 549			printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
 550		return printed;
 551	}
 552
 553	return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 554}
 555
 556static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 557						const char *intfmt,
 558					        struct syscall_arg *arg)
 559{
 560	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
 
 
 
 
 
 
 561}
 562
 563static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 564					      struct syscall_arg *arg)
 565{
 566	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
 567}
 568
 569#define SCA_STRARRAY syscall_arg__scnprintf_strarray
 570
 571bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 572{
 573	return strarray__strtoul(arg->parm, bf, size, ret);
 574}
 575
 576bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 577{
 578	return strarray__strtoul_flags(arg->parm, bf, size, ret);
 579}
 580
 581bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
 582{
 583	return strarrays__strtoul(arg->parm, bf, size, ret);
 584}
 585
 586size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
 587{
 588	return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
 589}
 590
 591size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 592{
 593	size_t printed;
 594	int i;
 595
 596	for (i = 0; i < sas->nr_entries; ++i) {
 597		struct strarray *sa = sas->entries[i];
 598		int idx = val - sa->offset;
 599
 600		if (idx >= 0 && idx < sa->nr_entries) {
 601			if (sa->entries[idx] == NULL)
 602				break;
 603			return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 604		}
 605	}
 606
 607	printed = scnprintf(bf, size, intfmt, val);
 608	if (show_prefix)
 609		printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
 610	return printed;
 611}
 612
 613bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
 614{
 615	int i;
 616
 617	for (i = 0; i < sa->nr_entries; ++i) {
 618		if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
 619			*ret = sa->offset + i;
 620			return true;
 621		}
 622	}
 623
 624	return false;
 625}
 626
 627bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
 628{
 629	u64 val = 0;
 630	char *tok = bf, *sep, *end;
 631
 632	*ret = 0;
 633
 634	while (size != 0) {
 635		int toklen = size;
 636
 637		sep = memchr(tok, '|', size);
 638		if (sep != NULL) {
 639			size -= sep - tok + 1;
 640
 641			end = sep - 1;
 642			while (end > tok && isspace(*end))
 643				--end;
 644
 645			toklen = end - tok + 1;
 646		}
 647
 648		while (isspace(*tok))
 649			++tok;
 650
 651		if (isalpha(*tok) || *tok == '_') {
 652			if (!strarray__strtoul(sa, tok, toklen, &val))
 653				return false;
 654		} else
 655			val = strtoul(tok, NULL, 0);
 656
 657		*ret |= (1 << (val - 1));
 658
 659		if (sep == NULL)
 660			break;
 661		tok = sep + 1;
 662	}
 663
 664	return true;
 665}
 666
 667bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
 668{
 669	int i;
 670
 671	for (i = 0; i < sas->nr_entries; ++i) {
 672		struct strarray *sa = sas->entries[i];
 673
 674		if (strarray__strtoul(sa, bf, size, ret))
 675			return true;
 676	}
 677
 678	return false;
 679}
 680
 681size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
 682					struct syscall_arg *arg)
 683{
 684	return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
 685}
 686
 687#ifndef AT_FDCWD
 688#define AT_FDCWD	-100
 689#endif
 690
 691static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
 692					   struct syscall_arg *arg)
 693{
 694	int fd = arg->val;
 695	const char *prefix = "AT_FD";
 696
 697	if (fd == AT_FDCWD)
 698		return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
 699
 700	return syscall_arg__scnprintf_fd(bf, size, arg);
 701}
 702
 703#define SCA_FDAT syscall_arg__scnprintf_fd_at
 704
 705static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 706					      struct syscall_arg *arg);
 707
 708#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 709
 710size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 
 711{
 712	return scnprintf(bf, size, "%#lx", arg->val);
 713}
 714
 715size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
 716{
 717	if (arg->val == 0)
 718		return scnprintf(bf, size, "NULL");
 719	return syscall_arg__scnprintf_hex(bf, size, arg);
 720}
 721
 722size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 
 723{
 724	return scnprintf(bf, size, "%d", arg->val);
 725}
 726
 727size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
 728{
 729	return scnprintf(bf, size, "%ld", arg->val);
 730}
 731
 732static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
 733{
 734	// XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
 735	//     fill missing comms using thread__set_comm()...
 736	//     here or in a special syscall_arg__scnprintf_pid_sched_tp...
 737	return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
 738}
 739
 740#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
 741
 742static const char *bpf_cmd[] = {
 743	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 744	"MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
 745	"PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
 746	"PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
 747	"PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
 748	"TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
 749	"BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
 750	"MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
 751	"LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
 752	"LINK_DETACH", "PROG_BIND_MAP",
 753};
 754static DEFINE_STRARRAY(bpf_cmd, "BPF_");
 755
 756static const char *fsmount_flags[] = {
 757	[1] = "CLOEXEC",
 758};
 759static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
 760
 761#include "trace/beauty/generated/fsconfig_arrays.c"
 762
 763static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
 764
 765static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
 766static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
 767
 768static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
 769static DEFINE_STRARRAY(itimers, "ITIMER_");
 770
 771static const char *keyctl_options[] = {
 772	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
 773	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
 774	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
 775	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
 776	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 777};
 778static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
 779
 780static const char *whences[] = { "SET", "CUR", "END",
 781#ifdef SEEK_DATA
 782"DATA",
 783#endif
 784#ifdef SEEK_HOLE
 785"HOLE",
 786#endif
 787};
 788static DEFINE_STRARRAY(whences, "SEEK_");
 789
 790static const char *fcntl_cmds[] = {
 791	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
 792	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
 793	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
 794	"GETOWNER_UIDS",
 795};
 796static DEFINE_STRARRAY(fcntl_cmds, "F_");
 797
 798static const char *fcntl_linux_specific_cmds[] = {
 799	"SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
 800	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
 801	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 802};
 803
 804static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
 805
 806static struct strarray *fcntl_cmds_arrays[] = {
 807	&strarray__fcntl_cmds,
 808	&strarray__fcntl_linux_specific_cmds,
 809};
 810
 811static DEFINE_STRARRAYS(fcntl_cmds_arrays);
 812
 813static const char *rlimit_resources[] = {
 814	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 815	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
 816	"RTTIME",
 817};
 818static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
 819
 820static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
 821static DEFINE_STRARRAY(sighow, "SIG_");
 822
 823static const char *clockid[] = {
 824	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
 825	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
 826	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 827};
 828static DEFINE_STRARRAY(clockid, "CLOCK_");
 
 
 
 
 
 
 
 
 
 
 829
 830static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 831						 struct syscall_arg *arg)
 832{
 833	bool show_prefix = arg->show_string_prefix;
 834	const char *suffix = "_OK";
 835	size_t printed = 0;
 836	int mode = arg->val;
 837
 838	if (mode == F_OK) /* 0 */
 839		return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
 840#define	P_MODE(n) \
 841	if (mode & n##_OK) { \
 842		printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
 843		mode &= ~n##_OK; \
 844	}
 845
 846	P_MODE(R);
 847	P_MODE(W);
 848	P_MODE(X);
 849#undef P_MODE
 850
 851	if (mode)
 852		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
 853
 854	return printed;
 855}
 856
 857#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
 858
 859static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 860					      struct syscall_arg *arg);
 861
 862#define SCA_FILENAME syscall_arg__scnprintf_filename
 863
 864// 'argname' is just documentational at this point, to remove the previous comment with that info
 865#define SCA_FILENAME_FROM_USER(argname) \
 866	  { .scnprintf	= SCA_FILENAME, \
 867	    .from_user	= true, }
 868
 869static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg);
 870
 871#define SCA_BUF syscall_arg__scnprintf_buf
 872
 873static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 874						struct syscall_arg *arg)
 875{
 876	bool show_prefix = arg->show_string_prefix;
 877	const char *prefix = "O_";
 878	int printed = 0, flags = arg->val;
 879
 880#define	P_FLAG(n) \
 881	if (flags & O_##n) { \
 882		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 883		flags &= ~O_##n; \
 884	}
 885
 886	P_FLAG(CLOEXEC);
 887	P_FLAG(NONBLOCK);
 888#undef P_FLAG
 889
 890	if (flags)
 891		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 892
 893	return printed;
 894}
 895
 896#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 897
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 898#ifndef GRND_NONBLOCK
 899#define GRND_NONBLOCK	0x0001
 900#endif
 901#ifndef GRND_RANDOM
 902#define GRND_RANDOM	0x0002
 903#endif
 904
 905static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 906						   struct syscall_arg *arg)
 907{
 908	bool show_prefix = arg->show_string_prefix;
 909	const char *prefix = "GRND_";
 910	int printed = 0, flags = arg->val;
 911
 912#define	P_FLAG(n) \
 913	if (flags & GRND_##n) { \
 914		printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
 915		flags &= ~GRND_##n; \
 916	}
 917
 918	P_FLAG(RANDOM);
 919	P_FLAG(NONBLOCK);
 920#undef P_FLAG
 921
 922	if (flags)
 923		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 924
 925	return printed;
 926}
 927
 928#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 929
 930#ifdef HAVE_LIBBPF_SUPPORT
 931static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
 932{
 933	int id;
 934
 935	type = strstr(type, "enum ");
 936	if (type == NULL)
 937		return;
 938
 939	type += 5; // skip "enum " to get the enumeration name
 940
 941	id = btf__find_by_name(btf, type);
 942	if (id < 0)
 943		return;
 944
 945	arg_fmt->type = btf__type_by_id(btf, id);
 946}
 947
 948static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
 949{
 950	const struct btf_type *bt = arg->fmt->type;
 951	struct btf *btf = arg->trace->btf;
 952	struct btf_enum *be = btf_enum(bt);
 953
 954	for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
 955		const char *name = btf__name_by_offset(btf, be->name_off);
 956		int max_len = max(size, strlen(name));
 957
 958		if (strncmp(name, bf, max_len) == 0) {
 959			*val = be->val;
 960			return true;
 961		}
 962	}
 963
 964	return false;
 965}
 966
 967static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
 968{
 969	const struct btf_type *bt;
 970	char *type = arg->type_name;
 971	struct btf *btf;
 972
 973	trace__load_vmlinux_btf(arg->trace);
 974
 975	btf = arg->trace->btf;
 976	if (btf == NULL)
 977		return false;
 978
 979	if (arg->fmt->type == NULL) {
 980		// See if this is an enum
 981		syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type);
 982	}
 983
 984	// Now let's see if we have a BTF type resolved
 985	bt = arg->fmt->type;
 986	if (bt == NULL)
 987		return false;
 988
 989	// If it is an enum:
 990	if (btf_is_enum(arg->fmt->type))
 991		return syscall_arg__strtoul_btf_enum(bf, size, arg, val);
 992
 993	return false;
 994}
 995
 996static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
 997{
 998	struct btf_enum *be = btf_enum(type);
 999	const int nr_entries = btf_vlen(type);
1000
1001	for (int i = 0; i < nr_entries; ++i, ++be) {
1002		if (be->val == val) {
1003			return scnprintf(bf, size, "%s",
1004					 btf__name_by_offset(btf, be->name_off));
1005		}
1006	}
1007
1008	return 0;
1009}
1010
1011struct trace_btf_dump_snprintf_ctx {
1012	char   *bf;
1013	size_t printed, size;
1014};
1015
1016static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args)
1017{
1018	struct trace_btf_dump_snprintf_ctx *ctx = vctx;
1019
1020	ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args);
1021}
1022
1023static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg)
1024{
1025	struct trace_btf_dump_snprintf_ctx ctx = {
1026		.bf   = bf,
1027		.size = size,
1028	};
1029	struct augmented_arg *augmented_arg = arg->augmented.args;
1030	int type_id = arg->fmt->type_id, consumed;
1031	struct btf_dump *btf_dump;
1032
1033	LIBBPF_OPTS(btf_dump_opts, dump_opts);
1034	LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts);
1035
1036	if (arg == NULL || arg->augmented.args == NULL)
1037		return 0;
1038
1039	dump_data_opts.compact	  = true;
1040	dump_data_opts.skip_names = !arg->trace->show_arg_names;
1041
1042	btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts);
1043	if (btf_dump == NULL)
1044		return 0;
1045
1046	/* pretty print the struct data here */
1047	if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0)
1048		return 0;
1049
1050	consumed = sizeof(*augmented_arg) + augmented_arg->size;
1051	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1052	arg->augmented.size -= consumed;
1053
1054	btf_dump__free(btf_dump);
1055
1056	return ctx.printed;
1057}
1058
1059static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
1060				   size_t size, int val, char *type)
1061{
1062	struct syscall_arg_fmt *arg_fmt = arg->fmt;
1063
1064	if (trace->btf == NULL)
1065		return 0;
1066
1067	if (arg_fmt->type == NULL) {
1068		// Check if this is an enum and if we have the BTF type for it.
1069		syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
1070	}
1071
1072	// Did we manage to find a BTF type for the syscall/tracepoint argument?
1073	if (arg_fmt->type == NULL)
1074		return 0;
1075
1076	if (btf_is_enum(arg_fmt->type))
1077		return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
1078	else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type))
1079		return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
1080
1081	return 0;
1082}
1083
1084#else // HAVE_LIBBPF_SUPPORT
1085static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused,
1086				   char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
1087				   char *type __maybe_unused)
1088{
1089	return 0;
1090}
1091
1092static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused,
1093					  struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
1094{
1095	return false;
1096}
1097#endif // HAVE_LIBBPF_SUPPORT
1098
1099#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type
1100
1101#define STRARRAY(name, array) \
1102	  { .scnprintf	= SCA_STRARRAY, \
1103	    .strtoul	= STUL_STRARRAY, \
1104	    .parm	= &strarray__##array, }
1105
1106#define STRARRAY_FLAGS(name, array) \
1107	  { .scnprintf	= SCA_STRARRAY_FLAGS, \
1108	    .strtoul	= STUL_STRARRAY_FLAGS, \
1109	    .parm	= &strarray__##array, }
1110
1111#include "trace/beauty/arch_errno_names.c"
1112#include "trace/beauty/eventfd.c"
 
1113#include "trace/beauty/futex_op.c"
1114#include "trace/beauty/futex_val3.c"
1115#include "trace/beauty/mmap.c"
1116#include "trace/beauty/mode_t.c"
1117#include "trace/beauty/msg_flags.c"
1118#include "trace/beauty/open_flags.c"
1119#include "trace/beauty/perf_event_open.c"
1120#include "trace/beauty/pid.c"
1121#include "trace/beauty/sched_policy.c"
1122#include "trace/beauty/seccomp.c"
1123#include "trace/beauty/signum.c"
1124#include "trace/beauty/socket_type.c"
1125#include "trace/beauty/waitid_options.c"
1126
1127static const struct syscall_fmt syscall_fmts[] = {
1128	{ .name	    = "access",
1129	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
1130	{ .name	    = "arch_prctl",
1131	  .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
1132		   [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
1133	{ .name	    = "bind",
1134	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1135		   [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
1136		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1137	{ .name	    = "bpf",
1138	  .arg = { [0] = STRARRAY(cmd, bpf_cmd),
1139		   [1] = { .from_user = true /* attr */, }, } },
 
1140	{ .name	    = "brk",	    .hexret = true,
1141	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
1142	{ .name     = "clock_gettime",
1143	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
1144	{ .name	    = "clock_nanosleep",
1145	  .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, },
1146	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
1147	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
1148		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
1149		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
1150		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
1151		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
1152	{ .name	    = "close",
1153	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
1154	{ .name	    = "connect",
1155	  .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1156		   [1] = SCA_SOCKADDR_FROM_USER(servaddr),
1157		   [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1158	{ .name	    = "epoll_ctl",
1159	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
1160	{ .name	    = "eventfd2",
1161	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
1162	{ .name     = "faccessat",
1163	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
1164		   [1] = SCA_FILENAME_FROM_USER(pathname),
1165		   [2] = { .scnprintf = SCA_ACCMODE,	  /* mode */ }, }, },
1166	{ .name     = "faccessat2",
1167	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
1168		   [1] = SCA_FILENAME_FROM_USER(pathname),
1169		   [2] = { .scnprintf = SCA_ACCMODE,	  /* mode */ },
1170		   [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, },
1171	{ .name	    = "fchmodat",
1172	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1173	{ .name	    = "fchownat",
1174	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1175	{ .name	    = "fcntl",
1176	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD,  /* cmd */
1177			   .strtoul   = STUL_STRARRAYS,
1178			   .parm      = &strarrays__fcntl_cmds_arrays,
1179			   .show_zero = true, },
1180		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
1181	{ .name	    = "flock",
1182	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
1183	{ .name     = "fsconfig",
1184	  .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
1185	{ .name     = "fsmount",
1186	  .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
1187		   [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
1188	{ .name     = "fspick",
1189	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
1190		   [1] = SCA_FILENAME_FROM_USER(path),
1191		   [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
1192	{ .name	    = "fstat", .alias = "newfstat", },
1193	{ .name	    = "futex",
1194	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
1195		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
1196	{ .name	    = "futimesat",
1197	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1198	{ .name	    = "getitimer",
1199	  .arg = { [0] = STRARRAY(which, itimers), }, },
1200	{ .name	    = "getpid",	    .errpid = true, },
1201	{ .name	    = "getpgid",    .errpid = true, },
1202	{ .name	    = "getppid",    .errpid = true, },
1203	{ .name	    = "getrandom",
1204	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
1205	{ .name	    = "getrlimit",
1206	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
1207	{ .name	    = "getsockopt",
1208	  .arg = { [1] = STRARRAY(level, socket_level), }, },
1209	{ .name	    = "gettid",	    .errpid = true, },
1210	{ .name	    = "ioctl",
1211	  .arg = {
1212#if defined(__i386__) || defined(__x86_64__)
1213/*
1214 * FIXME: Make this available to all arches.
1215 */
1216		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
1217		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 
1218#else
1219		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1220#endif
1221	{ .name	    = "kcmp",	    .nr_args = 5,
1222	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
1223		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
1224		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
1225		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
1226		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
1227	{ .name	    = "keyctl",
1228	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
1229	{ .name	    = "kill",
1230	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1231	{ .name	    = "linkat",
1232	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1233	{ .name	    = "lseek",
1234	  .arg = { [2] = STRARRAY(whence, whences), }, },
1235	{ .name	    = "lstat", .alias = "newlstat", },
1236	{ .name     = "madvise",
1237	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
1238		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
1239	{ .name	    = "mkdirat",
1240	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1241	{ .name	    = "mknodat",
1242	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
 
 
 
 
 
 
 
1243	{ .name	    = "mmap",	    .hexret = true,
1244/* The standard mmap maps to old_mmap on s390x */
1245#if defined(__s390x__)
1246	.alias = "old_mmap",
1247#endif
1248	  .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1249		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */
1250			   .strtoul   = STUL_STRARRAY_FLAGS,
1251			   .parm      = &strarray__mmap_flags, },
1252		   [5] = { .scnprintf = SCA_HEX,	/* offset */ }, }, },
1253	{ .name	    = "mount",
1254	  .arg = { [0] = SCA_FILENAME_FROM_USER(devname),
1255		   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
1256			   .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
1257	{ .name	    = "move_mount",
1258	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* from_dfd */ },
1259		   [1] = SCA_FILENAME_FROM_USER(pathname),
1260		   [2] = { .scnprintf = SCA_FDAT,	/* to_dfd */ },
1261		   [3] = SCA_FILENAME_FROM_USER(pathname),
1262		   [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
1263	{ .name	    = "mprotect",
1264	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
1265		   [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, },
1266	{ .name	    = "mq_unlink",
1267	  .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, },
1268	{ .name	    = "mremap",	    .hexret = true,
1269	  .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
1270	{ .name	    = "name_to_handle_at",
1271	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1272	{ .name	    = "nanosleep",
1273	  .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, },
1274	{ .name	    = "newfstatat", .alias = "fstatat",
1275	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dirfd */ },
1276		   [1] = SCA_FILENAME_FROM_USER(pathname),
1277		   [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1278	{ .name	    = "open",
1279	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1280	{ .name	    = "open_by_handle_at",
1281	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
1282		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1283	{ .name	    = "openat",
1284	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
1285		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1286	{ .name	    = "perf_event_open",
1287	  .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr),
1288		   [2] = { .scnprintf = SCA_INT,	/* cpu */ },
1289		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
1290		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
1291	{ .name	    = "pipe2",
1292	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
1293	{ .name	    = "pkey_alloc",
1294	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
1295	{ .name	    = "pkey_free",
1296	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
1297	{ .name	    = "pkey_mprotect",
1298	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
1299		   [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1300		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
1301	{ .name	    = "poll", .timeout = true, },
1302	{ .name	    = "ppoll", .timeout = true, },
1303	{ .name	    = "prctl",
1304	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
1305			   .strtoul   = STUL_STRARRAY,
1306			   .parm      = &strarray__prctl_options, },
1307		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
1308		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
1309	{ .name	    = "pread", .alias = "pread64", },
1310	{ .name	    = "preadv", .alias = "pread", },
1311	{ .name	    = "prlimit64",
1312	  .arg = { [1] = STRARRAY(resource, rlimit_resources),
1313		   [2] = { .from_user = true /* new_rlim */, }, }, },
1314	{ .name	    = "pwrite", .alias = "pwrite64", },
1315	{ .name	    = "readlinkat",
1316	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1317	{ .name	    = "recvfrom",
1318	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1319	{ .name	    = "recvmmsg",
1320	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1321	{ .name	    = "recvmsg",
1322	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1323	{ .name	    = "renameat",
1324	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1325		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
1326	{ .name	    = "renameat2",
1327	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1328		   [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
1329		   [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
1330	{ .name	    = "rseq",	    .errpid = true,
1331	  .arg = { [0] = { .from_user = true /* rseq */, }, }, },
1332	{ .name	    = "rt_sigaction",
1333	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1334	{ .name	    = "rt_sigprocmask",
1335	  .arg = { [0] = STRARRAY(how, sighow), }, },
1336	{ .name	    = "rt_sigqueueinfo",
1337	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1338	{ .name	    = "rt_tgsigqueueinfo",
1339	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1340	{ .name	    = "sched_setscheduler",
1341	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
1342	{ .name	    = "seccomp",
1343	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
1344		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
1345	{ .name	    = "select", .timeout = true, },
1346	{ .name	    = "sendfile", .alias = "sendfile64", },
1347	{ .name	    = "sendmmsg",
1348	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1349	{ .name	    = "sendmsg",
1350	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1351	{ .name	    = "sendto",
1352	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
1353		   [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
1354	{ .name	    = "set_robust_list",	    .errpid = true,
1355	  .arg = { [0] = { .from_user = true /* head */, }, }, },
1356	{ .name	    = "set_tid_address", .errpid = true, },
1357	{ .name	    = "setitimer",
1358	  .arg = { [0] = STRARRAY(which, itimers), }, },
1359	{ .name	    = "setrlimit",
1360	  .arg = { [0] = STRARRAY(resource, rlimit_resources),
1361		   [1] = { .from_user = true /* rlim */, }, }, },
1362	{ .name	    = "setsockopt",
1363	  .arg = { [1] = STRARRAY(level, socket_level), }, },
1364	{ .name	    = "socket",
1365	  .arg = { [0] = STRARRAY(family, socket_families),
1366		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1367		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1368	{ .name	    = "socketpair",
1369	  .arg = { [0] = STRARRAY(family, socket_families),
1370		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1371		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1372	{ .name	    = "stat", .alias = "newstat", },
1373	{ .name	    = "statx",
1374	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
1375		   [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } ,
1376		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
1377	{ .name	    = "swapoff",
1378	  .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1379	{ .name	    = "swapon",
1380	  .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1381	{ .name	    = "symlinkat",
1382	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1383	{ .name	    = "sync_file_range",
1384	  .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
1385	{ .name	    = "tgkill",
1386	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1387	{ .name	    = "tkill",
1388	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1389	{ .name     = "umount2", .alias = "umount",
1390	  .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, },
1391	{ .name	    = "uname", .alias = "newuname", },
1392	{ .name	    = "unlinkat",
1393	  .arg = { [0] = { .scnprintf = SCA_FDAT,	  /* dfd */ },
1394		   [1] = SCA_FILENAME_FROM_USER(pathname),
1395		   [2] = { .scnprintf = SCA_FS_AT_FLAGS,  /* flags */ }, }, },
1396	{ .name	    = "utimensat",
1397	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
1398	{ .name	    = "wait4",	    .errpid = true,
1399	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1400	{ .name	    = "waitid",	    .errpid = true,
1401	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1402	{ .name	    = "write",
1403	  .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
1404};
1405
1406static int syscall_fmt__cmp(const void *name, const void *fmtp)
1407{
1408	const struct syscall_fmt *fmt = fmtp;
1409	return strcmp(name, fmt->name);
1410}
1411
1412static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts,
1413						     const int nmemb,
1414						     const char *name)
1415{
1416	return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1417}
1418
1419static const struct syscall_fmt *syscall_fmt__find(const char *name)
1420{
1421	const int nmemb = ARRAY_SIZE(syscall_fmts);
1422	return __syscall_fmt__find(syscall_fmts, nmemb, name);
1423}
1424
1425static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts,
1426							      const int nmemb, const char *alias)
1427{
1428	int i;
1429
1430	for (i = 0; i < nmemb; ++i) {
1431		if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
1432			return &fmts[i];
1433	}
1434
1435	return NULL;
1436}
1437
1438static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
1439{
1440	const int nmemb = ARRAY_SIZE(syscall_fmts);
1441	return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
1442}
1443
1444/*
1445 * is_exit: is this "exit" or "exit_group"?
1446 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
1447 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
1448 * nonexistent: Just a hole in the syscall table, syscall id not allocated
1449 */
1450struct syscall {
1451	struct tep_event    *tp_format;
1452	int		    nr_args;
1453	int		    args_size;
1454	struct {
1455		struct bpf_program *sys_enter,
1456				   *sys_exit;
1457	}		    bpf_prog;
1458	bool		    is_exit;
1459	bool		    is_open;
1460	bool		    nonexistent;
1461	bool		    use_btf;
1462	struct tep_format_field *args;
1463	const char	    *name;
1464	const struct syscall_fmt  *fmt;
1465	struct syscall_arg_fmt *arg_fmt;
1466};
1467
1468/*
1469 * We need to have this 'calculated' boolean because in some cases we really
1470 * don't know what is the duration of a syscall, for instance, when we start
1471 * a session and some threads are waiting for a syscall to finish, say 'poll',
1472 * in which case all we can do is to print "( ? ) for duration and for the
1473 * start timestamp.
1474 */
1475static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1476{
1477	double duration = (double)t / NSEC_PER_MSEC;
1478	size_t printed = fprintf(fp, "(");
1479
1480	if (!calculated)
1481		printed += fprintf(fp, "         ");
1482	else if (duration >= 1.0)
1483		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1484	else if (duration >= 0.01)
1485		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1486	else
1487		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1488	return printed + fprintf(fp, "): ");
1489}
1490
1491/**
1492 * filename.ptr: The filename char pointer that will be vfs_getname'd
1493 * filename.entry_str_pos: Where to insert the string translated from
1494 *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1495 * ret_scnprintf: syscall args may set this to a different syscall return
1496 *                formatter, for instance, fcntl may return fds, file flags, etc.
1497 */
1498struct thread_trace {
1499	u64		  entry_time;
1500	bool		  entry_pending;
1501	unsigned long	  nr_events;
1502	unsigned long	  pfmaj, pfmin;
1503	char		  *entry_str;
1504	double		  runtime_ms;
1505	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1506        struct {
1507		unsigned long ptr;
1508		short int     entry_str_pos;
1509		bool	      pending_open;
1510		unsigned int  namelen;
1511		char	      *name;
1512	} filename;
1513	struct {
1514		int	      max;
1515		struct file   *table;
1516	} files;
1517
1518	struct intlist *syscall_stats;
1519};
1520
1521static struct thread_trace *thread_trace__new(void)
1522{
1523	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1524
1525	if (ttrace) {
1526		ttrace->files.max = -1;
1527		ttrace->syscall_stats = intlist__new(NULL);
1528	}
1529
1530	return ttrace;
1531}
1532
1533static void thread_trace__free_files(struct thread_trace *ttrace);
1534
1535static void thread_trace__delete(void *pttrace)
1536{
1537	struct thread_trace *ttrace = pttrace;
1538
1539	if (!ttrace)
1540		return;
1541
1542	intlist__delete(ttrace->syscall_stats);
1543	ttrace->syscall_stats = NULL;
1544	thread_trace__free_files(ttrace);
1545	zfree(&ttrace->entry_str);
1546	free(ttrace);
1547}
1548
1549static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1550{
1551	struct thread_trace *ttrace;
1552
1553	if (thread == NULL)
1554		goto fail;
1555
1556	if (thread__priv(thread) == NULL)
1557		thread__set_priv(thread, thread_trace__new());
1558
1559	if (thread__priv(thread) == NULL)
1560		goto fail;
1561
1562	ttrace = thread__priv(thread);
1563	++ttrace->nr_events;
1564
1565	return ttrace;
1566fail:
1567	color_fprintf(fp, PERF_COLOR_RED,
1568		      "WARNING: not enough memory, dropping samples!\n");
1569	return NULL;
1570}
1571
1572
1573void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1574				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1575{
1576	struct thread_trace *ttrace = thread__priv(arg->thread);
1577
1578	ttrace->ret_scnprintf = ret_scnprintf;
1579}
1580
1581#define TRACE_PFMAJ		(1 << 0)
1582#define TRACE_PFMIN		(1 << 1)
1583
1584static const size_t trace__entry_str_size = 2048;
1585
1586static void thread_trace__free_files(struct thread_trace *ttrace)
1587{
1588	for (int i = 0; i < ttrace->files.max; ++i) {
1589		struct file *file = ttrace->files.table + i;
1590		zfree(&file->pathname);
1591	}
1592
1593	zfree(&ttrace->files.table);
1594	ttrace->files.max  = -1;
1595}
1596
1597static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1598{
1599	if (fd < 0)
1600		return NULL;
1601
1602	if (fd > ttrace->files.max) {
1603		struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1604
1605		if (nfiles == NULL)
1606			return NULL;
1607
1608		if (ttrace->files.max != -1) {
1609			memset(nfiles + ttrace->files.max + 1, 0,
1610			       (fd - ttrace->files.max) * sizeof(struct file));
1611		} else {
1612			memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1613		}
1614
1615		ttrace->files.table = nfiles;
1616		ttrace->files.max   = fd;
1617	}
1618
1619	return ttrace->files.table + fd;
1620}
1621
1622struct file *thread__files_entry(struct thread *thread, int fd)
1623{
1624	return thread_trace__files_entry(thread__priv(thread), fd);
1625}
1626
1627static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1628{
1629	struct thread_trace *ttrace = thread__priv(thread);
1630	struct file *file = thread_trace__files_entry(ttrace, fd);
1631
1632	if (file != NULL) {
1633		struct stat st;
1634		if (stat(pathname, &st) == 0)
1635			file->dev_maj = major(st.st_rdev);
1636		file->pathname = strdup(pathname);
1637		if (file->pathname)
1638			return 0;
1639	}
1640
1641	return -1;
1642}
1643
1644static int thread__read_fd_path(struct thread *thread, int fd)
1645{
1646	char linkname[PATH_MAX], pathname[PATH_MAX];
1647	struct stat st;
1648	int ret;
1649
1650	if (thread__pid(thread) == thread__tid(thread)) {
1651		scnprintf(linkname, sizeof(linkname),
1652			  "/proc/%d/fd/%d", thread__pid(thread), fd);
1653	} else {
1654		scnprintf(linkname, sizeof(linkname),
1655			  "/proc/%d/task/%d/fd/%d",
1656			  thread__pid(thread), thread__tid(thread), fd);
1657	}
1658
1659	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1660		return -1;
1661
1662	ret = readlink(linkname, pathname, sizeof(pathname));
1663
1664	if (ret < 0 || ret > st.st_size)
1665		return -1;
1666
1667	pathname[ret] = '\0';
1668	return trace__set_fd_pathname(thread, fd, pathname);
1669}
1670
1671static const char *thread__fd_path(struct thread *thread, int fd,
1672				   struct trace *trace)
1673{
1674	struct thread_trace *ttrace = thread__priv(thread);
1675
1676	if (ttrace == NULL || trace->fd_path_disabled)
1677		return NULL;
1678
1679	if (fd < 0)
1680		return NULL;
1681
1682	if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1683		if (!trace->live)
1684			return NULL;
1685		++trace->stats.proc_getname;
1686		if (thread__read_fd_path(thread, fd))
1687			return NULL;
1688	}
1689
1690	return ttrace->files.table[fd].pathname;
1691}
1692
1693size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
 
1694{
1695	int fd = arg->val;
1696	size_t printed = scnprintf(bf, size, "%d", fd);
1697	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1698
1699	if (path)
1700		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1701
1702	return printed;
1703}
1704
1705size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1706{
1707        size_t printed = scnprintf(bf, size, "%d", fd);
1708	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1709
1710	if (thread) {
1711		const char *path = thread__fd_path(thread, fd, trace);
1712
1713		if (path)
1714			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1715
1716		thread__put(thread);
1717	}
1718
1719        return printed;
1720}
1721
1722static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1723					      struct syscall_arg *arg)
1724{
1725	int fd = arg->val;
1726	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1727	struct thread_trace *ttrace = thread__priv(arg->thread);
1728
1729	if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1730		zfree(&ttrace->files.table[fd].pathname);
1731
1732	return printed;
1733}
1734
1735static void thread__set_filename_pos(struct thread *thread, const char *bf,
1736				     unsigned long ptr)
1737{
1738	struct thread_trace *ttrace = thread__priv(thread);
1739
1740	ttrace->filename.ptr = ptr;
1741	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1742}
1743
1744static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1745{
1746	struct augmented_arg *augmented_arg = arg->augmented.args;
1747	size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1748	/*
1749	 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1750	 * we would have two strings, each prefixed by its size.
1751	 */
1752	int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1753
1754	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1755	arg->augmented.size -= consumed;
1756
1757	return printed;
1758}
1759
1760static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1761					      struct syscall_arg *arg)
1762{
1763	unsigned long ptr = arg->val;
1764
1765	if (arg->augmented.args)
1766		return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1767
1768	if (!arg->trace->vfs_getname)
1769		return scnprintf(bf, size, "%#x", ptr);
1770
1771	thread__set_filename_pos(arg->thread, bf, ptr);
1772	return 0;
1773}
1774
1775#define MAX_CONTROL_CHAR 31
1776#define MAX_ASCII 127
1777
1778static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg)
1779{
1780	struct augmented_arg *augmented_arg = arg->augmented.args;
1781	unsigned char *orig = (unsigned char *)augmented_arg->value;
1782	size_t printed = 0;
1783	int consumed;
1784
1785	if (augmented_arg == NULL)
1786		return 0;
1787
1788	for (int j = 0; j < augmented_arg->size; ++j) {
1789		bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII;
1790		/* print control characters (0~31 and 127), and non-ascii characters in \(digits) */
1791		printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]);
1792	}
1793
1794	consumed = sizeof(*augmented_arg) + augmented_arg->size;
1795	arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1796	arg->augmented.size -= consumed;
1797
1798	return printed;
1799}
1800
1801static bool trace__filter_duration(struct trace *trace, double t)
1802{
1803	return t < (trace->duration_filter * NSEC_PER_MSEC);
1804}
1805
1806static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1807{
1808	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1809
1810	return fprintf(fp, "%10.3f ", ts);
1811}
1812
1813/*
1814 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1815 * using ttrace->entry_time for a thread that receives a sys_exit without
1816 * first having received a sys_enter ("poll" issued before tracing session
1817 * starts, lost sys_enter exit due to ring buffer overflow).
1818 */
1819static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1820{
1821	if (tstamp > 0)
1822		return __trace__fprintf_tstamp(trace, tstamp, fp);
1823
1824	return fprintf(fp, "         ? ");
1825}
1826
1827static pid_t workload_pid = -1;
1828static volatile sig_atomic_t done = false;
1829static volatile sig_atomic_t interrupted = false;
1830
1831static void sighandler_interrupt(int sig __maybe_unused)
1832{
1833	done = interrupted = true;
 
1834}
1835
1836static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
1837			    void *context __maybe_unused)
1838{
1839	if (info->si_pid == workload_pid)
1840		done = true;
1841}
1842
1843static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1844{
1845	size_t printed = 0;
1846
1847	if (trace->multiple_threads) {
1848		if (trace->show_comm)
1849			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1850		printed += fprintf(fp, "%d ", thread__tid(thread));
1851	}
1852
1853	return printed;
1854}
1855
1856static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1857					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1858{
1859	size_t printed = 0;
1860
1861	if (trace->show_tstamp)
1862		printed = trace__fprintf_tstamp(trace, tstamp, fp);
1863	if (trace->show_duration)
1864		printed += fprintf_duration(duration, duration_calculated, fp);
1865	return printed + trace__fprintf_comm_tid(trace, thread, fp);
1866}
1867
1868static int trace__process_event(struct trace *trace, struct machine *machine,
1869				union perf_event *event, struct perf_sample *sample)
1870{
1871	int ret = 0;
1872
1873	switch (event->header.type) {
1874	case PERF_RECORD_LOST:
1875		color_fprintf(trace->output, PERF_COLOR_RED,
1876			      "LOST %" PRIu64 " events!\n", (u64)event->lost.lost);
1877		ret = machine__process_lost_event(machine, event, sample);
1878		break;
1879	default:
1880		ret = machine__process_event(machine, event, sample);
1881		break;
1882	}
1883
1884	return ret;
1885}
1886
1887static int trace__tool_process(const struct perf_tool *tool,
1888			       union perf_event *event,
1889			       struct perf_sample *sample,
1890			       struct machine *machine)
1891{
1892	struct trace *trace = container_of(tool, struct trace, tool);
1893	return trace__process_event(trace, machine, event, sample);
1894}
1895
1896static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1897{
1898	struct machine *machine = vmachine;
1899
1900	if (machine->kptr_restrict_warned)
1901		return NULL;
1902
1903	if (symbol_conf.kptr_restrict) {
1904		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1905			   "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1906			   "Kernel samples will not be resolved.\n");
1907		machine->kptr_restrict_warned = true;
1908		return NULL;
1909	}
1910
1911	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1912}
1913
1914static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1915{
1916	int err = symbol__init(NULL);
1917
1918	if (err)
1919		return err;
1920
1921	trace->host = machine__new_host();
1922	if (trace->host == NULL)
1923		return -ENOMEM;
1924
1925	thread__set_priv_destructor(thread_trace__delete);
1926
1927	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1928	if (err < 0)
1929		goto out;
1930
1931	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1932					    evlist->core.threads, trace__tool_process,
1933					    true, false, 1);
1934out:
1935	if (err)
1936		symbol__exit();
1937
1938	return err;
1939}
1940
1941static void trace__symbols__exit(struct trace *trace)
1942{
1943	machine__exit(trace->host);
1944	trace->host = NULL;
1945
1946	symbol__exit();
1947}
1948
1949static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1950{
1951	int idx;
 
1952
1953	if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
1954		nr_args = sc->fmt->nr_args;
1955
1956	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1957	if (sc->arg_fmt == NULL)
1958		return -1;
1959
1960	for (idx = 0; idx < nr_args; ++idx) {
1961		if (sc->fmt)
1962			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1963	}
1964
1965	sc->nr_args = nr_args;
1966	return 0;
1967}
1968
1969static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
1970	{ .name = "msr",	.scnprintf = SCA_X86_MSR,	  .strtoul = STUL_X86_MSR,	   },
1971	{ .name = "vector",	.scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
1972};
1973
1974static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
1975{
1976       const struct syscall_arg_fmt *fmt = fmtp;
1977       return strcmp(name, fmt->name);
1978}
1979
1980static const struct syscall_arg_fmt *
1981__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb,
1982				const char *name)
1983{
1984       return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
1985}
1986
1987static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
1988{
1989       const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
1990       return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
1991}
1992
1993static struct tep_format_field *
1994syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
1995			    bool *use_btf)
1996{
1997	struct tep_format_field *last_field = NULL;
1998	int len;
1999
2000	for (; field; field = field->next, ++arg) {
2001		last_field = field;
2002
2003		if (arg->scnprintf)
2004			continue;
2005
2006		len = strlen(field->name);
2007
2008		// As far as heuristics (or intention) goes this seems to hold true, and makes sense!
2009		if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
2010			arg->from_user = true;
2011
2012		if (strcmp(field->type, "const char *") == 0 &&
2013		    ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
2014		     strstr(field->name, "path") != NULL)) {
2015			arg->scnprintf = SCA_FILENAME;
2016		} else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
2017			arg->scnprintf = SCA_PTR;
2018		else if (strcmp(field->type, "pid_t") == 0)
2019			arg->scnprintf = SCA_PID;
2020		else if (strcmp(field->type, "umode_t") == 0)
2021			arg->scnprintf = SCA_MODE_T;
2022		else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
2023			arg->scnprintf = SCA_CHAR_ARRAY;
2024			arg->nr_entries = field->arraylen;
2025		} else if ((strcmp(field->type, "int") == 0 ||
2026			  strcmp(field->type, "unsigned int") == 0 ||
2027			  strcmp(field->type, "long") == 0) &&
2028			 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
 
2029			/*
2030			 * /sys/kernel/tracing/events/syscalls/sys_enter*
2031			 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
2032			 * 65 int
2033			 * 23 unsigned int
2034			 * 7 unsigned long
2035			 */
2036			arg->scnprintf = SCA_FD;
2037		} else if (strstr(field->type, "enum") && use_btf != NULL) {
2038			*use_btf = true;
2039			arg->strtoul = STUL_BTF_TYPE;
2040		} else {
2041			const struct syscall_arg_fmt *fmt =
2042				syscall_arg_fmt__find_by_name(field->name);
2043
2044			if (fmt) {
2045				arg->scnprintf = fmt->scnprintf;
2046				arg->strtoul   = fmt->strtoul;
2047			}
2048		}
 
2049	}
2050
2051	return last_field;
2052}
2053
2054static int syscall__set_arg_fmts(struct syscall *sc)
2055{
2056	struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args,
2057									  &sc->use_btf);
2058
2059	if (last_field)
2060		sc->args_size = last_field->offset + last_field->size;
2061
2062	return 0;
2063}
2064
2065static int trace__read_syscall_info(struct trace *trace, int id)
2066{
2067	char tp_name[128];
2068	struct syscall *sc;
2069	const char *name = syscalltbl__name(trace->sctbl, id);
2070	int err;
2071
2072#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2073	if (trace->syscalls.table == NULL) {
2074		trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
2075		if (trace->syscalls.table == NULL)
2076			return -ENOMEM;
2077	}
2078#else
2079	if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
2080		// When using libaudit we don't know beforehand what is the max syscall id
2081		struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
2082
2083		if (table == NULL)
2084			return -ENOMEM;
2085
2086		// Need to memset from offset 0 and +1 members if brand new
2087		if (trace->syscalls.table == NULL)
2088			memset(table, 0, (id + 1) * sizeof(*sc));
2089		else
2090			memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
2091
2092		trace->syscalls.table	      = table;
2093		trace->sctbl->syscalls.max_id = id;
2094	}
2095#endif
2096	sc = trace->syscalls.table + id;
2097	if (sc->nonexistent)
2098		return -EEXIST;
2099
2100	if (name == NULL) {
2101		sc->nonexistent = true;
2102		return -EEXIST;
2103	}
2104
 
2105	sc->name = name;
 
2106	sc->fmt  = syscall_fmt__find(sc->name);
2107
2108	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
2109	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2110
2111	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
2112		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
2113		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2114	}
2115
2116	/*
2117	 * Fails to read trace point format via sysfs node, so the trace point
2118	 * doesn't exist.  Set the 'nonexistent' flag as true.
2119	 */
2120	if (IS_ERR(sc->tp_format)) {
2121		sc->nonexistent = true;
2122		return PTR_ERR(sc->tp_format);
2123	}
2124
2125	/*
2126	 * The tracepoint format contains __syscall_nr field, so it's one more
2127	 * than the actual number of syscall arguments.
2128	 */
2129	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
2130					RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
2131		return -ENOMEM;
2132
2133	sc->args = sc->tp_format->format.fields;
 
2134	/*
2135	 * We need to check and discard the first variable '__syscall_nr'
2136	 * or 'nr' that mean the syscall number. It is needless here.
2137	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
2138	 */
2139	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
2140		sc->args = sc->args->next;
2141		--sc->nr_args;
2142	}
2143
2144	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
2145	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
2146
2147	err = syscall__set_arg_fmts(sc);
2148
2149	/* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */
2150	if (sc->use_btf)
2151		trace__load_vmlinux_btf(trace);
2152
2153	return err;
2154}
2155
2156static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf)
2157{
2158	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
2159
2160	if (fmt != NULL) {
2161		syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf);
2162		return 0;
2163	}
2164
2165	return -ENOMEM;
2166}
2167
2168static int intcmp(const void *a, const void *b)
2169{
2170	const int *one = a, *another = b;
2171
2172	return *one - *another;
2173}
2174
2175static int trace__validate_ev_qualifier(struct trace *trace)
2176{
2177	int err = 0;
2178	bool printed_invalid_prefix = false;
2179	struct str_node *pos;
2180	size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
2181
2182	trace->ev_qualifier_ids.entries = malloc(nr_allocated *
 
2183						 sizeof(trace->ev_qualifier_ids.entries[0]));
2184
2185	if (trace->ev_qualifier_ids.entries == NULL) {
2186		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
2187		       trace->output);
2188		err = -EINVAL;
2189		goto out;
2190	}
2191
 
 
2192	strlist__for_each_entry(pos, trace->ev_qualifier) {
2193		const char *sc = pos->s;
2194		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
2195
2196		if (id < 0) {
2197			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
2198			if (id >= 0)
2199				goto matches;
2200
2201			if (!printed_invalid_prefix) {
2202				pr_debug("Skipping unknown syscalls: ");
2203				printed_invalid_prefix = true;
2204			} else {
2205				pr_debug(", ");
2206			}
2207
2208			pr_debug("%s", sc);
2209			continue;
2210		}
2211matches:
2212		trace->ev_qualifier_ids.entries[nr_used++] = id;
2213		if (match_next == -1)
2214			continue;
2215
2216		while (1) {
2217			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
2218			if (id < 0)
2219				break;
2220			if (nr_allocated == nr_used) {
2221				void *entries;
2222
2223				nr_allocated += 8;
2224				entries = realloc(trace->ev_qualifier_ids.entries,
2225						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
2226				if (entries == NULL) {
2227					err = -ENOMEM;
2228					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
2229					goto out_free;
2230				}
2231				trace->ev_qualifier_ids.entries = entries;
2232			}
2233			trace->ev_qualifier_ids.entries[nr_used++] = id;
2234		}
 
 
2235	}
2236
2237	trace->ev_qualifier_ids.nr = nr_used;
2238	qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
 
 
 
 
2239out:
2240	if (printed_invalid_prefix)
2241		pr_debug("\n");
2242	return err;
2243out_free:
2244	zfree(&trace->ev_qualifier_ids.entries);
2245	trace->ev_qualifier_ids.nr = 0;
2246	goto out;
2247}
2248
2249static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
2250{
2251	bool in_ev_qualifier;
2252
2253	if (trace->ev_qualifier_ids.nr == 0)
2254		return true;
2255
2256	in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
2257				  trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
2258
2259	if (in_ev_qualifier)
2260	       return !trace->not_ev_qualifier;
2261
2262	return trace->not_ev_qualifier;
2263}
2264
2265/*
2266 * args is to be interpreted as a series of longs but we need to handle
2267 * 8-byte unaligned accesses. args points to raw_data within the event
2268 * and raw_data is guaranteed to be 8-byte unaligned because it is
2269 * preceded by raw_size which is a u32. So we need to copy args to a temp
2270 * variable to read it. Most notably this avoids extended load instructions
2271 * on unaligned addresses
2272 */
2273unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
2274{
2275	unsigned long val;
2276	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
2277
2278	memcpy(&val, p, sizeof(val));
2279	return val;
2280}
2281
2282static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
2283				      struct syscall_arg *arg)
2284{
2285	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
2286		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
2287
2288	return scnprintf(bf, size, "arg%d: ", arg->idx);
2289}
2290
2291/*
2292 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
2293 * as mount 'flags' argument that needs ignoring some magic flag, see comment
2294 * in tools/perf/trace/beauty/mount_flags.c
2295 */
2296static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
2297{
2298	if (fmt && fmt->mask_val)
2299		return fmt->mask_val(arg, val);
2300
2301	return val;
2302}
2303
2304static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
2305					     struct syscall_arg *arg, unsigned long val)
2306{
2307	if (fmt && fmt->scnprintf) {
2308		arg->val = val;
2309		if (fmt->parm)
2310			arg->parm = fmt->parm;
2311		return fmt->scnprintf(bf, size, arg);
2312	}
2313	return scnprintf(bf, size, "%ld", val);
2314}
2315
2316static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
2317				      unsigned char *args, void *augmented_args, int augmented_args_size,
2318				      struct trace *trace, struct thread *thread)
2319{
2320	size_t printed = 0, btf_printed;
 
2321	unsigned long val;
2322	u8 bit = 1;
2323	struct syscall_arg arg = {
2324		.args	= args,
2325		.augmented = {
2326			.size = augmented_args_size,
2327			.args = augmented_args,
2328		},
2329		.idx	= 0,
2330		.mask	= 0,
2331		.trace  = trace,
2332		.thread = thread,
2333		.show_string_prefix = trace->show_string_prefix,
2334	};
2335	struct thread_trace *ttrace = thread__priv(thread);
2336	void *default_scnprintf;
2337
2338	/*
2339	 * Things like fcntl will set this in its 'cmd' formatter to pick the
2340	 * right formatter for the return value (an fd? file flags?), which is
2341	 * not needed for syscalls that always return a given type, say an fd.
2342	 */
2343	ttrace->ret_scnprintf = NULL;
2344
2345	if (sc->args != NULL) {
2346		struct tep_format_field *field;
 
 
 
 
 
 
 
2347
2348		for (field = sc->args; field;
2349		     field = field->next, ++arg.idx, bit <<= 1) {
2350			if (arg.mask & bit)
2351				continue;
2352
2353			arg.fmt = &sc->arg_fmt[arg.idx];
2354			val = syscall_arg__val(&arg, arg.idx);
2355			/*
2356			 * Some syscall args need some mask, most don't and
2357			 * return val untouched.
2358			 */
2359			val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
2360
2361			/*
2362			 * Suppress this argument if its value is zero and show_zero
2363			 * property isn't set.
2364			 *
2365			 * If it has a BTF type, then override the zero suppression knob
2366			 * as the common case is for zero in an enum to have an associated entry.
2367			 */
2368			if (val == 0 && !trace->show_zeros &&
2369			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
2370			    !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE))
2371				continue;
2372
2373			printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2374
2375			if (trace->show_arg_names)
2376				printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2377
2378			default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
2379
2380			if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
2381				btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
2382								   size - printed, val, field->type);
2383				if (btf_printed) {
2384					printed += btf_printed;
2385					continue;
2386				}
2387			}
2388
2389			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
2390								  bf + printed, size - printed, &arg, val);
2391		}
2392	} else if (IS_ERR(sc->tp_format)) {
2393		/*
2394		 * If we managed to read the tracepoint /format file, then we
2395		 * may end up not having any args, like with gettid(), so only
2396		 * print the raw args when we didn't manage to read it.
2397		 */
2398		while (arg.idx < sc->nr_args) {
2399			if (arg.mask & bit)
2400				goto next_arg;
2401			val = syscall_arg__val(&arg, arg.idx);
2402			if (printed)
2403				printed += scnprintf(bf + printed, size - printed, ", ");
2404			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
2405			printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
2406next_arg:
2407			++arg.idx;
2408			bit <<= 1;
2409		}
2410	}
2411
2412	return printed;
2413}
2414
2415typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
2416				  union perf_event *event,
2417				  struct perf_sample *sample);
2418
2419static struct syscall *trace__syscall_info(struct trace *trace,
2420					   struct evsel *evsel, int id)
2421{
2422	int err = 0;
2423
2424	if (id < 0) {
2425
2426		/*
2427		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
2428		 * before that, leaving at a higher verbosity level till that is
2429		 * explained. Reproduced with plain ftrace with:
2430		 *
2431		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
2432		 * grep "NR -1 " /t/trace_pipe
2433		 *
2434		 * After generating some load on the machine.
2435 		 */
2436		if (verbose > 1) {
2437			static u64 n;
2438			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2439				id, evsel__name(evsel), ++n);
2440		}
2441		return NULL;
2442	}
2443
2444	err = -EINVAL;
2445
2446#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2447	if (id > trace->sctbl->syscalls.max_id) {
2448#else
2449	if (id >= trace->sctbl->syscalls.max_id) {
2450		/*
2451		 * With libaudit we don't know beforehand what is the max_id,
2452		 * so we let trace__read_syscall_info() figure that out as we
2453		 * go on reading syscalls.
2454		 */
2455		err = trace__read_syscall_info(trace, id);
2456		if (err)
2457#endif
2458		goto out_cant_read;
2459	}
2460
2461	if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
2462	    (err = trace__read_syscall_info(trace, id)) != 0)
2463		goto out_cant_read;
2464
2465	if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
2466		goto out_cant_read;
2467
2468	return &trace->syscalls.table[id];
2469
2470out_cant_read:
2471	if (verbose > 0) {
2472		char sbuf[STRERR_BUFSIZE];
2473		fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
2474		if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
2475			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2476		fputs(" information\n", trace->output);
2477	}
2478	return NULL;
2479}
2480
2481struct syscall_stats {
2482	struct stats stats;
2483	u64	     nr_failures;
2484	int	     max_errno;
2485	u32	     *errnos;
2486};
2487
2488static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
2489				 int id, struct perf_sample *sample, long err, bool errno_summary)
2490{
2491	struct int_node *inode;
2492	struct syscall_stats *stats;
2493	u64 duration = 0;
2494
2495	inode = intlist__findnew(ttrace->syscall_stats, id);
2496	if (inode == NULL)
2497		return;
2498
2499	stats = inode->priv;
2500	if (stats == NULL) {
2501		stats = zalloc(sizeof(*stats));
2502		if (stats == NULL)
2503			return;
2504
2505		init_stats(&stats->stats);
2506		inode->priv = stats;
2507	}
2508
2509	if (ttrace->entry_time && sample->time > ttrace->entry_time)
2510		duration = sample->time - ttrace->entry_time;
2511
2512	update_stats(&stats->stats, duration);
2513
2514	if (err < 0) {
2515		++stats->nr_failures;
2516
2517		if (!errno_summary)
2518			return;
2519
2520		err = -err;
2521		if (err > stats->max_errno) {
2522			u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
2523
2524			if (new_errnos) {
2525				memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
2526			} else {
2527				pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
2528					 thread__comm_str(thread), thread__pid(thread),
2529					 thread__tid(thread));
2530				return;
2531			}
2532
2533			stats->errnos = new_errnos;
2534			stats->max_errno = err;
2535		}
2536
2537		++stats->errnos[err - 1];
2538	}
2539}
2540
2541static int trace__printf_interrupted_entry(struct trace *trace)
2542{
2543	struct thread_trace *ttrace;
 
2544	size_t printed;
2545	int len;
2546
2547	if (trace->failure_only || trace->current == NULL)
2548		return 0;
2549
2550	ttrace = thread__priv(trace->current);
2551
2552	if (!ttrace->entry_pending)
2553		return 0;
2554
2555	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
2556	printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
2557
2558	if (len < trace->args_alignment - 4)
2559		printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
2560
2561	printed += fprintf(trace->output, " ...\n");
2562
 
 
2563	ttrace->entry_pending = false;
2564	++trace->nr_events_printed;
2565
2566	return printed;
2567}
2568
2569static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
2570				 struct perf_sample *sample, struct thread *thread)
2571{
2572	int printed = 0;
2573
2574	if (trace->print_sample) {
2575		double ts = (double)sample->time / NSEC_PER_MSEC;
2576
2577		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
2578				   evsel__name(evsel), ts,
2579				   thread__comm_str(thread),
2580				   sample->pid, sample->tid, sample->cpu);
2581	}
2582
2583	return printed;
2584}
2585
2586static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
2587{
2588	void *augmented_args = NULL;
2589	/*
2590	 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
2591	 * and there we get all 6 syscall args plus the tracepoint common fields
2592	 * that gets calculated at the start and the syscall_nr (another long).
2593	 * So we check if that is the case and if so don't look after the
2594	 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
2595	 * which is fixed.
2596	 *
2597	 * We'll revisit this later to pass s->args_size to the BPF augmenter
2598	 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
2599	 * copies only what we need for each syscall, like what happens when we
2600	 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
2601	 * traffic to just what is needed for each syscall.
2602	 */
2603	int args_size = raw_augmented_args_size ?: sc->args_size;
2604
2605	*augmented_args_size = sample->raw_size - args_size;
2606	if (*augmented_args_size > 0)
2607		augmented_args = sample->raw_data + args_size;
2608
2609	return augmented_args;
2610}
2611
2612static void syscall__exit(struct syscall *sc)
2613{
2614	if (!sc)
2615		return;
2616
2617	zfree(&sc->arg_fmt);
2618}
2619
2620static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
2621			    union perf_event *event __maybe_unused,
2622			    struct perf_sample *sample)
2623{
2624	char *msg;
2625	void *args;
2626	int printed = 0;
2627	struct thread *thread;
2628	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2629	int augmented_args_size = 0;
2630	void *augmented_args = NULL;
2631	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2632	struct thread_trace *ttrace;
2633
2634	if (sc == NULL)
2635		return -1;
2636
2637	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2638	ttrace = thread__trace(thread, trace->output);
2639	if (ttrace == NULL)
2640		goto out_put;
2641
2642	trace__fprintf_sample(trace, evsel, sample, thread);
2643
2644	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2645
2646	if (ttrace->entry_str == NULL) {
2647		ttrace->entry_str = malloc(trace__entry_str_size);
2648		if (!ttrace->entry_str)
2649			goto out_put;
2650	}
2651
2652	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
2653		trace__printf_interrupted_entry(trace);
2654	/*
2655	 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
2656	 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
2657	 * this breaks syscall__augmented_args() check for augmented args, as we calculate
2658	 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
2659	 * so when handling, say the openat syscall, we end up getting 6 args for the
2660	 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
2661	 * thinking that the extra 2 u64 args are the augmented filename, so just check
2662	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
2663	 */
2664	if (evsel != trace->syscalls.events.sys_enter)
2665		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2666	ttrace->entry_time = sample->time;
2667	msg = ttrace->entry_str;
2668	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2669
2670	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2671					   args, augmented_args, augmented_args_size, trace, thread);
2672
2673	if (sc->is_exit) {
2674		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
2675			int alignment = 0;
2676
2677			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2678			printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2679			if (trace->args_alignment > printed)
2680				alignment = trace->args_alignment - printed;
2681			fprintf(trace->output, "%*s= ?\n", alignment, " ");
2682		}
2683	} else {
2684		ttrace->entry_pending = true;
2685		/* See trace__vfs_getname & trace__sys_exit */
2686		ttrace->filename.pending_open = false;
2687	}
2688
2689	if (trace->current != thread) {
2690		thread__put(trace->current);
2691		trace->current = thread__get(thread);
2692	}
2693	err = 0;
2694out_put:
2695	thread__put(thread);
2696	return err;
2697}
2698
2699static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2700				    struct perf_sample *sample)
2701{
2702	struct thread_trace *ttrace;
2703	struct thread *thread;
2704	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2705	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2706	char msg[1024];
2707	void *args, *augmented_args = NULL;
2708	int augmented_args_size;
2709	size_t printed = 0;
2710
2711	if (sc == NULL)
2712		return -1;
2713
2714	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2715	ttrace = thread__trace(thread, trace->output);
2716	/*
2717	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2718	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2719	 */
2720	if (ttrace == NULL)
2721		goto out_put;
2722
2723	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2724	augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2725	printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2726	fprintf(trace->output, "%.*s", (int)printed, msg);
2727	err = 0;
2728out_put:
2729	thread__put(thread);
2730	return err;
2731}
2732
2733static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2734				    struct perf_sample *sample,
2735				    struct callchain_cursor *cursor)
2736{
2737	struct addr_location al;
2738	int max_stack = evsel->core.attr.sample_max_stack ?
2739			evsel->core.attr.sample_max_stack :
2740			trace->max_stack;
2741	int err = -1;
2742
2743	addr_location__init(&al);
2744	if (machine__resolve(trace->host, &al, sample) < 0)
2745		goto out;
2746
2747	err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2748out:
2749	addr_location__exit(&al);
2750	return err;
2751}
2752
2753static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2754{
2755	/* TODO: user-configurable print_opts */
2756	const unsigned int print_opts = EVSEL__PRINT_SYM |
2757				        EVSEL__PRINT_DSO |
2758				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
2759
2760	return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
2761}
2762
2763static const char *errno_to_name(struct evsel *evsel, int err)
2764{
2765	struct perf_env *env = evsel__env(evsel);
2766
2767	return perf_env__arch_strerrno(env, err);
2768}
2769
2770static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2771			   union perf_event *event __maybe_unused,
2772			   struct perf_sample *sample)
2773{
2774	long ret;
2775	u64 duration = 0;
2776	bool duration_calculated = false;
2777	struct thread *thread;
2778	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2779	int alignment = trace->args_alignment;
2780	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2781	struct thread_trace *ttrace;
2782
2783	if (sc == NULL)
2784		return -1;
2785
2786	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2787	ttrace = thread__trace(thread, trace->output);
2788	if (ttrace == NULL)
2789		goto out_put;
2790
2791	trace__fprintf_sample(trace, evsel, sample, thread);
 
2792
2793	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2794
2795	if (trace->summary)
2796		thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
2797
2798	if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2799		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2800		ttrace->filename.pending_open = false;
2801		++trace->stats.vfs_getname;
2802	}
2803
2804	if (ttrace->entry_time) {
2805		duration = sample->time - ttrace->entry_time;
2806		if (trace__filter_duration(trace, duration))
2807			goto out;
2808		duration_calculated = true;
2809	} else if (trace->duration_filter)
2810		goto out;
2811
2812	if (sample->callchain) {
2813		struct callchain_cursor *cursor = get_tls_callchain_cursor();
2814
2815		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2816		if (callchain_ret == 0) {
2817			if (cursor->nr < trace->min_stack)
2818				goto out;
2819			callchain_ret = 1;
2820		}
2821	}
2822
2823	if (trace->summary_only || (ret >= 0 && trace->failure_only))
2824		goto out;
2825
2826	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2827
2828	if (ttrace->entry_pending) {
2829		printed = fprintf(trace->output, "%s", ttrace->entry_str);
2830	} else {
2831		printed += fprintf(trace->output, " ... [");
2832		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2833		printed += 9;
2834		printed += fprintf(trace->output, "]: %s()", sc->name);
2835	}
2836
2837	printed++; /* the closing ')' */
2838
2839	if (alignment > printed)
2840		alignment -= printed;
2841	else
2842		alignment = 0;
2843
2844	fprintf(trace->output, ")%*s= ", alignment, " ");
2845
2846	if (sc->fmt == NULL) {
2847		if (ret < 0)
2848			goto errno_print;
2849signed_print:
2850		fprintf(trace->output, "%ld", ret);
2851	} else if (ret < 0) {
2852errno_print: {
2853		char bf[STRERR_BUFSIZE];
2854		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2855			   *e = errno_to_name(evsel, -ret);
2856
2857		fprintf(trace->output, "-1 %s (%s)", e, emsg);
2858	}
2859	} else if (ret == 0 && sc->fmt->timeout)
2860		fprintf(trace->output, "0 (Timeout)");
2861	else if (ttrace->ret_scnprintf) {
2862		char bf[1024];
2863		struct syscall_arg arg = {
2864			.val	= ret,
2865			.thread	= thread,
2866			.trace	= trace,
2867		};
2868		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2869		ttrace->ret_scnprintf = NULL;
2870		fprintf(trace->output, "%s", bf);
2871	} else if (sc->fmt->hexret)
2872		fprintf(trace->output, "%#lx", ret);
2873	else if (sc->fmt->errpid) {
2874		struct thread *child = machine__find_thread(trace->host, ret, ret);
2875
2876		if (child != NULL) {
2877			fprintf(trace->output, "%ld", ret);
2878			if (thread__comm_set(child))
2879				fprintf(trace->output, " (%s)", thread__comm_str(child));
2880			thread__put(child);
2881		}
2882	} else
2883		goto signed_print;
2884
2885	fputc('\n', trace->output);
2886
2887	/*
2888	 * We only consider an 'event' for the sake of --max-events a non-filtered
2889	 * sys_enter + sys_exit and other tracepoint events.
2890	 */
2891	if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2892		interrupted = true;
2893
2894	if (callchain_ret > 0)
2895		trace__fprintf_callchain(trace, sample);
2896	else if (callchain_ret < 0)
2897		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2898out:
2899	ttrace->entry_pending = false;
2900	err = 0;
2901out_put:
2902	thread__put(thread);
2903	return err;
2904}
2905
2906static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2907			      union perf_event *event __maybe_unused,
2908			      struct perf_sample *sample)
2909{
2910	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2911	struct thread_trace *ttrace;
2912	size_t filename_len, entry_str_len, to_move;
2913	ssize_t remaining_space;
2914	char *pos;
2915	const char *filename = evsel__rawptr(evsel, sample, "pathname");
2916
2917	if (!thread)
2918		goto out;
2919
2920	ttrace = thread__priv(thread);
2921	if (!ttrace)
2922		goto out_put;
2923
2924	filename_len = strlen(filename);
2925	if (filename_len == 0)
2926		goto out_put;
2927
2928	if (ttrace->filename.namelen < filename_len) {
2929		char *f = realloc(ttrace->filename.name, filename_len + 1);
2930
2931		if (f == NULL)
2932			goto out_put;
2933
2934		ttrace->filename.namelen = filename_len;
2935		ttrace->filename.name = f;
2936	}
2937
2938	strcpy(ttrace->filename.name, filename);
2939	ttrace->filename.pending_open = true;
2940
2941	if (!ttrace->filename.ptr)
2942		goto out_put;
2943
2944	entry_str_len = strlen(ttrace->entry_str);
2945	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2946	if (remaining_space <= 0)
2947		goto out_put;
2948
2949	if (filename_len > (size_t)remaining_space) {
2950		filename += filename_len - remaining_space;
2951		filename_len = remaining_space;
2952	}
2953
2954	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2955	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2956	memmove(pos + filename_len, pos, to_move);
2957	memcpy(pos, filename, filename_len);
2958
2959	ttrace->filename.ptr = 0;
2960	ttrace->filename.entry_str_pos = 0;
2961out_put:
2962	thread__put(thread);
2963out:
2964	return 0;
2965}
2966
2967static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2968				     union perf_event *event __maybe_unused,
2969				     struct perf_sample *sample)
2970{
2971        u64 runtime = evsel__intval(evsel, sample, "runtime");
2972	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2973	struct thread *thread = machine__findnew_thread(trace->host,
2974							sample->pid,
2975							sample->tid);
2976	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2977
2978	if (ttrace == NULL)
2979		goto out_dump;
2980
2981	ttrace->runtime_ms += runtime_ms;
2982	trace->runtime_ms += runtime_ms;
2983out_put:
2984	thread__put(thread);
2985	return 0;
2986
2987out_dump:
2988	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2989	       evsel->name,
2990	       evsel__strval(evsel, sample, "comm"),
2991	       (pid_t)evsel__intval(evsel, sample, "pid"),
2992	       runtime,
2993	       evsel__intval(evsel, sample, "vruntime"));
2994	goto out_put;
 
2995}
2996
2997static int bpf_output__printer(enum binary_printer_ops op,
2998			       unsigned int val, void *extra __maybe_unused, FILE *fp)
2999{
 
3000	unsigned char ch = (unsigned char)val;
3001
3002	switch (op) {
3003	case BINARY_PRINT_CHAR_DATA:
3004		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
 
3005	case BINARY_PRINT_DATA_BEGIN:
3006	case BINARY_PRINT_LINE_BEGIN:
3007	case BINARY_PRINT_ADDR:
3008	case BINARY_PRINT_NUM_DATA:
3009	case BINARY_PRINT_NUM_PAD:
3010	case BINARY_PRINT_SEP:
3011	case BINARY_PRINT_CHAR_PAD:
3012	case BINARY_PRINT_LINE_END:
3013	case BINARY_PRINT_DATA_END:
3014	default:
3015		break;
3016	}
3017
3018	return 0;
3019}
3020
3021static void bpf_output__fprintf(struct trace *trace,
3022				struct perf_sample *sample)
3023{
3024	binary__fprintf(sample->raw_data, sample->raw_size, 8,
3025			bpf_output__printer, NULL, trace->output);
3026	++trace->nr_events_printed;
3027}
3028
3029static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
3030				       struct thread *thread, void *augmented_args, int augmented_args_size)
3031{
3032	char bf[2048];
3033	size_t size = sizeof(bf);
3034	struct tep_format_field *field = evsel->tp_format->format.fields;
3035	struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
3036	size_t printed = 0, btf_printed;
3037	unsigned long val;
3038	u8 bit = 1;
3039	struct syscall_arg syscall_arg = {
3040		.augmented = {
3041			.size = augmented_args_size,
3042			.args = augmented_args,
3043		},
3044		.idx	= 0,
3045		.mask	= 0,
3046		.trace  = trace,
3047		.thread = thread,
3048		.show_string_prefix = trace->show_string_prefix,
3049	};
3050
3051	for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
3052		if (syscall_arg.mask & bit)
3053			continue;
3054
3055		syscall_arg.len = 0;
3056		syscall_arg.fmt = arg;
3057		if (field->flags & TEP_FIELD_IS_ARRAY) {
3058			int offset = field->offset;
3059
3060			if (field->flags & TEP_FIELD_IS_DYNAMIC) {
3061				offset = format_field__intval(field, sample, evsel->needs_swap);
3062				syscall_arg.len = offset >> 16;
3063				offset &= 0xffff;
3064				if (tep_field_is_relative(field->flags))
3065					offset += field->offset + field->size;
3066			}
3067
3068			val = (uintptr_t)(sample->raw_data + offset);
3069		} else
3070			val = format_field__intval(field, sample, evsel->needs_swap);
3071		/*
3072		 * Some syscall args need some mask, most don't and
3073		 * return val untouched.
3074		 */
3075		val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
3076
3077		/* Suppress this argument if its value is zero and show_zero property isn't set. */
3078		if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE)
3079			continue;
3080
3081		printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
3082
3083		if (trace->show_arg_names)
3084			printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
3085
3086		btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type);
3087		if (btf_printed) {
3088			printed += btf_printed;
3089			continue;
3090		}
3091
3092		printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
3093	}
3094
3095	return printed + fprintf(trace->output, "%.*s", (int)printed, bf);
3096}
3097
3098static int trace__event_handler(struct trace *trace, struct evsel *evsel,
3099				union perf_event *event __maybe_unused,
3100				struct perf_sample *sample)
3101{
3102	struct thread *thread;
3103	int callchain_ret = 0;
3104
3105	if (evsel->nr_events_printed >= evsel->max_events)
3106		return 0;
3107
3108	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3109
3110	if (sample->callchain) {
3111		struct callchain_cursor *cursor = get_tls_callchain_cursor();
3112
3113		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3114		if (callchain_ret == 0) {
3115			if (cursor->nr < trace->min_stack)
3116				goto out;
3117			callchain_ret = 1;
3118		}
3119	}
3120
3121	trace__printf_interrupted_entry(trace);
3122	trace__fprintf_tstamp(trace, sample->time, trace->output);
3123
3124	if (trace->trace_syscalls && trace->show_duration)
3125		fprintf(trace->output, "(         ): ");
3126
3127	if (thread)
3128		trace__fprintf_comm_tid(trace, thread, trace->output);
3129
3130	if (evsel == trace->syscalls.events.bpf_output) {
3131		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
3132		struct syscall *sc = trace__syscall_info(trace, evsel, id);
3133
3134		if (sc) {
3135			fprintf(trace->output, "%s(", sc->name);
3136			trace__fprintf_sys_enter(trace, evsel, sample);
3137			fputc(')', trace->output);
3138			goto newline;
3139		}
3140
3141		/*
3142		 * XXX: Not having the associated syscall info or not finding/adding
3143		 * 	the thread should never happen, but if it does...
3144		 * 	fall thru and print it as a bpf_output event.
3145		 */
3146	}
3147
3148	fprintf(trace->output, "%s(", evsel->name);
3149
3150	if (evsel__is_bpf_output(evsel)) {
3151		bpf_output__fprintf(trace, sample);
3152	} else if (evsel->tp_format) {
3153		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
3154		    trace__fprintf_sys_enter(trace, evsel, sample)) {
3155			if (trace->libtraceevent_print) {
3156				event_format__fprintf(evsel->tp_format, sample->cpu,
3157						      sample->raw_data, sample->raw_size,
3158						      trace->output);
3159			} else {
3160				trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
3161			}
3162		}
3163	}
3164
3165newline:
3166	fprintf(trace->output, ")\n");
3167
3168	if (callchain_ret > 0)
3169		trace__fprintf_callchain(trace, sample);
3170	else if (callchain_ret < 0)
3171		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3172
3173	++trace->nr_events_printed;
3174
3175	if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
3176		evsel__disable(evsel);
3177		evsel__close(evsel);
3178	}
3179out:
3180	thread__put(thread);
3181	return 0;
3182}
3183
3184static void print_location(FILE *f, struct perf_sample *sample,
3185			   struct addr_location *al,
3186			   bool print_dso, bool print_sym)
3187{
3188
3189	if ((verbose > 0 || print_dso) && al->map)
3190		fprintf(f, "%s@", dso__long_name(map__dso(al->map)));
3191
3192	if ((verbose > 0 || print_sym) && al->sym)
3193		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
3194			al->addr - al->sym->start);
3195	else if (al->map)
3196		fprintf(f, "0x%" PRIx64, al->addr);
3197	else
3198		fprintf(f, "0x%" PRIx64, sample->addr);
3199}
3200
3201static int trace__pgfault(struct trace *trace,
3202			  struct evsel *evsel,
3203			  union perf_event *event __maybe_unused,
3204			  struct perf_sample *sample)
3205{
3206	struct thread *thread;
3207	struct addr_location al;
3208	char map_type = 'd';
3209	struct thread_trace *ttrace;
3210	int err = -1;
3211	int callchain_ret = 0;
3212
3213	addr_location__init(&al);
3214	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3215
3216	if (sample->callchain) {
3217		struct callchain_cursor *cursor = get_tls_callchain_cursor();
3218
3219		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3220		if (callchain_ret == 0) {
3221			if (cursor->nr < trace->min_stack)
3222				goto out_put;
3223			callchain_ret = 1;
3224		}
3225	}
3226
3227	ttrace = thread__trace(thread, trace->output);
3228	if (ttrace == NULL)
3229		goto out_put;
3230
3231	if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
3232		ttrace->pfmaj++;
3233	else
3234		ttrace->pfmin++;
3235
3236	if (trace->summary_only)
3237		goto out;
3238
3239	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
 
3240
3241	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
3242
3243	fprintf(trace->output, "%sfault [",
3244		evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
3245		"maj" : "min");
3246
3247	print_location(trace->output, sample, &al, false, true);
3248
3249	fprintf(trace->output, "] => ");
3250
3251	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
 
3252
3253	if (!al.map) {
3254		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
 
3255
3256		if (al.map)
3257			map_type = 'x';
3258		else
3259			map_type = '?';
3260	}
3261
3262	print_location(trace->output, sample, &al, true, false);
3263
3264	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
3265
3266	if (callchain_ret > 0)
3267		trace__fprintf_callchain(trace, sample);
3268	else if (callchain_ret < 0)
3269		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3270
3271	++trace->nr_events_printed;
3272out:
3273	err = 0;
3274out_put:
3275	thread__put(thread);
3276	addr_location__exit(&al);
3277	return err;
3278}
3279
3280static void trace__set_base_time(struct trace *trace,
3281				 struct evsel *evsel,
3282				 struct perf_sample *sample)
3283{
3284	/*
3285	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
3286	 * and don't use sample->time unconditionally, we may end up having
3287	 * some other event in the future without PERF_SAMPLE_TIME for good
3288	 * reason, i.e. we may not be interested in its timestamps, just in
3289	 * it taking place, picking some piece of information when it
3290	 * appears in our event stream (vfs_getname comes to mind).
3291	 */
3292	if (trace->base_time == 0 && !trace->full_time &&
3293	    (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
3294		trace->base_time = sample->time;
3295}
3296
3297static int trace__process_sample(const struct perf_tool *tool,
3298				 union perf_event *event,
3299				 struct perf_sample *sample,
3300				 struct evsel *evsel,
3301				 struct machine *machine __maybe_unused)
3302{
3303	struct trace *trace = container_of(tool, struct trace, tool);
3304	struct thread *thread;
3305	int err = 0;
3306
3307	tracepoint_handler handler = evsel->handler;
3308
3309	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3310	if (thread && thread__is_filtered(thread))
3311		goto out;
3312
3313	trace__set_base_time(trace, evsel, sample);
3314
3315	if (handler) {
3316		++trace->nr_events;
3317		handler(trace, evsel, event, sample);
3318	}
3319out:
3320	thread__put(thread);
3321	return err;
3322}
3323
3324static int trace__record(struct trace *trace, int argc, const char **argv)
3325{
3326	unsigned int rec_argc, i, j;
3327	const char **rec_argv;
3328	const char * const record_args[] = {
3329		"record",
3330		"-R",
3331		"-m", "1024",
3332		"-c", "1",
3333	};
3334	pid_t pid = getpid();
3335	char *filter = asprintf__tp_filter_pids(1, &pid);
3336	const char * const sc_args[] = { "-e", };
3337	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
3338	const char * const majpf_args[] = { "-e", "major-faults" };
3339	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
3340	const char * const minpf_args[] = { "-e", "minor-faults" };
3341	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
3342	int err = -1;
3343
3344	/* +3 is for the event string below and the pid filter */
3345	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
3346		majpf_args_nr + minpf_args_nr + argc;
3347	rec_argv = calloc(rec_argc + 1, sizeof(char *));
3348
3349	if (rec_argv == NULL || filter == NULL)
3350		goto out_free;
3351
3352	j = 0;
3353	for (i = 0; i < ARRAY_SIZE(record_args); i++)
3354		rec_argv[j++] = record_args[i];
3355
3356	if (trace->trace_syscalls) {
3357		for (i = 0; i < sc_args_nr; i++)
3358			rec_argv[j++] = sc_args[i];
3359
3360		/* event string may be different for older kernels - e.g., RHEL6 */
3361		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
3362			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
3363		else if (is_valid_tracepoint("syscalls:sys_enter"))
3364			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
3365		else {
3366			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
3367			goto out_free;
3368		}
3369	}
3370
3371	rec_argv[j++] = "--filter";
3372	rec_argv[j++] = filter;
3373
3374	if (trace->trace_pgfaults & TRACE_PFMAJ)
3375		for (i = 0; i < majpf_args_nr; i++)
3376			rec_argv[j++] = majpf_args[i];
3377
3378	if (trace->trace_pgfaults & TRACE_PFMIN)
3379		for (i = 0; i < minpf_args_nr; i++)
3380			rec_argv[j++] = minpf_args[i];
3381
3382	for (i = 0; i < (unsigned int)argc; i++)
3383		rec_argv[j++] = argv[i];
3384
3385	err = cmd_record(j, rec_argv);
3386out_free:
3387	free(filter);
3388	free(rec_argv);
3389	return err;
3390}
3391
3392static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
3393
3394static bool evlist__add_vfs_getname(struct evlist *evlist)
3395{
3396	bool found = false;
3397	struct evsel *evsel, *tmp;
3398	struct parse_events_error err;
3399	int ret;
3400
3401	parse_events_error__init(&err);
3402	ret = parse_events(evlist, "probe:vfs_getname*", &err);
3403	parse_events_error__exit(&err);
3404	if (ret)
3405		return false;
3406
3407	evlist__for_each_entry_safe(evlist, evsel, tmp) {
3408		if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
3409			continue;
3410
3411		if (evsel__field(evsel, "pathname")) {
3412			evsel->handler = trace__vfs_getname;
3413			found = true;
3414			continue;
3415		}
3416
3417		list_del_init(&evsel->core.node);
3418		evsel->evlist = NULL;
3419		evsel__delete(evsel);
3420	}
3421
3422	return found;
 
 
3423}
3424
3425static struct evsel *evsel__new_pgfault(u64 config)
3426{
3427	struct evsel *evsel;
3428	struct perf_event_attr attr = {
3429		.type = PERF_TYPE_SOFTWARE,
3430		.mmap_data = 1,
3431	};
3432
3433	attr.config = config;
3434	attr.sample_period = 1;
3435
3436	event_attr_init(&attr);
3437
3438	evsel = evsel__new(&attr);
3439	if (evsel)
3440		evsel->handler = trace__pgfault;
3441
3442	return evsel;
3443}
3444
3445static void evlist__free_syscall_tp_fields(struct evlist *evlist)
3446{
3447	struct evsel *evsel;
3448
3449	evlist__for_each_entry(evlist, evsel) {
3450		evsel_trace__delete(evsel->priv);
3451		evsel->priv = NULL;
3452	}
3453}
3454
3455static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
3456{
3457	const u32 type = event->header.type;
3458	struct evsel *evsel;
3459
3460	if (type != PERF_RECORD_SAMPLE) {
3461		trace__process_event(trace, trace->host, event, sample);
3462		return;
3463	}
3464
3465	evsel = evlist__id2evsel(trace->evlist, sample->id);
3466	if (evsel == NULL) {
3467		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
3468		return;
3469	}
3470
3471	if (evswitch__discard(&trace->evswitch, evsel))
3472		return;
3473
3474	trace__set_base_time(trace, evsel, sample);
3475
3476	if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
3477	    sample->raw_data == NULL) {
3478		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
3479		       evsel__name(evsel), sample->tid,
3480		       sample->cpu, sample->raw_size);
3481	} else {
3482		tracepoint_handler handler = evsel->handler;
3483		handler(trace, evsel, event, sample);
3484	}
3485
3486	if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
3487		interrupted = true;
3488}
3489
3490static int trace__add_syscall_newtp(struct trace *trace)
3491{
3492	int ret = -1;
3493	struct evlist *evlist = trace->evlist;
3494	struct evsel *sys_enter, *sys_exit;
3495
3496	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
3497	if (sys_enter == NULL)
3498		goto out;
3499
3500	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
3501		goto out_delete_sys_enter;
3502
3503	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
3504	if (sys_exit == NULL)
3505		goto out_delete_sys_enter;
3506
3507	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
3508		goto out_delete_sys_exit;
3509
3510	evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
3511	evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
3512
3513	evlist__add(evlist, sys_enter);
3514	evlist__add(evlist, sys_exit);
3515
3516	if (callchain_param.enabled && !trace->kernel_syscallchains) {
3517		/*
3518		 * We're interested only in the user space callchain
3519		 * leading to the syscall, allow overriding that for
3520		 * debugging reasons using --kernel_syscall_callchains
3521		 */
3522		sys_exit->core.attr.exclude_callchain_kernel = 1;
3523	}
3524
3525	trace->syscalls.events.sys_enter = sys_enter;
3526	trace->syscalls.events.sys_exit  = sys_exit;
3527
3528	ret = 0;
3529out:
3530	return ret;
3531
3532out_delete_sys_exit:
3533	evsel__delete_priv(sys_exit);
3534out_delete_sys_enter:
3535	evsel__delete_priv(sys_enter);
3536	goto out;
3537}
3538
3539static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
3540{
3541	int err = -1;
3542	struct evsel *sys_exit;
3543	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
3544						trace->ev_qualifier_ids.nr,
3545						trace->ev_qualifier_ids.entries);
3546
3547	if (filter == NULL)
3548		goto out_enomem;
3549
3550	if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
 
3551		sys_exit = trace->syscalls.events.sys_exit;
3552		err = evsel__append_tp_filter(sys_exit, filter);
3553	}
3554
3555	free(filter);
3556out:
3557	return err;
3558out_enomem:
3559	errno = ENOMEM;
3560	goto out;
3561}
3562
3563#ifdef HAVE_BPF_SKEL
3564static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
3565{
3566       int id;
3567
3568	if (arg_fmt->type != NULL)
3569		return -1;
3570
3571       id = btf__find_by_name(btf, type);
3572       if (id < 0)
3573		return -1;
3574
3575       arg_fmt->type    = btf__type_by_id(btf, id);
3576       arg_fmt->type_id = id;
3577
3578       return 0;
3579}
3580
3581static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
3582{
3583	struct bpf_program *pos, *prog = NULL;
3584	const char *sec_name;
3585
3586	if (trace->skel->obj == NULL)
3587		return NULL;
3588
3589	bpf_object__for_each_program(pos, trace->skel->obj) {
3590		sec_name = bpf_program__section_name(pos);
3591		if (sec_name && !strcmp(sec_name, name)) {
3592			prog = pos;
3593			break;
3594		}
3595	}
3596
3597	return prog;
3598}
3599
3600static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
3601							const char *prog_name, const char *type)
3602{
3603	struct bpf_program *prog;
3604
3605	if (prog_name == NULL) {
3606		char default_prog_name[256];
3607		scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name);
3608		prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3609		if (prog != NULL)
3610			goto out_found;
3611		if (sc->fmt && sc->fmt->alias) {
3612			scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
3613			prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3614			if (prog != NULL)
3615				goto out_found;
3616		}
3617		goto out_unaugmented;
3618	}
3619
3620	prog = trace__find_bpf_program_by_title(trace, prog_name);
3621
3622	if (prog != NULL) {
3623out_found:
3624		return prog;
3625	}
3626
3627	pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
3628		 prog_name, type, sc->name);
3629out_unaugmented:
3630	return trace->skel->progs.syscall_unaugmented;
3631}
3632
3633static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
3634{
3635	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3636
3637	if (sc == NULL)
3638		return;
3639
3640	sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3641	sc->bpf_prog.sys_exit  = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit  : NULL,  "exit");
3642}
3643
3644static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
3645{
3646	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3647	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3648}
3649
3650static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
3651{
3652	struct syscall *sc = trace__syscall_info(trace, NULL, id);
3653	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3654}
3655
3656static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
3657{
3658	struct tep_format_field *field;
3659	struct syscall *sc = trace__syscall_info(trace, NULL, key);
3660	const struct btf_type *bt;
3661	char *struct_offset, *tmp, name[32];
3662	bool can_augment = false;
3663	int i, cnt;
3664
3665	if (sc == NULL)
3666		return -1;
3667
3668	trace__load_vmlinux_btf(trace);
3669	if (trace->btf == NULL)
3670		return -1;
3671
3672	for (i = 0, field = sc->args; field; ++i, field = field->next) {
3673		// XXX We're only collecting pointer payloads _from_ user space
3674		if (!sc->arg_fmt[i].from_user)
3675			continue;
3676
3677		struct_offset = strstr(field->type, "struct ");
3678		if (struct_offset == NULL)
3679			struct_offset = strstr(field->type, "union ");
3680		else
3681			struct_offset++; // "union" is shorter
3682
3683		if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
3684			struct_offset += 6;
3685
3686			/* for 'struct foo *', we only want 'foo' */
3687			for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
3688			}
3689
3690			strncpy(name, struct_offset, cnt);
3691			name[cnt] = '\0';
3692
3693			/* cache struct's btf_type and type_id */
3694			if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name))
3695				continue;
3696
3697			bt = sc->arg_fmt[i].type;
3698			beauty_array[i] = bt->size;
3699			can_augment = true;
3700		} else if (field->flags & TEP_FIELD_IS_POINTER && /* string */
3701			   strcmp(field->type, "const char *") == 0 &&
3702			   (strstr(field->name, "name") ||
3703			    strstr(field->name, "path") ||
3704			    strstr(field->name, "file") ||
3705			    strstr(field->name, "root") ||
3706			    strstr(field->name, "key") ||
3707			    strstr(field->name, "special") ||
3708			    strstr(field->name, "type") ||
3709			    strstr(field->name, "description"))) {
3710			beauty_array[i] = 1;
3711			can_augment = true;
3712		} else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */
3713			   strstr(field->type, "char *") &&
3714			   (strstr(field->name, "buf") ||
3715			    strstr(field->name, "val") ||
3716			    strstr(field->name, "msg"))) {
3717			int j;
3718			struct tep_format_field *field_tmp;
3719
3720			/* find the size of the buffer that appears in pairs with buf */
3721			for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) {
3722				if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */
3723				    (strstr(field_tmp->name, "count") ||
3724				     strstr(field_tmp->name, "siz") ||  /* size, bufsiz */
3725				     (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) {
3726					 /* filename's got 'len' in it, we don't want that */
3727					beauty_array[i] = -(j + 1);
3728					can_augment = true;
3729					break;
3730				}
3731			}
3732		}
3733	}
3734
3735	if (can_augment)
3736		return 0;
3737
3738	return -1;
3739}
3740
3741static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
3742{
3743	struct tep_format_field *field, *candidate_field;
3744	/*
3745	 * We're only interested in syscalls that have a pointer:
3746	 */
3747	for (field = sc->args; field; field = field->next) {
3748		if (field->flags & TEP_FIELD_IS_POINTER)
3749			goto try_to_find_pair;
3750	}
3751
3752	return NULL;
3753
3754try_to_find_pair:
3755	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3756		int id = syscalltbl__id_at_idx(trace->sctbl, i);
3757		struct syscall *pair = trace__syscall_info(trace, NULL, id);
3758		struct bpf_program *pair_prog;
3759		bool is_candidate = false;
3760
3761		if (pair == NULL || pair == sc ||
3762		    pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
3763			continue;
3764
3765		for (field = sc->args, candidate_field = pair->args;
3766		     field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
3767			bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
3768			     candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
3769
3770			if (is_pointer) {
3771			       if (!candidate_is_pointer) {
3772					// The candidate just doesn't copies our pointer arg, might copy other pointers we want.
3773					continue;
3774			       }
3775			} else {
3776				if (candidate_is_pointer) {
3777					// The candidate might copy a pointer we don't have, skip it.
3778					goto next_candidate;
3779				}
3780				continue;
3781			}
3782
3783			if (strcmp(field->type, candidate_field->type))
3784				goto next_candidate;
3785
3786			/*
3787			 * This is limited in the BPF program but sys_write
3788			 * uses "const char *" for its "buf" arg so we need to
3789			 * use some heuristic that is kinda future proof...
3790			 */
3791			if (strcmp(field->type, "const char *") == 0 &&
3792			    !(strstr(field->name, "name") ||
3793			      strstr(field->name, "path") ||
3794			      strstr(field->name, "file") ||
3795			      strstr(field->name, "root") ||
3796			      strstr(field->name, "description")))
3797				goto next_candidate;
3798
3799			is_candidate = true;
3800		}
3801
3802		if (!is_candidate)
3803			goto next_candidate;
3804
3805		/*
3806		 * Check if the tentative pair syscall augmenter has more pointers, if it has,
3807		 * then it may be collecting that and we then can't use it, as it would collect
3808		 * more than what is common to the two syscalls.
3809		 */
3810		if (candidate_field) {
3811			for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
3812				if (candidate_field->flags & TEP_FIELD_IS_POINTER)
3813					goto next_candidate;
3814		}
3815
3816		pair_prog = pair->bpf_prog.sys_enter;
3817		/*
3818		 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
3819		 * have been searched for, so search it here and if it returns the
3820		 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
3821		 * program for a filtered syscall on a non-filtered one.
3822		 *
3823		 * For instance, we have "!syscalls:sys_enter_renameat" and that is
3824		 * useful for "renameat2".
3825		 */
3826		if (pair_prog == NULL) {
3827			pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3828			if (pair_prog == trace->skel->progs.syscall_unaugmented)
3829				goto next_candidate;
3830		}
3831
3832		pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3833		return pair_prog;
3834	next_candidate:
3835		continue;
3836	}
3837
3838	return NULL;
3839}
3840
3841static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3842{
3843	int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
3844	int map_exit_fd  = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
3845	int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
3846	int err = 0;
3847	unsigned int beauty_array[6];
3848
3849	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3850		int prog_fd, key = syscalltbl__id_at_idx(trace->sctbl, i);
3851
3852		if (!trace__syscall_enabled(trace, key))
3853			continue;
3854
3855		trace__init_syscall_bpf_progs(trace, key);
3856
3857		// It'll get at least the "!raw_syscalls:unaugmented"
3858		prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3859		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3860		if (err)
3861			break;
3862		prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3863		err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3864		if (err)
3865			break;
3866
3867		/* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
3868		memset(beauty_array, 0, sizeof(beauty_array));
3869		err = trace__bpf_sys_enter_beauty_map(trace, key, (unsigned int *)beauty_array);
3870		if (err)
3871			continue;
3872		err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY);
3873		if (err)
3874			break;
3875	}
3876
3877	/*
3878	 * Now lets do a second pass looking for enabled syscalls without
3879	 * an augmenter that have a signature that is a superset of another
3880	 * syscall with an augmenter so that we can auto-reuse it.
3881	 *
3882	 * I.e. if we have an augmenter for the "open" syscall that has
3883	 * this signature:
3884	 *
3885	 *   int open(const char *pathname, int flags, mode_t mode);
3886	 *
3887	 * I.e. that will collect just the first string argument, then we
3888	 * can reuse it for the 'creat' syscall, that has this signature:
3889	 *
3890	 *   int creat(const char *pathname, mode_t mode);
3891	 *
3892	 * and for:
3893	 *
3894	 *   int stat(const char *pathname, struct stat *statbuf);
3895	 *   int lstat(const char *pathname, struct stat *statbuf);
3896	 *
3897	 * Because the 'open' augmenter will collect the first arg as a string,
3898	 * and leave alone all the other args, which already helps with
3899	 * beautifying 'stat' and 'lstat''s pathname arg.
3900	 *
3901	 * Then, in time, when 'stat' gets an augmenter that collects both
3902	 * first and second arg (this one on the raw_syscalls:sys_exit prog
3903	 * array tail call, then that one will be used.
3904	 */
3905	for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3906		int key = syscalltbl__id_at_idx(trace->sctbl, i);
3907		struct syscall *sc = trace__syscall_info(trace, NULL, key);
3908		struct bpf_program *pair_prog;
3909		int prog_fd;
3910
3911		if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3912			continue;
3913
3914		/*
3915		 * For now we're just reusing the sys_enter prog, and if it
3916		 * already has an augmenter, we don't need to find one.
3917		 */
3918		if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
3919			continue;
3920
3921		/*
3922		 * Look at all the other syscalls for one that has a signature
3923		 * that is close enough that we can share:
3924		 */
3925		pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3926		if (pair_prog == NULL)
3927			continue;
3928
3929		sc->bpf_prog.sys_enter = pair_prog;
3930
3931		/*
3932		 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3933		 * with the fd for the program we're reusing:
3934		 */
3935		prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3936		err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3937		if (err)
3938			break;
3939	}
3940
3941	return err;
3942}
3943#endif // HAVE_BPF_SKEL
3944
3945static int trace__set_ev_qualifier_filter(struct trace *trace)
3946{
3947	if (trace->syscalls.events.sys_enter)
3948		return trace__set_ev_qualifier_tp_filter(trace);
3949	return 0;
3950}
3951
3952static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3953				    size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3954{
3955	int err = 0;
3956#ifdef HAVE_LIBBPF_SUPPORT
3957	bool value = true;
3958	int map_fd = bpf_map__fd(map);
3959	size_t i;
3960
3961	for (i = 0; i < npids; ++i) {
3962		err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3963		if (err)
3964			break;
3965	}
3966#endif
3967	return err;
3968}
3969
3970static int trace__set_filter_loop_pids(struct trace *trace)
3971{
3972	unsigned int nr = 1, err;
3973	pid_t pids[32] = {
3974		getpid(),
3975	};
3976	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3977
3978	while (thread && nr < ARRAY_SIZE(pids)) {
3979		struct thread *parent = machine__find_thread(trace->host,
3980							     thread__ppid(thread),
3981							     thread__ppid(thread));
3982
3983		if (parent == NULL)
3984			break;
3985
3986		if (!strcmp(thread__comm_str(parent), "sshd") ||
3987		    strstarts(thread__comm_str(parent), "gnome-terminal")) {
3988			pids[nr++] = thread__tid(parent);
3989			break;
3990		}
3991		thread = parent;
3992	}
3993
3994	err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
3995	if (!err && trace->filter_pids.map)
3996		err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3997
3998	return err;
3999}
4000
4001static int trace__set_filter_pids(struct trace *trace)
4002{
4003	int err = 0;
4004	/*
4005	 * Better not use !target__has_task() here because we need to cover the
4006	 * case where no threads were specified in the command line, but a
4007	 * workload was, and in that case we will fill in the thread_map when
4008	 * we fork the workload in evlist__prepare_workload.
4009	 */
4010	if (trace->filter_pids.nr > 0) {
4011		err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
4012						    trace->filter_pids.entries);
4013		if (!err && trace->filter_pids.map) {
4014			err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
4015						       trace->filter_pids.entries);
4016		}
4017	} else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
4018		err = trace__set_filter_loop_pids(trace);
4019	}
4020
4021	return err;
4022}
4023
4024static int __trace__deliver_event(struct trace *trace, union perf_event *event)
4025{
4026	struct evlist *evlist = trace->evlist;
4027	struct perf_sample sample;
4028	int err = evlist__parse_sample(evlist, event, &sample);
4029
4030	if (err)
4031		fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
4032	else
4033		trace__handle_event(trace, event, &sample);
4034
4035	return 0;
4036}
4037
4038static int __trace__flush_events(struct trace *trace)
4039{
4040	u64 first = ordered_events__first_time(&trace->oe.data);
4041	u64 flush = trace->oe.last - NSEC_PER_SEC;
4042
4043	/* Is there some thing to flush.. */
4044	if (first && first < flush)
4045		return ordered_events__flush_time(&trace->oe.data, flush);
4046
4047	return 0;
4048}
4049
4050static int trace__flush_events(struct trace *trace)
4051{
4052	return !trace->sort_events ? 0 : __trace__flush_events(trace);
4053}
4054
4055static int trace__deliver_event(struct trace *trace, union perf_event *event)
4056{
4057	int err;
4058
4059	if (!trace->sort_events)
4060		return __trace__deliver_event(trace, event);
4061
4062	err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
4063	if (err && err != -1)
4064		return err;
4065
4066	err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
4067	if (err)
4068		return err;
4069
4070	return trace__flush_events(trace);
4071}
4072
4073static int ordered_events__deliver_event(struct ordered_events *oe,
4074					 struct ordered_event *event)
4075{
4076	struct trace *trace = container_of(oe, struct trace, oe.data);
4077
4078	return __trace__deliver_event(trace, event->event);
4079}
4080
4081static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg,
4082								   char **type)
4083{
4084	struct tep_format_field *field;
4085	struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
4086
4087	if (evsel->tp_format == NULL || fmt == NULL)
4088		return NULL;
4089
4090	for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
4091		if (strcmp(field->name, arg) == 0) {
4092			*type = field->type;
4093			return fmt;
4094		}
4095
4096	return NULL;
4097}
4098
4099static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
4100{
4101	char *tok, *left = evsel->filter, *new_filter = evsel->filter;
4102
4103	while ((tok = strpbrk(left, "=<>!")) != NULL) {
4104		char *right = tok + 1, *right_end;
4105
4106		if (*right == '=')
4107			++right;
4108
4109		while (isspace(*right))
4110			++right;
4111
4112		if (*right == '\0')
4113			break;
4114
4115		while (!isalpha(*left))
4116			if (++left == tok) {
4117				/*
4118				 * Bail out, can't find the name of the argument that is being
4119				 * used in the filter, let it try to set this filter, will fail later.
4120				 */
4121				return 0;
4122			}
4123
4124		right_end = right + 1;
4125		while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
4126			++right_end;
4127
4128		if (isalpha(*right)) {
4129			struct syscall_arg_fmt *fmt;
4130			int left_size = tok - left,
4131			    right_size = right_end - right;
4132			char arg[128], *type;
4133
4134			while (isspace(left[left_size - 1]))
4135				--left_size;
4136
4137			scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
4138
4139			fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type);
4140			if (fmt == NULL) {
4141				pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
4142				       arg, evsel->name, evsel->filter);
4143				return -1;
4144			}
4145
4146			pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
4147				 arg, (int)(right - tok), tok, right_size, right);
4148
4149			if (fmt->strtoul) {
4150				u64 val;
4151				struct syscall_arg syscall_arg = {
4152					.trace = trace,
4153					.fmt   = fmt,
4154					.type_name = type,
4155					.parm = fmt->parm,
4156				};
4157
4158				if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
4159					char *n, expansion[19];
4160					int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
4161					int expansion_offset = right - new_filter;
4162
4163					pr_debug("%s", expansion);
4164
4165					if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
4166						pr_debug(" out of memory!\n");
4167						free(new_filter);
4168						return -1;
4169					}
4170					if (new_filter != evsel->filter)
4171						free(new_filter);
4172					left = n + expansion_offset + expansion_lenght;
4173					new_filter = n;
4174				} else {
4175					pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4176					       right_size, right, arg, evsel->name, evsel->filter);
4177					return -1;
4178				}
4179			} else {
4180				pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4181				       arg, evsel->name, evsel->filter);
4182				return -1;
4183			}
4184
4185			pr_debug("\n");
4186		} else {
4187			left = right_end;
4188		}
4189	}
4190
4191	if (new_filter != evsel->filter) {
4192		pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
4193		evsel__set_filter(evsel, new_filter);
4194		free(new_filter);
4195	}
4196
4197	return 0;
4198}
4199
4200static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
4201{
4202	struct evlist *evlist = trace->evlist;
4203	struct evsel *evsel;
4204
4205	evlist__for_each_entry(evlist, evsel) {
4206		if (evsel->filter == NULL)
4207			continue;
4208
4209		if (trace__expand_filter(trace, evsel)) {
4210			*err_evsel = evsel;
4211			return -1;
4212		}
4213	}
4214
4215	return 0;
4216}
4217
4218static int trace__run(struct trace *trace, int argc, const char **argv)
4219{
4220	struct evlist *evlist = trace->evlist;
4221	struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
4222	int err = -1, i;
4223	unsigned long before;
4224	const bool forks = argc > 0;
4225	bool draining = false;
4226
4227	trace->live = true;
4228
4229	if (!trace->raw_augmented_syscalls) {
4230		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
4231			goto out_error_raw_syscalls;
4232
4233		if (trace->trace_syscalls)
4234			trace->vfs_getname = evlist__add_vfs_getname(evlist);
4235	}
4236
4237	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
4238		pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
4239		if (pgfault_maj == NULL)
4240			goto out_error_mem;
4241		evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
4242		evlist__add(evlist, pgfault_maj);
4243	}
4244
4245	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
4246		pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
4247		if (pgfault_min == NULL)
4248			goto out_error_mem;
4249		evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
4250		evlist__add(evlist, pgfault_min);
4251	}
4252
4253	/* Enable ignoring missing threads when -u/-p option is defined. */
4254	trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
4255
4256	if (trace->sched &&
4257	    evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
 
4258		goto out_error_sched_stat_runtime;
4259	/*
4260	 * If a global cgroup was set, apply it to all the events without an
4261	 * explicit cgroup. I.e.:
4262	 *
4263	 * 	trace -G A -e sched:*switch
4264	 *
4265	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
4266	 * _and_ sched:sched_switch to the 'A' cgroup, while:
4267	 *
4268	 * trace -e sched:*switch -G A
4269	 *
4270	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
4271	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
4272	 * a cgroup (on the root cgroup, sys wide, etc).
4273	 *
4274	 * Multiple cgroups:
4275	 *
4276	 * trace -G A -e sched:*switch -G B
4277	 *
4278	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
4279	 * to the 'B' cgroup.
4280	 *
4281	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
4282	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
4283	 */
4284	if (trace->cgroup)
4285		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
4286
4287	err = evlist__create_maps(evlist, &trace->opts.target);
4288	if (err < 0) {
4289		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
4290		goto out_delete_evlist;
4291	}
4292
4293	err = trace__symbols_init(trace, evlist);
4294	if (err < 0) {
4295		fprintf(trace->output, "Problems initializing symbol libraries!\n");
4296		goto out_delete_evlist;
4297	}
4298
4299	evlist__config(evlist, &trace->opts, &callchain_param);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4300
4301	if (forks) {
4302		err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
 
4303		if (err < 0) {
4304			fprintf(trace->output, "Couldn't run the workload!\n");
4305			goto out_delete_evlist;
4306		}
4307		workload_pid = evlist->workload.pid;
4308	}
4309
4310	err = evlist__open(evlist);
4311	if (err < 0)
4312		goto out_error_open;
4313#ifdef HAVE_BPF_SKEL
4314	if (trace->syscalls.events.bpf_output) {
4315		struct perf_cpu cpu;
4316
4317		/*
4318		 * Set up the __augmented_syscalls__ BPF map to hold for each
4319		 * CPU the bpf-output event's file descriptor.
4320		 */
4321		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
4322			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
4323					&cpu.cpu, sizeof(int),
4324					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
4325						       cpu.cpu, 0),
4326					sizeof(__u32), BPF_ANY);
4327		}
4328	}
4329
4330	if (trace->skel)
4331		trace->filter_pids.map = trace->skel->maps.pids_filtered;
4332#endif
4333	err = trace__set_filter_pids(trace);
 
 
 
 
 
 
 
4334	if (err < 0)
4335		goto out_error_mem;
4336
4337#ifdef HAVE_BPF_SKEL
4338	if (trace->skel && trace->skel->progs.sys_enter)
4339		trace__init_syscalls_bpf_prog_array_maps(trace);
4340#endif
4341
4342	if (trace->ev_qualifier_ids.nr > 0) {
4343		err = trace__set_ev_qualifier_filter(trace);
4344		if (err < 0)
4345			goto out_errno;
4346
4347		if (trace->syscalls.events.sys_exit) {
4348			pr_debug("event qualifier tracepoint filter: %s\n",
4349				 trace->syscalls.events.sys_exit->filter);
4350		}
4351	}
4352
4353	/*
4354	 * If the "close" syscall is not traced, then we will not have the
4355	 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
4356	 * fd->pathname table and were ending up showing the last value set by
4357	 * syscalls opening a pathname and associating it with a descriptor or
4358	 * reading it from /proc/pid/fd/ in cases where that doesn't make
4359	 * sense.
4360	 *
4361	 *  So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
4362	 *  not in use.
4363	 */
4364	trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
4365
4366	err = trace__expand_filters(trace, &evsel);
4367	if (err)
4368		goto out_delete_evlist;
4369	err = evlist__apply_filters(evlist, &evsel, &trace->opts.target);
4370	if (err < 0)
4371		goto out_error_apply_filters;
4372
4373	err = evlist__mmap(evlist, trace->opts.mmap_pages);
4374	if (err < 0)
4375		goto out_error_mmap;
4376
4377	if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
4378		evlist__enable(evlist);
4379
4380	if (forks)
4381		evlist__start_workload(evlist);
4382
4383	if (trace->opts.target.initial_delay) {
4384		usleep(trace->opts.target.initial_delay * 1000);
4385		evlist__enable(evlist);
4386	}
4387
4388	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
4389		perf_thread_map__nr(evlist->core.threads) > 1 ||
4390		evlist__first(evlist)->core.attr.inherit;
4391
4392	/*
4393	 * Now that we already used evsel->core.attr to ask the kernel to setup the
4394	 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
4395	 * trace__resolve_callchain(), allowing per-event max-stack settings
4396	 * to override an explicitly set --max-stack global setting.
4397	 */
4398	evlist__for_each_entry(evlist, evsel) {
4399		if (evsel__has_callchain(evsel) &&
4400		    evsel->core.attr.sample_max_stack == 0)
4401			evsel->core.attr.sample_max_stack = trace->max_stack;
4402	}
4403again:
4404	before = trace->nr_events;
4405
4406	for (i = 0; i < evlist->core.nr_mmaps; i++) {
4407		union perf_event *event;
4408		struct mmap *md;
4409
4410		md = &evlist->mmap[i];
4411		if (perf_mmap__read_init(&md->core) < 0)
4412			continue;
4413
4414		while ((event = perf_mmap__read_event(&md->core)) != NULL) {
4415			++trace->nr_events;
4416
4417			err = trace__deliver_event(trace, event);
4418			if (err)
4419				goto out_disable;
 
 
4420
4421			perf_mmap__consume(&md->core);
 
 
4422
4423			if (interrupted)
4424				goto out_disable;
4425
4426			if (done && !draining) {
4427				evlist__disable(evlist);
4428				draining = true;
4429			}
4430		}
4431		perf_mmap__read_done(&md->core);
4432	}
4433
4434	if (trace->nr_events == before) {
4435		int timeout = done ? 100 : -1;
4436
4437		if (!draining && evlist__poll(evlist, timeout) > 0) {
4438			if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
4439				draining = true;
4440
4441			goto again;
4442		} else {
4443			if (trace__flush_events(trace))
4444				goto out_disable;
4445		}
4446	} else {
4447		goto again;
4448	}
4449
4450out_disable:
4451	thread__zput(trace->current);
4452
4453	evlist__disable(evlist);
4454
4455	if (trace->sort_events)
4456		ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
4457
4458	if (!err) {
4459		if (trace->summary)
4460			trace__fprintf_thread_summary(trace, trace->output);
4461
4462		if (trace->show_tool_stats) {
4463			fprintf(trace->output, "Stats:\n "
4464					       " vfs_getname : %" PRIu64 "\n"
4465					       " proc_getname: %" PRIu64 "\n",
4466				trace->stats.vfs_getname,
4467				trace->stats.proc_getname);
4468		}
4469	}
4470
4471out_delete_evlist:
4472	trace__symbols__exit(trace);
4473	evlist__free_syscall_tp_fields(evlist);
4474	evlist__delete(evlist);
4475	cgroup__put(trace->cgroup);
4476	trace->evlist = NULL;
4477	trace->live = false;
4478	return err;
4479{
4480	char errbuf[BUFSIZ];
4481
4482out_error_sched_stat_runtime:
4483	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
4484	goto out_error;
4485
4486out_error_raw_syscalls:
4487	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
4488	goto out_error;
4489
4490out_error_mmap:
4491	evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
4492	goto out_error;
4493
4494out_error_open:
4495	evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
4496
4497out_error:
4498	fprintf(trace->output, "%s\n", errbuf);
4499	goto out_delete_evlist;
4500
4501out_error_apply_filters:
4502	fprintf(trace->output,
4503		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
4504		evsel->filter, evsel__name(evsel), errno,
4505		str_error_r(errno, errbuf, sizeof(errbuf)));
4506	goto out_delete_evlist;
4507}
4508out_error_mem:
4509	fprintf(trace->output, "Not enough memory to run!\n");
4510	goto out_delete_evlist;
4511
4512out_errno:
4513	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
4514	goto out_delete_evlist;
4515}
4516
4517static int trace__replay(struct trace *trace)
4518{
4519	const struct evsel_str_handler handlers[] = {
4520		{ "probe:vfs_getname",	     trace__vfs_getname, },
4521	};
4522	struct perf_data data = {
4523		.path  = input_name,
4524		.mode  = PERF_DATA_MODE_READ,
4525		.force = trace->force,
4526	};
4527	struct perf_session *session;
4528	struct evsel *evsel;
4529	int err = -1;
4530
4531	trace->tool.sample	  = trace__process_sample;
4532	trace->tool.mmap	  = perf_event__process_mmap;
4533	trace->tool.mmap2	  = perf_event__process_mmap2;
4534	trace->tool.comm	  = perf_event__process_comm;
4535	trace->tool.exit	  = perf_event__process_exit;
4536	trace->tool.fork	  = perf_event__process_fork;
4537	trace->tool.attr	  = perf_event__process_attr;
4538	trace->tool.tracing_data  = perf_event__process_tracing_data;
4539	trace->tool.build_id	  = perf_event__process_build_id;
4540	trace->tool.namespaces	  = perf_event__process_namespaces;
4541
4542	trace->tool.ordered_events = true;
4543	trace->tool.ordering_requires_timestamps = true;
4544
4545	/* add tid to output */
4546	trace->multiple_threads = true;
4547
4548	session = perf_session__new(&data, &trace->tool);
4549	if (IS_ERR(session))
4550		return PTR_ERR(session);
4551
4552	if (trace->opts.target.pid)
4553		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
4554
4555	if (trace->opts.target.tid)
4556		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
4557
4558	if (symbol__init(&session->header.env) < 0)
4559		goto out;
4560
4561	trace->host = &session->machines.host;
4562
4563	err = perf_session__set_tracepoints_handlers(session, handlers);
4564	if (err)
4565		goto out;
4566
4567	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
4568	trace->syscalls.events.sys_enter = evsel;
4569	/* older kernels have syscalls tp versus raw_syscalls */
4570	if (evsel == NULL)
4571		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
 
4572
4573	if (evsel &&
4574	    (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
4575	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
4576		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
4577		goto out;
4578	}
4579
4580	evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
4581	trace->syscalls.events.sys_exit = evsel;
4582	if (evsel == NULL)
4583		evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
 
4584	if (evsel &&
4585	    (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
4586	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
4587		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
4588		goto out;
4589	}
4590
4591	evlist__for_each_entry(session->evlist, evsel) {
4592		if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
4593		    (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
4594		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
4595		     evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
4596			evsel->handler = trace__pgfault;
4597	}
4598
4599	setup_pager();
4600
4601	err = perf_session__process_events(session);
4602	if (err)
4603		pr_err("Failed to process events, error %d", err);
4604
4605	else if (trace->summary)
4606		trace__fprintf_thread_summary(trace, trace->output);
4607
4608out:
4609	perf_session__delete(session);
4610
4611	return err;
4612}
4613
4614static size_t trace__fprintf_threads_header(FILE *fp)
4615{
4616	size_t printed;
4617
4618	printed  = fprintf(fp, "\n Summary of events:\n\n");
4619
4620	return printed;
4621}
4622
4623DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
4624	struct syscall_stats *stats;
4625	double		     msecs;
4626	int		     syscall;
4627)
4628{
4629	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
4630	struct syscall_stats *stats = source->priv;
4631
4632	entry->syscall = source->i;
4633	entry->stats   = stats;
4634	entry->msecs   = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
4635}
4636
4637static size_t thread__dump_stats(struct thread_trace *ttrace,
4638				 struct trace *trace, FILE *fp)
4639{
4640	size_t printed = 0;
4641	struct syscall *sc;
4642	struct rb_node *nd;
4643	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
4644
4645	if (syscall_stats == NULL)
4646		return 0;
4647
4648	printed += fprintf(fp, "\n");
4649
4650	printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
4651	printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
4652	printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
4653
4654	resort_rb__for_each_entry(nd, syscall_stats) {
4655		struct syscall_stats *stats = syscall_stats_entry->stats;
4656		if (stats) {
4657			double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
4658			double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
4659			double avg = avg_stats(&stats->stats);
4660			double pct;
4661			u64 n = (u64)stats->stats.n;
4662
4663			pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
4664			avg /= NSEC_PER_MSEC;
4665
4666			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
4667			printed += fprintf(fp, "   %-15s", sc->name);
4668			printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
4669					   n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
4670			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
4671
4672			if (trace->errno_summary && stats->nr_failures) {
4673				int e;
4674
4675				for (e = 0; e < stats->max_errno; ++e) {
4676					if (stats->errnos[e] != 0)
4677						fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
4678				}
4679			}
4680		}
4681	}
4682
4683	resort_rb__delete(syscall_stats);
4684	printed += fprintf(fp, "\n\n");
4685
4686	return printed;
4687}
4688
4689static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
4690{
4691	size_t printed = 0;
4692	struct thread_trace *ttrace = thread__priv(thread);
4693	double ratio;
4694
4695	if (ttrace == NULL)
4696		return 0;
4697
4698	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
4699
4700	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread));
4701	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
4702	printed += fprintf(fp, "%.1f%%", ratio);
4703	if (ttrace->pfmaj)
4704		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
4705	if (ttrace->pfmin)
4706		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
4707	if (trace->sched)
4708		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
4709	else if (fputc('\n', fp) != EOF)
4710		++printed;
4711
4712	printed += thread__dump_stats(ttrace, trace, fp);
4713
4714	return printed;
4715}
4716
4717static unsigned long thread__nr_events(struct thread_trace *ttrace)
4718{
4719	return ttrace ? ttrace->nr_events : 0;
4720}
4721
4722static int trace_nr_events_cmp(void *priv __maybe_unused,
4723			       const struct list_head *la,
4724			       const struct list_head *lb)
4725{
4726	struct thread_list *a = list_entry(la, struct thread_list, list);
4727	struct thread_list *b = list_entry(lb, struct thread_list, list);
4728	unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
4729	unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
4730
4731	if (a_nr_events != b_nr_events)
4732		return a_nr_events < b_nr_events ? -1 : 1;
4733
4734	/* Identical number of threads, place smaller tids first. */
4735	return thread__tid(a->thread) < thread__tid(b->thread)
4736		? -1
4737		: (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
4738}
4739
4740static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
4741{
 
4742	size_t printed = trace__fprintf_threads_header(fp);
4743	LIST_HEAD(threads);
 
 
 
 
 
4744
4745	if (machine__thread_list(trace->host, &threads) == 0) {
4746		struct thread_list *pos;
4747
4748		list_sort(NULL, &threads, trace_nr_events_cmp);
4749
4750		list_for_each_entry(pos, &threads, list)
4751			printed += trace__fprintf_thread(fp, pos->thread, trace);
4752	}
4753	thread_list__delete(&threads);
4754	return printed;
4755}
4756
4757static int trace__set_duration(const struct option *opt, const char *str,
4758			       int unset __maybe_unused)
4759{
4760	struct trace *trace = opt->value;
4761
4762	trace->duration_filter = atof(str);
4763	return 0;
4764}
4765
4766static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
4767					      int unset __maybe_unused)
4768{
4769	int ret = -1;
4770	size_t i;
4771	struct trace *trace = opt->value;
4772	/*
4773	 * FIXME: introduce a intarray class, plain parse csv and create a
4774	 * { int nr, int entries[] } struct...
4775	 */
4776	struct intlist *list = intlist__new(str);
4777
4778	if (list == NULL)
4779		return -1;
4780
4781	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
4782	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
4783
4784	if (trace->filter_pids.entries == NULL)
4785		goto out;
4786
4787	trace->filter_pids.entries[0] = getpid();
4788
4789	for (i = 1; i < trace->filter_pids.nr; ++i)
4790		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
4791
4792	intlist__delete(list);
4793	ret = 0;
4794out:
4795	return ret;
4796}
4797
4798static int trace__open_output(struct trace *trace, const char *filename)
4799{
4800	struct stat st;
4801
4802	if (!stat(filename, &st) && st.st_size) {
4803		char oldname[PATH_MAX];
4804
4805		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
4806		unlink(oldname);
4807		rename(filename, oldname);
4808	}
4809
4810	trace->output = fopen(filename, "w");
4811
4812	return trace->output == NULL ? -errno : 0;
4813}
4814
4815static int parse_pagefaults(const struct option *opt, const char *str,
4816			    int unset __maybe_unused)
4817{
4818	int *trace_pgfaults = opt->value;
4819
4820	if (strcmp(str, "all") == 0)
4821		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
4822	else if (strcmp(str, "maj") == 0)
4823		*trace_pgfaults |= TRACE_PFMAJ;
4824	else if (strcmp(str, "min") == 0)
4825		*trace_pgfaults |= TRACE_PFMIN;
4826	else
4827		return -1;
4828
4829	return 0;
4830}
4831
4832static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
4833{
4834	struct evsel *evsel;
4835
4836	evlist__for_each_entry(evlist, evsel) {
4837		if (evsel->handler == NULL)
4838			evsel->handler = handler;
4839	}
4840}
4841
4842static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
4843{
4844	struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
4845
4846	if (fmt) {
4847		const struct syscall_fmt *scfmt = syscall_fmt__find(name);
4848
4849		if (scfmt) {
4850			int skip = 0;
4851
4852			if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
4853			    strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
4854				++skip;
4855
4856			memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
4857		}
4858	}
4859}
4860
4861static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf)
4862{
4863	struct evsel *evsel;
4864
4865	evlist__for_each_entry(evlist, evsel) {
4866		if (evsel->priv || !evsel->tp_format)
4867			continue;
4868
4869		if (strcmp(evsel->tp_format->system, "syscalls")) {
4870			evsel__init_tp_arg_scnprintf(evsel, use_btf);
4871			continue;
4872		}
4873
4874		if (evsel__init_syscall_tp(evsel))
4875			return -1;
4876
4877		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
4878			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4879
4880			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
4881				return -1;
4882
4883			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
4884		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
4885			struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4886
4887			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
4888				return -1;
4889
4890			evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
4891		}
4892	}
4893
4894	return 0;
4895}
4896
4897/*
4898 * XXX: Hackish, just splitting the combined -e+--event (syscalls
4899 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
4900 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
4901 *
4902 * It'd be better to introduce a parse_options() variant that would return a
4903 * list with the terms it didn't match to an event...
4904 */
4905static int trace__parse_events_option(const struct option *opt, const char *str,
4906				      int unset __maybe_unused)
4907{
4908	struct trace *trace = (struct trace *)opt->value;
4909	const char *s = str;
4910	char *sep = NULL, *lists[2] = { NULL, NULL, };
4911	int len = strlen(str) + 1, err = -1, list, idx;
4912	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
4913	char group_name[PATH_MAX];
4914	const struct syscall_fmt *fmt;
4915
4916	if (strace_groups_dir == NULL)
4917		return -1;
4918
4919	if (*s == '!') {
4920		++s;
4921		trace->not_ev_qualifier = true;
4922	}
4923
4924	while (1) {
4925		if ((sep = strchr(s, ',')) != NULL)
4926			*sep = '\0';
4927
4928		list = 0;
4929		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
4930		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
4931			list = 1;
4932			goto do_concat;
4933		}
4934
4935		fmt = syscall_fmt__find_by_alias(s);
4936		if (fmt != NULL) {
4937			list = 1;
4938			s = fmt->name;
4939		} else {
4940			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
4941			if (access(group_name, R_OK) == 0)
4942				list = 1;
4943		}
4944do_concat:
4945		if (lists[list]) {
4946			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
4947		} else {
4948			lists[list] = malloc(len);
4949			if (lists[list] == NULL)
4950				goto out;
4951			strcpy(lists[list], s);
4952		}
4953
4954		if (!sep)
4955			break;
4956
4957		*sep = ',';
4958		s = sep + 1;
4959	}
4960
4961	if (lists[1] != NULL) {
4962		struct strlist_config slist_config = {
4963			.dirname = strace_groups_dir,
4964		};
4965
4966		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
4967		if (trace->ev_qualifier == NULL) {
4968			fputs("Not enough memory to parse event qualifier", trace->output);
4969			goto out;
4970		}
4971
4972		if (trace__validate_ev_qualifier(trace))
4973			goto out;
4974		trace->trace_syscalls = true;
4975	}
4976
4977	err = 0;
4978
4979	if (lists[0]) {
4980		struct parse_events_option_args parse_events_option_args = {
4981			.evlistp = &trace->evlist,
4982		};
4983		struct option o = {
4984			.value = &parse_events_option_args,
4985		};
4986		err = parse_events_option(&o, lists[0], 0);
4987	}
4988out:
4989	free(strace_groups_dir);
4990	free(lists[0]);
4991	free(lists[1]);
4992	if (sep)
4993		*sep = ',';
4994
4995	return err;
4996}
4997
4998static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
4999{
5000	struct trace *trace = opt->value;
5001
5002	if (!list_empty(&trace->evlist->core.entries)) {
5003		struct option o = {
5004			.value = &trace->evlist,
5005		};
5006		return parse_cgroups(&o, str, unset);
5007	}
5008	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
5009
5010	return 0;
5011}
5012
5013static int trace__config(const char *var, const char *value, void *arg)
5014{
5015	struct trace *trace = arg;
5016	int err = 0;
5017
5018	if (!strcmp(var, "trace.add_events")) {
5019		trace->perfconfig_events = strdup(value);
5020		if (trace->perfconfig_events == NULL) {
5021			pr_err("Not enough memory for %s\n", "trace.add_events");
5022			return -1;
5023		}
5024	} else if (!strcmp(var, "trace.show_timestamp")) {
5025		trace->show_tstamp = perf_config_bool(var, value);
5026	} else if (!strcmp(var, "trace.show_duration")) {
5027		trace->show_duration = perf_config_bool(var, value);
5028	} else if (!strcmp(var, "trace.show_arg_names")) {
5029		trace->show_arg_names = perf_config_bool(var, value);
5030		if (!trace->show_arg_names)
5031			trace->show_zeros = true;
5032	} else if (!strcmp(var, "trace.show_zeros")) {
5033		bool new_show_zeros = perf_config_bool(var, value);
5034		if (!trace->show_arg_names && !new_show_zeros) {
5035			pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
5036			goto out;
5037		}
5038		trace->show_zeros = new_show_zeros;
5039	} else if (!strcmp(var, "trace.show_prefix")) {
5040		trace->show_string_prefix = perf_config_bool(var, value);
5041	} else if (!strcmp(var, "trace.no_inherit")) {
5042		trace->opts.no_inherit = perf_config_bool(var, value);
5043	} else if (!strcmp(var, "trace.args_alignment")) {
5044		int args_alignment = 0;
5045		if (perf_config_int(&args_alignment, var, value) == 0)
5046			trace->args_alignment = args_alignment;
5047	} else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
5048		if (strcasecmp(value, "libtraceevent") == 0)
5049			trace->libtraceevent_print = true;
5050		else if (strcasecmp(value, "libbeauty") == 0)
5051			trace->libtraceevent_print = false;
5052	}
5053out:
5054	return err;
5055}
5056
5057static void trace__exit(struct trace *trace)
5058{
5059	int i;
5060
5061	strlist__delete(trace->ev_qualifier);
5062	zfree(&trace->ev_qualifier_ids.entries);
5063	if (trace->syscalls.table) {
5064		for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
5065			syscall__exit(&trace->syscalls.table[i]);
5066		zfree(&trace->syscalls.table);
5067	}
5068	syscalltbl__delete(trace->sctbl);
5069	zfree(&trace->perfconfig_events);
5070}
5071
5072#ifdef HAVE_BPF_SKEL
5073static int bpf__setup_bpf_output(struct evlist *evlist)
5074{
5075	int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
5076
5077	if (err)
5078		pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
5079
5080	return err;
5081}
5082#endif
5083
5084int cmd_trace(int argc, const char **argv)
5085{
5086	const char *trace_usage[] = {
5087		"perf trace [<options>] [<command>]",
5088		"perf trace [<options>] -- <command> [<options>]",
5089		"perf trace record [<options>] [<command>]",
5090		"perf trace record [<options>] -- <command> [<options>]",
5091		NULL
5092	};
5093	struct trace trace = {
 
 
 
5094		.opts = {
5095			.target = {
5096				.uid	   = UINT_MAX,
5097				.uses_mmap = true,
5098			},
5099			.user_freq     = UINT_MAX,
5100			.user_interval = ULLONG_MAX,
5101			.no_buffering  = true,
5102			.mmap_pages    = UINT_MAX,
 
5103		},
5104		.output = stderr,
5105		.show_comm = true,
5106		.show_tstamp = true,
5107		.show_duration = true,
5108		.show_arg_names = true,
5109		.args_alignment = 70,
5110		.trace_syscalls = false,
5111		.kernel_syscallchains = false,
5112		.max_stack = UINT_MAX,
5113		.max_events = ULONG_MAX,
5114	};
5115	const char *output_name = NULL;
 
5116	const struct option trace_options[] = {
5117	OPT_CALLBACK('e', "event", &trace, "event",
5118		     "event/syscall selector. use 'perf list' to list available events",
5119		     trace__parse_events_option),
5120	OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
5121		     "event filter", parse_filter),
5122	OPT_BOOLEAN(0, "comm", &trace.show_comm,
5123		    "show the thread COMM next to its id"),
5124	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
5125	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
5126		     trace__parse_events_option),
5127	OPT_STRING('o', "output", &output_name, "file", "output file name"),
5128	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
5129	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
5130		    "trace events on existing process id"),
5131	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
5132		    "trace events on existing thread id"),
5133	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
5134		     "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
5135	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
5136		    "system-wide collection from all CPUs"),
5137	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
5138		    "list of cpus to monitor"),
5139	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
5140		    "child tasks do not inherit counters"),
5141	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
5142		     "number of mmap data pages", evlist__parse_mmap_pages),
 
5143	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
5144		   "user to profile"),
5145	OPT_CALLBACK(0, "duration", &trace, "float",
5146		     "show only events with duration > N.M ms",
5147		     trace__set_duration),
5148	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
5149	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
5150	OPT_BOOLEAN('T', "time", &trace.full_time,
5151		    "Show full timestamp, not time relative to first start"),
5152	OPT_BOOLEAN(0, "failure", &trace.failure_only,
5153		    "Show only syscalls that failed"),
5154	OPT_BOOLEAN('s', "summary", &trace.summary_only,
5155		    "Show only syscall summary with statistics"),
5156	OPT_BOOLEAN('S', "with-summary", &trace.summary,
5157		    "Show all syscalls and summary with statistics"),
5158	OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
5159		    "Show errno stats per syscall, use with -s or -S"),
5160	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
5161		     "Trace pagefaults", parse_pagefaults, "maj"),
5162	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
5163	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
5164	OPT_CALLBACK(0, "call-graph", &trace.opts,
5165		     "record_mode[,record_size]", record_callchain_help,
5166		     &record_parse_callchain_opt),
5167	OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
5168		    "Use libtraceevent to print the tracepoint arguments."),
5169	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
5170		    "Show the kernel callchains on the syscall exit path"),
5171	OPT_ULONG(0, "max-events", &trace.max_events,
5172		"Set the maximum number of events to print, exit after that is reached. "),
5173	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
5174		     "Set the minimum stack depth when parsing the callchain, "
5175		     "anything below the specified depth will be ignored."),
5176	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
5177		     "Set the maximum stack depth when parsing the callchain, "
5178		     "anything beyond the specified depth will be ignored. "
5179		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
5180	OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
5181			"Sort batch of events before processing, use if getting out of order events"),
5182	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
5183			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
5184	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
5185			"per thread proc mmap processing timeout in ms"),
5186	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
5187		     trace__parse_cgroups),
5188	OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
5189		     "ms to wait before starting measurement after program "
5190		     "start"),
5191	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
5192		       "to customized ones"),
5193	OPTS_EVSWITCH(&trace.evswitch),
5194	OPT_END()
5195	};
5196	bool __maybe_unused max_stack_user_set = true;
5197	bool mmap_pages_user_set = true;
5198	struct evsel *evsel;
5199	const char * const trace_subcommands[] = { "record", NULL };
5200	int err = -1;
5201	char bf[BUFSIZ];
5202	struct sigaction sigchld_act;
5203
5204	signal(SIGSEGV, sighandler_dump_stack);
5205	signal(SIGFPE, sighandler_dump_stack);
5206	signal(SIGINT, sighandler_interrupt);
5207
5208	memset(&sigchld_act, 0, sizeof(sigchld_act));
5209	sigchld_act.sa_flags = SA_SIGINFO;
5210	sigchld_act.sa_sigaction = sighandler_chld;
5211	sigaction(SIGCHLD, &sigchld_act, NULL);
5212
5213	trace.evlist = evlist__new();
5214	trace.sctbl = syscalltbl__new();
5215
5216	if (trace.evlist == NULL || trace.sctbl == NULL) {
5217		pr_err("Not enough memory to run!\n");
5218		err = -ENOMEM;
5219		goto out;
5220	}
5221
5222	/*
5223	 * Parsing .perfconfig may entail creating a BPF event, that may need
5224	 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
5225	 * is too small. This affects just this process, not touching the
5226	 * global setting. If it fails we'll get something in 'perf trace -v'
5227	 * to help diagnose the problem.
5228	 */
5229	rlimit__bump_memlock();
5230
5231	err = perf_config(trace__config, &trace);
5232	if (err)
5233		goto out;
5234
5235	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
5236				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
5237
5238	/*
5239	 * Here we already passed thru trace__parse_events_option() and it has
5240	 * already figured out if -e syscall_name, if not but if --event
5241	 * foo:bar was used, the user is interested _just_ in those, say,
5242	 * tracepoint events, not in the strace-like syscall-name-based mode.
5243	 *
5244	 * This is important because we need to check if strace-like mode is
5245	 * needed to decided if we should filter out the eBPF
5246	 * __augmented_syscalls__ code, if it is in the mix, say, via
5247	 * .perfconfig trace.add_events, and filter those out.
5248	 */
5249	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
5250	    trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
5251		trace.trace_syscalls = true;
5252	}
5253	/*
5254	 * Now that we have --verbose figured out, lets see if we need to parse
5255	 * events from .perfconfig, so that if those events fail parsing, say some
5256	 * BPF program fails, then we'll be able to use --verbose to see what went
5257	 * wrong in more detail.
5258	 */
5259	if (trace.perfconfig_events != NULL) {
5260		struct parse_events_error parse_err;
5261
5262		parse_events_error__init(&parse_err);
5263		err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
5264		if (err)
5265			parse_events_error__print(&parse_err, trace.perfconfig_events);
5266		parse_events_error__exit(&parse_err);
5267		if (err)
5268			goto out;
5269	}
5270
5271	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
5272		usage_with_options_msg(trace_usage, trace_options,
5273				       "cgroup monitoring only available in system-wide mode");
5274	}
5275
5276#ifdef HAVE_BPF_SKEL
5277	if (!trace.trace_syscalls)
5278		goto skip_augmentation;
5279
5280	if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
5281		pr_debug("Syscall augmentation fails with record, disabling augmentation");
5282		goto skip_augmentation;
5283	}
5284
5285	trace.skel = augmented_raw_syscalls_bpf__open();
5286	if (!trace.skel) {
5287		pr_debug("Failed to open augmented syscalls BPF skeleton");
5288	} else {
5289		/*
5290		 * Disable attaching the BPF programs except for sys_enter and
5291		 * sys_exit that tail call into this as necessary.
5292		 */
5293		struct bpf_program *prog;
5294
5295		bpf_object__for_each_program(prog, trace.skel->obj) {
5296			if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
5297				bpf_program__set_autoattach(prog, /*autoattach=*/false);
5298		}
5299
5300		err = augmented_raw_syscalls_bpf__load(trace.skel);
5301
5302		if (err < 0) {
5303			libbpf_strerror(err, bf, sizeof(bf));
5304			pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf);
5305		} else {
5306			augmented_raw_syscalls_bpf__attach(trace.skel);
5307			trace__add_syscall_newtp(&trace);
5308		}
5309	}
5310
5311	err = bpf__setup_bpf_output(trace.evlist);
5312	if (err) {
5313		libbpf_strerror(err, bf, sizeof(bf));
5314		pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
5315		goto out;
5316	}
5317	trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
5318	assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__"));
5319skip_augmentation:
5320#endif
5321	err = -1;
5322
5323	if (trace.trace_pgfaults) {
5324		trace.opts.sample_address = true;
5325		trace.opts.sample_time = true;
5326	}
5327
5328	if (trace.opts.mmap_pages == UINT_MAX)
5329		mmap_pages_user_set = false;
5330
5331	if (trace.max_stack == UINT_MAX) {
5332		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
5333		max_stack_user_set = false;
5334	}
5335
5336#ifdef HAVE_DWARF_UNWIND_SUPPORT
5337	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
5338		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
5339	}
5340#endif
5341
5342	if (callchain_param.enabled) {
5343		if (!mmap_pages_user_set && geteuid() == 0)
5344			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
5345
5346		symbol_conf.use_callchain = true;
5347	}
5348
5349	if (trace.evlist->core.nr_entries > 0) {
5350		bool use_btf = false;
5351
5352		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
5353		if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
5354			perror("failed to set syscalls:* tracepoint fields");
5355			goto out;
5356		}
5357
5358		if (use_btf)
5359			trace__load_vmlinux_btf(&trace);
5360	}
5361
5362	if (trace.sort_events) {
5363		ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
5364		ordered_events__set_copy_on_queue(&trace.oe.data, true);
5365	}
5366
5367	/*
5368	 * If we are augmenting syscalls, then combine what we put in the
5369	 * __augmented_syscalls__ BPF map with what is in the
5370	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
5371	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
5372	 *
5373	 * We'll switch to look at two BPF maps, one for sys_enter and the
5374	 * other for sys_exit when we start augmenting the sys_exit paths with
5375	 * buffers that are being copied from kernel to userspace, think 'read'
5376	 * syscall.
5377	 */
5378	if (trace.syscalls.events.bpf_output) {
5379		evlist__for_each_entry(trace.evlist, evsel) {
5380			bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
5381
5382			if (raw_syscalls_sys_exit) {
5383				trace.raw_augmented_syscalls = true;
5384				goto init_augmented_syscall_tp;
5385			}
5386
5387			if (trace.syscalls.events.bpf_output->priv == NULL &&
5388			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
5389				struct evsel *augmented = trace.syscalls.events.bpf_output;
5390				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
5391				    evsel__init_augmented_syscall_tp_args(augmented))
5392					goto out;
5393				/*
5394				 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
5395				 * Above we made sure we can get from the payload the tp fields
5396				 * that we get from syscalls:sys_enter tracefs format file.
5397				 */
5398				augmented->handler = trace__sys_enter;
5399				/*
5400				 * Now we do the same for the *syscalls:sys_enter event so that
5401				 * if we handle it directly, i.e. if the BPF prog returns 0 so
5402				 * as not to filter it, then we'll handle it just like we would
5403				 * for the BPF_OUTPUT one:
5404				 */
5405				if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
5406				    evsel__init_augmented_syscall_tp_args(evsel))
5407					goto out;
5408				evsel->handler = trace__sys_enter;
5409			}
5410
5411			if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
5412				struct syscall_tp *sc;
5413init_augmented_syscall_tp:
5414				if (evsel__init_augmented_syscall_tp(evsel, evsel))
5415					goto out;
5416				sc = __evsel__syscall_tp(evsel);
5417				/*
5418				 * For now with BPF raw_augmented we hook into
5419				 * raw_syscalls:sys_enter and there we get all
5420				 * 6 syscall args plus the tracepoint common
5421				 * fields and the syscall_nr (another long).
5422				 * So we check if that is the case and if so
5423				 * don't look after the sc->args_size but
5424				 * always after the full raw_syscalls:sys_enter
5425				 * payload, which is fixed.
5426				 *
5427				 * We'll revisit this later to pass
5428				 * s->args_size to the BPF augmenter (now
5429				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
5430				 * so that it copies only what we need for each
5431				 * syscall, like what happens when we use
5432				 * syscalls:sys_enter_NAME, so that we reduce
5433				 * the kernel/userspace traffic to just what is
5434				 * needed for each syscall.
5435				 */
5436				if (trace.raw_augmented_syscalls)
5437					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
5438				evsel__init_augmented_syscall_tp_ret(evsel);
5439				evsel->handler = trace__sys_exit;
5440			}
5441		}
5442	}
5443
5444	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
5445		return trace__record(&trace, argc-1, &argv[1]);
5446
5447	/* Using just --errno-summary will trigger --summary */
5448	if (trace.errno_summary && !trace.summary && !trace.summary_only)
5449		trace.summary_only = true;
5450
5451	/* summary_only implies summary option, but don't overwrite summary if set */
5452	if (trace.summary_only)
5453		trace.summary = trace.summary_only;
5454
5455	/* Keep exited threads, otherwise information might be lost for summary */
5456	if (trace.summary)
5457		symbol_conf.keep_exited_threads = true;
 
 
 
 
 
 
 
5458
5459	if (output_name != NULL) {
5460		err = trace__open_output(&trace, output_name);
5461		if (err < 0) {
5462			perror("failed to create output file");
5463			goto out;
5464		}
5465	}
5466
5467	err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
5468	if (err)
5469		goto out_close;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5470
5471	err = target__validate(&trace.opts.target);
5472	if (err) {
5473		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5474		fprintf(trace.output, "%s", bf);
5475		goto out_close;
5476	}
5477
5478	err = target__parse_uid(&trace.opts.target);
5479	if (err) {
5480		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5481		fprintf(trace.output, "%s", bf);
5482		goto out_close;
5483	}
5484
5485	if (!argc && target__none(&trace.opts.target))
5486		trace.opts.target.system_wide = true;
5487
5488	if (input_name)
5489		err = trace__replay(&trace);
5490	else
5491		err = trace__run(&trace, argc, argv);
5492
5493out_close:
5494	if (output_name != NULL)
5495		fclose(trace.output);
5496out:
5497	trace__exit(&trace);
5498#ifdef HAVE_BPF_SKEL
5499	augmented_raw_syscalls_bpf__destroy(trace.skel);
5500#endif
5501	return err;
5502}