Loading...
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19#include <traceevent/event-parse.h>
20#include <api/fs/tracing_path.h>
21#include "builtin.h"
22#include "util/cgroup.h"
23#include "util/color.h"
24#include "util/debug.h"
25#include "util/env.h"
26#include "util/event.h"
27#include "util/evlist.h"
28#include <subcmd/exec-cmd.h>
29#include "util/machine.h"
30#include "util/path.h"
31#include "util/session.h"
32#include "util/thread.h"
33#include <subcmd/parse-options.h>
34#include "util/strlist.h"
35#include "util/intlist.h"
36#include "util/thread_map.h"
37#include "util/stat.h"
38#include "trace/beauty/beauty.h"
39#include "trace-event.h"
40#include "util/parse-events.h"
41#include "util/bpf-loader.h"
42#include "callchain.h"
43#include "print_binary.h"
44#include "string2.h"
45#include "syscalltbl.h"
46#include "rb_resort.h"
47
48#include <errno.h>
49#include <inttypes.h>
50#include <poll.h>
51#include <signal.h>
52#include <stdlib.h>
53#include <string.h>
54#include <linux/err.h>
55#include <linux/filter.h>
56#include <linux/kernel.h>
57#include <linux/random.h>
58#include <linux/stringify.h>
59#include <linux/time64.h>
60#include <fcntl.h>
61
62#include "sane_ctype.h"
63
64#ifndef O_CLOEXEC
65# define O_CLOEXEC 02000000
66#endif
67
68#ifndef F_LINUX_SPECIFIC_BASE
69# define F_LINUX_SPECIFIC_BASE 1024
70#endif
71
72struct trace {
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
75 struct {
76 int max;
77 struct syscall *table;
78 struct {
79 struct perf_evsel *sys_enter,
80 *sys_exit;
81 } events;
82 } syscalls;
83 struct record_opts opts;
84 struct perf_evlist *evlist;
85 struct machine *host;
86 struct thread *current;
87 struct cgroup *cgroup;
88 u64 base_time;
89 FILE *output;
90 unsigned long nr_events;
91 struct strlist *ev_qualifier;
92 struct {
93 size_t nr;
94 int *entries;
95 } ev_qualifier_ids;
96 struct {
97 size_t nr;
98 pid_t *entries;
99 } filter_pids;
100 double duration_filter;
101 double runtime_ms;
102 struct {
103 u64 vfs_getname,
104 proc_getname;
105 } stats;
106 unsigned int max_stack;
107 unsigned int min_stack;
108 bool not_ev_qualifier;
109 bool live;
110 bool full_time;
111 bool sched;
112 bool multiple_threads;
113 bool summary;
114 bool summary_only;
115 bool failure_only;
116 bool show_comm;
117 bool print_sample;
118 bool show_tool_stats;
119 bool trace_syscalls;
120 bool kernel_syscallchains;
121 bool force;
122 bool vfs_getname;
123 int trace_pgfaults;
124 int open_id;
125};
126
127struct tp_field {
128 int offset;
129 union {
130 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 };
133};
134
135#define TP_UINT_FIELD(bits) \
136static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137{ \
138 u##bits value; \
139 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 return value; \
141}
142
143TP_UINT_FIELD(8);
144TP_UINT_FIELD(16);
145TP_UINT_FIELD(32);
146TP_UINT_FIELD(64);
147
148#define TP_UINT_FIELD__SWAPPED(bits) \
149static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150{ \
151 u##bits value; \
152 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 return bswap_##bits(value);\
154}
155
156TP_UINT_FIELD__SWAPPED(16);
157TP_UINT_FIELD__SWAPPED(32);
158TP_UINT_FIELD__SWAPPED(64);
159
160static int tp_field__init_uint(struct tp_field *field,
161 struct format_field *format_field,
162 bool needs_swap)
163{
164 field->offset = format_field->offset;
165
166 switch (format_field->size) {
167 case 1:
168 field->integer = tp_field__u8;
169 break;
170 case 2:
171 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172 break;
173 case 4:
174 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175 break;
176 case 8:
177 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178 break;
179 default:
180 return -1;
181 }
182
183 return 0;
184}
185
186static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187{
188 return sample->raw_data + field->offset;
189}
190
191static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192{
193 field->offset = format_field->offset;
194 field->pointer = tp_field__ptr;
195 return 0;
196}
197
198struct syscall_tp {
199 struct tp_field id;
200 union {
201 struct tp_field args, ret;
202 };
203};
204
205static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206 struct tp_field *field,
207 const char *name)
208{
209 struct format_field *format_field = perf_evsel__field(evsel, name);
210
211 if (format_field == NULL)
212 return -1;
213
214 return tp_field__init_uint(field, format_field, evsel->needs_swap);
215}
216
217#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218 ({ struct syscall_tp *sc = evsel->priv;\
219 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220
221static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222 struct tp_field *field,
223 const char *name)
224{
225 struct format_field *format_field = perf_evsel__field(evsel, name);
226
227 if (format_field == NULL)
228 return -1;
229
230 return tp_field__init_ptr(field, format_field);
231}
232
233#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234 ({ struct syscall_tp *sc = evsel->priv;\
235 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236
237static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238{
239 zfree(&evsel->priv);
240 perf_evsel__delete(evsel);
241}
242
243static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244{
245 evsel->priv = malloc(sizeof(struct syscall_tp));
246 if (evsel->priv != NULL) {
247 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248 goto out_delete;
249
250 evsel->handler = handler;
251 return 0;
252 }
253
254 return -ENOMEM;
255
256out_delete:
257 zfree(&evsel->priv);
258 return -ENOENT;
259}
260
261static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262{
263 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264
265 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266 if (IS_ERR(evsel))
267 evsel = perf_evsel__newtp("syscalls", direction);
268
269 if (IS_ERR(evsel))
270 return NULL;
271
272 if (perf_evsel__init_syscall_tp(evsel, handler))
273 goto out_delete;
274
275 return evsel;
276
277out_delete:
278 perf_evsel__delete_priv(evsel);
279 return NULL;
280}
281
282#define perf_evsel__sc_tp_uint(evsel, name, sample) \
283 ({ struct syscall_tp *fields = evsel->priv; \
284 fields->name.integer(&fields->name, sample); })
285
286#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287 ({ struct syscall_tp *fields = evsel->priv; \
288 fields->name.pointer(&fields->name, sample); })
289
290size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291{
292 int idx = val - sa->offset;
293
294 if (idx < 0 || idx >= sa->nr_entries)
295 return scnprintf(bf, size, intfmt, val);
296
297 return scnprintf(bf, size, "%s", sa->entries[idx]);
298}
299
300static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 const char *intfmt,
302 struct syscall_arg *arg)
303{
304 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305}
306
307static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308 struct syscall_arg *arg)
309{
310 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311}
312
313#define SCA_STRARRAY syscall_arg__scnprintf_strarray
314
315struct strarrays {
316 int nr_entries;
317 struct strarray **entries;
318};
319
320#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321 .nr_entries = ARRAY_SIZE(array), \
322 .entries = array, \
323}
324
325size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326 struct syscall_arg *arg)
327{
328 struct strarrays *sas = arg->parm;
329 int i;
330
331 for (i = 0; i < sas->nr_entries; ++i) {
332 struct strarray *sa = sas->entries[i];
333 int idx = arg->val - sa->offset;
334
335 if (idx >= 0 && idx < sa->nr_entries) {
336 if (sa->entries[idx] == NULL)
337 break;
338 return scnprintf(bf, size, "%s", sa->entries[idx]);
339 }
340 }
341
342 return scnprintf(bf, size, "%d", arg->val);
343}
344
345#ifndef AT_FDCWD
346#define AT_FDCWD -100
347#endif
348
349static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350 struct syscall_arg *arg)
351{
352 int fd = arg->val;
353
354 if (fd == AT_FDCWD)
355 return scnprintf(bf, size, "CWD");
356
357 return syscall_arg__scnprintf_fd(bf, size, arg);
358}
359
360#define SCA_FDAT syscall_arg__scnprintf_fd_at
361
362static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363 struct syscall_arg *arg);
364
365#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366
367size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368{
369 return scnprintf(bf, size, "%#lx", arg->val);
370}
371
372size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373{
374 return scnprintf(bf, size, "%d", arg->val);
375}
376
377size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378{
379 return scnprintf(bf, size, "%ld", arg->val);
380}
381
382static const char *bpf_cmd[] = {
383 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384 "MAP_GET_NEXT_KEY", "PROG_LOAD",
385};
386static DEFINE_STRARRAY(bpf_cmd);
387
388static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390
391static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392static DEFINE_STRARRAY(itimers);
393
394static const char *keyctl_options[] = {
395 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400};
401static DEFINE_STRARRAY(keyctl_options);
402
403static const char *whences[] = { "SET", "CUR", "END",
404#ifdef SEEK_DATA
405"DATA",
406#endif
407#ifdef SEEK_HOLE
408"HOLE",
409#endif
410};
411static DEFINE_STRARRAY(whences);
412
413static const char *fcntl_cmds[] = {
414 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417 "GETOWNER_UIDS",
418};
419static DEFINE_STRARRAY(fcntl_cmds);
420
421static const char *fcntl_linux_specific_cmds[] = {
422 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
423 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425};
426
427static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428
429static struct strarray *fcntl_cmds_arrays[] = {
430 &strarray__fcntl_cmds,
431 &strarray__fcntl_linux_specific_cmds,
432};
433
434static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435
436static const char *rlimit_resources[] = {
437 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439 "RTTIME",
440};
441static DEFINE_STRARRAY(rlimit_resources);
442
443static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444static DEFINE_STRARRAY(sighow);
445
446static const char *clockid[] = {
447 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450};
451static DEFINE_STRARRAY(clockid);
452
453static const char *socket_families[] = {
454 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459 "ALG", "NFC", "VSOCK",
460};
461static DEFINE_STRARRAY(socket_families);
462
463static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464 struct syscall_arg *arg)
465{
466 size_t printed = 0;
467 int mode = arg->val;
468
469 if (mode == F_OK) /* 0 */
470 return scnprintf(bf, size, "F");
471#define P_MODE(n) \
472 if (mode & n##_OK) { \
473 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474 mode &= ~n##_OK; \
475 }
476
477 P_MODE(R);
478 P_MODE(W);
479 P_MODE(X);
480#undef P_MODE
481
482 if (mode)
483 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484
485 return printed;
486}
487
488#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489
490static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491 struct syscall_arg *arg);
492
493#define SCA_FILENAME syscall_arg__scnprintf_filename
494
495static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496 struct syscall_arg *arg)
497{
498 int printed = 0, flags = arg->val;
499
500#define P_FLAG(n) \
501 if (flags & O_##n) { \
502 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503 flags &= ~O_##n; \
504 }
505
506 P_FLAG(CLOEXEC);
507 P_FLAG(NONBLOCK);
508#undef P_FLAG
509
510 if (flags)
511 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512
513 return printed;
514}
515
516#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517
518#ifndef GRND_NONBLOCK
519#define GRND_NONBLOCK 0x0001
520#endif
521#ifndef GRND_RANDOM
522#define GRND_RANDOM 0x0002
523#endif
524
525static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526 struct syscall_arg *arg)
527{
528 int printed = 0, flags = arg->val;
529
530#define P_FLAG(n) \
531 if (flags & GRND_##n) { \
532 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533 flags &= ~GRND_##n; \
534 }
535
536 P_FLAG(RANDOM);
537 P_FLAG(NONBLOCK);
538#undef P_FLAG
539
540 if (flags)
541 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542
543 return printed;
544}
545
546#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547
548#define STRARRAY(name, array) \
549 { .scnprintf = SCA_STRARRAY, \
550 .parm = &strarray__##array, }
551
552#include "trace/beauty/arch_errno_names.c"
553#include "trace/beauty/eventfd.c"
554#include "trace/beauty/futex_op.c"
555#include "trace/beauty/futex_val3.c"
556#include "trace/beauty/mmap.c"
557#include "trace/beauty/mode_t.c"
558#include "trace/beauty/msg_flags.c"
559#include "trace/beauty/open_flags.c"
560#include "trace/beauty/perf_event_open.c"
561#include "trace/beauty/pid.c"
562#include "trace/beauty/sched_policy.c"
563#include "trace/beauty/seccomp.c"
564#include "trace/beauty/signum.c"
565#include "trace/beauty/socket_type.c"
566#include "trace/beauty/waitid_options.c"
567
568struct syscall_arg_fmt {
569 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570 void *parm;
571 const char *name;
572 bool show_zero;
573};
574
575static struct syscall_fmt {
576 const char *name;
577 const char *alias;
578 struct syscall_arg_fmt arg[6];
579 u8 nr_args;
580 bool errpid;
581 bool timeout;
582 bool hexret;
583} syscall_fmts[] = {
584 { .name = "access",
585 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
586 { .name = "bpf",
587 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588 { .name = "brk", .hexret = true,
589 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590 { .name = "clock_gettime",
591 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592 { .name = "clone", .errpid = true, .nr_args = 5,
593 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
594 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
595 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
597 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
598 { .name = "close",
599 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600 { .name = "epoll_ctl",
601 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602 { .name = "eventfd2",
603 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604 { .name = "fchmodat",
605 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606 { .name = "fchownat",
607 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608 { .name = "fcntl",
609 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610 .parm = &strarrays__fcntl_cmds_arrays,
611 .show_zero = true, },
612 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
613 { .name = "flock",
614 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615 { .name = "fstat", .alias = "newfstat", },
616 { .name = "fstatat", .alias = "newfstatat", },
617 { .name = "futex",
618 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620 { .name = "futimesat",
621 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622 { .name = "getitimer",
623 .arg = { [0] = STRARRAY(which, itimers), }, },
624 { .name = "getpid", .errpid = true, },
625 { .name = "getpgid", .errpid = true, },
626 { .name = "getppid", .errpid = true, },
627 { .name = "getrandom",
628 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629 { .name = "getrlimit",
630 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631 { .name = "gettid", .errpid = true, },
632 { .name = "ioctl",
633 .arg = {
634#if defined(__i386__) || defined(__x86_64__)
635/*
636 * FIXME: Make this available to all arches.
637 */
638 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640#else
641 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642#endif
643 { .name = "kcmp", .nr_args = 5,
644 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
645 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
646 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
647 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
648 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
649 { .name = "keyctl",
650 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651 { .name = "kill",
652 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653 { .name = "linkat",
654 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655 { .name = "lseek",
656 .arg = { [2] = STRARRAY(whence, whences), }, },
657 { .name = "lstat", .alias = "newlstat", },
658 { .name = "madvise",
659 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
660 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661 { .name = "mkdirat",
662 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663 { .name = "mknodat",
664 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665 { .name = "mlock",
666 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667 { .name = "mlockall",
668 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669 { .name = "mmap", .hexret = true,
670/* The standard mmap maps to old_mmap on s390x */
671#if defined(__s390x__)
672 .alias = "old_mmap",
673#endif
674 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
675 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
676 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
677 { .name = "mprotect",
678 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
679 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
680 { .name = "mq_unlink",
681 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682 { .name = "mremap", .hexret = true,
683 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
684 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
686 { .name = "munlock",
687 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688 { .name = "munmap",
689 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690 { .name = "name_to_handle_at",
691 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692 { .name = "newfstatat",
693 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694 { .name = "open",
695 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 { .name = "open_by_handle_at",
697 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
698 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 { .name = "openat",
700 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
701 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702 { .name = "perf_event_open",
703 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
704 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
705 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706 { .name = "pipe2",
707 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708 { .name = "pkey_alloc",
709 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
710 { .name = "pkey_free",
711 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
712 { .name = "pkey_mprotect",
713 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
714 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
715 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
716 { .name = "poll", .timeout = true, },
717 { .name = "ppoll", .timeout = true, },
718 { .name = "prctl", .alias = "arch_prctl",
719 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722 { .name = "pread", .alias = "pread64", },
723 { .name = "preadv", .alias = "pread", },
724 { .name = "prlimit64",
725 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726 { .name = "pwrite", .alias = "pwrite64", },
727 { .name = "readlinkat",
728 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729 { .name = "recvfrom",
730 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731 { .name = "recvmmsg",
732 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733 { .name = "recvmsg",
734 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735 { .name = "renameat",
736 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737 { .name = "rt_sigaction",
738 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739 { .name = "rt_sigprocmask",
740 .arg = { [0] = STRARRAY(how, sighow), }, },
741 { .name = "rt_sigqueueinfo",
742 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743 { .name = "rt_tgsigqueueinfo",
744 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745 { .name = "sched_setscheduler",
746 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747 { .name = "seccomp",
748 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
749 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750 { .name = "select", .timeout = true, },
751 { .name = "sendmmsg",
752 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753 { .name = "sendmsg",
754 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755 { .name = "sendto",
756 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757 { .name = "set_tid_address", .errpid = true, },
758 { .name = "setitimer",
759 .arg = { [0] = STRARRAY(which, itimers), }, },
760 { .name = "setrlimit",
761 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762 { .name = "socket",
763 .arg = { [0] = STRARRAY(family, socket_families),
764 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765 { .name = "socketpair",
766 .arg = { [0] = STRARRAY(family, socket_families),
767 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
768 { .name = "stat", .alias = "newstat", },
769 { .name = "statx",
770 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
771 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
772 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
773 { .name = "swapoff",
774 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
775 { .name = "swapon",
776 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777 { .name = "symlinkat",
778 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779 { .name = "tgkill",
780 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
781 { .name = "tkill",
782 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783 { .name = "uname", .alias = "newuname", },
784 { .name = "unlinkat",
785 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
786 { .name = "utimensat",
787 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
788 { .name = "wait4", .errpid = true,
789 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
790 { .name = "waitid", .errpid = true,
791 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792};
793
794static int syscall_fmt__cmp(const void *name, const void *fmtp)
795{
796 const struct syscall_fmt *fmt = fmtp;
797 return strcmp(name, fmt->name);
798}
799
800static struct syscall_fmt *syscall_fmt__find(const char *name)
801{
802 const int nmemb = ARRAY_SIZE(syscall_fmts);
803 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
804}
805
806struct syscall {
807 struct event_format *tp_format;
808 int nr_args;
809 struct format_field *args;
810 const char *name;
811 bool is_exit;
812 struct syscall_fmt *fmt;
813 struct syscall_arg_fmt *arg_fmt;
814};
815
816/*
817 * We need to have this 'calculated' boolean because in some cases we really
818 * don't know what is the duration of a syscall, for instance, when we start
819 * a session and some threads are waiting for a syscall to finish, say 'poll',
820 * in which case all we can do is to print "( ? ) for duration and for the
821 * start timestamp.
822 */
823static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
824{
825 double duration = (double)t / NSEC_PER_MSEC;
826 size_t printed = fprintf(fp, "(");
827
828 if (!calculated)
829 printed += fprintf(fp, " ");
830 else if (duration >= 1.0)
831 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832 else if (duration >= 0.01)
833 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834 else
835 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836 return printed + fprintf(fp, "): ");
837}
838
839/**
840 * filename.ptr: The filename char pointer that will be vfs_getname'd
841 * filename.entry_str_pos: Where to insert the string translated from
842 * filename.ptr by the vfs_getname tracepoint/kprobe.
843 * ret_scnprintf: syscall args may set this to a different syscall return
844 * formatter, for instance, fcntl may return fds, file flags, etc.
845 */
846struct thread_trace {
847 u64 entry_time;
848 bool entry_pending;
849 unsigned long nr_events;
850 unsigned long pfmaj, pfmin;
851 char *entry_str;
852 double runtime_ms;
853 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
854 struct {
855 unsigned long ptr;
856 short int entry_str_pos;
857 bool pending_open;
858 unsigned int namelen;
859 char *name;
860 } filename;
861 struct {
862 int max;
863 char **table;
864 } paths;
865
866 struct intlist *syscall_stats;
867};
868
869static struct thread_trace *thread_trace__new(void)
870{
871 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
872
873 if (ttrace)
874 ttrace->paths.max = -1;
875
876 ttrace->syscall_stats = intlist__new(NULL);
877
878 return ttrace;
879}
880
881static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
882{
883 struct thread_trace *ttrace;
884
885 if (thread == NULL)
886 goto fail;
887
888 if (thread__priv(thread) == NULL)
889 thread__set_priv(thread, thread_trace__new());
890
891 if (thread__priv(thread) == NULL)
892 goto fail;
893
894 ttrace = thread__priv(thread);
895 ++ttrace->nr_events;
896
897 return ttrace;
898fail:
899 color_fprintf(fp, PERF_COLOR_RED,
900 "WARNING: not enough memory, dropping samples!\n");
901 return NULL;
902}
903
904
905void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
906 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
907{
908 struct thread_trace *ttrace = thread__priv(arg->thread);
909
910 ttrace->ret_scnprintf = ret_scnprintf;
911}
912
913#define TRACE_PFMAJ (1 << 0)
914#define TRACE_PFMIN (1 << 1)
915
916static const size_t trace__entry_str_size = 2048;
917
918static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
919{
920 struct thread_trace *ttrace = thread__priv(thread);
921
922 if (fd > ttrace->paths.max) {
923 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
924
925 if (npath == NULL)
926 return -1;
927
928 if (ttrace->paths.max != -1) {
929 memset(npath + ttrace->paths.max + 1, 0,
930 (fd - ttrace->paths.max) * sizeof(char *));
931 } else {
932 memset(npath, 0, (fd + 1) * sizeof(char *));
933 }
934
935 ttrace->paths.table = npath;
936 ttrace->paths.max = fd;
937 }
938
939 ttrace->paths.table[fd] = strdup(pathname);
940
941 return ttrace->paths.table[fd] != NULL ? 0 : -1;
942}
943
944static int thread__read_fd_path(struct thread *thread, int fd)
945{
946 char linkname[PATH_MAX], pathname[PATH_MAX];
947 struct stat st;
948 int ret;
949
950 if (thread->pid_ == thread->tid) {
951 scnprintf(linkname, sizeof(linkname),
952 "/proc/%d/fd/%d", thread->pid_, fd);
953 } else {
954 scnprintf(linkname, sizeof(linkname),
955 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
956 }
957
958 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
959 return -1;
960
961 ret = readlink(linkname, pathname, sizeof(pathname));
962
963 if (ret < 0 || ret > st.st_size)
964 return -1;
965
966 pathname[ret] = '\0';
967 return trace__set_fd_pathname(thread, fd, pathname);
968}
969
970static const char *thread__fd_path(struct thread *thread, int fd,
971 struct trace *trace)
972{
973 struct thread_trace *ttrace = thread__priv(thread);
974
975 if (ttrace == NULL)
976 return NULL;
977
978 if (fd < 0)
979 return NULL;
980
981 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
982 if (!trace->live)
983 return NULL;
984 ++trace->stats.proc_getname;
985 if (thread__read_fd_path(thread, fd))
986 return NULL;
987 }
988
989 return ttrace->paths.table[fd];
990}
991
992size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
993{
994 int fd = arg->val;
995 size_t printed = scnprintf(bf, size, "%d", fd);
996 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
997
998 if (path)
999 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000
1001 return printed;
1002}
1003
1004size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005{
1006 size_t printed = scnprintf(bf, size, "%d", fd);
1007 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008
1009 if (thread) {
1010 const char *path = thread__fd_path(thread, fd, trace);
1011
1012 if (path)
1013 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014
1015 thread__put(thread);
1016 }
1017
1018 return printed;
1019}
1020
1021static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022 struct syscall_arg *arg)
1023{
1024 int fd = arg->val;
1025 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026 struct thread_trace *ttrace = thread__priv(arg->thread);
1027
1028 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029 zfree(&ttrace->paths.table[fd]);
1030
1031 return printed;
1032}
1033
1034static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035 unsigned long ptr)
1036{
1037 struct thread_trace *ttrace = thread__priv(thread);
1038
1039 ttrace->filename.ptr = ptr;
1040 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041}
1042
1043static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044 struct syscall_arg *arg)
1045{
1046 unsigned long ptr = arg->val;
1047
1048 if (!arg->trace->vfs_getname)
1049 return scnprintf(bf, size, "%#x", ptr);
1050
1051 thread__set_filename_pos(arg->thread, bf, ptr);
1052 return 0;
1053}
1054
1055static bool trace__filter_duration(struct trace *trace, double t)
1056{
1057 return t < (trace->duration_filter * NSEC_PER_MSEC);
1058}
1059
1060static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061{
1062 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063
1064 return fprintf(fp, "%10.3f ", ts);
1065}
1066
1067/*
1068 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069 * using ttrace->entry_time for a thread that receives a sys_exit without
1070 * first having received a sys_enter ("poll" issued before tracing session
1071 * starts, lost sys_enter exit due to ring buffer overflow).
1072 */
1073static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074{
1075 if (tstamp > 0)
1076 return __trace__fprintf_tstamp(trace, tstamp, fp);
1077
1078 return fprintf(fp, " ? ");
1079}
1080
1081static bool done = false;
1082static bool interrupted = false;
1083
1084static void sig_handler(int sig)
1085{
1086 done = true;
1087 interrupted = sig == SIGINT;
1088}
1089
1090static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092{
1093 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094 printed += fprintf_duration(duration, duration_calculated, fp);
1095
1096 if (trace->multiple_threads) {
1097 if (trace->show_comm)
1098 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099 printed += fprintf(fp, "%d ", thread->tid);
1100 }
1101
1102 return printed;
1103}
1104
1105static int trace__process_event(struct trace *trace, struct machine *machine,
1106 union perf_event *event, struct perf_sample *sample)
1107{
1108 int ret = 0;
1109
1110 switch (event->header.type) {
1111 case PERF_RECORD_LOST:
1112 color_fprintf(trace->output, PERF_COLOR_RED,
1113 "LOST %" PRIu64 " events!\n", event->lost.lost);
1114 ret = machine__process_lost_event(machine, event, sample);
1115 break;
1116 default:
1117 ret = machine__process_event(machine, event, sample);
1118 break;
1119 }
1120
1121 return ret;
1122}
1123
1124static int trace__tool_process(struct perf_tool *tool,
1125 union perf_event *event,
1126 struct perf_sample *sample,
1127 struct machine *machine)
1128{
1129 struct trace *trace = container_of(tool, struct trace, tool);
1130 return trace__process_event(trace, machine, event, sample);
1131}
1132
1133static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134{
1135 struct machine *machine = vmachine;
1136
1137 if (machine->kptr_restrict_warned)
1138 return NULL;
1139
1140 if (symbol_conf.kptr_restrict) {
1141 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143 "Kernel samples will not be resolved.\n");
1144 machine->kptr_restrict_warned = true;
1145 return NULL;
1146 }
1147
1148 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149}
1150
1151static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152{
1153 int err = symbol__init(NULL);
1154
1155 if (err)
1156 return err;
1157
1158 trace->host = machine__new_host();
1159 if (trace->host == NULL)
1160 return -ENOMEM;
1161
1162 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163 if (err < 0)
1164 goto out;
1165
1166 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167 evlist->threads, trace__tool_process, false,
1168 trace->opts.proc_map_timeout, 1);
1169out:
1170 if (err)
1171 symbol__exit();
1172
1173 return err;
1174}
1175
1176static void trace__symbols__exit(struct trace *trace)
1177{
1178 machine__exit(trace->host);
1179 trace->host = NULL;
1180
1181 symbol__exit();
1182}
1183
1184static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185{
1186 int idx;
1187
1188 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189 nr_args = sc->fmt->nr_args;
1190
1191 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192 if (sc->arg_fmt == NULL)
1193 return -1;
1194
1195 for (idx = 0; idx < nr_args; ++idx) {
1196 if (sc->fmt)
1197 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198 }
1199
1200 sc->nr_args = nr_args;
1201 return 0;
1202}
1203
1204static int syscall__set_arg_fmts(struct syscall *sc)
1205{
1206 struct format_field *field;
1207 int idx = 0, len;
1208
1209 for (field = sc->args; field; field = field->next, ++idx) {
1210 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211 continue;
1212
1213 if (strcmp(field->type, "const char *") == 0 &&
1214 (strcmp(field->name, "filename") == 0 ||
1215 strcmp(field->name, "path") == 0 ||
1216 strcmp(field->name, "pathname") == 0))
1217 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218 else if (field->flags & FIELD_IS_POINTER)
1219 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220 else if (strcmp(field->type, "pid_t") == 0)
1221 sc->arg_fmt[idx].scnprintf = SCA_PID;
1222 else if (strcmp(field->type, "umode_t") == 0)
1223 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224 else if ((strcmp(field->type, "int") == 0 ||
1225 strcmp(field->type, "unsigned int") == 0 ||
1226 strcmp(field->type, "long") == 0) &&
1227 (len = strlen(field->name)) >= 2 &&
1228 strcmp(field->name + len - 2, "fd") == 0) {
1229 /*
1230 * /sys/kernel/tracing/events/syscalls/sys_enter*
1231 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232 * 65 int
1233 * 23 unsigned int
1234 * 7 unsigned long
1235 */
1236 sc->arg_fmt[idx].scnprintf = SCA_FD;
1237 }
1238 }
1239
1240 return 0;
1241}
1242
1243static int trace__read_syscall_info(struct trace *trace, int id)
1244{
1245 char tp_name[128];
1246 struct syscall *sc;
1247 const char *name = syscalltbl__name(trace->sctbl, id);
1248
1249 if (name == NULL)
1250 return -1;
1251
1252 if (id > trace->syscalls.max) {
1253 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254
1255 if (nsyscalls == NULL)
1256 return -1;
1257
1258 if (trace->syscalls.max != -1) {
1259 memset(nsyscalls + trace->syscalls.max + 1, 0,
1260 (id - trace->syscalls.max) * sizeof(*sc));
1261 } else {
1262 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263 }
1264
1265 trace->syscalls.table = nsyscalls;
1266 trace->syscalls.max = id;
1267 }
1268
1269 sc = trace->syscalls.table + id;
1270 sc->name = name;
1271
1272 sc->fmt = syscall_fmt__find(sc->name);
1273
1274 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276
1277 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280 }
1281
1282 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283 return -1;
1284
1285 if (IS_ERR(sc->tp_format))
1286 return -1;
1287
1288 sc->args = sc->tp_format->format.fields;
1289 /*
1290 * We need to check and discard the first variable '__syscall_nr'
1291 * or 'nr' that mean the syscall number. It is needless here.
1292 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293 */
1294 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295 sc->args = sc->args->next;
1296 --sc->nr_args;
1297 }
1298
1299 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1300
1301 return syscall__set_arg_fmts(sc);
1302}
1303
1304static int trace__validate_ev_qualifier(struct trace *trace)
1305{
1306 int err = 0, i;
1307 size_t nr_allocated;
1308 struct str_node *pos;
1309
1310 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312 sizeof(trace->ev_qualifier_ids.entries[0]));
1313
1314 if (trace->ev_qualifier_ids.entries == NULL) {
1315 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316 trace->output);
1317 err = -EINVAL;
1318 goto out;
1319 }
1320
1321 nr_allocated = trace->ev_qualifier_ids.nr;
1322 i = 0;
1323
1324 strlist__for_each_entry(pos, trace->ev_qualifier) {
1325 const char *sc = pos->s;
1326 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327
1328 if (id < 0) {
1329 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330 if (id >= 0)
1331 goto matches;
1332
1333 if (err == 0) {
1334 fputs("Error:\tInvalid syscall ", trace->output);
1335 err = -EINVAL;
1336 } else {
1337 fputs(", ", trace->output);
1338 }
1339
1340 fputs(sc, trace->output);
1341 }
1342matches:
1343 trace->ev_qualifier_ids.entries[i++] = id;
1344 if (match_next == -1)
1345 continue;
1346
1347 while (1) {
1348 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349 if (id < 0)
1350 break;
1351 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352 void *entries;
1353
1354 nr_allocated += 8;
1355 entries = realloc(trace->ev_qualifier_ids.entries,
1356 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357 if (entries == NULL) {
1358 err = -ENOMEM;
1359 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360 goto out_free;
1361 }
1362 trace->ev_qualifier_ids.entries = entries;
1363 }
1364 trace->ev_qualifier_ids.nr++;
1365 trace->ev_qualifier_ids.entries[i++] = id;
1366 }
1367 }
1368
1369 if (err < 0) {
1370 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371 "\nHint:\tand: 'man syscalls'\n", trace->output);
1372out_free:
1373 zfree(&trace->ev_qualifier_ids.entries);
1374 trace->ev_qualifier_ids.nr = 0;
1375 }
1376out:
1377 return err;
1378}
1379
1380/*
1381 * args is to be interpreted as a series of longs but we need to handle
1382 * 8-byte unaligned accesses. args points to raw_data within the event
1383 * and raw_data is guaranteed to be 8-byte unaligned because it is
1384 * preceded by raw_size which is a u32. So we need to copy args to a temp
1385 * variable to read it. Most notably this avoids extended load instructions
1386 * on unaligned addresses
1387 */
1388unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389{
1390 unsigned long val;
1391 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392
1393 memcpy(&val, p, sizeof(val));
1394 return val;
1395}
1396
1397static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398 struct syscall_arg *arg)
1399{
1400 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402
1403 return scnprintf(bf, size, "arg%d: ", arg->idx);
1404}
1405
1406static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407 struct syscall_arg *arg, unsigned long val)
1408{
1409 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1410 arg->val = val;
1411 if (sc->arg_fmt[arg->idx].parm)
1412 arg->parm = sc->arg_fmt[arg->idx].parm;
1413 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414 }
1415 return scnprintf(bf, size, "%ld", val);
1416}
1417
1418static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419 unsigned char *args, struct trace *trace,
1420 struct thread *thread)
1421{
1422 size_t printed = 0;
1423 unsigned long val;
1424 u8 bit = 1;
1425 struct syscall_arg arg = {
1426 .args = args,
1427 .idx = 0,
1428 .mask = 0,
1429 .trace = trace,
1430 .thread = thread,
1431 };
1432 struct thread_trace *ttrace = thread__priv(thread);
1433
1434 /*
1435 * Things like fcntl will set this in its 'cmd' formatter to pick the
1436 * right formatter for the return value (an fd? file flags?), which is
1437 * not needed for syscalls that always return a given type, say an fd.
1438 */
1439 ttrace->ret_scnprintf = NULL;
1440
1441 if (sc->args != NULL) {
1442 struct format_field *field;
1443
1444 for (field = sc->args; field;
1445 field = field->next, ++arg.idx, bit <<= 1) {
1446 if (arg.mask & bit)
1447 continue;
1448
1449 val = syscall_arg__val(&arg, arg.idx);
1450
1451 /*
1452 * Suppress this argument if its value is zero and
1453 * and we don't have a string associated in an
1454 * strarray for it.
1455 */
1456 if (val == 0 &&
1457 !(sc->arg_fmt &&
1458 (sc->arg_fmt[arg.idx].show_zero ||
1459 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461 sc->arg_fmt[arg.idx].parm))
1462 continue;
1463
1464 printed += scnprintf(bf + printed, size - printed,
1465 "%s%s: ", printed ? ", " : "", field->name);
1466 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1467 }
1468 } else if (IS_ERR(sc->tp_format)) {
1469 /*
1470 * If we managed to read the tracepoint /format file, then we
1471 * may end up not having any args, like with gettid(), so only
1472 * print the raw args when we didn't manage to read it.
1473 */
1474 while (arg.idx < sc->nr_args) {
1475 if (arg.mask & bit)
1476 goto next_arg;
1477 val = syscall_arg__val(&arg, arg.idx);
1478 if (printed)
1479 printed += scnprintf(bf + printed, size - printed, ", ");
1480 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482next_arg:
1483 ++arg.idx;
1484 bit <<= 1;
1485 }
1486 }
1487
1488 return printed;
1489}
1490
1491typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492 union perf_event *event,
1493 struct perf_sample *sample);
1494
1495static struct syscall *trace__syscall_info(struct trace *trace,
1496 struct perf_evsel *evsel, int id)
1497{
1498
1499 if (id < 0) {
1500
1501 /*
1502 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503 * before that, leaving at a higher verbosity level till that is
1504 * explained. Reproduced with plain ftrace with:
1505 *
1506 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507 * grep "NR -1 " /t/trace_pipe
1508 *
1509 * After generating some load on the machine.
1510 */
1511 if (verbose > 1) {
1512 static u64 n;
1513 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514 id, perf_evsel__name(evsel), ++n);
1515 }
1516 return NULL;
1517 }
1518
1519 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520 trace__read_syscall_info(trace, id))
1521 goto out_cant_read;
1522
1523 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524 goto out_cant_read;
1525
1526 return &trace->syscalls.table[id];
1527
1528out_cant_read:
1529 if (verbose > 0) {
1530 fprintf(trace->output, "Problems reading syscall %d", id);
1531 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1532 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533 fputs(" information\n", trace->output);
1534 }
1535 return NULL;
1536}
1537
1538static void thread__update_stats(struct thread_trace *ttrace,
1539 int id, struct perf_sample *sample)
1540{
1541 struct int_node *inode;
1542 struct stats *stats;
1543 u64 duration = 0;
1544
1545 inode = intlist__findnew(ttrace->syscall_stats, id);
1546 if (inode == NULL)
1547 return;
1548
1549 stats = inode->priv;
1550 if (stats == NULL) {
1551 stats = malloc(sizeof(struct stats));
1552 if (stats == NULL)
1553 return;
1554 init_stats(stats);
1555 inode->priv = stats;
1556 }
1557
1558 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559 duration = sample->time - ttrace->entry_time;
1560
1561 update_stats(stats, duration);
1562}
1563
1564static int trace__printf_interrupted_entry(struct trace *trace)
1565{
1566 struct thread_trace *ttrace;
1567 size_t printed;
1568
1569 if (trace->failure_only || trace->current == NULL)
1570 return 0;
1571
1572 ttrace = thread__priv(trace->current);
1573
1574 if (!ttrace->entry_pending)
1575 return 0;
1576
1577 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1579 ttrace->entry_pending = false;
1580
1581 return printed;
1582}
1583
1584static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585 struct perf_sample *sample, struct thread *thread)
1586{
1587 int printed = 0;
1588
1589 if (trace->print_sample) {
1590 double ts = (double)sample->time / NSEC_PER_MSEC;
1591
1592 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593 perf_evsel__name(evsel), ts,
1594 thread__comm_str(thread),
1595 sample->pid, sample->tid, sample->cpu);
1596 }
1597
1598 return printed;
1599}
1600
1601static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1602 union perf_event *event __maybe_unused,
1603 struct perf_sample *sample)
1604{
1605 char *msg;
1606 void *args;
1607 size_t printed = 0;
1608 struct thread *thread;
1609 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1610 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611 struct thread_trace *ttrace;
1612
1613 if (sc == NULL)
1614 return -1;
1615
1616 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617 ttrace = thread__trace(thread, trace->output);
1618 if (ttrace == NULL)
1619 goto out_put;
1620
1621 trace__fprintf_sample(trace, evsel, sample, thread);
1622
1623 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624
1625 if (ttrace->entry_str == NULL) {
1626 ttrace->entry_str = malloc(trace__entry_str_size);
1627 if (!ttrace->entry_str)
1628 goto out_put;
1629 }
1630
1631 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632 trace__printf_interrupted_entry(trace);
1633
1634 ttrace->entry_time = sample->time;
1635 msg = ttrace->entry_str;
1636 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637
1638 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639 args, trace, thread);
1640
1641 if (sc->is_exit) {
1642 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1643 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1645 }
1646 } else {
1647 ttrace->entry_pending = true;
1648 /* See trace__vfs_getname & trace__sys_exit */
1649 ttrace->filename.pending_open = false;
1650 }
1651
1652 if (trace->current != thread) {
1653 thread__put(trace->current);
1654 trace->current = thread__get(thread);
1655 }
1656 err = 0;
1657out_put:
1658 thread__put(thread);
1659 return err;
1660}
1661
1662static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1663 struct perf_sample *sample,
1664 struct callchain_cursor *cursor)
1665{
1666 struct addr_location al;
1667 int max_stack = evsel->attr.sample_max_stack ?
1668 evsel->attr.sample_max_stack :
1669 trace->max_stack;
1670
1671 if (machine__resolve(trace->host, &al, sample) < 0 ||
1672 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673 return -1;
1674
1675 return 0;
1676}
1677
1678static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679{
1680 /* TODO: user-configurable print_opts */
1681 const unsigned int print_opts = EVSEL__PRINT_SYM |
1682 EVSEL__PRINT_DSO |
1683 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684
1685 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686}
1687
1688static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689{
1690 struct perf_env *env = perf_evsel__env(evsel);
1691 const char *arch_name = perf_env__arch(env);
1692
1693 return arch_syscalls__strerrno(arch_name, err);
1694}
1695
1696static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697 union perf_event *event __maybe_unused,
1698 struct perf_sample *sample)
1699{
1700 long ret;
1701 u64 duration = 0;
1702 bool duration_calculated = false;
1703 struct thread *thread;
1704 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1705 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706 struct thread_trace *ttrace;
1707
1708 if (sc == NULL)
1709 return -1;
1710
1711 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712 ttrace = thread__trace(thread, trace->output);
1713 if (ttrace == NULL)
1714 goto out_put;
1715
1716 trace__fprintf_sample(trace, evsel, sample, thread);
1717
1718 if (trace->summary)
1719 thread__update_stats(ttrace, id, sample);
1720
1721 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722
1723 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725 ttrace->filename.pending_open = false;
1726 ++trace->stats.vfs_getname;
1727 }
1728
1729 if (ttrace->entry_time) {
1730 duration = sample->time - ttrace->entry_time;
1731 if (trace__filter_duration(trace, duration))
1732 goto out;
1733 duration_calculated = true;
1734 } else if (trace->duration_filter)
1735 goto out;
1736
1737 if (sample->callchain) {
1738 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1739 if (callchain_ret == 0) {
1740 if (callchain_cursor.nr < trace->min_stack)
1741 goto out;
1742 callchain_ret = 1;
1743 }
1744 }
1745
1746 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747 goto out;
1748
1749 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750
1751 if (ttrace->entry_pending) {
1752 fprintf(trace->output, "%-70s", ttrace->entry_str);
1753 } else {
1754 fprintf(trace->output, " ... [");
1755 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756 fprintf(trace->output, "]: %s()", sc->name);
1757 }
1758
1759 if (sc->fmt == NULL) {
1760 if (ret < 0)
1761 goto errno_print;
1762signed_print:
1763 fprintf(trace->output, ") = %ld", ret);
1764 } else if (ret < 0) {
1765errno_print: {
1766 char bf[STRERR_BUFSIZE];
1767 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768 *e = errno_to_name(evsel, -ret);
1769
1770 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771 }
1772 } else if (ret == 0 && sc->fmt->timeout)
1773 fprintf(trace->output, ") = 0 Timeout");
1774 else if (ttrace->ret_scnprintf) {
1775 char bf[1024];
1776 struct syscall_arg arg = {
1777 .val = ret,
1778 .thread = thread,
1779 .trace = trace,
1780 };
1781 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782 ttrace->ret_scnprintf = NULL;
1783 fprintf(trace->output, ") = %s", bf);
1784 } else if (sc->fmt->hexret)
1785 fprintf(trace->output, ") = %#lx", ret);
1786 else if (sc->fmt->errpid) {
1787 struct thread *child = machine__find_thread(trace->host, ret, ret);
1788
1789 if (child != NULL) {
1790 fprintf(trace->output, ") = %ld", ret);
1791 if (child->comm_set)
1792 fprintf(trace->output, " (%s)", thread__comm_str(child));
1793 thread__put(child);
1794 }
1795 } else
1796 goto signed_print;
1797
1798 fputc('\n', trace->output);
1799
1800 if (callchain_ret > 0)
1801 trace__fprintf_callchain(trace, sample);
1802 else if (callchain_ret < 0)
1803 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804out:
1805 ttrace->entry_pending = false;
1806 err = 0;
1807out_put:
1808 thread__put(thread);
1809 return err;
1810}
1811
1812static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813 union perf_event *event __maybe_unused,
1814 struct perf_sample *sample)
1815{
1816 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817 struct thread_trace *ttrace;
1818 size_t filename_len, entry_str_len, to_move;
1819 ssize_t remaining_space;
1820 char *pos;
1821 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822
1823 if (!thread)
1824 goto out;
1825
1826 ttrace = thread__priv(thread);
1827 if (!ttrace)
1828 goto out_put;
1829
1830 filename_len = strlen(filename);
1831 if (filename_len == 0)
1832 goto out_put;
1833
1834 if (ttrace->filename.namelen < filename_len) {
1835 char *f = realloc(ttrace->filename.name, filename_len + 1);
1836
1837 if (f == NULL)
1838 goto out_put;
1839
1840 ttrace->filename.namelen = filename_len;
1841 ttrace->filename.name = f;
1842 }
1843
1844 strcpy(ttrace->filename.name, filename);
1845 ttrace->filename.pending_open = true;
1846
1847 if (!ttrace->filename.ptr)
1848 goto out_put;
1849
1850 entry_str_len = strlen(ttrace->entry_str);
1851 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852 if (remaining_space <= 0)
1853 goto out_put;
1854
1855 if (filename_len > (size_t)remaining_space) {
1856 filename += filename_len - remaining_space;
1857 filename_len = remaining_space;
1858 }
1859
1860 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862 memmove(pos + filename_len, pos, to_move);
1863 memcpy(pos, filename, filename_len);
1864
1865 ttrace->filename.ptr = 0;
1866 ttrace->filename.entry_str_pos = 0;
1867out_put:
1868 thread__put(thread);
1869out:
1870 return 0;
1871}
1872
1873static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874 union perf_event *event __maybe_unused,
1875 struct perf_sample *sample)
1876{
1877 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879 struct thread *thread = machine__findnew_thread(trace->host,
1880 sample->pid,
1881 sample->tid);
1882 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883
1884 if (ttrace == NULL)
1885 goto out_dump;
1886
1887 ttrace->runtime_ms += runtime_ms;
1888 trace->runtime_ms += runtime_ms;
1889out_put:
1890 thread__put(thread);
1891 return 0;
1892
1893out_dump:
1894 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895 evsel->name,
1896 perf_evsel__strval(evsel, sample, "comm"),
1897 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898 runtime,
1899 perf_evsel__intval(evsel, sample, "vruntime"));
1900 goto out_put;
1901}
1902
1903static int bpf_output__printer(enum binary_printer_ops op,
1904 unsigned int val, void *extra __maybe_unused, FILE *fp)
1905{
1906 unsigned char ch = (unsigned char)val;
1907
1908 switch (op) {
1909 case BINARY_PRINT_CHAR_DATA:
1910 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911 case BINARY_PRINT_DATA_BEGIN:
1912 case BINARY_PRINT_LINE_BEGIN:
1913 case BINARY_PRINT_ADDR:
1914 case BINARY_PRINT_NUM_DATA:
1915 case BINARY_PRINT_NUM_PAD:
1916 case BINARY_PRINT_SEP:
1917 case BINARY_PRINT_CHAR_PAD:
1918 case BINARY_PRINT_LINE_END:
1919 case BINARY_PRINT_DATA_END:
1920 default:
1921 break;
1922 }
1923
1924 return 0;
1925}
1926
1927static void bpf_output__fprintf(struct trace *trace,
1928 struct perf_sample *sample)
1929{
1930 binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931 bpf_output__printer, NULL, trace->output);
1932}
1933
1934static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935 union perf_event *event __maybe_unused,
1936 struct perf_sample *sample)
1937{
1938 int callchain_ret = 0;
1939
1940 if (sample->callchain) {
1941 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1942 if (callchain_ret == 0) {
1943 if (callchain_cursor.nr < trace->min_stack)
1944 goto out;
1945 callchain_ret = 1;
1946 }
1947 }
1948
1949 trace__printf_interrupted_entry(trace);
1950 trace__fprintf_tstamp(trace, sample->time, trace->output);
1951
1952 if (trace->trace_syscalls)
1953 fprintf(trace->output, "( ): ");
1954
1955 fprintf(trace->output, "%s:", evsel->name);
1956
1957 if (perf_evsel__is_bpf_output(evsel)) {
1958 bpf_output__fprintf(trace, sample);
1959 } else if (evsel->tp_format) {
1960 event_format__fprintf(evsel->tp_format, sample->cpu,
1961 sample->raw_data, sample->raw_size,
1962 trace->output);
1963 }
1964
1965 fprintf(trace->output, "\n");
1966
1967 if (callchain_ret > 0)
1968 trace__fprintf_callchain(trace, sample);
1969 else if (callchain_ret < 0)
1970 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1971out:
1972 return 0;
1973}
1974
1975static void print_location(FILE *f, struct perf_sample *sample,
1976 struct addr_location *al,
1977 bool print_dso, bool print_sym)
1978{
1979
1980 if ((verbose > 0 || print_dso) && al->map)
1981 fprintf(f, "%s@", al->map->dso->long_name);
1982
1983 if ((verbose > 0 || print_sym) && al->sym)
1984 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985 al->addr - al->sym->start);
1986 else if (al->map)
1987 fprintf(f, "0x%" PRIx64, al->addr);
1988 else
1989 fprintf(f, "0x%" PRIx64, sample->addr);
1990}
1991
1992static int trace__pgfault(struct trace *trace,
1993 struct perf_evsel *evsel,
1994 union perf_event *event __maybe_unused,
1995 struct perf_sample *sample)
1996{
1997 struct thread *thread;
1998 struct addr_location al;
1999 char map_type = 'd';
2000 struct thread_trace *ttrace;
2001 int err = -1;
2002 int callchain_ret = 0;
2003
2004 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005
2006 if (sample->callchain) {
2007 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2008 if (callchain_ret == 0) {
2009 if (callchain_cursor.nr < trace->min_stack)
2010 goto out_put;
2011 callchain_ret = 1;
2012 }
2013 }
2014
2015 ttrace = thread__trace(thread, trace->output);
2016 if (ttrace == NULL)
2017 goto out_put;
2018
2019 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020 ttrace->pfmaj++;
2021 else
2022 ttrace->pfmin++;
2023
2024 if (trace->summary_only)
2025 goto out;
2026
2027 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2028 sample->ip, &al);
2029
2030 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2031
2032 fprintf(trace->output, "%sfault [",
2033 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2034 "maj" : "min");
2035
2036 print_location(trace->output, sample, &al, false, true);
2037
2038 fprintf(trace->output, "] => ");
2039
2040 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2041 sample->addr, &al);
2042
2043 if (!al.map) {
2044 thread__find_addr_location(thread, sample->cpumode,
2045 MAP__FUNCTION, sample->addr, &al);
2046
2047 if (al.map)
2048 map_type = 'x';
2049 else
2050 map_type = '?';
2051 }
2052
2053 print_location(trace->output, sample, &al, true, false);
2054
2055 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2056
2057 if (callchain_ret > 0)
2058 trace__fprintf_callchain(trace, sample);
2059 else if (callchain_ret < 0)
2060 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2061out:
2062 err = 0;
2063out_put:
2064 thread__put(thread);
2065 return err;
2066}
2067
2068static void trace__set_base_time(struct trace *trace,
2069 struct perf_evsel *evsel,
2070 struct perf_sample *sample)
2071{
2072 /*
2073 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2074 * and don't use sample->time unconditionally, we may end up having
2075 * some other event in the future without PERF_SAMPLE_TIME for good
2076 * reason, i.e. we may not be interested in its timestamps, just in
2077 * it taking place, picking some piece of information when it
2078 * appears in our event stream (vfs_getname comes to mind).
2079 */
2080 if (trace->base_time == 0 && !trace->full_time &&
2081 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2082 trace->base_time = sample->time;
2083}
2084
2085static int trace__process_sample(struct perf_tool *tool,
2086 union perf_event *event,
2087 struct perf_sample *sample,
2088 struct perf_evsel *evsel,
2089 struct machine *machine __maybe_unused)
2090{
2091 struct trace *trace = container_of(tool, struct trace, tool);
2092 struct thread *thread;
2093 int err = 0;
2094
2095 tracepoint_handler handler = evsel->handler;
2096
2097 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098 if (thread && thread__is_filtered(thread))
2099 goto out;
2100
2101 trace__set_base_time(trace, evsel, sample);
2102
2103 if (handler) {
2104 ++trace->nr_events;
2105 handler(trace, evsel, event, sample);
2106 }
2107out:
2108 thread__put(thread);
2109 return err;
2110}
2111
2112static int trace__record(struct trace *trace, int argc, const char **argv)
2113{
2114 unsigned int rec_argc, i, j;
2115 const char **rec_argv;
2116 const char * const record_args[] = {
2117 "record",
2118 "-R",
2119 "-m", "1024",
2120 "-c", "1",
2121 };
2122
2123 const char * const sc_args[] = { "-e", };
2124 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2125 const char * const majpf_args[] = { "-e", "major-faults" };
2126 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2127 const char * const minpf_args[] = { "-e", "minor-faults" };
2128 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2129
2130 /* +1 is for the event string below */
2131 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2132 majpf_args_nr + minpf_args_nr + argc;
2133 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2134
2135 if (rec_argv == NULL)
2136 return -ENOMEM;
2137
2138 j = 0;
2139 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2140 rec_argv[j++] = record_args[i];
2141
2142 if (trace->trace_syscalls) {
2143 for (i = 0; i < sc_args_nr; i++)
2144 rec_argv[j++] = sc_args[i];
2145
2146 /* event string may be different for older kernels - e.g., RHEL6 */
2147 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2148 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2149 else if (is_valid_tracepoint("syscalls:sys_enter"))
2150 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2151 else {
2152 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2153 free(rec_argv);
2154 return -1;
2155 }
2156 }
2157
2158 if (trace->trace_pgfaults & TRACE_PFMAJ)
2159 for (i = 0; i < majpf_args_nr; i++)
2160 rec_argv[j++] = majpf_args[i];
2161
2162 if (trace->trace_pgfaults & TRACE_PFMIN)
2163 for (i = 0; i < minpf_args_nr; i++)
2164 rec_argv[j++] = minpf_args[i];
2165
2166 for (i = 0; i < (unsigned int)argc; i++)
2167 rec_argv[j++] = argv[i];
2168
2169 return cmd_record(j, rec_argv);
2170}
2171
2172static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2173
2174static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2175{
2176 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2177
2178 if (IS_ERR(evsel))
2179 return false;
2180
2181 if (perf_evsel__field(evsel, "pathname") == NULL) {
2182 perf_evsel__delete(evsel);
2183 return false;
2184 }
2185
2186 evsel->handler = trace__vfs_getname;
2187 perf_evlist__add(evlist, evsel);
2188 return true;
2189}
2190
2191static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2192{
2193 struct perf_evsel *evsel;
2194 struct perf_event_attr attr = {
2195 .type = PERF_TYPE_SOFTWARE,
2196 .mmap_data = 1,
2197 };
2198
2199 attr.config = config;
2200 attr.sample_period = 1;
2201
2202 event_attr_init(&attr);
2203
2204 evsel = perf_evsel__new(&attr);
2205 if (evsel)
2206 evsel->handler = trace__pgfault;
2207
2208 return evsel;
2209}
2210
2211static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2212{
2213 const u32 type = event->header.type;
2214 struct perf_evsel *evsel;
2215
2216 if (type != PERF_RECORD_SAMPLE) {
2217 trace__process_event(trace, trace->host, event, sample);
2218 return;
2219 }
2220
2221 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2222 if (evsel == NULL) {
2223 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2224 return;
2225 }
2226
2227 trace__set_base_time(trace, evsel, sample);
2228
2229 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2230 sample->raw_data == NULL) {
2231 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2232 perf_evsel__name(evsel), sample->tid,
2233 sample->cpu, sample->raw_size);
2234 } else {
2235 tracepoint_handler handler = evsel->handler;
2236 handler(trace, evsel, event, sample);
2237 }
2238}
2239
2240static int trace__add_syscall_newtp(struct trace *trace)
2241{
2242 int ret = -1;
2243 struct perf_evlist *evlist = trace->evlist;
2244 struct perf_evsel *sys_enter, *sys_exit;
2245
2246 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2247 if (sys_enter == NULL)
2248 goto out;
2249
2250 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2251 goto out_delete_sys_enter;
2252
2253 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2254 if (sys_exit == NULL)
2255 goto out_delete_sys_enter;
2256
2257 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2258 goto out_delete_sys_exit;
2259
2260 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2261 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2262
2263 perf_evlist__add(evlist, sys_enter);
2264 perf_evlist__add(evlist, sys_exit);
2265
2266 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2267 /*
2268 * We're interested only in the user space callchain
2269 * leading to the syscall, allow overriding that for
2270 * debugging reasons using --kernel_syscall_callchains
2271 */
2272 sys_exit->attr.exclude_callchain_kernel = 1;
2273 }
2274
2275 trace->syscalls.events.sys_enter = sys_enter;
2276 trace->syscalls.events.sys_exit = sys_exit;
2277
2278 ret = 0;
2279out:
2280 return ret;
2281
2282out_delete_sys_exit:
2283 perf_evsel__delete_priv(sys_exit);
2284out_delete_sys_enter:
2285 perf_evsel__delete_priv(sys_enter);
2286 goto out;
2287}
2288
2289static int trace__set_ev_qualifier_filter(struct trace *trace)
2290{
2291 int err = -1;
2292 struct perf_evsel *sys_exit;
2293 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2294 trace->ev_qualifier_ids.nr,
2295 trace->ev_qualifier_ids.entries);
2296
2297 if (filter == NULL)
2298 goto out_enomem;
2299
2300 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2301 filter)) {
2302 sys_exit = trace->syscalls.events.sys_exit;
2303 err = perf_evsel__append_tp_filter(sys_exit, filter);
2304 }
2305
2306 free(filter);
2307out:
2308 return err;
2309out_enomem:
2310 errno = ENOMEM;
2311 goto out;
2312}
2313
2314static int trace__set_filter_loop_pids(struct trace *trace)
2315{
2316 unsigned int nr = 1;
2317 pid_t pids[32] = {
2318 getpid(),
2319 };
2320 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2321
2322 while (thread && nr < ARRAY_SIZE(pids)) {
2323 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2324
2325 if (parent == NULL)
2326 break;
2327
2328 if (!strcmp(thread__comm_str(parent), "sshd")) {
2329 pids[nr++] = parent->tid;
2330 break;
2331 }
2332 thread = parent;
2333 }
2334
2335 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2336}
2337
2338static int trace__run(struct trace *trace, int argc, const char **argv)
2339{
2340 struct perf_evlist *evlist = trace->evlist;
2341 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2342 int err = -1, i;
2343 unsigned long before;
2344 const bool forks = argc > 0;
2345 bool draining = false;
2346
2347 trace->live = true;
2348
2349 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2350 goto out_error_raw_syscalls;
2351
2352 if (trace->trace_syscalls)
2353 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2354
2355 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2356 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2357 if (pgfault_maj == NULL)
2358 goto out_error_mem;
2359 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2360 perf_evlist__add(evlist, pgfault_maj);
2361 }
2362
2363 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2364 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2365 if (pgfault_min == NULL)
2366 goto out_error_mem;
2367 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2368 perf_evlist__add(evlist, pgfault_min);
2369 }
2370
2371 if (trace->sched &&
2372 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2373 trace__sched_stat_runtime))
2374 goto out_error_sched_stat_runtime;
2375
2376 /*
2377 * If a global cgroup was set, apply it to all the events without an
2378 * explicit cgroup. I.e.:
2379 *
2380 * trace -G A -e sched:*switch
2381 *
2382 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2383 * _and_ sched:sched_switch to the 'A' cgroup, while:
2384 *
2385 * trace -e sched:*switch -G A
2386 *
2387 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2388 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2389 * a cgroup (on the root cgroup, sys wide, etc).
2390 *
2391 * Multiple cgroups:
2392 *
2393 * trace -G A -e sched:*switch -G B
2394 *
2395 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2396 * to the 'B' cgroup.
2397 *
2398 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2399 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2400 */
2401 if (trace->cgroup)
2402 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2403
2404 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2405 if (err < 0) {
2406 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2407 goto out_delete_evlist;
2408 }
2409
2410 err = trace__symbols_init(trace, evlist);
2411 if (err < 0) {
2412 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2413 goto out_delete_evlist;
2414 }
2415
2416 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2417
2418 signal(SIGCHLD, sig_handler);
2419 signal(SIGINT, sig_handler);
2420
2421 if (forks) {
2422 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2423 argv, false, NULL);
2424 if (err < 0) {
2425 fprintf(trace->output, "Couldn't run the workload!\n");
2426 goto out_delete_evlist;
2427 }
2428 }
2429
2430 err = perf_evlist__open(evlist);
2431 if (err < 0)
2432 goto out_error_open;
2433
2434 err = bpf__apply_obj_config();
2435 if (err) {
2436 char errbuf[BUFSIZ];
2437
2438 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2439 pr_err("ERROR: Apply config to BPF failed: %s\n",
2440 errbuf);
2441 goto out_error_open;
2442 }
2443
2444 /*
2445 * Better not use !target__has_task() here because we need to cover the
2446 * case where no threads were specified in the command line, but a
2447 * workload was, and in that case we will fill in the thread_map when
2448 * we fork the workload in perf_evlist__prepare_workload.
2449 */
2450 if (trace->filter_pids.nr > 0)
2451 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2452 else if (thread_map__pid(evlist->threads, 0) == -1)
2453 err = trace__set_filter_loop_pids(trace);
2454
2455 if (err < 0)
2456 goto out_error_mem;
2457
2458 if (trace->ev_qualifier_ids.nr > 0) {
2459 err = trace__set_ev_qualifier_filter(trace);
2460 if (err < 0)
2461 goto out_errno;
2462
2463 pr_debug("event qualifier tracepoint filter: %s\n",
2464 trace->syscalls.events.sys_exit->filter);
2465 }
2466
2467 err = perf_evlist__apply_filters(evlist, &evsel);
2468 if (err < 0)
2469 goto out_error_apply_filters;
2470
2471 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2472 if (err < 0)
2473 goto out_error_mmap;
2474
2475 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2476 perf_evlist__enable(evlist);
2477
2478 if (forks)
2479 perf_evlist__start_workload(evlist);
2480
2481 if (trace->opts.initial_delay) {
2482 usleep(trace->opts.initial_delay * 1000);
2483 perf_evlist__enable(evlist);
2484 }
2485
2486 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2487 evlist->threads->nr > 1 ||
2488 perf_evlist__first(evlist)->attr.inherit;
2489
2490 /*
2491 * Now that we already used evsel->attr to ask the kernel to setup the
2492 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2493 * trace__resolve_callchain(), allowing per-event max-stack settings
2494 * to override an explicitely set --max-stack global setting.
2495 */
2496 evlist__for_each_entry(evlist, evsel) {
2497 if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2498 evsel->attr.sample_max_stack == 0)
2499 evsel->attr.sample_max_stack = trace->max_stack;
2500 }
2501again:
2502 before = trace->nr_events;
2503
2504 for (i = 0; i < evlist->nr_mmaps; i++) {
2505 union perf_event *event;
2506 struct perf_mmap *md;
2507
2508 md = &evlist->mmap[i];
2509 if (perf_mmap__read_init(md) < 0)
2510 continue;
2511
2512 while ((event = perf_mmap__read_event(md)) != NULL) {
2513 struct perf_sample sample;
2514
2515 ++trace->nr_events;
2516
2517 err = perf_evlist__parse_sample(evlist, event, &sample);
2518 if (err) {
2519 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2520 goto next_event;
2521 }
2522
2523 trace__handle_event(trace, event, &sample);
2524next_event:
2525 perf_mmap__consume(md);
2526
2527 if (interrupted)
2528 goto out_disable;
2529
2530 if (done && !draining) {
2531 perf_evlist__disable(evlist);
2532 draining = true;
2533 }
2534 }
2535 perf_mmap__read_done(md);
2536 }
2537
2538 if (trace->nr_events == before) {
2539 int timeout = done ? 100 : -1;
2540
2541 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2542 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2543 draining = true;
2544
2545 goto again;
2546 }
2547 } else {
2548 goto again;
2549 }
2550
2551out_disable:
2552 thread__zput(trace->current);
2553
2554 perf_evlist__disable(evlist);
2555
2556 if (!err) {
2557 if (trace->summary)
2558 trace__fprintf_thread_summary(trace, trace->output);
2559
2560 if (trace->show_tool_stats) {
2561 fprintf(trace->output, "Stats:\n "
2562 " vfs_getname : %" PRIu64 "\n"
2563 " proc_getname: %" PRIu64 "\n",
2564 trace->stats.vfs_getname,
2565 trace->stats.proc_getname);
2566 }
2567 }
2568
2569out_delete_evlist:
2570 trace__symbols__exit(trace);
2571
2572 perf_evlist__delete(evlist);
2573 cgroup__put(trace->cgroup);
2574 trace->evlist = NULL;
2575 trace->live = false;
2576 return err;
2577{
2578 char errbuf[BUFSIZ];
2579
2580out_error_sched_stat_runtime:
2581 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2582 goto out_error;
2583
2584out_error_raw_syscalls:
2585 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2586 goto out_error;
2587
2588out_error_mmap:
2589 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2590 goto out_error;
2591
2592out_error_open:
2593 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2594
2595out_error:
2596 fprintf(trace->output, "%s\n", errbuf);
2597 goto out_delete_evlist;
2598
2599out_error_apply_filters:
2600 fprintf(trace->output,
2601 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2602 evsel->filter, perf_evsel__name(evsel), errno,
2603 str_error_r(errno, errbuf, sizeof(errbuf)));
2604 goto out_delete_evlist;
2605}
2606out_error_mem:
2607 fprintf(trace->output, "Not enough memory to run!\n");
2608 goto out_delete_evlist;
2609
2610out_errno:
2611 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2612 goto out_delete_evlist;
2613}
2614
2615static int trace__replay(struct trace *trace)
2616{
2617 const struct perf_evsel_str_handler handlers[] = {
2618 { "probe:vfs_getname", trace__vfs_getname, },
2619 };
2620 struct perf_data data = {
2621 .file = {
2622 .path = input_name,
2623 },
2624 .mode = PERF_DATA_MODE_READ,
2625 .force = trace->force,
2626 };
2627 struct perf_session *session;
2628 struct perf_evsel *evsel;
2629 int err = -1;
2630
2631 trace->tool.sample = trace__process_sample;
2632 trace->tool.mmap = perf_event__process_mmap;
2633 trace->tool.mmap2 = perf_event__process_mmap2;
2634 trace->tool.comm = perf_event__process_comm;
2635 trace->tool.exit = perf_event__process_exit;
2636 trace->tool.fork = perf_event__process_fork;
2637 trace->tool.attr = perf_event__process_attr;
2638 trace->tool.tracing_data = perf_event__process_tracing_data;
2639 trace->tool.build_id = perf_event__process_build_id;
2640 trace->tool.namespaces = perf_event__process_namespaces;
2641
2642 trace->tool.ordered_events = true;
2643 trace->tool.ordering_requires_timestamps = true;
2644
2645 /* add tid to output */
2646 trace->multiple_threads = true;
2647
2648 session = perf_session__new(&data, false, &trace->tool);
2649 if (session == NULL)
2650 return -1;
2651
2652 if (trace->opts.target.pid)
2653 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2654
2655 if (trace->opts.target.tid)
2656 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2657
2658 if (symbol__init(&session->header.env) < 0)
2659 goto out;
2660
2661 trace->host = &session->machines.host;
2662
2663 err = perf_session__set_tracepoints_handlers(session, handlers);
2664 if (err)
2665 goto out;
2666
2667 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2668 "raw_syscalls:sys_enter");
2669 /* older kernels have syscalls tp versus raw_syscalls */
2670 if (evsel == NULL)
2671 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672 "syscalls:sys_enter");
2673
2674 if (evsel &&
2675 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2676 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2677 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2678 goto out;
2679 }
2680
2681 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682 "raw_syscalls:sys_exit");
2683 if (evsel == NULL)
2684 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2685 "syscalls:sys_exit");
2686 if (evsel &&
2687 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2688 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2689 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2690 goto out;
2691 }
2692
2693 evlist__for_each_entry(session->evlist, evsel) {
2694 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2695 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2696 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2697 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2698 evsel->handler = trace__pgfault;
2699 }
2700
2701 setup_pager();
2702
2703 err = perf_session__process_events(session);
2704 if (err)
2705 pr_err("Failed to process events, error %d", err);
2706
2707 else if (trace->summary)
2708 trace__fprintf_thread_summary(trace, trace->output);
2709
2710out:
2711 perf_session__delete(session);
2712
2713 return err;
2714}
2715
2716static size_t trace__fprintf_threads_header(FILE *fp)
2717{
2718 size_t printed;
2719
2720 printed = fprintf(fp, "\n Summary of events:\n\n");
2721
2722 return printed;
2723}
2724
2725DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2726 struct stats *stats;
2727 double msecs;
2728 int syscall;
2729)
2730{
2731 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2732 struct stats *stats = source->priv;
2733
2734 entry->syscall = source->i;
2735 entry->stats = stats;
2736 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2737}
2738
2739static size_t thread__dump_stats(struct thread_trace *ttrace,
2740 struct trace *trace, FILE *fp)
2741{
2742 size_t printed = 0;
2743 struct syscall *sc;
2744 struct rb_node *nd;
2745 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2746
2747 if (syscall_stats == NULL)
2748 return 0;
2749
2750 printed += fprintf(fp, "\n");
2751
2752 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2753 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2754 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2755
2756 resort_rb__for_each_entry(nd, syscall_stats) {
2757 struct stats *stats = syscall_stats_entry->stats;
2758 if (stats) {
2759 double min = (double)(stats->min) / NSEC_PER_MSEC;
2760 double max = (double)(stats->max) / NSEC_PER_MSEC;
2761 double avg = avg_stats(stats);
2762 double pct;
2763 u64 n = (u64) stats->n;
2764
2765 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2766 avg /= NSEC_PER_MSEC;
2767
2768 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2769 printed += fprintf(fp, " %-15s", sc->name);
2770 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2771 n, syscall_stats_entry->msecs, min, avg);
2772 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2773 }
2774 }
2775
2776 resort_rb__delete(syscall_stats);
2777 printed += fprintf(fp, "\n\n");
2778
2779 return printed;
2780}
2781
2782static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2783{
2784 size_t printed = 0;
2785 struct thread_trace *ttrace = thread__priv(thread);
2786 double ratio;
2787
2788 if (ttrace == NULL)
2789 return 0;
2790
2791 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2792
2793 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2794 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2795 printed += fprintf(fp, "%.1f%%", ratio);
2796 if (ttrace->pfmaj)
2797 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2798 if (ttrace->pfmin)
2799 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2800 if (trace->sched)
2801 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2802 else if (fputc('\n', fp) != EOF)
2803 ++printed;
2804
2805 printed += thread__dump_stats(ttrace, trace, fp);
2806
2807 return printed;
2808}
2809
2810static unsigned long thread__nr_events(struct thread_trace *ttrace)
2811{
2812 return ttrace ? ttrace->nr_events : 0;
2813}
2814
2815DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2816 struct thread *thread;
2817)
2818{
2819 entry->thread = rb_entry(nd, struct thread, rb_node);
2820}
2821
2822static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2823{
2824 size_t printed = trace__fprintf_threads_header(fp);
2825 struct rb_node *nd;
2826 int i;
2827
2828 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2829 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2830
2831 if (threads == NULL) {
2832 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2833 return 0;
2834 }
2835
2836 resort_rb__for_each_entry(nd, threads)
2837 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2838
2839 resort_rb__delete(threads);
2840 }
2841 return printed;
2842}
2843
2844static int trace__set_duration(const struct option *opt, const char *str,
2845 int unset __maybe_unused)
2846{
2847 struct trace *trace = opt->value;
2848
2849 trace->duration_filter = atof(str);
2850 return 0;
2851}
2852
2853static int trace__set_filter_pids(const struct option *opt, const char *str,
2854 int unset __maybe_unused)
2855{
2856 int ret = -1;
2857 size_t i;
2858 struct trace *trace = opt->value;
2859 /*
2860 * FIXME: introduce a intarray class, plain parse csv and create a
2861 * { int nr, int entries[] } struct...
2862 */
2863 struct intlist *list = intlist__new(str);
2864
2865 if (list == NULL)
2866 return -1;
2867
2868 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2869 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2870
2871 if (trace->filter_pids.entries == NULL)
2872 goto out;
2873
2874 trace->filter_pids.entries[0] = getpid();
2875
2876 for (i = 1; i < trace->filter_pids.nr; ++i)
2877 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2878
2879 intlist__delete(list);
2880 ret = 0;
2881out:
2882 return ret;
2883}
2884
2885static int trace__open_output(struct trace *trace, const char *filename)
2886{
2887 struct stat st;
2888
2889 if (!stat(filename, &st) && st.st_size) {
2890 char oldname[PATH_MAX];
2891
2892 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2893 unlink(oldname);
2894 rename(filename, oldname);
2895 }
2896
2897 trace->output = fopen(filename, "w");
2898
2899 return trace->output == NULL ? -errno : 0;
2900}
2901
2902static int parse_pagefaults(const struct option *opt, const char *str,
2903 int unset __maybe_unused)
2904{
2905 int *trace_pgfaults = opt->value;
2906
2907 if (strcmp(str, "all") == 0)
2908 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2909 else if (strcmp(str, "maj") == 0)
2910 *trace_pgfaults |= TRACE_PFMAJ;
2911 else if (strcmp(str, "min") == 0)
2912 *trace_pgfaults |= TRACE_PFMIN;
2913 else
2914 return -1;
2915
2916 return 0;
2917}
2918
2919static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2920{
2921 struct perf_evsel *evsel;
2922
2923 evlist__for_each_entry(evlist, evsel)
2924 evsel->handler = handler;
2925}
2926
2927/*
2928 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2929 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2930 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2931 *
2932 * It'd be better to introduce a parse_options() variant that would return a
2933 * list with the terms it didn't match to an event...
2934 */
2935static int trace__parse_events_option(const struct option *opt, const char *str,
2936 int unset __maybe_unused)
2937{
2938 struct trace *trace = (struct trace *)opt->value;
2939 const char *s = str;
2940 char *sep = NULL, *lists[2] = { NULL, NULL, };
2941 int len = strlen(str) + 1, err = -1, list, idx;
2942 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2943 char group_name[PATH_MAX];
2944
2945 if (strace_groups_dir == NULL)
2946 return -1;
2947
2948 if (*s == '!') {
2949 ++s;
2950 trace->not_ev_qualifier = true;
2951 }
2952
2953 while (1) {
2954 if ((sep = strchr(s, ',')) != NULL)
2955 *sep = '\0';
2956
2957 list = 0;
2958 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2959 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2960 list = 1;
2961 } else {
2962 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2963 if (access(group_name, R_OK) == 0)
2964 list = 1;
2965 }
2966
2967 if (lists[list]) {
2968 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2969 } else {
2970 lists[list] = malloc(len);
2971 if (lists[list] == NULL)
2972 goto out;
2973 strcpy(lists[list], s);
2974 }
2975
2976 if (!sep)
2977 break;
2978
2979 *sep = ',';
2980 s = sep + 1;
2981 }
2982
2983 if (lists[1] != NULL) {
2984 struct strlist_config slist_config = {
2985 .dirname = strace_groups_dir,
2986 };
2987
2988 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2989 if (trace->ev_qualifier == NULL) {
2990 fputs("Not enough memory to parse event qualifier", trace->output);
2991 goto out;
2992 }
2993
2994 if (trace__validate_ev_qualifier(trace))
2995 goto out;
2996 }
2997
2998 err = 0;
2999
3000 if (lists[0]) {
3001 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002 "event selector. use 'perf list' to list available events",
3003 parse_events_option);
3004 err = parse_events_option(&o, lists[0], 0);
3005 }
3006out:
3007 if (sep)
3008 *sep = ',';
3009
3010 return err;
3011}
3012
3013static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014{
3015 struct trace *trace = opt->value;
3016
3017 if (!list_empty(&trace->evlist->entries))
3018 return parse_cgroups(opt, str, unset);
3019
3020 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021
3022 return 0;
3023}
3024
3025int cmd_trace(int argc, const char **argv)
3026{
3027 const char *trace_usage[] = {
3028 "perf trace [<options>] [<command>]",
3029 "perf trace [<options>] -- <command> [<options>]",
3030 "perf trace record [<options>] [<command>]",
3031 "perf trace record [<options>] -- <command> [<options>]",
3032 NULL
3033 };
3034 struct trace trace = {
3035 .syscalls = {
3036 . max = -1,
3037 },
3038 .opts = {
3039 .target = {
3040 .uid = UINT_MAX,
3041 .uses_mmap = true,
3042 },
3043 .user_freq = UINT_MAX,
3044 .user_interval = ULLONG_MAX,
3045 .no_buffering = true,
3046 .mmap_pages = UINT_MAX,
3047 .proc_map_timeout = 500,
3048 },
3049 .output = stderr,
3050 .show_comm = true,
3051 .trace_syscalls = true,
3052 .kernel_syscallchains = false,
3053 .max_stack = UINT_MAX,
3054 };
3055 const char *output_name = NULL;
3056 const struct option trace_options[] = {
3057 OPT_CALLBACK('e', "event", &trace, "event",
3058 "event/syscall selector. use 'perf list' to list available events",
3059 trace__parse_events_option),
3060 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061 "show the thread COMM next to its id"),
3062 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064 trace__parse_events_option),
3065 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068 "trace events on existing process id"),
3069 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070 "trace events on existing thread id"),
3071 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072 "pids to filter (by the kernel)", trace__set_filter_pids),
3073 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074 "system-wide collection from all CPUs"),
3075 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076 "list of cpus to monitor"),
3077 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078 "child tasks do not inherit counters"),
3079 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080 "number of mmap data pages",
3081 perf_evlist__parse_mmap_pages),
3082 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083 "user to profile"),
3084 OPT_CALLBACK(0, "duration", &trace, "float",
3085 "show only events with duration > N.M ms",
3086 trace__set_duration),
3087 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089 OPT_BOOLEAN('T', "time", &trace.full_time,
3090 "Show full timestamp, not time relative to first start"),
3091 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092 "Show only syscalls that failed"),
3093 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094 "Show only syscall summary with statistics"),
3095 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096 "Show all syscalls and summary with statistics"),
3097 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098 "Trace pagefaults", parse_pagefaults, "maj"),
3099 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101 OPT_CALLBACK(0, "call-graph", &trace.opts,
3102 "record_mode[,record_size]", record_callchain_help,
3103 &record_parse_callchain_opt),
3104 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105 "Show the kernel callchains on the syscall exit path"),
3106 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107 "Set the minimum stack depth when parsing the callchain, "
3108 "anything below the specified depth will be ignored."),
3109 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110 "Set the maximum stack depth when parsing the callchain, "
3111 "anything beyond the specified depth will be ignored. "
3112 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3113 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116 "per thread proc mmap processing timeout in ms"),
3117 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118 trace__parse_cgroups),
3119 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120 "ms to wait before starting measurement after program "
3121 "start"),
3122 OPT_END()
3123 };
3124 bool __maybe_unused max_stack_user_set = true;
3125 bool mmap_pages_user_set = true;
3126 const char * const trace_subcommands[] = { "record", NULL };
3127 int err;
3128 char bf[BUFSIZ];
3129
3130 signal(SIGSEGV, sighandler_dump_stack);
3131 signal(SIGFPE, sighandler_dump_stack);
3132
3133 trace.evlist = perf_evlist__new();
3134 trace.sctbl = syscalltbl__new();
3135
3136 if (trace.evlist == NULL || trace.sctbl == NULL) {
3137 pr_err("Not enough memory to run!\n");
3138 err = -ENOMEM;
3139 goto out;
3140 }
3141
3142 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144
3145 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146 usage_with_options_msg(trace_usage, trace_options,
3147 "cgroup monitoring only available in system-wide mode");
3148 }
3149
3150 err = bpf__setup_stdout(trace.evlist);
3151 if (err) {
3152 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154 goto out;
3155 }
3156
3157 err = -1;
3158
3159 if (trace.trace_pgfaults) {
3160 trace.opts.sample_address = true;
3161 trace.opts.sample_time = true;
3162 }
3163
3164 if (trace.opts.mmap_pages == UINT_MAX)
3165 mmap_pages_user_set = false;
3166
3167 if (trace.max_stack == UINT_MAX) {
3168 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3169 max_stack_user_set = false;
3170 }
3171
3172#ifdef HAVE_DWARF_UNWIND_SUPPORT
3173 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175 }
3176#endif
3177
3178 if (callchain_param.enabled) {
3179 if (!mmap_pages_user_set && geteuid() == 0)
3180 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181
3182 symbol_conf.use_callchain = true;
3183 }
3184
3185 if (trace.evlist->nr_entries > 0)
3186 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3187
3188 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189 return trace__record(&trace, argc-1, &argv[1]);
3190
3191 /* summary_only implies summary option, but don't overwrite summary if set */
3192 if (trace.summary_only)
3193 trace.summary = trace.summary_only;
3194
3195 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197 pr_err("Please specify something to trace.\n");
3198 return -1;
3199 }
3200
3201 if (!trace.trace_syscalls && trace.ev_qualifier) {
3202 pr_err("The -e option can't be used with --no-syscalls.\n");
3203 goto out;
3204 }
3205
3206 if (output_name != NULL) {
3207 err = trace__open_output(&trace, output_name);
3208 if (err < 0) {
3209 perror("failed to create output file");
3210 goto out;
3211 }
3212 }
3213
3214 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3215
3216 err = target__validate(&trace.opts.target);
3217 if (err) {
3218 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3219 fprintf(trace.output, "%s", bf);
3220 goto out_close;
3221 }
3222
3223 err = target__parse_uid(&trace.opts.target);
3224 if (err) {
3225 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3226 fprintf(trace.output, "%s", bf);
3227 goto out_close;
3228 }
3229
3230 if (!argc && target__none(&trace.opts.target))
3231 trace.opts.target.system_wide = true;
3232
3233 if (input_name)
3234 err = trace__replay(&trace);
3235 else
3236 err = trace__run(&trace, argc, argv);
3237
3238out_close:
3239 if (output_name != NULL)
3240 fclose(trace.output);
3241out:
3242 return err;
3243}
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 */
16
17#include "util/record.h"
18#include <api/fs/tracing_path.h>
19#ifdef HAVE_LIBBPF_SUPPORT
20#include <bpf/bpf.h>
21#include <bpf/libbpf.h>
22#include <bpf/btf.h>
23#ifdef HAVE_BPF_SKEL
24#include "bpf_skel/augmented_raw_syscalls.skel.h"
25#endif
26#endif
27#include "util/bpf_map.h"
28#include "util/rlimit.h"
29#include "builtin.h"
30#include "util/cgroup.h"
31#include "util/color.h"
32#include "util/config.h"
33#include "util/debug.h"
34#include "util/dso.h"
35#include "util/env.h"
36#include "util/event.h"
37#include "util/evsel.h"
38#include "util/evsel_fprintf.h"
39#include "util/synthetic-events.h"
40#include "util/evlist.h"
41#include "util/evswitch.h"
42#include "util/mmap.h"
43#include <subcmd/pager.h>
44#include <subcmd/exec-cmd.h>
45#include "util/machine.h"
46#include "util/map.h"
47#include "util/symbol.h"
48#include "util/path.h"
49#include "util/session.h"
50#include "util/thread.h"
51#include <subcmd/parse-options.h>
52#include "util/strlist.h"
53#include "util/intlist.h"
54#include "util/thread_map.h"
55#include "util/stat.h"
56#include "util/tool.h"
57#include "util/util.h"
58#include "trace/beauty/beauty.h"
59#include "trace-event.h"
60#include "util/parse-events.h"
61#include "util/tracepoint.h"
62#include "callchain.h"
63#include "print_binary.h"
64#include "string2.h"
65#include "syscalltbl.h"
66#include "rb_resort.h"
67#include "../perf.h"
68#include "trace_augment.h"
69
70#include <errno.h>
71#include <inttypes.h>
72#include <poll.h>
73#include <signal.h>
74#include <stdlib.h>
75#include <string.h>
76#include <linux/err.h>
77#include <linux/filter.h>
78#include <linux/kernel.h>
79#include <linux/list_sort.h>
80#include <linux/random.h>
81#include <linux/stringify.h>
82#include <linux/time64.h>
83#include <linux/zalloc.h>
84#include <fcntl.h>
85#include <sys/sysmacros.h>
86
87#include <linux/ctype.h>
88#include <perf/mmap.h>
89
90#ifdef HAVE_LIBTRACEEVENT
91#include <event-parse.h>
92#endif
93
94#ifndef O_CLOEXEC
95# define O_CLOEXEC 02000000
96#endif
97
98#ifndef F_LINUX_SPECIFIC_BASE
99# define F_LINUX_SPECIFIC_BASE 1024
100#endif
101
102#define RAW_SYSCALL_ARGS_NUM 6
103
104/*
105 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
106 *
107 * We have to explicitely mark the direction of the flow of data, if from the
108 * kernel to user space or the other way around, since the BPF collector we
109 * have so far copies only from user to kernel space, mark the arguments that
110 * go that direction, so that we don´t end up collecting the previous contents
111 * for syscall args that goes from kernel to user space.
112 */
113struct syscall_arg_fmt {
114 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
115 bool (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
116 unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
117 void *parm;
118 const char *name;
119 u16 nr_entries; // for arrays
120 bool from_user;
121 bool show_zero;
122#ifdef HAVE_LIBBPF_SUPPORT
123 const struct btf_type *type;
124 int type_id; /* used in btf_dump */
125#endif
126};
127
128struct syscall_fmt {
129 const char *name;
130 const char *alias;
131 struct {
132 const char *sys_enter,
133 *sys_exit;
134 } bpf_prog_name;
135 struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
136 u8 nr_args;
137 bool errpid;
138 bool timeout;
139 bool hexret;
140};
141
142struct trace {
143 struct perf_tool tool;
144 struct syscalltbl *sctbl;
145 struct {
146 struct syscall *table;
147 struct {
148 struct evsel *sys_enter,
149 *sys_exit,
150 *bpf_output;
151 } events;
152 } syscalls;
153#ifdef HAVE_BPF_SKEL
154 struct augmented_raw_syscalls_bpf *skel;
155#endif
156#ifdef HAVE_LIBBPF_SUPPORT
157 struct btf *btf;
158#endif
159 struct record_opts opts;
160 struct evlist *evlist;
161 struct machine *host;
162 struct thread *current;
163 struct cgroup *cgroup;
164 u64 base_time;
165 FILE *output;
166 unsigned long nr_events;
167 unsigned long nr_events_printed;
168 unsigned long max_events;
169 struct evswitch evswitch;
170 struct strlist *ev_qualifier;
171 struct {
172 size_t nr;
173 int *entries;
174 } ev_qualifier_ids;
175 struct {
176 size_t nr;
177 pid_t *entries;
178 struct bpf_map *map;
179 } filter_pids;
180 double duration_filter;
181 double runtime_ms;
182 struct {
183 u64 vfs_getname,
184 proc_getname;
185 } stats;
186 unsigned int max_stack;
187 unsigned int min_stack;
188 int raw_augmented_syscalls_args_size;
189 bool raw_augmented_syscalls;
190 bool fd_path_disabled;
191 bool sort_events;
192 bool not_ev_qualifier;
193 bool live;
194 bool full_time;
195 bool sched;
196 bool multiple_threads;
197 bool summary;
198 bool summary_only;
199 bool errno_summary;
200 bool failure_only;
201 bool show_comm;
202 bool print_sample;
203 bool show_tool_stats;
204 bool trace_syscalls;
205 bool libtraceevent_print;
206 bool kernel_syscallchains;
207 s16 args_alignment;
208 bool show_tstamp;
209 bool show_duration;
210 bool show_zeros;
211 bool show_arg_names;
212 bool show_string_prefix;
213 bool force;
214 bool vfs_getname;
215 bool force_btf;
216 int trace_pgfaults;
217 char *perfconfig_events;
218 struct {
219 struct ordered_events data;
220 u64 last;
221 } oe;
222};
223
224static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused)
225{
226#ifdef HAVE_LIBBPF_SUPPORT
227 if (trace->btf != NULL)
228 return;
229
230 trace->btf = btf__load_vmlinux_btf();
231 if (verbose > 0) {
232 fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" :
233 "Failed to load vmlinux BTF\n");
234 }
235#endif
236}
237
238struct tp_field {
239 int offset;
240 union {
241 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
242 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
243 };
244};
245
246#define TP_UINT_FIELD(bits) \
247static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
248{ \
249 u##bits value; \
250 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
251 return value; \
252}
253
254TP_UINT_FIELD(8);
255TP_UINT_FIELD(16);
256TP_UINT_FIELD(32);
257TP_UINT_FIELD(64);
258
259#define TP_UINT_FIELD__SWAPPED(bits) \
260static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
261{ \
262 u##bits value; \
263 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
264 return bswap_##bits(value);\
265}
266
267TP_UINT_FIELD__SWAPPED(16);
268TP_UINT_FIELD__SWAPPED(32);
269TP_UINT_FIELD__SWAPPED(64);
270
271static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
272{
273 field->offset = offset;
274
275 switch (size) {
276 case 1:
277 field->integer = tp_field__u8;
278 break;
279 case 2:
280 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
281 break;
282 case 4:
283 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
284 break;
285 case 8:
286 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
287 break;
288 default:
289 return -1;
290 }
291
292 return 0;
293}
294
295static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
296{
297 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
298}
299
300static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
301{
302 return sample->raw_data + field->offset;
303}
304
305static int __tp_field__init_ptr(struct tp_field *field, int offset)
306{
307 field->offset = offset;
308 field->pointer = tp_field__ptr;
309 return 0;
310}
311
312static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
313{
314 return __tp_field__init_ptr(field, format_field->offset);
315}
316
317struct syscall_tp {
318 struct tp_field id;
319 union {
320 struct tp_field args, ret;
321 };
322};
323
324/*
325 * The evsel->priv as used by 'perf trace'
326 * sc: for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
327 * fmt: for all the other tracepoints
328 */
329struct evsel_trace {
330 struct syscall_tp sc;
331 struct syscall_arg_fmt *fmt;
332};
333
334static struct evsel_trace *evsel_trace__new(void)
335{
336 return zalloc(sizeof(struct evsel_trace));
337}
338
339static void evsel_trace__delete(struct evsel_trace *et)
340{
341 if (et == NULL)
342 return;
343
344 zfree(&et->fmt);
345 free(et);
346}
347
348/*
349 * Used with raw_syscalls:sys_{enter,exit} and with the
350 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
351 */
352static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
353{
354 struct evsel_trace *et = evsel->priv;
355
356 return &et->sc;
357}
358
359static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
360{
361 if (evsel->priv == NULL) {
362 evsel->priv = evsel_trace__new();
363 if (evsel->priv == NULL)
364 return NULL;
365 }
366
367 return __evsel__syscall_tp(evsel);
368}
369
370/*
371 * Used with all the other tracepoints.
372 */
373static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
374{
375 struct evsel_trace *et = evsel->priv;
376
377 return et->fmt;
378}
379
380static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
381{
382 struct evsel_trace *et = evsel->priv;
383
384 if (evsel->priv == NULL) {
385 et = evsel->priv = evsel_trace__new();
386
387 if (et == NULL)
388 return NULL;
389 }
390
391 if (et->fmt == NULL) {
392 et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
393 if (et->fmt == NULL)
394 goto out_delete;
395 }
396
397 return __evsel__syscall_arg_fmt(evsel);
398
399out_delete:
400 evsel_trace__delete(evsel->priv);
401 evsel->priv = NULL;
402 return NULL;
403}
404
405static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
406{
407 struct tep_format_field *format_field = evsel__field(evsel, name);
408
409 if (format_field == NULL)
410 return -1;
411
412 return tp_field__init_uint(field, format_field, evsel->needs_swap);
413}
414
415#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
416 ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
417 evsel__init_tp_uint_field(evsel, &sc->name, #name); })
418
419static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
420{
421 struct tep_format_field *format_field = evsel__field(evsel, name);
422
423 if (format_field == NULL)
424 return -1;
425
426 return tp_field__init_ptr(field, format_field);
427}
428
429#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
430 ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
431 evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
432
433static void evsel__delete_priv(struct evsel *evsel)
434{
435 zfree(&evsel->priv);
436 evsel__delete(evsel);
437}
438
439static int evsel__init_syscall_tp(struct evsel *evsel)
440{
441 struct syscall_tp *sc = evsel__syscall_tp(evsel);
442
443 if (sc != NULL) {
444 if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
445 evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
446 return -ENOENT;
447
448 return 0;
449 }
450
451 return -ENOMEM;
452}
453
454static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
455{
456 struct syscall_tp *sc = evsel__syscall_tp(evsel);
457
458 if (sc != NULL) {
459 struct tep_format_field *syscall_id = evsel__field(tp, "id");
460 if (syscall_id == NULL)
461 syscall_id = evsel__field(tp, "__syscall_nr");
462 if (syscall_id == NULL ||
463 __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
464 return -EINVAL;
465
466 return 0;
467 }
468
469 return -ENOMEM;
470}
471
472static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
473{
474 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
475
476 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
477}
478
479static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
480{
481 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
482
483 return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
484}
485
486static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
487{
488 if (evsel__syscall_tp(evsel) != NULL) {
489 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
490 return -ENOENT;
491
492 evsel->handler = handler;
493 return 0;
494 }
495
496 return -ENOMEM;
497}
498
499static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
500{
501 struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
502
503 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
504 if (IS_ERR(evsel))
505 evsel = evsel__newtp("syscalls", direction);
506
507 if (IS_ERR(evsel))
508 return NULL;
509
510 if (evsel__init_raw_syscall_tp(evsel, handler))
511 goto out_delete;
512
513 return evsel;
514
515out_delete:
516 evsel__delete_priv(evsel);
517 return NULL;
518}
519
520#define perf_evsel__sc_tp_uint(evsel, name, sample) \
521 ({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
522 fields->name.integer(&fields->name, sample); })
523
524#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
525 ({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
526 fields->name.pointer(&fields->name, sample); })
527
528size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
529{
530 int idx = val - sa->offset;
531
532 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
533 size_t printed = scnprintf(bf, size, intfmt, val);
534 if (show_suffix)
535 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
536 return printed;
537 }
538
539 return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
540}
541
542size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
543{
544 int idx = val - sa->offset;
545
546 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
547 size_t printed = scnprintf(bf, size, intfmt, val);
548 if (show_prefix)
549 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
550 return printed;
551 }
552
553 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
554}
555
556static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
557 const char *intfmt,
558 struct syscall_arg *arg)
559{
560 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
561}
562
563static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
564 struct syscall_arg *arg)
565{
566 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
567}
568
569#define SCA_STRARRAY syscall_arg__scnprintf_strarray
570
571bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
572{
573 return strarray__strtoul(arg->parm, bf, size, ret);
574}
575
576bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
577{
578 return strarray__strtoul_flags(arg->parm, bf, size, ret);
579}
580
581bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
582{
583 return strarrays__strtoul(arg->parm, bf, size, ret);
584}
585
586size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
587{
588 return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
589}
590
591size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
592{
593 size_t printed;
594 int i;
595
596 for (i = 0; i < sas->nr_entries; ++i) {
597 struct strarray *sa = sas->entries[i];
598 int idx = val - sa->offset;
599
600 if (idx >= 0 && idx < sa->nr_entries) {
601 if (sa->entries[idx] == NULL)
602 break;
603 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
604 }
605 }
606
607 printed = scnprintf(bf, size, intfmt, val);
608 if (show_prefix)
609 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
610 return printed;
611}
612
613bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
614{
615 int i;
616
617 for (i = 0; i < sa->nr_entries; ++i) {
618 if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
619 *ret = sa->offset + i;
620 return true;
621 }
622 }
623
624 return false;
625}
626
627bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
628{
629 u64 val = 0;
630 char *tok = bf, *sep, *end;
631
632 *ret = 0;
633
634 while (size != 0) {
635 int toklen = size;
636
637 sep = memchr(tok, '|', size);
638 if (sep != NULL) {
639 size -= sep - tok + 1;
640
641 end = sep - 1;
642 while (end > tok && isspace(*end))
643 --end;
644
645 toklen = end - tok + 1;
646 }
647
648 while (isspace(*tok))
649 ++tok;
650
651 if (isalpha(*tok) || *tok == '_') {
652 if (!strarray__strtoul(sa, tok, toklen, &val))
653 return false;
654 } else
655 val = strtoul(tok, NULL, 0);
656
657 *ret |= (1 << (val - 1));
658
659 if (sep == NULL)
660 break;
661 tok = sep + 1;
662 }
663
664 return true;
665}
666
667bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
668{
669 int i;
670
671 for (i = 0; i < sas->nr_entries; ++i) {
672 struct strarray *sa = sas->entries[i];
673
674 if (strarray__strtoul(sa, bf, size, ret))
675 return true;
676 }
677
678 return false;
679}
680
681size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
682 struct syscall_arg *arg)
683{
684 return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
685}
686
687#ifndef AT_FDCWD
688#define AT_FDCWD -100
689#endif
690
691static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
692 struct syscall_arg *arg)
693{
694 int fd = arg->val;
695 const char *prefix = "AT_FD";
696
697 if (fd == AT_FDCWD)
698 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
699
700 return syscall_arg__scnprintf_fd(bf, size, arg);
701}
702
703#define SCA_FDAT syscall_arg__scnprintf_fd_at
704
705static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
706 struct syscall_arg *arg);
707
708#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
709
710size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
711{
712 return scnprintf(bf, size, "%#lx", arg->val);
713}
714
715size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
716{
717 if (arg->val == 0)
718 return scnprintf(bf, size, "NULL");
719 return syscall_arg__scnprintf_hex(bf, size, arg);
720}
721
722size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
723{
724 return scnprintf(bf, size, "%d", arg->val);
725}
726
727size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
728{
729 return scnprintf(bf, size, "%ld", arg->val);
730}
731
732static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
733{
734 // XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
735 // fill missing comms using thread__set_comm()...
736 // here or in a special syscall_arg__scnprintf_pid_sched_tp...
737 return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
738}
739
740#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
741
742static const char *bpf_cmd[] = {
743 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
744 "MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
745 "PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
746 "PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
747 "PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
748 "TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
749 "BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
750 "MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
751 "LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
752 "LINK_DETACH", "PROG_BIND_MAP",
753};
754static DEFINE_STRARRAY(bpf_cmd, "BPF_");
755
756static const char *fsmount_flags[] = {
757 [1] = "CLOEXEC",
758};
759static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
760
761#include "trace/beauty/generated/fsconfig_arrays.c"
762
763static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
764
765static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
766static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
767
768static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
769static DEFINE_STRARRAY(itimers, "ITIMER_");
770
771static const char *keyctl_options[] = {
772 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
773 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
774 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
775 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
776 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
777};
778static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
779
780static const char *whences[] = { "SET", "CUR", "END",
781#ifdef SEEK_DATA
782"DATA",
783#endif
784#ifdef SEEK_HOLE
785"HOLE",
786#endif
787};
788static DEFINE_STRARRAY(whences, "SEEK_");
789
790static const char *fcntl_cmds[] = {
791 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
792 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
793 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
794 "GETOWNER_UIDS",
795};
796static DEFINE_STRARRAY(fcntl_cmds, "F_");
797
798static const char *fcntl_linux_specific_cmds[] = {
799 "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
800 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
801 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
802};
803
804static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
805
806static struct strarray *fcntl_cmds_arrays[] = {
807 &strarray__fcntl_cmds,
808 &strarray__fcntl_linux_specific_cmds,
809};
810
811static DEFINE_STRARRAYS(fcntl_cmds_arrays);
812
813static const char *rlimit_resources[] = {
814 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
815 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
816 "RTTIME",
817};
818static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
819
820static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
821static DEFINE_STRARRAY(sighow, "SIG_");
822
823static const char *clockid[] = {
824 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
825 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
826 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
827};
828static DEFINE_STRARRAY(clockid, "CLOCK_");
829
830static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
831 struct syscall_arg *arg)
832{
833 bool show_prefix = arg->show_string_prefix;
834 const char *suffix = "_OK";
835 size_t printed = 0;
836 int mode = arg->val;
837
838 if (mode == F_OK) /* 0 */
839 return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
840#define P_MODE(n) \
841 if (mode & n##_OK) { \
842 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
843 mode &= ~n##_OK; \
844 }
845
846 P_MODE(R);
847 P_MODE(W);
848 P_MODE(X);
849#undef P_MODE
850
851 if (mode)
852 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
853
854 return printed;
855}
856
857#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
858
859static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
860 struct syscall_arg *arg);
861
862#define SCA_FILENAME syscall_arg__scnprintf_filename
863
864// 'argname' is just documentational at this point, to remove the previous comment with that info
865#define SCA_FILENAME_FROM_USER(argname) \
866 { .scnprintf = SCA_FILENAME, \
867 .from_user = true, }
868
869static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg);
870
871#define SCA_BUF syscall_arg__scnprintf_buf
872
873static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
874 struct syscall_arg *arg)
875{
876 bool show_prefix = arg->show_string_prefix;
877 const char *prefix = "O_";
878 int printed = 0, flags = arg->val;
879
880#define P_FLAG(n) \
881 if (flags & O_##n) { \
882 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
883 flags &= ~O_##n; \
884 }
885
886 P_FLAG(CLOEXEC);
887 P_FLAG(NONBLOCK);
888#undef P_FLAG
889
890 if (flags)
891 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
892
893 return printed;
894}
895
896#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
897
898#ifndef GRND_NONBLOCK
899#define GRND_NONBLOCK 0x0001
900#endif
901#ifndef GRND_RANDOM
902#define GRND_RANDOM 0x0002
903#endif
904
905static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
906 struct syscall_arg *arg)
907{
908 bool show_prefix = arg->show_string_prefix;
909 const char *prefix = "GRND_";
910 int printed = 0, flags = arg->val;
911
912#define P_FLAG(n) \
913 if (flags & GRND_##n) { \
914 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
915 flags &= ~GRND_##n; \
916 }
917
918 P_FLAG(RANDOM);
919 P_FLAG(NONBLOCK);
920#undef P_FLAG
921
922 if (flags)
923 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
924
925 return printed;
926}
927
928#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
929
930#ifdef HAVE_LIBBPF_SUPPORT
931static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
932{
933 int id;
934
935 type = strstr(type, "enum ");
936 if (type == NULL)
937 return;
938
939 type += 5; // skip "enum " to get the enumeration name
940
941 id = btf__find_by_name(btf, type);
942 if (id < 0)
943 return;
944
945 arg_fmt->type = btf__type_by_id(btf, id);
946}
947
948static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
949{
950 const struct btf_type *bt = arg->fmt->type;
951 struct btf *btf = arg->trace->btf;
952 struct btf_enum *be = btf_enum(bt);
953
954 for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
955 const char *name = btf__name_by_offset(btf, be->name_off);
956 int max_len = max(size, strlen(name));
957
958 if (strncmp(name, bf, max_len) == 0) {
959 *val = be->val;
960 return true;
961 }
962 }
963
964 return false;
965}
966
967static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
968{
969 const struct btf_type *bt;
970 char *type = arg->type_name;
971 struct btf *btf;
972
973 trace__load_vmlinux_btf(arg->trace);
974
975 btf = arg->trace->btf;
976 if (btf == NULL)
977 return false;
978
979 if (arg->fmt->type == NULL) {
980 // See if this is an enum
981 syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type);
982 }
983
984 // Now let's see if we have a BTF type resolved
985 bt = arg->fmt->type;
986 if (bt == NULL)
987 return false;
988
989 // If it is an enum:
990 if (btf_is_enum(arg->fmt->type))
991 return syscall_arg__strtoul_btf_enum(bf, size, arg, val);
992
993 return false;
994}
995
996static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
997{
998 struct btf_enum *be = btf_enum(type);
999 const int nr_entries = btf_vlen(type);
1000
1001 for (int i = 0; i < nr_entries; ++i, ++be) {
1002 if (be->val == val) {
1003 return scnprintf(bf, size, "%s",
1004 btf__name_by_offset(btf, be->name_off));
1005 }
1006 }
1007
1008 return 0;
1009}
1010
1011struct trace_btf_dump_snprintf_ctx {
1012 char *bf;
1013 size_t printed, size;
1014};
1015
1016static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args)
1017{
1018 struct trace_btf_dump_snprintf_ctx *ctx = vctx;
1019
1020 ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args);
1021}
1022
1023static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg)
1024{
1025 struct trace_btf_dump_snprintf_ctx ctx = {
1026 .bf = bf,
1027 .size = size,
1028 };
1029 struct augmented_arg *augmented_arg = arg->augmented.args;
1030 int type_id = arg->fmt->type_id, consumed;
1031 struct btf_dump *btf_dump;
1032
1033 LIBBPF_OPTS(btf_dump_opts, dump_opts);
1034 LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts);
1035
1036 if (arg == NULL || arg->augmented.args == NULL)
1037 return 0;
1038
1039 dump_data_opts.compact = true;
1040 dump_data_opts.skip_names = !arg->trace->show_arg_names;
1041
1042 btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts);
1043 if (btf_dump == NULL)
1044 return 0;
1045
1046 /* pretty print the struct data here */
1047 if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0)
1048 return 0;
1049
1050 consumed = sizeof(*augmented_arg) + augmented_arg->size;
1051 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1052 arg->augmented.size -= consumed;
1053
1054 btf_dump__free(btf_dump);
1055
1056 return ctx.printed;
1057}
1058
1059static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
1060 size_t size, int val, char *type)
1061{
1062 struct syscall_arg_fmt *arg_fmt = arg->fmt;
1063
1064 if (trace->btf == NULL)
1065 return 0;
1066
1067 if (arg_fmt->type == NULL) {
1068 // Check if this is an enum and if we have the BTF type for it.
1069 syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
1070 }
1071
1072 // Did we manage to find a BTF type for the syscall/tracepoint argument?
1073 if (arg_fmt->type == NULL)
1074 return 0;
1075
1076 if (btf_is_enum(arg_fmt->type))
1077 return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
1078 else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type))
1079 return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
1080
1081 return 0;
1082}
1083
1084#else // HAVE_LIBBPF_SUPPORT
1085static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused,
1086 char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
1087 char *type __maybe_unused)
1088{
1089 return 0;
1090}
1091
1092static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused,
1093 struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
1094{
1095 return false;
1096}
1097#endif // HAVE_LIBBPF_SUPPORT
1098
1099#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type
1100
1101#define STRARRAY(name, array) \
1102 { .scnprintf = SCA_STRARRAY, \
1103 .strtoul = STUL_STRARRAY, \
1104 .parm = &strarray__##array, }
1105
1106#define STRARRAY_FLAGS(name, array) \
1107 { .scnprintf = SCA_STRARRAY_FLAGS, \
1108 .strtoul = STUL_STRARRAY_FLAGS, \
1109 .parm = &strarray__##array, }
1110
1111#include "trace/beauty/arch_errno_names.c"
1112#include "trace/beauty/eventfd.c"
1113#include "trace/beauty/futex_op.c"
1114#include "trace/beauty/futex_val3.c"
1115#include "trace/beauty/mmap.c"
1116#include "trace/beauty/mode_t.c"
1117#include "trace/beauty/msg_flags.c"
1118#include "trace/beauty/open_flags.c"
1119#include "trace/beauty/perf_event_open.c"
1120#include "trace/beauty/pid.c"
1121#include "trace/beauty/sched_policy.c"
1122#include "trace/beauty/seccomp.c"
1123#include "trace/beauty/signum.c"
1124#include "trace/beauty/socket_type.c"
1125#include "trace/beauty/waitid_options.c"
1126
1127static const struct syscall_fmt syscall_fmts[] = {
1128 { .name = "access",
1129 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
1130 { .name = "arch_prctl",
1131 .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
1132 [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
1133 { .name = "bind",
1134 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1135 [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
1136 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1137 { .name = "bpf",
1138 .arg = { [0] = STRARRAY(cmd, bpf_cmd),
1139 [1] = { .from_user = true /* attr */, }, } },
1140 { .name = "brk", .hexret = true,
1141 .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
1142 { .name = "clock_gettime",
1143 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
1144 { .name = "clock_nanosleep",
1145 .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, },
1146 { .name = "clone", .errpid = true, .nr_args = 5,
1147 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
1148 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
1149 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
1150 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
1151 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
1152 { .name = "close",
1153 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
1154 { .name = "connect",
1155 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1156 [1] = SCA_SOCKADDR_FROM_USER(servaddr),
1157 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1158 { .name = "epoll_ctl",
1159 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
1160 { .name = "eventfd2",
1161 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
1162 { .name = "faccessat",
1163 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1164 [1] = SCA_FILENAME_FROM_USER(pathname),
1165 [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
1166 { .name = "faccessat2",
1167 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1168 [1] = SCA_FILENAME_FROM_USER(pathname),
1169 [2] = { .scnprintf = SCA_ACCMODE, /* mode */ },
1170 [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, },
1171 { .name = "fchmodat",
1172 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1173 { .name = "fchownat",
1174 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1175 { .name = "fcntl",
1176 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
1177 .strtoul = STUL_STRARRAYS,
1178 .parm = &strarrays__fcntl_cmds_arrays,
1179 .show_zero = true, },
1180 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
1181 { .name = "flock",
1182 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
1183 { .name = "fsconfig",
1184 .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
1185 { .name = "fsmount",
1186 .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
1187 [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
1188 { .name = "fspick",
1189 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1190 [1] = SCA_FILENAME_FROM_USER(path),
1191 [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
1192 { .name = "fstat", .alias = "newfstat", },
1193 { .name = "futex",
1194 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
1195 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
1196 { .name = "futimesat",
1197 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1198 { .name = "getitimer",
1199 .arg = { [0] = STRARRAY(which, itimers), }, },
1200 { .name = "getpid", .errpid = true, },
1201 { .name = "getpgid", .errpid = true, },
1202 { .name = "getppid", .errpid = true, },
1203 { .name = "getrandom",
1204 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
1205 { .name = "getrlimit",
1206 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
1207 { .name = "getsockopt",
1208 .arg = { [1] = STRARRAY(level, socket_level), }, },
1209 { .name = "gettid", .errpid = true, },
1210 { .name = "ioctl",
1211 .arg = {
1212#if defined(__i386__) || defined(__x86_64__)
1213/*
1214 * FIXME: Make this available to all arches.
1215 */
1216 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
1217 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1218#else
1219 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1220#endif
1221 { .name = "kcmp", .nr_args = 5,
1222 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
1223 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
1224 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
1225 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
1226 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
1227 { .name = "keyctl",
1228 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
1229 { .name = "kill",
1230 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1231 { .name = "linkat",
1232 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1233 { .name = "lseek",
1234 .arg = { [2] = STRARRAY(whence, whences), }, },
1235 { .name = "lstat", .alias = "newlstat", },
1236 { .name = "madvise",
1237 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1238 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
1239 { .name = "mkdirat",
1240 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1241 { .name = "mknodat",
1242 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1243 { .name = "mmap", .hexret = true,
1244/* The standard mmap maps to old_mmap on s390x */
1245#if defined(__s390x__)
1246 .alias = "old_mmap",
1247#endif
1248 .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1249 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */
1250 .strtoul = STUL_STRARRAY_FLAGS,
1251 .parm = &strarray__mmap_flags, },
1252 [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, },
1253 { .name = "mount",
1254 .arg = { [0] = SCA_FILENAME_FROM_USER(devname),
1255 [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
1256 .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
1257 { .name = "move_mount",
1258 .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ },
1259 [1] = SCA_FILENAME_FROM_USER(pathname),
1260 [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ },
1261 [3] = SCA_FILENAME_FROM_USER(pathname),
1262 [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
1263 { .name = "mprotect",
1264 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1265 [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, },
1266 { .name = "mq_unlink",
1267 .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, },
1268 { .name = "mremap", .hexret = true,
1269 .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
1270 { .name = "name_to_handle_at",
1271 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1272 { .name = "nanosleep",
1273 .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, },
1274 { .name = "newfstatat", .alias = "fstatat",
1275 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1276 [1] = SCA_FILENAME_FROM_USER(pathname),
1277 [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1278 { .name = "open",
1279 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1280 { .name = "open_by_handle_at",
1281 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1282 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1283 { .name = "openat",
1284 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1285 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1286 { .name = "perf_event_open",
1287 .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr),
1288 [2] = { .scnprintf = SCA_INT, /* cpu */ },
1289 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
1290 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
1291 { .name = "pipe2",
1292 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
1293 { .name = "pkey_alloc",
1294 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
1295 { .name = "pkey_free",
1296 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
1297 { .name = "pkey_mprotect",
1298 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1299 [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1300 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
1301 { .name = "poll", .timeout = true, },
1302 { .name = "ppoll", .timeout = true, },
1303 { .name = "prctl",
1304 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
1305 .strtoul = STUL_STRARRAY,
1306 .parm = &strarray__prctl_options, },
1307 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
1308 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
1309 { .name = "pread", .alias = "pread64", },
1310 { .name = "preadv", .alias = "pread", },
1311 { .name = "prlimit64",
1312 .arg = { [1] = STRARRAY(resource, rlimit_resources),
1313 [2] = { .from_user = true /* new_rlim */, }, }, },
1314 { .name = "pwrite", .alias = "pwrite64", },
1315 { .name = "readlinkat",
1316 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1317 { .name = "recvfrom",
1318 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1319 { .name = "recvmmsg",
1320 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1321 { .name = "recvmsg",
1322 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1323 { .name = "renameat",
1324 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1325 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
1326 { .name = "renameat2",
1327 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1328 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
1329 [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
1330 { .name = "rseq", .errpid = true,
1331 .arg = { [0] = { .from_user = true /* rseq */, }, }, },
1332 { .name = "rt_sigaction",
1333 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1334 { .name = "rt_sigprocmask",
1335 .arg = { [0] = STRARRAY(how, sighow), }, },
1336 { .name = "rt_sigqueueinfo",
1337 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1338 { .name = "rt_tgsigqueueinfo",
1339 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1340 { .name = "sched_setscheduler",
1341 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
1342 { .name = "seccomp",
1343 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
1344 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
1345 { .name = "select", .timeout = true, },
1346 { .name = "sendfile", .alias = "sendfile64", },
1347 { .name = "sendmmsg",
1348 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1349 { .name = "sendmsg",
1350 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1351 { .name = "sendto",
1352 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
1353 [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
1354 { .name = "set_robust_list", .errpid = true,
1355 .arg = { [0] = { .from_user = true /* head */, }, }, },
1356 { .name = "set_tid_address", .errpid = true, },
1357 { .name = "setitimer",
1358 .arg = { [0] = STRARRAY(which, itimers), }, },
1359 { .name = "setrlimit",
1360 .arg = { [0] = STRARRAY(resource, rlimit_resources),
1361 [1] = { .from_user = true /* rlim */, }, }, },
1362 { .name = "setsockopt",
1363 .arg = { [1] = STRARRAY(level, socket_level), }, },
1364 { .name = "socket",
1365 .arg = { [0] = STRARRAY(family, socket_families),
1366 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1367 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1368 { .name = "socketpair",
1369 .arg = { [0] = STRARRAY(family, socket_families),
1370 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1371 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1372 { .name = "stat", .alias = "newstat", },
1373 { .name = "statx",
1374 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
1375 [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } ,
1376 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
1377 { .name = "swapoff",
1378 .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1379 { .name = "swapon",
1380 .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1381 { .name = "symlinkat",
1382 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1383 { .name = "sync_file_range",
1384 .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
1385 { .name = "tgkill",
1386 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1387 { .name = "tkill",
1388 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1389 { .name = "umount2", .alias = "umount",
1390 .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, },
1391 { .name = "uname", .alias = "newuname", },
1392 { .name = "unlinkat",
1393 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1394 [1] = SCA_FILENAME_FROM_USER(pathname),
1395 [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1396 { .name = "utimensat",
1397 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
1398 { .name = "wait4", .errpid = true,
1399 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1400 { .name = "waitid", .errpid = true,
1401 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1402 { .name = "write",
1403 .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
1404};
1405
1406static int syscall_fmt__cmp(const void *name, const void *fmtp)
1407{
1408 const struct syscall_fmt *fmt = fmtp;
1409 return strcmp(name, fmt->name);
1410}
1411
1412static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts,
1413 const int nmemb,
1414 const char *name)
1415{
1416 return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1417}
1418
1419static const struct syscall_fmt *syscall_fmt__find(const char *name)
1420{
1421 const int nmemb = ARRAY_SIZE(syscall_fmts);
1422 return __syscall_fmt__find(syscall_fmts, nmemb, name);
1423}
1424
1425static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts,
1426 const int nmemb, const char *alias)
1427{
1428 int i;
1429
1430 for (i = 0; i < nmemb; ++i) {
1431 if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
1432 return &fmts[i];
1433 }
1434
1435 return NULL;
1436}
1437
1438static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
1439{
1440 const int nmemb = ARRAY_SIZE(syscall_fmts);
1441 return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
1442}
1443
1444/*
1445 * is_exit: is this "exit" or "exit_group"?
1446 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
1447 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
1448 * nonexistent: Just a hole in the syscall table, syscall id not allocated
1449 */
1450struct syscall {
1451 struct tep_event *tp_format;
1452 int nr_args;
1453 int args_size;
1454 struct {
1455 struct bpf_program *sys_enter,
1456 *sys_exit;
1457 } bpf_prog;
1458 bool is_exit;
1459 bool is_open;
1460 bool nonexistent;
1461 bool use_btf;
1462 struct tep_format_field *args;
1463 const char *name;
1464 const struct syscall_fmt *fmt;
1465 struct syscall_arg_fmt *arg_fmt;
1466};
1467
1468/*
1469 * We need to have this 'calculated' boolean because in some cases we really
1470 * don't know what is the duration of a syscall, for instance, when we start
1471 * a session and some threads are waiting for a syscall to finish, say 'poll',
1472 * in which case all we can do is to print "( ? ) for duration and for the
1473 * start timestamp.
1474 */
1475static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1476{
1477 double duration = (double)t / NSEC_PER_MSEC;
1478 size_t printed = fprintf(fp, "(");
1479
1480 if (!calculated)
1481 printed += fprintf(fp, " ");
1482 else if (duration >= 1.0)
1483 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1484 else if (duration >= 0.01)
1485 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1486 else
1487 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1488 return printed + fprintf(fp, "): ");
1489}
1490
1491/**
1492 * filename.ptr: The filename char pointer that will be vfs_getname'd
1493 * filename.entry_str_pos: Where to insert the string translated from
1494 * filename.ptr by the vfs_getname tracepoint/kprobe.
1495 * ret_scnprintf: syscall args may set this to a different syscall return
1496 * formatter, for instance, fcntl may return fds, file flags, etc.
1497 */
1498struct thread_trace {
1499 u64 entry_time;
1500 bool entry_pending;
1501 unsigned long nr_events;
1502 unsigned long pfmaj, pfmin;
1503 char *entry_str;
1504 double runtime_ms;
1505 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1506 struct {
1507 unsigned long ptr;
1508 short int entry_str_pos;
1509 bool pending_open;
1510 unsigned int namelen;
1511 char *name;
1512 } filename;
1513 struct {
1514 int max;
1515 struct file *table;
1516 } files;
1517
1518 struct intlist *syscall_stats;
1519};
1520
1521static struct thread_trace *thread_trace__new(void)
1522{
1523 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1524
1525 if (ttrace) {
1526 ttrace->files.max = -1;
1527 ttrace->syscall_stats = intlist__new(NULL);
1528 }
1529
1530 return ttrace;
1531}
1532
1533static void thread_trace__free_files(struct thread_trace *ttrace);
1534
1535static void thread_trace__delete(void *pttrace)
1536{
1537 struct thread_trace *ttrace = pttrace;
1538
1539 if (!ttrace)
1540 return;
1541
1542 intlist__delete(ttrace->syscall_stats);
1543 ttrace->syscall_stats = NULL;
1544 thread_trace__free_files(ttrace);
1545 zfree(&ttrace->entry_str);
1546 free(ttrace);
1547}
1548
1549static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1550{
1551 struct thread_trace *ttrace;
1552
1553 if (thread == NULL)
1554 goto fail;
1555
1556 if (thread__priv(thread) == NULL)
1557 thread__set_priv(thread, thread_trace__new());
1558
1559 if (thread__priv(thread) == NULL)
1560 goto fail;
1561
1562 ttrace = thread__priv(thread);
1563 ++ttrace->nr_events;
1564
1565 return ttrace;
1566fail:
1567 color_fprintf(fp, PERF_COLOR_RED,
1568 "WARNING: not enough memory, dropping samples!\n");
1569 return NULL;
1570}
1571
1572
1573void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1574 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1575{
1576 struct thread_trace *ttrace = thread__priv(arg->thread);
1577
1578 ttrace->ret_scnprintf = ret_scnprintf;
1579}
1580
1581#define TRACE_PFMAJ (1 << 0)
1582#define TRACE_PFMIN (1 << 1)
1583
1584static const size_t trace__entry_str_size = 2048;
1585
1586static void thread_trace__free_files(struct thread_trace *ttrace)
1587{
1588 for (int i = 0; i < ttrace->files.max; ++i) {
1589 struct file *file = ttrace->files.table + i;
1590 zfree(&file->pathname);
1591 }
1592
1593 zfree(&ttrace->files.table);
1594 ttrace->files.max = -1;
1595}
1596
1597static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1598{
1599 if (fd < 0)
1600 return NULL;
1601
1602 if (fd > ttrace->files.max) {
1603 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1604
1605 if (nfiles == NULL)
1606 return NULL;
1607
1608 if (ttrace->files.max != -1) {
1609 memset(nfiles + ttrace->files.max + 1, 0,
1610 (fd - ttrace->files.max) * sizeof(struct file));
1611 } else {
1612 memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1613 }
1614
1615 ttrace->files.table = nfiles;
1616 ttrace->files.max = fd;
1617 }
1618
1619 return ttrace->files.table + fd;
1620}
1621
1622struct file *thread__files_entry(struct thread *thread, int fd)
1623{
1624 return thread_trace__files_entry(thread__priv(thread), fd);
1625}
1626
1627static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1628{
1629 struct thread_trace *ttrace = thread__priv(thread);
1630 struct file *file = thread_trace__files_entry(ttrace, fd);
1631
1632 if (file != NULL) {
1633 struct stat st;
1634 if (stat(pathname, &st) == 0)
1635 file->dev_maj = major(st.st_rdev);
1636 file->pathname = strdup(pathname);
1637 if (file->pathname)
1638 return 0;
1639 }
1640
1641 return -1;
1642}
1643
1644static int thread__read_fd_path(struct thread *thread, int fd)
1645{
1646 char linkname[PATH_MAX], pathname[PATH_MAX];
1647 struct stat st;
1648 int ret;
1649
1650 if (thread__pid(thread) == thread__tid(thread)) {
1651 scnprintf(linkname, sizeof(linkname),
1652 "/proc/%d/fd/%d", thread__pid(thread), fd);
1653 } else {
1654 scnprintf(linkname, sizeof(linkname),
1655 "/proc/%d/task/%d/fd/%d",
1656 thread__pid(thread), thread__tid(thread), fd);
1657 }
1658
1659 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1660 return -1;
1661
1662 ret = readlink(linkname, pathname, sizeof(pathname));
1663
1664 if (ret < 0 || ret > st.st_size)
1665 return -1;
1666
1667 pathname[ret] = '\0';
1668 return trace__set_fd_pathname(thread, fd, pathname);
1669}
1670
1671static const char *thread__fd_path(struct thread *thread, int fd,
1672 struct trace *trace)
1673{
1674 struct thread_trace *ttrace = thread__priv(thread);
1675
1676 if (ttrace == NULL || trace->fd_path_disabled)
1677 return NULL;
1678
1679 if (fd < 0)
1680 return NULL;
1681
1682 if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1683 if (!trace->live)
1684 return NULL;
1685 ++trace->stats.proc_getname;
1686 if (thread__read_fd_path(thread, fd))
1687 return NULL;
1688 }
1689
1690 return ttrace->files.table[fd].pathname;
1691}
1692
1693size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1694{
1695 int fd = arg->val;
1696 size_t printed = scnprintf(bf, size, "%d", fd);
1697 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1698
1699 if (path)
1700 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1701
1702 return printed;
1703}
1704
1705size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1706{
1707 size_t printed = scnprintf(bf, size, "%d", fd);
1708 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1709
1710 if (thread) {
1711 const char *path = thread__fd_path(thread, fd, trace);
1712
1713 if (path)
1714 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1715
1716 thread__put(thread);
1717 }
1718
1719 return printed;
1720}
1721
1722static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1723 struct syscall_arg *arg)
1724{
1725 int fd = arg->val;
1726 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1727 struct thread_trace *ttrace = thread__priv(arg->thread);
1728
1729 if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1730 zfree(&ttrace->files.table[fd].pathname);
1731
1732 return printed;
1733}
1734
1735static void thread__set_filename_pos(struct thread *thread, const char *bf,
1736 unsigned long ptr)
1737{
1738 struct thread_trace *ttrace = thread__priv(thread);
1739
1740 ttrace->filename.ptr = ptr;
1741 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1742}
1743
1744static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1745{
1746 struct augmented_arg *augmented_arg = arg->augmented.args;
1747 size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1748 /*
1749 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1750 * we would have two strings, each prefixed by its size.
1751 */
1752 int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1753
1754 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1755 arg->augmented.size -= consumed;
1756
1757 return printed;
1758}
1759
1760static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1761 struct syscall_arg *arg)
1762{
1763 unsigned long ptr = arg->val;
1764
1765 if (arg->augmented.args)
1766 return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1767
1768 if (!arg->trace->vfs_getname)
1769 return scnprintf(bf, size, "%#x", ptr);
1770
1771 thread__set_filename_pos(arg->thread, bf, ptr);
1772 return 0;
1773}
1774
1775#define MAX_CONTROL_CHAR 31
1776#define MAX_ASCII 127
1777
1778static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg)
1779{
1780 struct augmented_arg *augmented_arg = arg->augmented.args;
1781 unsigned char *orig = (unsigned char *)augmented_arg->value;
1782 size_t printed = 0;
1783 int consumed;
1784
1785 if (augmented_arg == NULL)
1786 return 0;
1787
1788 for (int j = 0; j < augmented_arg->size; ++j) {
1789 bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII;
1790 /* print control characters (0~31 and 127), and non-ascii characters in \(digits) */
1791 printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]);
1792 }
1793
1794 consumed = sizeof(*augmented_arg) + augmented_arg->size;
1795 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1796 arg->augmented.size -= consumed;
1797
1798 return printed;
1799}
1800
1801static bool trace__filter_duration(struct trace *trace, double t)
1802{
1803 return t < (trace->duration_filter * NSEC_PER_MSEC);
1804}
1805
1806static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1807{
1808 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1809
1810 return fprintf(fp, "%10.3f ", ts);
1811}
1812
1813/*
1814 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1815 * using ttrace->entry_time for a thread that receives a sys_exit without
1816 * first having received a sys_enter ("poll" issued before tracing session
1817 * starts, lost sys_enter exit due to ring buffer overflow).
1818 */
1819static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1820{
1821 if (tstamp > 0)
1822 return __trace__fprintf_tstamp(trace, tstamp, fp);
1823
1824 return fprintf(fp, " ? ");
1825}
1826
1827static pid_t workload_pid = -1;
1828static volatile sig_atomic_t done = false;
1829static volatile sig_atomic_t interrupted = false;
1830
1831static void sighandler_interrupt(int sig __maybe_unused)
1832{
1833 done = interrupted = true;
1834}
1835
1836static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
1837 void *context __maybe_unused)
1838{
1839 if (info->si_pid == workload_pid)
1840 done = true;
1841}
1842
1843static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1844{
1845 size_t printed = 0;
1846
1847 if (trace->multiple_threads) {
1848 if (trace->show_comm)
1849 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1850 printed += fprintf(fp, "%d ", thread__tid(thread));
1851 }
1852
1853 return printed;
1854}
1855
1856static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1857 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1858{
1859 size_t printed = 0;
1860
1861 if (trace->show_tstamp)
1862 printed = trace__fprintf_tstamp(trace, tstamp, fp);
1863 if (trace->show_duration)
1864 printed += fprintf_duration(duration, duration_calculated, fp);
1865 return printed + trace__fprintf_comm_tid(trace, thread, fp);
1866}
1867
1868static int trace__process_event(struct trace *trace, struct machine *machine,
1869 union perf_event *event, struct perf_sample *sample)
1870{
1871 int ret = 0;
1872
1873 switch (event->header.type) {
1874 case PERF_RECORD_LOST:
1875 color_fprintf(trace->output, PERF_COLOR_RED,
1876 "LOST %" PRIu64 " events!\n", (u64)event->lost.lost);
1877 ret = machine__process_lost_event(machine, event, sample);
1878 break;
1879 default:
1880 ret = machine__process_event(machine, event, sample);
1881 break;
1882 }
1883
1884 return ret;
1885}
1886
1887static int trace__tool_process(const struct perf_tool *tool,
1888 union perf_event *event,
1889 struct perf_sample *sample,
1890 struct machine *machine)
1891{
1892 struct trace *trace = container_of(tool, struct trace, tool);
1893 return trace__process_event(trace, machine, event, sample);
1894}
1895
1896static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1897{
1898 struct machine *machine = vmachine;
1899
1900 if (machine->kptr_restrict_warned)
1901 return NULL;
1902
1903 if (symbol_conf.kptr_restrict) {
1904 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1905 "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1906 "Kernel samples will not be resolved.\n");
1907 machine->kptr_restrict_warned = true;
1908 return NULL;
1909 }
1910
1911 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1912}
1913
1914static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1915{
1916 int err = symbol__init(NULL);
1917
1918 if (err)
1919 return err;
1920
1921 trace->host = machine__new_host();
1922 if (trace->host == NULL)
1923 return -ENOMEM;
1924
1925 thread__set_priv_destructor(thread_trace__delete);
1926
1927 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1928 if (err < 0)
1929 goto out;
1930
1931 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1932 evlist->core.threads, trace__tool_process,
1933 true, false, 1);
1934out:
1935 if (err)
1936 symbol__exit();
1937
1938 return err;
1939}
1940
1941static void trace__symbols__exit(struct trace *trace)
1942{
1943 machine__exit(trace->host);
1944 trace->host = NULL;
1945
1946 symbol__exit();
1947}
1948
1949static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1950{
1951 int idx;
1952
1953 if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
1954 nr_args = sc->fmt->nr_args;
1955
1956 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1957 if (sc->arg_fmt == NULL)
1958 return -1;
1959
1960 for (idx = 0; idx < nr_args; ++idx) {
1961 if (sc->fmt)
1962 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1963 }
1964
1965 sc->nr_args = nr_args;
1966 return 0;
1967}
1968
1969static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
1970 { .name = "msr", .scnprintf = SCA_X86_MSR, .strtoul = STUL_X86_MSR, },
1971 { .name = "vector", .scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
1972};
1973
1974static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
1975{
1976 const struct syscall_arg_fmt *fmt = fmtp;
1977 return strcmp(name, fmt->name);
1978}
1979
1980static const struct syscall_arg_fmt *
1981__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb,
1982 const char *name)
1983{
1984 return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
1985}
1986
1987static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
1988{
1989 const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
1990 return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
1991}
1992
1993static struct tep_format_field *
1994syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
1995 bool *use_btf)
1996{
1997 struct tep_format_field *last_field = NULL;
1998 int len;
1999
2000 for (; field; field = field->next, ++arg) {
2001 last_field = field;
2002
2003 if (arg->scnprintf)
2004 continue;
2005
2006 len = strlen(field->name);
2007
2008 // As far as heuristics (or intention) goes this seems to hold true, and makes sense!
2009 if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
2010 arg->from_user = true;
2011
2012 if (strcmp(field->type, "const char *") == 0 &&
2013 ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
2014 strstr(field->name, "path") != NULL)) {
2015 arg->scnprintf = SCA_FILENAME;
2016 } else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
2017 arg->scnprintf = SCA_PTR;
2018 else if (strcmp(field->type, "pid_t") == 0)
2019 arg->scnprintf = SCA_PID;
2020 else if (strcmp(field->type, "umode_t") == 0)
2021 arg->scnprintf = SCA_MODE_T;
2022 else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
2023 arg->scnprintf = SCA_CHAR_ARRAY;
2024 arg->nr_entries = field->arraylen;
2025 } else if ((strcmp(field->type, "int") == 0 ||
2026 strcmp(field->type, "unsigned int") == 0 ||
2027 strcmp(field->type, "long") == 0) &&
2028 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
2029 /*
2030 * /sys/kernel/tracing/events/syscalls/sys_enter*
2031 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
2032 * 65 int
2033 * 23 unsigned int
2034 * 7 unsigned long
2035 */
2036 arg->scnprintf = SCA_FD;
2037 } else if (strstr(field->type, "enum") && use_btf != NULL) {
2038 *use_btf = true;
2039 arg->strtoul = STUL_BTF_TYPE;
2040 } else {
2041 const struct syscall_arg_fmt *fmt =
2042 syscall_arg_fmt__find_by_name(field->name);
2043
2044 if (fmt) {
2045 arg->scnprintf = fmt->scnprintf;
2046 arg->strtoul = fmt->strtoul;
2047 }
2048 }
2049 }
2050
2051 return last_field;
2052}
2053
2054static int syscall__set_arg_fmts(struct syscall *sc)
2055{
2056 struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args,
2057 &sc->use_btf);
2058
2059 if (last_field)
2060 sc->args_size = last_field->offset + last_field->size;
2061
2062 return 0;
2063}
2064
2065static int trace__read_syscall_info(struct trace *trace, int id)
2066{
2067 char tp_name[128];
2068 struct syscall *sc;
2069 const char *name = syscalltbl__name(trace->sctbl, id);
2070 int err;
2071
2072#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2073 if (trace->syscalls.table == NULL) {
2074 trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
2075 if (trace->syscalls.table == NULL)
2076 return -ENOMEM;
2077 }
2078#else
2079 if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
2080 // When using libaudit we don't know beforehand what is the max syscall id
2081 struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
2082
2083 if (table == NULL)
2084 return -ENOMEM;
2085
2086 // Need to memset from offset 0 and +1 members if brand new
2087 if (trace->syscalls.table == NULL)
2088 memset(table, 0, (id + 1) * sizeof(*sc));
2089 else
2090 memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
2091
2092 trace->syscalls.table = table;
2093 trace->sctbl->syscalls.max_id = id;
2094 }
2095#endif
2096 sc = trace->syscalls.table + id;
2097 if (sc->nonexistent)
2098 return -EEXIST;
2099
2100 if (name == NULL) {
2101 sc->nonexistent = true;
2102 return -EEXIST;
2103 }
2104
2105 sc->name = name;
2106 sc->fmt = syscall_fmt__find(sc->name);
2107
2108 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
2109 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2110
2111 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
2112 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
2113 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2114 }
2115
2116 /*
2117 * Fails to read trace point format via sysfs node, so the trace point
2118 * doesn't exist. Set the 'nonexistent' flag as true.
2119 */
2120 if (IS_ERR(sc->tp_format)) {
2121 sc->nonexistent = true;
2122 return PTR_ERR(sc->tp_format);
2123 }
2124
2125 /*
2126 * The tracepoint format contains __syscall_nr field, so it's one more
2127 * than the actual number of syscall arguments.
2128 */
2129 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
2130 RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
2131 return -ENOMEM;
2132
2133 sc->args = sc->tp_format->format.fields;
2134 /*
2135 * We need to check and discard the first variable '__syscall_nr'
2136 * or 'nr' that mean the syscall number. It is needless here.
2137 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
2138 */
2139 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
2140 sc->args = sc->args->next;
2141 --sc->nr_args;
2142 }
2143
2144 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
2145 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
2146
2147 err = syscall__set_arg_fmts(sc);
2148
2149 /* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */
2150 if (sc->use_btf)
2151 trace__load_vmlinux_btf(trace);
2152
2153 return err;
2154}
2155
2156static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf)
2157{
2158 struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
2159
2160 if (fmt != NULL) {
2161 syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf);
2162 return 0;
2163 }
2164
2165 return -ENOMEM;
2166}
2167
2168static int intcmp(const void *a, const void *b)
2169{
2170 const int *one = a, *another = b;
2171
2172 return *one - *another;
2173}
2174
2175static int trace__validate_ev_qualifier(struct trace *trace)
2176{
2177 int err = 0;
2178 bool printed_invalid_prefix = false;
2179 struct str_node *pos;
2180 size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
2181
2182 trace->ev_qualifier_ids.entries = malloc(nr_allocated *
2183 sizeof(trace->ev_qualifier_ids.entries[0]));
2184
2185 if (trace->ev_qualifier_ids.entries == NULL) {
2186 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
2187 trace->output);
2188 err = -EINVAL;
2189 goto out;
2190 }
2191
2192 strlist__for_each_entry(pos, trace->ev_qualifier) {
2193 const char *sc = pos->s;
2194 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
2195
2196 if (id < 0) {
2197 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
2198 if (id >= 0)
2199 goto matches;
2200
2201 if (!printed_invalid_prefix) {
2202 pr_debug("Skipping unknown syscalls: ");
2203 printed_invalid_prefix = true;
2204 } else {
2205 pr_debug(", ");
2206 }
2207
2208 pr_debug("%s", sc);
2209 continue;
2210 }
2211matches:
2212 trace->ev_qualifier_ids.entries[nr_used++] = id;
2213 if (match_next == -1)
2214 continue;
2215
2216 while (1) {
2217 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
2218 if (id < 0)
2219 break;
2220 if (nr_allocated == nr_used) {
2221 void *entries;
2222
2223 nr_allocated += 8;
2224 entries = realloc(trace->ev_qualifier_ids.entries,
2225 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
2226 if (entries == NULL) {
2227 err = -ENOMEM;
2228 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
2229 goto out_free;
2230 }
2231 trace->ev_qualifier_ids.entries = entries;
2232 }
2233 trace->ev_qualifier_ids.entries[nr_used++] = id;
2234 }
2235 }
2236
2237 trace->ev_qualifier_ids.nr = nr_used;
2238 qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
2239out:
2240 if (printed_invalid_prefix)
2241 pr_debug("\n");
2242 return err;
2243out_free:
2244 zfree(&trace->ev_qualifier_ids.entries);
2245 trace->ev_qualifier_ids.nr = 0;
2246 goto out;
2247}
2248
2249static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
2250{
2251 bool in_ev_qualifier;
2252
2253 if (trace->ev_qualifier_ids.nr == 0)
2254 return true;
2255
2256 in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
2257 trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
2258
2259 if (in_ev_qualifier)
2260 return !trace->not_ev_qualifier;
2261
2262 return trace->not_ev_qualifier;
2263}
2264
2265/*
2266 * args is to be interpreted as a series of longs but we need to handle
2267 * 8-byte unaligned accesses. args points to raw_data within the event
2268 * and raw_data is guaranteed to be 8-byte unaligned because it is
2269 * preceded by raw_size which is a u32. So we need to copy args to a temp
2270 * variable to read it. Most notably this avoids extended load instructions
2271 * on unaligned addresses
2272 */
2273unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
2274{
2275 unsigned long val;
2276 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
2277
2278 memcpy(&val, p, sizeof(val));
2279 return val;
2280}
2281
2282static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
2283 struct syscall_arg *arg)
2284{
2285 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
2286 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
2287
2288 return scnprintf(bf, size, "arg%d: ", arg->idx);
2289}
2290
2291/*
2292 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
2293 * as mount 'flags' argument that needs ignoring some magic flag, see comment
2294 * in tools/perf/trace/beauty/mount_flags.c
2295 */
2296static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
2297{
2298 if (fmt && fmt->mask_val)
2299 return fmt->mask_val(arg, val);
2300
2301 return val;
2302}
2303
2304static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
2305 struct syscall_arg *arg, unsigned long val)
2306{
2307 if (fmt && fmt->scnprintf) {
2308 arg->val = val;
2309 if (fmt->parm)
2310 arg->parm = fmt->parm;
2311 return fmt->scnprintf(bf, size, arg);
2312 }
2313 return scnprintf(bf, size, "%ld", val);
2314}
2315
2316static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
2317 unsigned char *args, void *augmented_args, int augmented_args_size,
2318 struct trace *trace, struct thread *thread)
2319{
2320 size_t printed = 0, btf_printed;
2321 unsigned long val;
2322 u8 bit = 1;
2323 struct syscall_arg arg = {
2324 .args = args,
2325 .augmented = {
2326 .size = augmented_args_size,
2327 .args = augmented_args,
2328 },
2329 .idx = 0,
2330 .mask = 0,
2331 .trace = trace,
2332 .thread = thread,
2333 .show_string_prefix = trace->show_string_prefix,
2334 };
2335 struct thread_trace *ttrace = thread__priv(thread);
2336 void *default_scnprintf;
2337
2338 /*
2339 * Things like fcntl will set this in its 'cmd' formatter to pick the
2340 * right formatter for the return value (an fd? file flags?), which is
2341 * not needed for syscalls that always return a given type, say an fd.
2342 */
2343 ttrace->ret_scnprintf = NULL;
2344
2345 if (sc->args != NULL) {
2346 struct tep_format_field *field;
2347
2348 for (field = sc->args; field;
2349 field = field->next, ++arg.idx, bit <<= 1) {
2350 if (arg.mask & bit)
2351 continue;
2352
2353 arg.fmt = &sc->arg_fmt[arg.idx];
2354 val = syscall_arg__val(&arg, arg.idx);
2355 /*
2356 * Some syscall args need some mask, most don't and
2357 * return val untouched.
2358 */
2359 val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
2360
2361 /*
2362 * Suppress this argument if its value is zero and show_zero
2363 * property isn't set.
2364 *
2365 * If it has a BTF type, then override the zero suppression knob
2366 * as the common case is for zero in an enum to have an associated entry.
2367 */
2368 if (val == 0 && !trace->show_zeros &&
2369 !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
2370 !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE))
2371 continue;
2372
2373 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2374
2375 if (trace->show_arg_names)
2376 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2377
2378 default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
2379
2380 if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
2381 btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
2382 size - printed, val, field->type);
2383 if (btf_printed) {
2384 printed += btf_printed;
2385 continue;
2386 }
2387 }
2388
2389 printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
2390 bf + printed, size - printed, &arg, val);
2391 }
2392 } else if (IS_ERR(sc->tp_format)) {
2393 /*
2394 * If we managed to read the tracepoint /format file, then we
2395 * may end up not having any args, like with gettid(), so only
2396 * print the raw args when we didn't manage to read it.
2397 */
2398 while (arg.idx < sc->nr_args) {
2399 if (arg.mask & bit)
2400 goto next_arg;
2401 val = syscall_arg__val(&arg, arg.idx);
2402 if (printed)
2403 printed += scnprintf(bf + printed, size - printed, ", ");
2404 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
2405 printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
2406next_arg:
2407 ++arg.idx;
2408 bit <<= 1;
2409 }
2410 }
2411
2412 return printed;
2413}
2414
2415typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
2416 union perf_event *event,
2417 struct perf_sample *sample);
2418
2419static struct syscall *trace__syscall_info(struct trace *trace,
2420 struct evsel *evsel, int id)
2421{
2422 int err = 0;
2423
2424 if (id < 0) {
2425
2426 /*
2427 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
2428 * before that, leaving at a higher verbosity level till that is
2429 * explained. Reproduced with plain ftrace with:
2430 *
2431 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
2432 * grep "NR -1 " /t/trace_pipe
2433 *
2434 * After generating some load on the machine.
2435 */
2436 if (verbose > 1) {
2437 static u64 n;
2438 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2439 id, evsel__name(evsel), ++n);
2440 }
2441 return NULL;
2442 }
2443
2444 err = -EINVAL;
2445
2446#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2447 if (id > trace->sctbl->syscalls.max_id) {
2448#else
2449 if (id >= trace->sctbl->syscalls.max_id) {
2450 /*
2451 * With libaudit we don't know beforehand what is the max_id,
2452 * so we let trace__read_syscall_info() figure that out as we
2453 * go on reading syscalls.
2454 */
2455 err = trace__read_syscall_info(trace, id);
2456 if (err)
2457#endif
2458 goto out_cant_read;
2459 }
2460
2461 if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
2462 (err = trace__read_syscall_info(trace, id)) != 0)
2463 goto out_cant_read;
2464
2465 if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
2466 goto out_cant_read;
2467
2468 return &trace->syscalls.table[id];
2469
2470out_cant_read:
2471 if (verbose > 0) {
2472 char sbuf[STRERR_BUFSIZE];
2473 fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
2474 if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
2475 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2476 fputs(" information\n", trace->output);
2477 }
2478 return NULL;
2479}
2480
2481struct syscall_stats {
2482 struct stats stats;
2483 u64 nr_failures;
2484 int max_errno;
2485 u32 *errnos;
2486};
2487
2488static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
2489 int id, struct perf_sample *sample, long err, bool errno_summary)
2490{
2491 struct int_node *inode;
2492 struct syscall_stats *stats;
2493 u64 duration = 0;
2494
2495 inode = intlist__findnew(ttrace->syscall_stats, id);
2496 if (inode == NULL)
2497 return;
2498
2499 stats = inode->priv;
2500 if (stats == NULL) {
2501 stats = zalloc(sizeof(*stats));
2502 if (stats == NULL)
2503 return;
2504
2505 init_stats(&stats->stats);
2506 inode->priv = stats;
2507 }
2508
2509 if (ttrace->entry_time && sample->time > ttrace->entry_time)
2510 duration = sample->time - ttrace->entry_time;
2511
2512 update_stats(&stats->stats, duration);
2513
2514 if (err < 0) {
2515 ++stats->nr_failures;
2516
2517 if (!errno_summary)
2518 return;
2519
2520 err = -err;
2521 if (err > stats->max_errno) {
2522 u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
2523
2524 if (new_errnos) {
2525 memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
2526 } else {
2527 pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
2528 thread__comm_str(thread), thread__pid(thread),
2529 thread__tid(thread));
2530 return;
2531 }
2532
2533 stats->errnos = new_errnos;
2534 stats->max_errno = err;
2535 }
2536
2537 ++stats->errnos[err - 1];
2538 }
2539}
2540
2541static int trace__printf_interrupted_entry(struct trace *trace)
2542{
2543 struct thread_trace *ttrace;
2544 size_t printed;
2545 int len;
2546
2547 if (trace->failure_only || trace->current == NULL)
2548 return 0;
2549
2550 ttrace = thread__priv(trace->current);
2551
2552 if (!ttrace->entry_pending)
2553 return 0;
2554
2555 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
2556 printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
2557
2558 if (len < trace->args_alignment - 4)
2559 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
2560
2561 printed += fprintf(trace->output, " ...\n");
2562
2563 ttrace->entry_pending = false;
2564 ++trace->nr_events_printed;
2565
2566 return printed;
2567}
2568
2569static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
2570 struct perf_sample *sample, struct thread *thread)
2571{
2572 int printed = 0;
2573
2574 if (trace->print_sample) {
2575 double ts = (double)sample->time / NSEC_PER_MSEC;
2576
2577 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
2578 evsel__name(evsel), ts,
2579 thread__comm_str(thread),
2580 sample->pid, sample->tid, sample->cpu);
2581 }
2582
2583 return printed;
2584}
2585
2586static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
2587{
2588 void *augmented_args = NULL;
2589 /*
2590 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
2591 * and there we get all 6 syscall args plus the tracepoint common fields
2592 * that gets calculated at the start and the syscall_nr (another long).
2593 * So we check if that is the case and if so don't look after the
2594 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
2595 * which is fixed.
2596 *
2597 * We'll revisit this later to pass s->args_size to the BPF augmenter
2598 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
2599 * copies only what we need for each syscall, like what happens when we
2600 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
2601 * traffic to just what is needed for each syscall.
2602 */
2603 int args_size = raw_augmented_args_size ?: sc->args_size;
2604
2605 *augmented_args_size = sample->raw_size - args_size;
2606 if (*augmented_args_size > 0)
2607 augmented_args = sample->raw_data + args_size;
2608
2609 return augmented_args;
2610}
2611
2612static void syscall__exit(struct syscall *sc)
2613{
2614 if (!sc)
2615 return;
2616
2617 zfree(&sc->arg_fmt);
2618}
2619
2620static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
2621 union perf_event *event __maybe_unused,
2622 struct perf_sample *sample)
2623{
2624 char *msg;
2625 void *args;
2626 int printed = 0;
2627 struct thread *thread;
2628 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2629 int augmented_args_size = 0;
2630 void *augmented_args = NULL;
2631 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2632 struct thread_trace *ttrace;
2633
2634 if (sc == NULL)
2635 return -1;
2636
2637 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2638 ttrace = thread__trace(thread, trace->output);
2639 if (ttrace == NULL)
2640 goto out_put;
2641
2642 trace__fprintf_sample(trace, evsel, sample, thread);
2643
2644 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2645
2646 if (ttrace->entry_str == NULL) {
2647 ttrace->entry_str = malloc(trace__entry_str_size);
2648 if (!ttrace->entry_str)
2649 goto out_put;
2650 }
2651
2652 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
2653 trace__printf_interrupted_entry(trace);
2654 /*
2655 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
2656 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
2657 * this breaks syscall__augmented_args() check for augmented args, as we calculate
2658 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
2659 * so when handling, say the openat syscall, we end up getting 6 args for the
2660 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
2661 * thinking that the extra 2 u64 args are the augmented filename, so just check
2662 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
2663 */
2664 if (evsel != trace->syscalls.events.sys_enter)
2665 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2666 ttrace->entry_time = sample->time;
2667 msg = ttrace->entry_str;
2668 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2669
2670 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2671 args, augmented_args, augmented_args_size, trace, thread);
2672
2673 if (sc->is_exit) {
2674 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
2675 int alignment = 0;
2676
2677 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2678 printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2679 if (trace->args_alignment > printed)
2680 alignment = trace->args_alignment - printed;
2681 fprintf(trace->output, "%*s= ?\n", alignment, " ");
2682 }
2683 } else {
2684 ttrace->entry_pending = true;
2685 /* See trace__vfs_getname & trace__sys_exit */
2686 ttrace->filename.pending_open = false;
2687 }
2688
2689 if (trace->current != thread) {
2690 thread__put(trace->current);
2691 trace->current = thread__get(thread);
2692 }
2693 err = 0;
2694out_put:
2695 thread__put(thread);
2696 return err;
2697}
2698
2699static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2700 struct perf_sample *sample)
2701{
2702 struct thread_trace *ttrace;
2703 struct thread *thread;
2704 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2705 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2706 char msg[1024];
2707 void *args, *augmented_args = NULL;
2708 int augmented_args_size;
2709 size_t printed = 0;
2710
2711 if (sc == NULL)
2712 return -1;
2713
2714 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2715 ttrace = thread__trace(thread, trace->output);
2716 /*
2717 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2718 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2719 */
2720 if (ttrace == NULL)
2721 goto out_put;
2722
2723 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2724 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2725 printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2726 fprintf(trace->output, "%.*s", (int)printed, msg);
2727 err = 0;
2728out_put:
2729 thread__put(thread);
2730 return err;
2731}
2732
2733static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2734 struct perf_sample *sample,
2735 struct callchain_cursor *cursor)
2736{
2737 struct addr_location al;
2738 int max_stack = evsel->core.attr.sample_max_stack ?
2739 evsel->core.attr.sample_max_stack :
2740 trace->max_stack;
2741 int err = -1;
2742
2743 addr_location__init(&al);
2744 if (machine__resolve(trace->host, &al, sample) < 0)
2745 goto out;
2746
2747 err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2748out:
2749 addr_location__exit(&al);
2750 return err;
2751}
2752
2753static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2754{
2755 /* TODO: user-configurable print_opts */
2756 const unsigned int print_opts = EVSEL__PRINT_SYM |
2757 EVSEL__PRINT_DSO |
2758 EVSEL__PRINT_UNKNOWN_AS_ADDR;
2759
2760 return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
2761}
2762
2763static const char *errno_to_name(struct evsel *evsel, int err)
2764{
2765 struct perf_env *env = evsel__env(evsel);
2766
2767 return perf_env__arch_strerrno(env, err);
2768}
2769
2770static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2771 union perf_event *event __maybe_unused,
2772 struct perf_sample *sample)
2773{
2774 long ret;
2775 u64 duration = 0;
2776 bool duration_calculated = false;
2777 struct thread *thread;
2778 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2779 int alignment = trace->args_alignment;
2780 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2781 struct thread_trace *ttrace;
2782
2783 if (sc == NULL)
2784 return -1;
2785
2786 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2787 ttrace = thread__trace(thread, trace->output);
2788 if (ttrace == NULL)
2789 goto out_put;
2790
2791 trace__fprintf_sample(trace, evsel, sample, thread);
2792
2793 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2794
2795 if (trace->summary)
2796 thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
2797
2798 if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2799 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2800 ttrace->filename.pending_open = false;
2801 ++trace->stats.vfs_getname;
2802 }
2803
2804 if (ttrace->entry_time) {
2805 duration = sample->time - ttrace->entry_time;
2806 if (trace__filter_duration(trace, duration))
2807 goto out;
2808 duration_calculated = true;
2809 } else if (trace->duration_filter)
2810 goto out;
2811
2812 if (sample->callchain) {
2813 struct callchain_cursor *cursor = get_tls_callchain_cursor();
2814
2815 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2816 if (callchain_ret == 0) {
2817 if (cursor->nr < trace->min_stack)
2818 goto out;
2819 callchain_ret = 1;
2820 }
2821 }
2822
2823 if (trace->summary_only || (ret >= 0 && trace->failure_only))
2824 goto out;
2825
2826 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2827
2828 if (ttrace->entry_pending) {
2829 printed = fprintf(trace->output, "%s", ttrace->entry_str);
2830 } else {
2831 printed += fprintf(trace->output, " ... [");
2832 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2833 printed += 9;
2834 printed += fprintf(trace->output, "]: %s()", sc->name);
2835 }
2836
2837 printed++; /* the closing ')' */
2838
2839 if (alignment > printed)
2840 alignment -= printed;
2841 else
2842 alignment = 0;
2843
2844 fprintf(trace->output, ")%*s= ", alignment, " ");
2845
2846 if (sc->fmt == NULL) {
2847 if (ret < 0)
2848 goto errno_print;
2849signed_print:
2850 fprintf(trace->output, "%ld", ret);
2851 } else if (ret < 0) {
2852errno_print: {
2853 char bf[STRERR_BUFSIZE];
2854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2855 *e = errno_to_name(evsel, -ret);
2856
2857 fprintf(trace->output, "-1 %s (%s)", e, emsg);
2858 }
2859 } else if (ret == 0 && sc->fmt->timeout)
2860 fprintf(trace->output, "0 (Timeout)");
2861 else if (ttrace->ret_scnprintf) {
2862 char bf[1024];
2863 struct syscall_arg arg = {
2864 .val = ret,
2865 .thread = thread,
2866 .trace = trace,
2867 };
2868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2869 ttrace->ret_scnprintf = NULL;
2870 fprintf(trace->output, "%s", bf);
2871 } else if (sc->fmt->hexret)
2872 fprintf(trace->output, "%#lx", ret);
2873 else if (sc->fmt->errpid) {
2874 struct thread *child = machine__find_thread(trace->host, ret, ret);
2875
2876 if (child != NULL) {
2877 fprintf(trace->output, "%ld", ret);
2878 if (thread__comm_set(child))
2879 fprintf(trace->output, " (%s)", thread__comm_str(child));
2880 thread__put(child);
2881 }
2882 } else
2883 goto signed_print;
2884
2885 fputc('\n', trace->output);
2886
2887 /*
2888 * We only consider an 'event' for the sake of --max-events a non-filtered
2889 * sys_enter + sys_exit and other tracepoint events.
2890 */
2891 if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2892 interrupted = true;
2893
2894 if (callchain_ret > 0)
2895 trace__fprintf_callchain(trace, sample);
2896 else if (callchain_ret < 0)
2897 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2898out:
2899 ttrace->entry_pending = false;
2900 err = 0;
2901out_put:
2902 thread__put(thread);
2903 return err;
2904}
2905
2906static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2907 union perf_event *event __maybe_unused,
2908 struct perf_sample *sample)
2909{
2910 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2911 struct thread_trace *ttrace;
2912 size_t filename_len, entry_str_len, to_move;
2913 ssize_t remaining_space;
2914 char *pos;
2915 const char *filename = evsel__rawptr(evsel, sample, "pathname");
2916
2917 if (!thread)
2918 goto out;
2919
2920 ttrace = thread__priv(thread);
2921 if (!ttrace)
2922 goto out_put;
2923
2924 filename_len = strlen(filename);
2925 if (filename_len == 0)
2926 goto out_put;
2927
2928 if (ttrace->filename.namelen < filename_len) {
2929 char *f = realloc(ttrace->filename.name, filename_len + 1);
2930
2931 if (f == NULL)
2932 goto out_put;
2933
2934 ttrace->filename.namelen = filename_len;
2935 ttrace->filename.name = f;
2936 }
2937
2938 strcpy(ttrace->filename.name, filename);
2939 ttrace->filename.pending_open = true;
2940
2941 if (!ttrace->filename.ptr)
2942 goto out_put;
2943
2944 entry_str_len = strlen(ttrace->entry_str);
2945 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2946 if (remaining_space <= 0)
2947 goto out_put;
2948
2949 if (filename_len > (size_t)remaining_space) {
2950 filename += filename_len - remaining_space;
2951 filename_len = remaining_space;
2952 }
2953
2954 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2955 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2956 memmove(pos + filename_len, pos, to_move);
2957 memcpy(pos, filename, filename_len);
2958
2959 ttrace->filename.ptr = 0;
2960 ttrace->filename.entry_str_pos = 0;
2961out_put:
2962 thread__put(thread);
2963out:
2964 return 0;
2965}
2966
2967static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2968 union perf_event *event __maybe_unused,
2969 struct perf_sample *sample)
2970{
2971 u64 runtime = evsel__intval(evsel, sample, "runtime");
2972 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2973 struct thread *thread = machine__findnew_thread(trace->host,
2974 sample->pid,
2975 sample->tid);
2976 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2977
2978 if (ttrace == NULL)
2979 goto out_dump;
2980
2981 ttrace->runtime_ms += runtime_ms;
2982 trace->runtime_ms += runtime_ms;
2983out_put:
2984 thread__put(thread);
2985 return 0;
2986
2987out_dump:
2988 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2989 evsel->name,
2990 evsel__strval(evsel, sample, "comm"),
2991 (pid_t)evsel__intval(evsel, sample, "pid"),
2992 runtime,
2993 evsel__intval(evsel, sample, "vruntime"));
2994 goto out_put;
2995}
2996
2997static int bpf_output__printer(enum binary_printer_ops op,
2998 unsigned int val, void *extra __maybe_unused, FILE *fp)
2999{
3000 unsigned char ch = (unsigned char)val;
3001
3002 switch (op) {
3003 case BINARY_PRINT_CHAR_DATA:
3004 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
3005 case BINARY_PRINT_DATA_BEGIN:
3006 case BINARY_PRINT_LINE_BEGIN:
3007 case BINARY_PRINT_ADDR:
3008 case BINARY_PRINT_NUM_DATA:
3009 case BINARY_PRINT_NUM_PAD:
3010 case BINARY_PRINT_SEP:
3011 case BINARY_PRINT_CHAR_PAD:
3012 case BINARY_PRINT_LINE_END:
3013 case BINARY_PRINT_DATA_END:
3014 default:
3015 break;
3016 }
3017
3018 return 0;
3019}
3020
3021static void bpf_output__fprintf(struct trace *trace,
3022 struct perf_sample *sample)
3023{
3024 binary__fprintf(sample->raw_data, sample->raw_size, 8,
3025 bpf_output__printer, NULL, trace->output);
3026 ++trace->nr_events_printed;
3027}
3028
3029static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
3030 struct thread *thread, void *augmented_args, int augmented_args_size)
3031{
3032 char bf[2048];
3033 size_t size = sizeof(bf);
3034 struct tep_format_field *field = evsel->tp_format->format.fields;
3035 struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
3036 size_t printed = 0, btf_printed;
3037 unsigned long val;
3038 u8 bit = 1;
3039 struct syscall_arg syscall_arg = {
3040 .augmented = {
3041 .size = augmented_args_size,
3042 .args = augmented_args,
3043 },
3044 .idx = 0,
3045 .mask = 0,
3046 .trace = trace,
3047 .thread = thread,
3048 .show_string_prefix = trace->show_string_prefix,
3049 };
3050
3051 for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
3052 if (syscall_arg.mask & bit)
3053 continue;
3054
3055 syscall_arg.len = 0;
3056 syscall_arg.fmt = arg;
3057 if (field->flags & TEP_FIELD_IS_ARRAY) {
3058 int offset = field->offset;
3059
3060 if (field->flags & TEP_FIELD_IS_DYNAMIC) {
3061 offset = format_field__intval(field, sample, evsel->needs_swap);
3062 syscall_arg.len = offset >> 16;
3063 offset &= 0xffff;
3064 if (tep_field_is_relative(field->flags))
3065 offset += field->offset + field->size;
3066 }
3067
3068 val = (uintptr_t)(sample->raw_data + offset);
3069 } else
3070 val = format_field__intval(field, sample, evsel->needs_swap);
3071 /*
3072 * Some syscall args need some mask, most don't and
3073 * return val untouched.
3074 */
3075 val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
3076
3077 /* Suppress this argument if its value is zero and show_zero property isn't set. */
3078 if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE)
3079 continue;
3080
3081 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
3082
3083 if (trace->show_arg_names)
3084 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
3085
3086 btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type);
3087 if (btf_printed) {
3088 printed += btf_printed;
3089 continue;
3090 }
3091
3092 printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
3093 }
3094
3095 return printed + fprintf(trace->output, "%.*s", (int)printed, bf);
3096}
3097
3098static int trace__event_handler(struct trace *trace, struct evsel *evsel,
3099 union perf_event *event __maybe_unused,
3100 struct perf_sample *sample)
3101{
3102 struct thread *thread;
3103 int callchain_ret = 0;
3104
3105 if (evsel->nr_events_printed >= evsel->max_events)
3106 return 0;
3107
3108 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3109
3110 if (sample->callchain) {
3111 struct callchain_cursor *cursor = get_tls_callchain_cursor();
3112
3113 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3114 if (callchain_ret == 0) {
3115 if (cursor->nr < trace->min_stack)
3116 goto out;
3117 callchain_ret = 1;
3118 }
3119 }
3120
3121 trace__printf_interrupted_entry(trace);
3122 trace__fprintf_tstamp(trace, sample->time, trace->output);
3123
3124 if (trace->trace_syscalls && trace->show_duration)
3125 fprintf(trace->output, "( ): ");
3126
3127 if (thread)
3128 trace__fprintf_comm_tid(trace, thread, trace->output);
3129
3130 if (evsel == trace->syscalls.events.bpf_output) {
3131 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
3132 struct syscall *sc = trace__syscall_info(trace, evsel, id);
3133
3134 if (sc) {
3135 fprintf(trace->output, "%s(", sc->name);
3136 trace__fprintf_sys_enter(trace, evsel, sample);
3137 fputc(')', trace->output);
3138 goto newline;
3139 }
3140
3141 /*
3142 * XXX: Not having the associated syscall info or not finding/adding
3143 * the thread should never happen, but if it does...
3144 * fall thru and print it as a bpf_output event.
3145 */
3146 }
3147
3148 fprintf(trace->output, "%s(", evsel->name);
3149
3150 if (evsel__is_bpf_output(evsel)) {
3151 bpf_output__fprintf(trace, sample);
3152 } else if (evsel->tp_format) {
3153 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
3154 trace__fprintf_sys_enter(trace, evsel, sample)) {
3155 if (trace->libtraceevent_print) {
3156 event_format__fprintf(evsel->tp_format, sample->cpu,
3157 sample->raw_data, sample->raw_size,
3158 trace->output);
3159 } else {
3160 trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
3161 }
3162 }
3163 }
3164
3165newline:
3166 fprintf(trace->output, ")\n");
3167
3168 if (callchain_ret > 0)
3169 trace__fprintf_callchain(trace, sample);
3170 else if (callchain_ret < 0)
3171 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3172
3173 ++trace->nr_events_printed;
3174
3175 if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
3176 evsel__disable(evsel);
3177 evsel__close(evsel);
3178 }
3179out:
3180 thread__put(thread);
3181 return 0;
3182}
3183
3184static void print_location(FILE *f, struct perf_sample *sample,
3185 struct addr_location *al,
3186 bool print_dso, bool print_sym)
3187{
3188
3189 if ((verbose > 0 || print_dso) && al->map)
3190 fprintf(f, "%s@", dso__long_name(map__dso(al->map)));
3191
3192 if ((verbose > 0 || print_sym) && al->sym)
3193 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
3194 al->addr - al->sym->start);
3195 else if (al->map)
3196 fprintf(f, "0x%" PRIx64, al->addr);
3197 else
3198 fprintf(f, "0x%" PRIx64, sample->addr);
3199}
3200
3201static int trace__pgfault(struct trace *trace,
3202 struct evsel *evsel,
3203 union perf_event *event __maybe_unused,
3204 struct perf_sample *sample)
3205{
3206 struct thread *thread;
3207 struct addr_location al;
3208 char map_type = 'd';
3209 struct thread_trace *ttrace;
3210 int err = -1;
3211 int callchain_ret = 0;
3212
3213 addr_location__init(&al);
3214 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3215
3216 if (sample->callchain) {
3217 struct callchain_cursor *cursor = get_tls_callchain_cursor();
3218
3219 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3220 if (callchain_ret == 0) {
3221 if (cursor->nr < trace->min_stack)
3222 goto out_put;
3223 callchain_ret = 1;
3224 }
3225 }
3226
3227 ttrace = thread__trace(thread, trace->output);
3228 if (ttrace == NULL)
3229 goto out_put;
3230
3231 if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
3232 ttrace->pfmaj++;
3233 else
3234 ttrace->pfmin++;
3235
3236 if (trace->summary_only)
3237 goto out;
3238
3239 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
3240
3241 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
3242
3243 fprintf(trace->output, "%sfault [",
3244 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
3245 "maj" : "min");
3246
3247 print_location(trace->output, sample, &al, false, true);
3248
3249 fprintf(trace->output, "] => ");
3250
3251 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
3252
3253 if (!al.map) {
3254 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
3255
3256 if (al.map)
3257 map_type = 'x';
3258 else
3259 map_type = '?';
3260 }
3261
3262 print_location(trace->output, sample, &al, true, false);
3263
3264 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
3265
3266 if (callchain_ret > 0)
3267 trace__fprintf_callchain(trace, sample);
3268 else if (callchain_ret < 0)
3269 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3270
3271 ++trace->nr_events_printed;
3272out:
3273 err = 0;
3274out_put:
3275 thread__put(thread);
3276 addr_location__exit(&al);
3277 return err;
3278}
3279
3280static void trace__set_base_time(struct trace *trace,
3281 struct evsel *evsel,
3282 struct perf_sample *sample)
3283{
3284 /*
3285 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
3286 * and don't use sample->time unconditionally, we may end up having
3287 * some other event in the future without PERF_SAMPLE_TIME for good
3288 * reason, i.e. we may not be interested in its timestamps, just in
3289 * it taking place, picking some piece of information when it
3290 * appears in our event stream (vfs_getname comes to mind).
3291 */
3292 if (trace->base_time == 0 && !trace->full_time &&
3293 (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
3294 trace->base_time = sample->time;
3295}
3296
3297static int trace__process_sample(const struct perf_tool *tool,
3298 union perf_event *event,
3299 struct perf_sample *sample,
3300 struct evsel *evsel,
3301 struct machine *machine __maybe_unused)
3302{
3303 struct trace *trace = container_of(tool, struct trace, tool);
3304 struct thread *thread;
3305 int err = 0;
3306
3307 tracepoint_handler handler = evsel->handler;
3308
3309 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3310 if (thread && thread__is_filtered(thread))
3311 goto out;
3312
3313 trace__set_base_time(trace, evsel, sample);
3314
3315 if (handler) {
3316 ++trace->nr_events;
3317 handler(trace, evsel, event, sample);
3318 }
3319out:
3320 thread__put(thread);
3321 return err;
3322}
3323
3324static int trace__record(struct trace *trace, int argc, const char **argv)
3325{
3326 unsigned int rec_argc, i, j;
3327 const char **rec_argv;
3328 const char * const record_args[] = {
3329 "record",
3330 "-R",
3331 "-m", "1024",
3332 "-c", "1",
3333 };
3334 pid_t pid = getpid();
3335 char *filter = asprintf__tp_filter_pids(1, &pid);
3336 const char * const sc_args[] = { "-e", };
3337 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
3338 const char * const majpf_args[] = { "-e", "major-faults" };
3339 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
3340 const char * const minpf_args[] = { "-e", "minor-faults" };
3341 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
3342 int err = -1;
3343
3344 /* +3 is for the event string below and the pid filter */
3345 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
3346 majpf_args_nr + minpf_args_nr + argc;
3347 rec_argv = calloc(rec_argc + 1, sizeof(char *));
3348
3349 if (rec_argv == NULL || filter == NULL)
3350 goto out_free;
3351
3352 j = 0;
3353 for (i = 0; i < ARRAY_SIZE(record_args); i++)
3354 rec_argv[j++] = record_args[i];
3355
3356 if (trace->trace_syscalls) {
3357 for (i = 0; i < sc_args_nr; i++)
3358 rec_argv[j++] = sc_args[i];
3359
3360 /* event string may be different for older kernels - e.g., RHEL6 */
3361 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
3362 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
3363 else if (is_valid_tracepoint("syscalls:sys_enter"))
3364 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
3365 else {
3366 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
3367 goto out_free;
3368 }
3369 }
3370
3371 rec_argv[j++] = "--filter";
3372 rec_argv[j++] = filter;
3373
3374 if (trace->trace_pgfaults & TRACE_PFMAJ)
3375 for (i = 0; i < majpf_args_nr; i++)
3376 rec_argv[j++] = majpf_args[i];
3377
3378 if (trace->trace_pgfaults & TRACE_PFMIN)
3379 for (i = 0; i < minpf_args_nr; i++)
3380 rec_argv[j++] = minpf_args[i];
3381
3382 for (i = 0; i < (unsigned int)argc; i++)
3383 rec_argv[j++] = argv[i];
3384
3385 err = cmd_record(j, rec_argv);
3386out_free:
3387 free(filter);
3388 free(rec_argv);
3389 return err;
3390}
3391
3392static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
3393
3394static bool evlist__add_vfs_getname(struct evlist *evlist)
3395{
3396 bool found = false;
3397 struct evsel *evsel, *tmp;
3398 struct parse_events_error err;
3399 int ret;
3400
3401 parse_events_error__init(&err);
3402 ret = parse_events(evlist, "probe:vfs_getname*", &err);
3403 parse_events_error__exit(&err);
3404 if (ret)
3405 return false;
3406
3407 evlist__for_each_entry_safe(evlist, evsel, tmp) {
3408 if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
3409 continue;
3410
3411 if (evsel__field(evsel, "pathname")) {
3412 evsel->handler = trace__vfs_getname;
3413 found = true;
3414 continue;
3415 }
3416
3417 list_del_init(&evsel->core.node);
3418 evsel->evlist = NULL;
3419 evsel__delete(evsel);
3420 }
3421
3422 return found;
3423}
3424
3425static struct evsel *evsel__new_pgfault(u64 config)
3426{
3427 struct evsel *evsel;
3428 struct perf_event_attr attr = {
3429 .type = PERF_TYPE_SOFTWARE,
3430 .mmap_data = 1,
3431 };
3432
3433 attr.config = config;
3434 attr.sample_period = 1;
3435
3436 event_attr_init(&attr);
3437
3438 evsel = evsel__new(&attr);
3439 if (evsel)
3440 evsel->handler = trace__pgfault;
3441
3442 return evsel;
3443}
3444
3445static void evlist__free_syscall_tp_fields(struct evlist *evlist)
3446{
3447 struct evsel *evsel;
3448
3449 evlist__for_each_entry(evlist, evsel) {
3450 evsel_trace__delete(evsel->priv);
3451 evsel->priv = NULL;
3452 }
3453}
3454
3455static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
3456{
3457 const u32 type = event->header.type;
3458 struct evsel *evsel;
3459
3460 if (type != PERF_RECORD_SAMPLE) {
3461 trace__process_event(trace, trace->host, event, sample);
3462 return;
3463 }
3464
3465 evsel = evlist__id2evsel(trace->evlist, sample->id);
3466 if (evsel == NULL) {
3467 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
3468 return;
3469 }
3470
3471 if (evswitch__discard(&trace->evswitch, evsel))
3472 return;
3473
3474 trace__set_base_time(trace, evsel, sample);
3475
3476 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
3477 sample->raw_data == NULL) {
3478 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
3479 evsel__name(evsel), sample->tid,
3480 sample->cpu, sample->raw_size);
3481 } else {
3482 tracepoint_handler handler = evsel->handler;
3483 handler(trace, evsel, event, sample);
3484 }
3485
3486 if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
3487 interrupted = true;
3488}
3489
3490static int trace__add_syscall_newtp(struct trace *trace)
3491{
3492 int ret = -1;
3493 struct evlist *evlist = trace->evlist;
3494 struct evsel *sys_enter, *sys_exit;
3495
3496 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
3497 if (sys_enter == NULL)
3498 goto out;
3499
3500 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
3501 goto out_delete_sys_enter;
3502
3503 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
3504 if (sys_exit == NULL)
3505 goto out_delete_sys_enter;
3506
3507 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
3508 goto out_delete_sys_exit;
3509
3510 evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
3511 evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
3512
3513 evlist__add(evlist, sys_enter);
3514 evlist__add(evlist, sys_exit);
3515
3516 if (callchain_param.enabled && !trace->kernel_syscallchains) {
3517 /*
3518 * We're interested only in the user space callchain
3519 * leading to the syscall, allow overriding that for
3520 * debugging reasons using --kernel_syscall_callchains
3521 */
3522 sys_exit->core.attr.exclude_callchain_kernel = 1;
3523 }
3524
3525 trace->syscalls.events.sys_enter = sys_enter;
3526 trace->syscalls.events.sys_exit = sys_exit;
3527
3528 ret = 0;
3529out:
3530 return ret;
3531
3532out_delete_sys_exit:
3533 evsel__delete_priv(sys_exit);
3534out_delete_sys_enter:
3535 evsel__delete_priv(sys_enter);
3536 goto out;
3537}
3538
3539static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
3540{
3541 int err = -1;
3542 struct evsel *sys_exit;
3543 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
3544 trace->ev_qualifier_ids.nr,
3545 trace->ev_qualifier_ids.entries);
3546
3547 if (filter == NULL)
3548 goto out_enomem;
3549
3550 if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
3551 sys_exit = trace->syscalls.events.sys_exit;
3552 err = evsel__append_tp_filter(sys_exit, filter);
3553 }
3554
3555 free(filter);
3556out:
3557 return err;
3558out_enomem:
3559 errno = ENOMEM;
3560 goto out;
3561}
3562
3563#ifdef HAVE_BPF_SKEL
3564static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
3565{
3566 int id;
3567
3568 if (arg_fmt->type != NULL)
3569 return -1;
3570
3571 id = btf__find_by_name(btf, type);
3572 if (id < 0)
3573 return -1;
3574
3575 arg_fmt->type = btf__type_by_id(btf, id);
3576 arg_fmt->type_id = id;
3577
3578 return 0;
3579}
3580
3581static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
3582{
3583 struct bpf_program *pos, *prog = NULL;
3584 const char *sec_name;
3585
3586 if (trace->skel->obj == NULL)
3587 return NULL;
3588
3589 bpf_object__for_each_program(pos, trace->skel->obj) {
3590 sec_name = bpf_program__section_name(pos);
3591 if (sec_name && !strcmp(sec_name, name)) {
3592 prog = pos;
3593 break;
3594 }
3595 }
3596
3597 return prog;
3598}
3599
3600static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
3601 const char *prog_name, const char *type)
3602{
3603 struct bpf_program *prog;
3604
3605 if (prog_name == NULL) {
3606 char default_prog_name[256];
3607 scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name);
3608 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3609 if (prog != NULL)
3610 goto out_found;
3611 if (sc->fmt && sc->fmt->alias) {
3612 scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
3613 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3614 if (prog != NULL)
3615 goto out_found;
3616 }
3617 goto out_unaugmented;
3618 }
3619
3620 prog = trace__find_bpf_program_by_title(trace, prog_name);
3621
3622 if (prog != NULL) {
3623out_found:
3624 return prog;
3625 }
3626
3627 pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
3628 prog_name, type, sc->name);
3629out_unaugmented:
3630 return trace->skel->progs.syscall_unaugmented;
3631}
3632
3633static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
3634{
3635 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3636
3637 if (sc == NULL)
3638 return;
3639
3640 sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3641 sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit");
3642}
3643
3644static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
3645{
3646 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3647 return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3648}
3649
3650static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
3651{
3652 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3653 return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3654}
3655
3656static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
3657{
3658 struct tep_format_field *field;
3659 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3660 const struct btf_type *bt;
3661 char *struct_offset, *tmp, name[32];
3662 bool can_augment = false;
3663 int i, cnt;
3664
3665 if (sc == NULL)
3666 return -1;
3667
3668 trace__load_vmlinux_btf(trace);
3669 if (trace->btf == NULL)
3670 return -1;
3671
3672 for (i = 0, field = sc->args; field; ++i, field = field->next) {
3673 // XXX We're only collecting pointer payloads _from_ user space
3674 if (!sc->arg_fmt[i].from_user)
3675 continue;
3676
3677 struct_offset = strstr(field->type, "struct ");
3678 if (struct_offset == NULL)
3679 struct_offset = strstr(field->type, "union ");
3680 else
3681 struct_offset++; // "union" is shorter
3682
3683 if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
3684 struct_offset += 6;
3685
3686 /* for 'struct foo *', we only want 'foo' */
3687 for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
3688 }
3689
3690 strncpy(name, struct_offset, cnt);
3691 name[cnt] = '\0';
3692
3693 /* cache struct's btf_type and type_id */
3694 if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name))
3695 continue;
3696
3697 bt = sc->arg_fmt[i].type;
3698 beauty_array[i] = bt->size;
3699 can_augment = true;
3700 } else if (field->flags & TEP_FIELD_IS_POINTER && /* string */
3701 strcmp(field->type, "const char *") == 0 &&
3702 (strstr(field->name, "name") ||
3703 strstr(field->name, "path") ||
3704 strstr(field->name, "file") ||
3705 strstr(field->name, "root") ||
3706 strstr(field->name, "key") ||
3707 strstr(field->name, "special") ||
3708 strstr(field->name, "type") ||
3709 strstr(field->name, "description"))) {
3710 beauty_array[i] = 1;
3711 can_augment = true;
3712 } else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */
3713 strstr(field->type, "char *") &&
3714 (strstr(field->name, "buf") ||
3715 strstr(field->name, "val") ||
3716 strstr(field->name, "msg"))) {
3717 int j;
3718 struct tep_format_field *field_tmp;
3719
3720 /* find the size of the buffer that appears in pairs with buf */
3721 for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) {
3722 if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */
3723 (strstr(field_tmp->name, "count") ||
3724 strstr(field_tmp->name, "siz") || /* size, bufsiz */
3725 (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) {
3726 /* filename's got 'len' in it, we don't want that */
3727 beauty_array[i] = -(j + 1);
3728 can_augment = true;
3729 break;
3730 }
3731 }
3732 }
3733 }
3734
3735 if (can_augment)
3736 return 0;
3737
3738 return -1;
3739}
3740
3741static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
3742{
3743 struct tep_format_field *field, *candidate_field;
3744 /*
3745 * We're only interested in syscalls that have a pointer:
3746 */
3747 for (field = sc->args; field; field = field->next) {
3748 if (field->flags & TEP_FIELD_IS_POINTER)
3749 goto try_to_find_pair;
3750 }
3751
3752 return NULL;
3753
3754try_to_find_pair:
3755 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3756 int id = syscalltbl__id_at_idx(trace->sctbl, i);
3757 struct syscall *pair = trace__syscall_info(trace, NULL, id);
3758 struct bpf_program *pair_prog;
3759 bool is_candidate = false;
3760
3761 if (pair == NULL || pair == sc ||
3762 pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
3763 continue;
3764
3765 for (field = sc->args, candidate_field = pair->args;
3766 field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
3767 bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
3768 candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
3769
3770 if (is_pointer) {
3771 if (!candidate_is_pointer) {
3772 // The candidate just doesn't copies our pointer arg, might copy other pointers we want.
3773 continue;
3774 }
3775 } else {
3776 if (candidate_is_pointer) {
3777 // The candidate might copy a pointer we don't have, skip it.
3778 goto next_candidate;
3779 }
3780 continue;
3781 }
3782
3783 if (strcmp(field->type, candidate_field->type))
3784 goto next_candidate;
3785
3786 /*
3787 * This is limited in the BPF program but sys_write
3788 * uses "const char *" for its "buf" arg so we need to
3789 * use some heuristic that is kinda future proof...
3790 */
3791 if (strcmp(field->type, "const char *") == 0 &&
3792 !(strstr(field->name, "name") ||
3793 strstr(field->name, "path") ||
3794 strstr(field->name, "file") ||
3795 strstr(field->name, "root") ||
3796 strstr(field->name, "description")))
3797 goto next_candidate;
3798
3799 is_candidate = true;
3800 }
3801
3802 if (!is_candidate)
3803 goto next_candidate;
3804
3805 /*
3806 * Check if the tentative pair syscall augmenter has more pointers, if it has,
3807 * then it may be collecting that and we then can't use it, as it would collect
3808 * more than what is common to the two syscalls.
3809 */
3810 if (candidate_field) {
3811 for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
3812 if (candidate_field->flags & TEP_FIELD_IS_POINTER)
3813 goto next_candidate;
3814 }
3815
3816 pair_prog = pair->bpf_prog.sys_enter;
3817 /*
3818 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
3819 * have been searched for, so search it here and if it returns the
3820 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
3821 * program for a filtered syscall on a non-filtered one.
3822 *
3823 * For instance, we have "!syscalls:sys_enter_renameat" and that is
3824 * useful for "renameat2".
3825 */
3826 if (pair_prog == NULL) {
3827 pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3828 if (pair_prog == trace->skel->progs.syscall_unaugmented)
3829 goto next_candidate;
3830 }
3831
3832 pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3833 return pair_prog;
3834 next_candidate:
3835 continue;
3836 }
3837
3838 return NULL;
3839}
3840
3841static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3842{
3843 int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
3844 int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
3845 int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
3846 int err = 0;
3847 unsigned int beauty_array[6];
3848
3849 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3850 int prog_fd, key = syscalltbl__id_at_idx(trace->sctbl, i);
3851
3852 if (!trace__syscall_enabled(trace, key))
3853 continue;
3854
3855 trace__init_syscall_bpf_progs(trace, key);
3856
3857 // It'll get at least the "!raw_syscalls:unaugmented"
3858 prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3859 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3860 if (err)
3861 break;
3862 prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3863 err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3864 if (err)
3865 break;
3866
3867 /* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
3868 memset(beauty_array, 0, sizeof(beauty_array));
3869 err = trace__bpf_sys_enter_beauty_map(trace, key, (unsigned int *)beauty_array);
3870 if (err)
3871 continue;
3872 err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY);
3873 if (err)
3874 break;
3875 }
3876
3877 /*
3878 * Now lets do a second pass looking for enabled syscalls without
3879 * an augmenter that have a signature that is a superset of another
3880 * syscall with an augmenter so that we can auto-reuse it.
3881 *
3882 * I.e. if we have an augmenter for the "open" syscall that has
3883 * this signature:
3884 *
3885 * int open(const char *pathname, int flags, mode_t mode);
3886 *
3887 * I.e. that will collect just the first string argument, then we
3888 * can reuse it for the 'creat' syscall, that has this signature:
3889 *
3890 * int creat(const char *pathname, mode_t mode);
3891 *
3892 * and for:
3893 *
3894 * int stat(const char *pathname, struct stat *statbuf);
3895 * int lstat(const char *pathname, struct stat *statbuf);
3896 *
3897 * Because the 'open' augmenter will collect the first arg as a string,
3898 * and leave alone all the other args, which already helps with
3899 * beautifying 'stat' and 'lstat''s pathname arg.
3900 *
3901 * Then, in time, when 'stat' gets an augmenter that collects both
3902 * first and second arg (this one on the raw_syscalls:sys_exit prog
3903 * array tail call, then that one will be used.
3904 */
3905 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3906 int key = syscalltbl__id_at_idx(trace->sctbl, i);
3907 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3908 struct bpf_program *pair_prog;
3909 int prog_fd;
3910
3911 if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3912 continue;
3913
3914 /*
3915 * For now we're just reusing the sys_enter prog, and if it
3916 * already has an augmenter, we don't need to find one.
3917 */
3918 if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
3919 continue;
3920
3921 /*
3922 * Look at all the other syscalls for one that has a signature
3923 * that is close enough that we can share:
3924 */
3925 pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3926 if (pair_prog == NULL)
3927 continue;
3928
3929 sc->bpf_prog.sys_enter = pair_prog;
3930
3931 /*
3932 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3933 * with the fd for the program we're reusing:
3934 */
3935 prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3936 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3937 if (err)
3938 break;
3939 }
3940
3941 return err;
3942}
3943#endif // HAVE_BPF_SKEL
3944
3945static int trace__set_ev_qualifier_filter(struct trace *trace)
3946{
3947 if (trace->syscalls.events.sys_enter)
3948 return trace__set_ev_qualifier_tp_filter(trace);
3949 return 0;
3950}
3951
3952static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3953 size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3954{
3955 int err = 0;
3956#ifdef HAVE_LIBBPF_SUPPORT
3957 bool value = true;
3958 int map_fd = bpf_map__fd(map);
3959 size_t i;
3960
3961 for (i = 0; i < npids; ++i) {
3962 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3963 if (err)
3964 break;
3965 }
3966#endif
3967 return err;
3968}
3969
3970static int trace__set_filter_loop_pids(struct trace *trace)
3971{
3972 unsigned int nr = 1, err;
3973 pid_t pids[32] = {
3974 getpid(),
3975 };
3976 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3977
3978 while (thread && nr < ARRAY_SIZE(pids)) {
3979 struct thread *parent = machine__find_thread(trace->host,
3980 thread__ppid(thread),
3981 thread__ppid(thread));
3982
3983 if (parent == NULL)
3984 break;
3985
3986 if (!strcmp(thread__comm_str(parent), "sshd") ||
3987 strstarts(thread__comm_str(parent), "gnome-terminal")) {
3988 pids[nr++] = thread__tid(parent);
3989 break;
3990 }
3991 thread = parent;
3992 }
3993
3994 err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
3995 if (!err && trace->filter_pids.map)
3996 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3997
3998 return err;
3999}
4000
4001static int trace__set_filter_pids(struct trace *trace)
4002{
4003 int err = 0;
4004 /*
4005 * Better not use !target__has_task() here because we need to cover the
4006 * case where no threads were specified in the command line, but a
4007 * workload was, and in that case we will fill in the thread_map when
4008 * we fork the workload in evlist__prepare_workload.
4009 */
4010 if (trace->filter_pids.nr > 0) {
4011 err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
4012 trace->filter_pids.entries);
4013 if (!err && trace->filter_pids.map) {
4014 err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
4015 trace->filter_pids.entries);
4016 }
4017 } else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
4018 err = trace__set_filter_loop_pids(trace);
4019 }
4020
4021 return err;
4022}
4023
4024static int __trace__deliver_event(struct trace *trace, union perf_event *event)
4025{
4026 struct evlist *evlist = trace->evlist;
4027 struct perf_sample sample;
4028 int err = evlist__parse_sample(evlist, event, &sample);
4029
4030 if (err)
4031 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
4032 else
4033 trace__handle_event(trace, event, &sample);
4034
4035 return 0;
4036}
4037
4038static int __trace__flush_events(struct trace *trace)
4039{
4040 u64 first = ordered_events__first_time(&trace->oe.data);
4041 u64 flush = trace->oe.last - NSEC_PER_SEC;
4042
4043 /* Is there some thing to flush.. */
4044 if (first && first < flush)
4045 return ordered_events__flush_time(&trace->oe.data, flush);
4046
4047 return 0;
4048}
4049
4050static int trace__flush_events(struct trace *trace)
4051{
4052 return !trace->sort_events ? 0 : __trace__flush_events(trace);
4053}
4054
4055static int trace__deliver_event(struct trace *trace, union perf_event *event)
4056{
4057 int err;
4058
4059 if (!trace->sort_events)
4060 return __trace__deliver_event(trace, event);
4061
4062 err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
4063 if (err && err != -1)
4064 return err;
4065
4066 err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
4067 if (err)
4068 return err;
4069
4070 return trace__flush_events(trace);
4071}
4072
4073static int ordered_events__deliver_event(struct ordered_events *oe,
4074 struct ordered_event *event)
4075{
4076 struct trace *trace = container_of(oe, struct trace, oe.data);
4077
4078 return __trace__deliver_event(trace, event->event);
4079}
4080
4081static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg,
4082 char **type)
4083{
4084 struct tep_format_field *field;
4085 struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
4086
4087 if (evsel->tp_format == NULL || fmt == NULL)
4088 return NULL;
4089
4090 for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
4091 if (strcmp(field->name, arg) == 0) {
4092 *type = field->type;
4093 return fmt;
4094 }
4095
4096 return NULL;
4097}
4098
4099static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
4100{
4101 char *tok, *left = evsel->filter, *new_filter = evsel->filter;
4102
4103 while ((tok = strpbrk(left, "=<>!")) != NULL) {
4104 char *right = tok + 1, *right_end;
4105
4106 if (*right == '=')
4107 ++right;
4108
4109 while (isspace(*right))
4110 ++right;
4111
4112 if (*right == '\0')
4113 break;
4114
4115 while (!isalpha(*left))
4116 if (++left == tok) {
4117 /*
4118 * Bail out, can't find the name of the argument that is being
4119 * used in the filter, let it try to set this filter, will fail later.
4120 */
4121 return 0;
4122 }
4123
4124 right_end = right + 1;
4125 while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
4126 ++right_end;
4127
4128 if (isalpha(*right)) {
4129 struct syscall_arg_fmt *fmt;
4130 int left_size = tok - left,
4131 right_size = right_end - right;
4132 char arg[128], *type;
4133
4134 while (isspace(left[left_size - 1]))
4135 --left_size;
4136
4137 scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
4138
4139 fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type);
4140 if (fmt == NULL) {
4141 pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
4142 arg, evsel->name, evsel->filter);
4143 return -1;
4144 }
4145
4146 pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
4147 arg, (int)(right - tok), tok, right_size, right);
4148
4149 if (fmt->strtoul) {
4150 u64 val;
4151 struct syscall_arg syscall_arg = {
4152 .trace = trace,
4153 .fmt = fmt,
4154 .type_name = type,
4155 .parm = fmt->parm,
4156 };
4157
4158 if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
4159 char *n, expansion[19];
4160 int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
4161 int expansion_offset = right - new_filter;
4162
4163 pr_debug("%s", expansion);
4164
4165 if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
4166 pr_debug(" out of memory!\n");
4167 free(new_filter);
4168 return -1;
4169 }
4170 if (new_filter != evsel->filter)
4171 free(new_filter);
4172 left = n + expansion_offset + expansion_lenght;
4173 new_filter = n;
4174 } else {
4175 pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4176 right_size, right, arg, evsel->name, evsel->filter);
4177 return -1;
4178 }
4179 } else {
4180 pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4181 arg, evsel->name, evsel->filter);
4182 return -1;
4183 }
4184
4185 pr_debug("\n");
4186 } else {
4187 left = right_end;
4188 }
4189 }
4190
4191 if (new_filter != evsel->filter) {
4192 pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
4193 evsel__set_filter(evsel, new_filter);
4194 free(new_filter);
4195 }
4196
4197 return 0;
4198}
4199
4200static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
4201{
4202 struct evlist *evlist = trace->evlist;
4203 struct evsel *evsel;
4204
4205 evlist__for_each_entry(evlist, evsel) {
4206 if (evsel->filter == NULL)
4207 continue;
4208
4209 if (trace__expand_filter(trace, evsel)) {
4210 *err_evsel = evsel;
4211 return -1;
4212 }
4213 }
4214
4215 return 0;
4216}
4217
4218static int trace__run(struct trace *trace, int argc, const char **argv)
4219{
4220 struct evlist *evlist = trace->evlist;
4221 struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
4222 int err = -1, i;
4223 unsigned long before;
4224 const bool forks = argc > 0;
4225 bool draining = false;
4226
4227 trace->live = true;
4228
4229 if (!trace->raw_augmented_syscalls) {
4230 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
4231 goto out_error_raw_syscalls;
4232
4233 if (trace->trace_syscalls)
4234 trace->vfs_getname = evlist__add_vfs_getname(evlist);
4235 }
4236
4237 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
4238 pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
4239 if (pgfault_maj == NULL)
4240 goto out_error_mem;
4241 evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
4242 evlist__add(evlist, pgfault_maj);
4243 }
4244
4245 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
4246 pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
4247 if (pgfault_min == NULL)
4248 goto out_error_mem;
4249 evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
4250 evlist__add(evlist, pgfault_min);
4251 }
4252
4253 /* Enable ignoring missing threads when -u/-p option is defined. */
4254 trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
4255
4256 if (trace->sched &&
4257 evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
4258 goto out_error_sched_stat_runtime;
4259 /*
4260 * If a global cgroup was set, apply it to all the events without an
4261 * explicit cgroup. I.e.:
4262 *
4263 * trace -G A -e sched:*switch
4264 *
4265 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
4266 * _and_ sched:sched_switch to the 'A' cgroup, while:
4267 *
4268 * trace -e sched:*switch -G A
4269 *
4270 * will only set the sched:sched_switch event to the 'A' cgroup, all the
4271 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
4272 * a cgroup (on the root cgroup, sys wide, etc).
4273 *
4274 * Multiple cgroups:
4275 *
4276 * trace -G A -e sched:*switch -G B
4277 *
4278 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
4279 * to the 'B' cgroup.
4280 *
4281 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
4282 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
4283 */
4284 if (trace->cgroup)
4285 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
4286
4287 err = evlist__create_maps(evlist, &trace->opts.target);
4288 if (err < 0) {
4289 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
4290 goto out_delete_evlist;
4291 }
4292
4293 err = trace__symbols_init(trace, evlist);
4294 if (err < 0) {
4295 fprintf(trace->output, "Problems initializing symbol libraries!\n");
4296 goto out_delete_evlist;
4297 }
4298
4299 evlist__config(evlist, &trace->opts, &callchain_param);
4300
4301 if (forks) {
4302 err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
4303 if (err < 0) {
4304 fprintf(trace->output, "Couldn't run the workload!\n");
4305 goto out_delete_evlist;
4306 }
4307 workload_pid = evlist->workload.pid;
4308 }
4309
4310 err = evlist__open(evlist);
4311 if (err < 0)
4312 goto out_error_open;
4313#ifdef HAVE_BPF_SKEL
4314 if (trace->syscalls.events.bpf_output) {
4315 struct perf_cpu cpu;
4316
4317 /*
4318 * Set up the __augmented_syscalls__ BPF map to hold for each
4319 * CPU the bpf-output event's file descriptor.
4320 */
4321 perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
4322 bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
4323 &cpu.cpu, sizeof(int),
4324 xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
4325 cpu.cpu, 0),
4326 sizeof(__u32), BPF_ANY);
4327 }
4328 }
4329
4330 if (trace->skel)
4331 trace->filter_pids.map = trace->skel->maps.pids_filtered;
4332#endif
4333 err = trace__set_filter_pids(trace);
4334 if (err < 0)
4335 goto out_error_mem;
4336
4337#ifdef HAVE_BPF_SKEL
4338 if (trace->skel && trace->skel->progs.sys_enter)
4339 trace__init_syscalls_bpf_prog_array_maps(trace);
4340#endif
4341
4342 if (trace->ev_qualifier_ids.nr > 0) {
4343 err = trace__set_ev_qualifier_filter(trace);
4344 if (err < 0)
4345 goto out_errno;
4346
4347 if (trace->syscalls.events.sys_exit) {
4348 pr_debug("event qualifier tracepoint filter: %s\n",
4349 trace->syscalls.events.sys_exit->filter);
4350 }
4351 }
4352
4353 /*
4354 * If the "close" syscall is not traced, then we will not have the
4355 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
4356 * fd->pathname table and were ending up showing the last value set by
4357 * syscalls opening a pathname and associating it with a descriptor or
4358 * reading it from /proc/pid/fd/ in cases where that doesn't make
4359 * sense.
4360 *
4361 * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
4362 * not in use.
4363 */
4364 trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
4365
4366 err = trace__expand_filters(trace, &evsel);
4367 if (err)
4368 goto out_delete_evlist;
4369 err = evlist__apply_filters(evlist, &evsel, &trace->opts.target);
4370 if (err < 0)
4371 goto out_error_apply_filters;
4372
4373 err = evlist__mmap(evlist, trace->opts.mmap_pages);
4374 if (err < 0)
4375 goto out_error_mmap;
4376
4377 if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
4378 evlist__enable(evlist);
4379
4380 if (forks)
4381 evlist__start_workload(evlist);
4382
4383 if (trace->opts.target.initial_delay) {
4384 usleep(trace->opts.target.initial_delay * 1000);
4385 evlist__enable(evlist);
4386 }
4387
4388 trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
4389 perf_thread_map__nr(evlist->core.threads) > 1 ||
4390 evlist__first(evlist)->core.attr.inherit;
4391
4392 /*
4393 * Now that we already used evsel->core.attr to ask the kernel to setup the
4394 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
4395 * trace__resolve_callchain(), allowing per-event max-stack settings
4396 * to override an explicitly set --max-stack global setting.
4397 */
4398 evlist__for_each_entry(evlist, evsel) {
4399 if (evsel__has_callchain(evsel) &&
4400 evsel->core.attr.sample_max_stack == 0)
4401 evsel->core.attr.sample_max_stack = trace->max_stack;
4402 }
4403again:
4404 before = trace->nr_events;
4405
4406 for (i = 0; i < evlist->core.nr_mmaps; i++) {
4407 union perf_event *event;
4408 struct mmap *md;
4409
4410 md = &evlist->mmap[i];
4411 if (perf_mmap__read_init(&md->core) < 0)
4412 continue;
4413
4414 while ((event = perf_mmap__read_event(&md->core)) != NULL) {
4415 ++trace->nr_events;
4416
4417 err = trace__deliver_event(trace, event);
4418 if (err)
4419 goto out_disable;
4420
4421 perf_mmap__consume(&md->core);
4422
4423 if (interrupted)
4424 goto out_disable;
4425
4426 if (done && !draining) {
4427 evlist__disable(evlist);
4428 draining = true;
4429 }
4430 }
4431 perf_mmap__read_done(&md->core);
4432 }
4433
4434 if (trace->nr_events == before) {
4435 int timeout = done ? 100 : -1;
4436
4437 if (!draining && evlist__poll(evlist, timeout) > 0) {
4438 if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
4439 draining = true;
4440
4441 goto again;
4442 } else {
4443 if (trace__flush_events(trace))
4444 goto out_disable;
4445 }
4446 } else {
4447 goto again;
4448 }
4449
4450out_disable:
4451 thread__zput(trace->current);
4452
4453 evlist__disable(evlist);
4454
4455 if (trace->sort_events)
4456 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
4457
4458 if (!err) {
4459 if (trace->summary)
4460 trace__fprintf_thread_summary(trace, trace->output);
4461
4462 if (trace->show_tool_stats) {
4463 fprintf(trace->output, "Stats:\n "
4464 " vfs_getname : %" PRIu64 "\n"
4465 " proc_getname: %" PRIu64 "\n",
4466 trace->stats.vfs_getname,
4467 trace->stats.proc_getname);
4468 }
4469 }
4470
4471out_delete_evlist:
4472 trace__symbols__exit(trace);
4473 evlist__free_syscall_tp_fields(evlist);
4474 evlist__delete(evlist);
4475 cgroup__put(trace->cgroup);
4476 trace->evlist = NULL;
4477 trace->live = false;
4478 return err;
4479{
4480 char errbuf[BUFSIZ];
4481
4482out_error_sched_stat_runtime:
4483 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
4484 goto out_error;
4485
4486out_error_raw_syscalls:
4487 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
4488 goto out_error;
4489
4490out_error_mmap:
4491 evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
4492 goto out_error;
4493
4494out_error_open:
4495 evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
4496
4497out_error:
4498 fprintf(trace->output, "%s\n", errbuf);
4499 goto out_delete_evlist;
4500
4501out_error_apply_filters:
4502 fprintf(trace->output,
4503 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
4504 evsel->filter, evsel__name(evsel), errno,
4505 str_error_r(errno, errbuf, sizeof(errbuf)));
4506 goto out_delete_evlist;
4507}
4508out_error_mem:
4509 fprintf(trace->output, "Not enough memory to run!\n");
4510 goto out_delete_evlist;
4511
4512out_errno:
4513 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
4514 goto out_delete_evlist;
4515}
4516
4517static int trace__replay(struct trace *trace)
4518{
4519 const struct evsel_str_handler handlers[] = {
4520 { "probe:vfs_getname", trace__vfs_getname, },
4521 };
4522 struct perf_data data = {
4523 .path = input_name,
4524 .mode = PERF_DATA_MODE_READ,
4525 .force = trace->force,
4526 };
4527 struct perf_session *session;
4528 struct evsel *evsel;
4529 int err = -1;
4530
4531 trace->tool.sample = trace__process_sample;
4532 trace->tool.mmap = perf_event__process_mmap;
4533 trace->tool.mmap2 = perf_event__process_mmap2;
4534 trace->tool.comm = perf_event__process_comm;
4535 trace->tool.exit = perf_event__process_exit;
4536 trace->tool.fork = perf_event__process_fork;
4537 trace->tool.attr = perf_event__process_attr;
4538 trace->tool.tracing_data = perf_event__process_tracing_data;
4539 trace->tool.build_id = perf_event__process_build_id;
4540 trace->tool.namespaces = perf_event__process_namespaces;
4541
4542 trace->tool.ordered_events = true;
4543 trace->tool.ordering_requires_timestamps = true;
4544
4545 /* add tid to output */
4546 trace->multiple_threads = true;
4547
4548 session = perf_session__new(&data, &trace->tool);
4549 if (IS_ERR(session))
4550 return PTR_ERR(session);
4551
4552 if (trace->opts.target.pid)
4553 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
4554
4555 if (trace->opts.target.tid)
4556 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
4557
4558 if (symbol__init(&session->header.env) < 0)
4559 goto out;
4560
4561 trace->host = &session->machines.host;
4562
4563 err = perf_session__set_tracepoints_handlers(session, handlers);
4564 if (err)
4565 goto out;
4566
4567 evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
4568 trace->syscalls.events.sys_enter = evsel;
4569 /* older kernels have syscalls tp versus raw_syscalls */
4570 if (evsel == NULL)
4571 evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
4572
4573 if (evsel &&
4574 (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
4575 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
4576 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
4577 goto out;
4578 }
4579
4580 evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
4581 trace->syscalls.events.sys_exit = evsel;
4582 if (evsel == NULL)
4583 evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
4584 if (evsel &&
4585 (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
4586 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
4587 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
4588 goto out;
4589 }
4590
4591 evlist__for_each_entry(session->evlist, evsel) {
4592 if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
4593 (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
4594 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
4595 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
4596 evsel->handler = trace__pgfault;
4597 }
4598
4599 setup_pager();
4600
4601 err = perf_session__process_events(session);
4602 if (err)
4603 pr_err("Failed to process events, error %d", err);
4604
4605 else if (trace->summary)
4606 trace__fprintf_thread_summary(trace, trace->output);
4607
4608out:
4609 perf_session__delete(session);
4610
4611 return err;
4612}
4613
4614static size_t trace__fprintf_threads_header(FILE *fp)
4615{
4616 size_t printed;
4617
4618 printed = fprintf(fp, "\n Summary of events:\n\n");
4619
4620 return printed;
4621}
4622
4623DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
4624 struct syscall_stats *stats;
4625 double msecs;
4626 int syscall;
4627)
4628{
4629 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
4630 struct syscall_stats *stats = source->priv;
4631
4632 entry->syscall = source->i;
4633 entry->stats = stats;
4634 entry->msecs = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
4635}
4636
4637static size_t thread__dump_stats(struct thread_trace *ttrace,
4638 struct trace *trace, FILE *fp)
4639{
4640 size_t printed = 0;
4641 struct syscall *sc;
4642 struct rb_node *nd;
4643 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
4644
4645 if (syscall_stats == NULL)
4646 return 0;
4647
4648 printed += fprintf(fp, "\n");
4649
4650 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
4651 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
4652 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
4653
4654 resort_rb__for_each_entry(nd, syscall_stats) {
4655 struct syscall_stats *stats = syscall_stats_entry->stats;
4656 if (stats) {
4657 double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
4658 double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
4659 double avg = avg_stats(&stats->stats);
4660 double pct;
4661 u64 n = (u64)stats->stats.n;
4662
4663 pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
4664 avg /= NSEC_PER_MSEC;
4665
4666 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
4667 printed += fprintf(fp, " %-15s", sc->name);
4668 printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
4669 n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
4670 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
4671
4672 if (trace->errno_summary && stats->nr_failures) {
4673 int e;
4674
4675 for (e = 0; e < stats->max_errno; ++e) {
4676 if (stats->errnos[e] != 0)
4677 fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
4678 }
4679 }
4680 }
4681 }
4682
4683 resort_rb__delete(syscall_stats);
4684 printed += fprintf(fp, "\n\n");
4685
4686 return printed;
4687}
4688
4689static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
4690{
4691 size_t printed = 0;
4692 struct thread_trace *ttrace = thread__priv(thread);
4693 double ratio;
4694
4695 if (ttrace == NULL)
4696 return 0;
4697
4698 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
4699
4700 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread));
4701 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
4702 printed += fprintf(fp, "%.1f%%", ratio);
4703 if (ttrace->pfmaj)
4704 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
4705 if (ttrace->pfmin)
4706 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
4707 if (trace->sched)
4708 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
4709 else if (fputc('\n', fp) != EOF)
4710 ++printed;
4711
4712 printed += thread__dump_stats(ttrace, trace, fp);
4713
4714 return printed;
4715}
4716
4717static unsigned long thread__nr_events(struct thread_trace *ttrace)
4718{
4719 return ttrace ? ttrace->nr_events : 0;
4720}
4721
4722static int trace_nr_events_cmp(void *priv __maybe_unused,
4723 const struct list_head *la,
4724 const struct list_head *lb)
4725{
4726 struct thread_list *a = list_entry(la, struct thread_list, list);
4727 struct thread_list *b = list_entry(lb, struct thread_list, list);
4728 unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
4729 unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
4730
4731 if (a_nr_events != b_nr_events)
4732 return a_nr_events < b_nr_events ? -1 : 1;
4733
4734 /* Identical number of threads, place smaller tids first. */
4735 return thread__tid(a->thread) < thread__tid(b->thread)
4736 ? -1
4737 : (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
4738}
4739
4740static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
4741{
4742 size_t printed = trace__fprintf_threads_header(fp);
4743 LIST_HEAD(threads);
4744
4745 if (machine__thread_list(trace->host, &threads) == 0) {
4746 struct thread_list *pos;
4747
4748 list_sort(NULL, &threads, trace_nr_events_cmp);
4749
4750 list_for_each_entry(pos, &threads, list)
4751 printed += trace__fprintf_thread(fp, pos->thread, trace);
4752 }
4753 thread_list__delete(&threads);
4754 return printed;
4755}
4756
4757static int trace__set_duration(const struct option *opt, const char *str,
4758 int unset __maybe_unused)
4759{
4760 struct trace *trace = opt->value;
4761
4762 trace->duration_filter = atof(str);
4763 return 0;
4764}
4765
4766static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
4767 int unset __maybe_unused)
4768{
4769 int ret = -1;
4770 size_t i;
4771 struct trace *trace = opt->value;
4772 /*
4773 * FIXME: introduce a intarray class, plain parse csv and create a
4774 * { int nr, int entries[] } struct...
4775 */
4776 struct intlist *list = intlist__new(str);
4777
4778 if (list == NULL)
4779 return -1;
4780
4781 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
4782 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
4783
4784 if (trace->filter_pids.entries == NULL)
4785 goto out;
4786
4787 trace->filter_pids.entries[0] = getpid();
4788
4789 for (i = 1; i < trace->filter_pids.nr; ++i)
4790 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
4791
4792 intlist__delete(list);
4793 ret = 0;
4794out:
4795 return ret;
4796}
4797
4798static int trace__open_output(struct trace *trace, const char *filename)
4799{
4800 struct stat st;
4801
4802 if (!stat(filename, &st) && st.st_size) {
4803 char oldname[PATH_MAX];
4804
4805 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
4806 unlink(oldname);
4807 rename(filename, oldname);
4808 }
4809
4810 trace->output = fopen(filename, "w");
4811
4812 return trace->output == NULL ? -errno : 0;
4813}
4814
4815static int parse_pagefaults(const struct option *opt, const char *str,
4816 int unset __maybe_unused)
4817{
4818 int *trace_pgfaults = opt->value;
4819
4820 if (strcmp(str, "all") == 0)
4821 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
4822 else if (strcmp(str, "maj") == 0)
4823 *trace_pgfaults |= TRACE_PFMAJ;
4824 else if (strcmp(str, "min") == 0)
4825 *trace_pgfaults |= TRACE_PFMIN;
4826 else
4827 return -1;
4828
4829 return 0;
4830}
4831
4832static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
4833{
4834 struct evsel *evsel;
4835
4836 evlist__for_each_entry(evlist, evsel) {
4837 if (evsel->handler == NULL)
4838 evsel->handler = handler;
4839 }
4840}
4841
4842static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
4843{
4844 struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
4845
4846 if (fmt) {
4847 const struct syscall_fmt *scfmt = syscall_fmt__find(name);
4848
4849 if (scfmt) {
4850 int skip = 0;
4851
4852 if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
4853 strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
4854 ++skip;
4855
4856 memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
4857 }
4858 }
4859}
4860
4861static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf)
4862{
4863 struct evsel *evsel;
4864
4865 evlist__for_each_entry(evlist, evsel) {
4866 if (evsel->priv || !evsel->tp_format)
4867 continue;
4868
4869 if (strcmp(evsel->tp_format->system, "syscalls")) {
4870 evsel__init_tp_arg_scnprintf(evsel, use_btf);
4871 continue;
4872 }
4873
4874 if (evsel__init_syscall_tp(evsel))
4875 return -1;
4876
4877 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
4878 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4879
4880 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
4881 return -1;
4882
4883 evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
4884 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
4885 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4886
4887 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
4888 return -1;
4889
4890 evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
4891 }
4892 }
4893
4894 return 0;
4895}
4896
4897/*
4898 * XXX: Hackish, just splitting the combined -e+--event (syscalls
4899 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
4900 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
4901 *
4902 * It'd be better to introduce a parse_options() variant that would return a
4903 * list with the terms it didn't match to an event...
4904 */
4905static int trace__parse_events_option(const struct option *opt, const char *str,
4906 int unset __maybe_unused)
4907{
4908 struct trace *trace = (struct trace *)opt->value;
4909 const char *s = str;
4910 char *sep = NULL, *lists[2] = { NULL, NULL, };
4911 int len = strlen(str) + 1, err = -1, list, idx;
4912 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
4913 char group_name[PATH_MAX];
4914 const struct syscall_fmt *fmt;
4915
4916 if (strace_groups_dir == NULL)
4917 return -1;
4918
4919 if (*s == '!') {
4920 ++s;
4921 trace->not_ev_qualifier = true;
4922 }
4923
4924 while (1) {
4925 if ((sep = strchr(s, ',')) != NULL)
4926 *sep = '\0';
4927
4928 list = 0;
4929 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
4930 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
4931 list = 1;
4932 goto do_concat;
4933 }
4934
4935 fmt = syscall_fmt__find_by_alias(s);
4936 if (fmt != NULL) {
4937 list = 1;
4938 s = fmt->name;
4939 } else {
4940 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
4941 if (access(group_name, R_OK) == 0)
4942 list = 1;
4943 }
4944do_concat:
4945 if (lists[list]) {
4946 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
4947 } else {
4948 lists[list] = malloc(len);
4949 if (lists[list] == NULL)
4950 goto out;
4951 strcpy(lists[list], s);
4952 }
4953
4954 if (!sep)
4955 break;
4956
4957 *sep = ',';
4958 s = sep + 1;
4959 }
4960
4961 if (lists[1] != NULL) {
4962 struct strlist_config slist_config = {
4963 .dirname = strace_groups_dir,
4964 };
4965
4966 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
4967 if (trace->ev_qualifier == NULL) {
4968 fputs("Not enough memory to parse event qualifier", trace->output);
4969 goto out;
4970 }
4971
4972 if (trace__validate_ev_qualifier(trace))
4973 goto out;
4974 trace->trace_syscalls = true;
4975 }
4976
4977 err = 0;
4978
4979 if (lists[0]) {
4980 struct parse_events_option_args parse_events_option_args = {
4981 .evlistp = &trace->evlist,
4982 };
4983 struct option o = {
4984 .value = &parse_events_option_args,
4985 };
4986 err = parse_events_option(&o, lists[0], 0);
4987 }
4988out:
4989 free(strace_groups_dir);
4990 free(lists[0]);
4991 free(lists[1]);
4992 if (sep)
4993 *sep = ',';
4994
4995 return err;
4996}
4997
4998static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
4999{
5000 struct trace *trace = opt->value;
5001
5002 if (!list_empty(&trace->evlist->core.entries)) {
5003 struct option o = {
5004 .value = &trace->evlist,
5005 };
5006 return parse_cgroups(&o, str, unset);
5007 }
5008 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
5009
5010 return 0;
5011}
5012
5013static int trace__config(const char *var, const char *value, void *arg)
5014{
5015 struct trace *trace = arg;
5016 int err = 0;
5017
5018 if (!strcmp(var, "trace.add_events")) {
5019 trace->perfconfig_events = strdup(value);
5020 if (trace->perfconfig_events == NULL) {
5021 pr_err("Not enough memory for %s\n", "trace.add_events");
5022 return -1;
5023 }
5024 } else if (!strcmp(var, "trace.show_timestamp")) {
5025 trace->show_tstamp = perf_config_bool(var, value);
5026 } else if (!strcmp(var, "trace.show_duration")) {
5027 trace->show_duration = perf_config_bool(var, value);
5028 } else if (!strcmp(var, "trace.show_arg_names")) {
5029 trace->show_arg_names = perf_config_bool(var, value);
5030 if (!trace->show_arg_names)
5031 trace->show_zeros = true;
5032 } else if (!strcmp(var, "trace.show_zeros")) {
5033 bool new_show_zeros = perf_config_bool(var, value);
5034 if (!trace->show_arg_names && !new_show_zeros) {
5035 pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
5036 goto out;
5037 }
5038 trace->show_zeros = new_show_zeros;
5039 } else if (!strcmp(var, "trace.show_prefix")) {
5040 trace->show_string_prefix = perf_config_bool(var, value);
5041 } else if (!strcmp(var, "trace.no_inherit")) {
5042 trace->opts.no_inherit = perf_config_bool(var, value);
5043 } else if (!strcmp(var, "trace.args_alignment")) {
5044 int args_alignment = 0;
5045 if (perf_config_int(&args_alignment, var, value) == 0)
5046 trace->args_alignment = args_alignment;
5047 } else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
5048 if (strcasecmp(value, "libtraceevent") == 0)
5049 trace->libtraceevent_print = true;
5050 else if (strcasecmp(value, "libbeauty") == 0)
5051 trace->libtraceevent_print = false;
5052 }
5053out:
5054 return err;
5055}
5056
5057static void trace__exit(struct trace *trace)
5058{
5059 int i;
5060
5061 strlist__delete(trace->ev_qualifier);
5062 zfree(&trace->ev_qualifier_ids.entries);
5063 if (trace->syscalls.table) {
5064 for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
5065 syscall__exit(&trace->syscalls.table[i]);
5066 zfree(&trace->syscalls.table);
5067 }
5068 syscalltbl__delete(trace->sctbl);
5069 zfree(&trace->perfconfig_events);
5070}
5071
5072#ifdef HAVE_BPF_SKEL
5073static int bpf__setup_bpf_output(struct evlist *evlist)
5074{
5075 int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
5076
5077 if (err)
5078 pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
5079
5080 return err;
5081}
5082#endif
5083
5084int cmd_trace(int argc, const char **argv)
5085{
5086 const char *trace_usage[] = {
5087 "perf trace [<options>] [<command>]",
5088 "perf trace [<options>] -- <command> [<options>]",
5089 "perf trace record [<options>] [<command>]",
5090 "perf trace record [<options>] -- <command> [<options>]",
5091 NULL
5092 };
5093 struct trace trace = {
5094 .opts = {
5095 .target = {
5096 .uid = UINT_MAX,
5097 .uses_mmap = true,
5098 },
5099 .user_freq = UINT_MAX,
5100 .user_interval = ULLONG_MAX,
5101 .no_buffering = true,
5102 .mmap_pages = UINT_MAX,
5103 },
5104 .output = stderr,
5105 .show_comm = true,
5106 .show_tstamp = true,
5107 .show_duration = true,
5108 .show_arg_names = true,
5109 .args_alignment = 70,
5110 .trace_syscalls = false,
5111 .kernel_syscallchains = false,
5112 .max_stack = UINT_MAX,
5113 .max_events = ULONG_MAX,
5114 };
5115 const char *output_name = NULL;
5116 const struct option trace_options[] = {
5117 OPT_CALLBACK('e', "event", &trace, "event",
5118 "event/syscall selector. use 'perf list' to list available events",
5119 trace__parse_events_option),
5120 OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
5121 "event filter", parse_filter),
5122 OPT_BOOLEAN(0, "comm", &trace.show_comm,
5123 "show the thread COMM next to its id"),
5124 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
5125 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
5126 trace__parse_events_option),
5127 OPT_STRING('o', "output", &output_name, "file", "output file name"),
5128 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
5129 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
5130 "trace events on existing process id"),
5131 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
5132 "trace events on existing thread id"),
5133 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
5134 "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
5135 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
5136 "system-wide collection from all CPUs"),
5137 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
5138 "list of cpus to monitor"),
5139 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
5140 "child tasks do not inherit counters"),
5141 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
5142 "number of mmap data pages", evlist__parse_mmap_pages),
5143 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
5144 "user to profile"),
5145 OPT_CALLBACK(0, "duration", &trace, "float",
5146 "show only events with duration > N.M ms",
5147 trace__set_duration),
5148 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
5149 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
5150 OPT_BOOLEAN('T', "time", &trace.full_time,
5151 "Show full timestamp, not time relative to first start"),
5152 OPT_BOOLEAN(0, "failure", &trace.failure_only,
5153 "Show only syscalls that failed"),
5154 OPT_BOOLEAN('s', "summary", &trace.summary_only,
5155 "Show only syscall summary with statistics"),
5156 OPT_BOOLEAN('S', "with-summary", &trace.summary,
5157 "Show all syscalls and summary with statistics"),
5158 OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
5159 "Show errno stats per syscall, use with -s or -S"),
5160 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
5161 "Trace pagefaults", parse_pagefaults, "maj"),
5162 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
5163 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
5164 OPT_CALLBACK(0, "call-graph", &trace.opts,
5165 "record_mode[,record_size]", record_callchain_help,
5166 &record_parse_callchain_opt),
5167 OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
5168 "Use libtraceevent to print the tracepoint arguments."),
5169 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
5170 "Show the kernel callchains on the syscall exit path"),
5171 OPT_ULONG(0, "max-events", &trace.max_events,
5172 "Set the maximum number of events to print, exit after that is reached. "),
5173 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
5174 "Set the minimum stack depth when parsing the callchain, "
5175 "anything below the specified depth will be ignored."),
5176 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
5177 "Set the maximum stack depth when parsing the callchain, "
5178 "anything beyond the specified depth will be ignored. "
5179 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
5180 OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
5181 "Sort batch of events before processing, use if getting out of order events"),
5182 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
5183 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
5184 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
5185 "per thread proc mmap processing timeout in ms"),
5186 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
5187 trace__parse_cgroups),
5188 OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
5189 "ms to wait before starting measurement after program "
5190 "start"),
5191 OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
5192 "to customized ones"),
5193 OPTS_EVSWITCH(&trace.evswitch),
5194 OPT_END()
5195 };
5196 bool __maybe_unused max_stack_user_set = true;
5197 bool mmap_pages_user_set = true;
5198 struct evsel *evsel;
5199 const char * const trace_subcommands[] = { "record", NULL };
5200 int err = -1;
5201 char bf[BUFSIZ];
5202 struct sigaction sigchld_act;
5203
5204 signal(SIGSEGV, sighandler_dump_stack);
5205 signal(SIGFPE, sighandler_dump_stack);
5206 signal(SIGINT, sighandler_interrupt);
5207
5208 memset(&sigchld_act, 0, sizeof(sigchld_act));
5209 sigchld_act.sa_flags = SA_SIGINFO;
5210 sigchld_act.sa_sigaction = sighandler_chld;
5211 sigaction(SIGCHLD, &sigchld_act, NULL);
5212
5213 trace.evlist = evlist__new();
5214 trace.sctbl = syscalltbl__new();
5215
5216 if (trace.evlist == NULL || trace.sctbl == NULL) {
5217 pr_err("Not enough memory to run!\n");
5218 err = -ENOMEM;
5219 goto out;
5220 }
5221
5222 /*
5223 * Parsing .perfconfig may entail creating a BPF event, that may need
5224 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
5225 * is too small. This affects just this process, not touching the
5226 * global setting. If it fails we'll get something in 'perf trace -v'
5227 * to help diagnose the problem.
5228 */
5229 rlimit__bump_memlock();
5230
5231 err = perf_config(trace__config, &trace);
5232 if (err)
5233 goto out;
5234
5235 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
5236 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
5237
5238 /*
5239 * Here we already passed thru trace__parse_events_option() and it has
5240 * already figured out if -e syscall_name, if not but if --event
5241 * foo:bar was used, the user is interested _just_ in those, say,
5242 * tracepoint events, not in the strace-like syscall-name-based mode.
5243 *
5244 * This is important because we need to check if strace-like mode is
5245 * needed to decided if we should filter out the eBPF
5246 * __augmented_syscalls__ code, if it is in the mix, say, via
5247 * .perfconfig trace.add_events, and filter those out.
5248 */
5249 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
5250 trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
5251 trace.trace_syscalls = true;
5252 }
5253 /*
5254 * Now that we have --verbose figured out, lets see if we need to parse
5255 * events from .perfconfig, so that if those events fail parsing, say some
5256 * BPF program fails, then we'll be able to use --verbose to see what went
5257 * wrong in more detail.
5258 */
5259 if (trace.perfconfig_events != NULL) {
5260 struct parse_events_error parse_err;
5261
5262 parse_events_error__init(&parse_err);
5263 err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
5264 if (err)
5265 parse_events_error__print(&parse_err, trace.perfconfig_events);
5266 parse_events_error__exit(&parse_err);
5267 if (err)
5268 goto out;
5269 }
5270
5271 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
5272 usage_with_options_msg(trace_usage, trace_options,
5273 "cgroup monitoring only available in system-wide mode");
5274 }
5275
5276#ifdef HAVE_BPF_SKEL
5277 if (!trace.trace_syscalls)
5278 goto skip_augmentation;
5279
5280 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
5281 pr_debug("Syscall augmentation fails with record, disabling augmentation");
5282 goto skip_augmentation;
5283 }
5284
5285 trace.skel = augmented_raw_syscalls_bpf__open();
5286 if (!trace.skel) {
5287 pr_debug("Failed to open augmented syscalls BPF skeleton");
5288 } else {
5289 /*
5290 * Disable attaching the BPF programs except for sys_enter and
5291 * sys_exit that tail call into this as necessary.
5292 */
5293 struct bpf_program *prog;
5294
5295 bpf_object__for_each_program(prog, trace.skel->obj) {
5296 if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
5297 bpf_program__set_autoattach(prog, /*autoattach=*/false);
5298 }
5299
5300 err = augmented_raw_syscalls_bpf__load(trace.skel);
5301
5302 if (err < 0) {
5303 libbpf_strerror(err, bf, sizeof(bf));
5304 pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf);
5305 } else {
5306 augmented_raw_syscalls_bpf__attach(trace.skel);
5307 trace__add_syscall_newtp(&trace);
5308 }
5309 }
5310
5311 err = bpf__setup_bpf_output(trace.evlist);
5312 if (err) {
5313 libbpf_strerror(err, bf, sizeof(bf));
5314 pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
5315 goto out;
5316 }
5317 trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
5318 assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__"));
5319skip_augmentation:
5320#endif
5321 err = -1;
5322
5323 if (trace.trace_pgfaults) {
5324 trace.opts.sample_address = true;
5325 trace.opts.sample_time = true;
5326 }
5327
5328 if (trace.opts.mmap_pages == UINT_MAX)
5329 mmap_pages_user_set = false;
5330
5331 if (trace.max_stack == UINT_MAX) {
5332 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
5333 max_stack_user_set = false;
5334 }
5335
5336#ifdef HAVE_DWARF_UNWIND_SUPPORT
5337 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
5338 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
5339 }
5340#endif
5341
5342 if (callchain_param.enabled) {
5343 if (!mmap_pages_user_set && geteuid() == 0)
5344 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
5345
5346 symbol_conf.use_callchain = true;
5347 }
5348
5349 if (trace.evlist->core.nr_entries > 0) {
5350 bool use_btf = false;
5351
5352 evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
5353 if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
5354 perror("failed to set syscalls:* tracepoint fields");
5355 goto out;
5356 }
5357
5358 if (use_btf)
5359 trace__load_vmlinux_btf(&trace);
5360 }
5361
5362 if (trace.sort_events) {
5363 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
5364 ordered_events__set_copy_on_queue(&trace.oe.data, true);
5365 }
5366
5367 /*
5368 * If we are augmenting syscalls, then combine what we put in the
5369 * __augmented_syscalls__ BPF map with what is in the
5370 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
5371 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
5372 *
5373 * We'll switch to look at two BPF maps, one for sys_enter and the
5374 * other for sys_exit when we start augmenting the sys_exit paths with
5375 * buffers that are being copied from kernel to userspace, think 'read'
5376 * syscall.
5377 */
5378 if (trace.syscalls.events.bpf_output) {
5379 evlist__for_each_entry(trace.evlist, evsel) {
5380 bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
5381
5382 if (raw_syscalls_sys_exit) {
5383 trace.raw_augmented_syscalls = true;
5384 goto init_augmented_syscall_tp;
5385 }
5386
5387 if (trace.syscalls.events.bpf_output->priv == NULL &&
5388 strstr(evsel__name(evsel), "syscalls:sys_enter")) {
5389 struct evsel *augmented = trace.syscalls.events.bpf_output;
5390 if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
5391 evsel__init_augmented_syscall_tp_args(augmented))
5392 goto out;
5393 /*
5394 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
5395 * Above we made sure we can get from the payload the tp fields
5396 * that we get from syscalls:sys_enter tracefs format file.
5397 */
5398 augmented->handler = trace__sys_enter;
5399 /*
5400 * Now we do the same for the *syscalls:sys_enter event so that
5401 * if we handle it directly, i.e. if the BPF prog returns 0 so
5402 * as not to filter it, then we'll handle it just like we would
5403 * for the BPF_OUTPUT one:
5404 */
5405 if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
5406 evsel__init_augmented_syscall_tp_args(evsel))
5407 goto out;
5408 evsel->handler = trace__sys_enter;
5409 }
5410
5411 if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
5412 struct syscall_tp *sc;
5413init_augmented_syscall_tp:
5414 if (evsel__init_augmented_syscall_tp(evsel, evsel))
5415 goto out;
5416 sc = __evsel__syscall_tp(evsel);
5417 /*
5418 * For now with BPF raw_augmented we hook into
5419 * raw_syscalls:sys_enter and there we get all
5420 * 6 syscall args plus the tracepoint common
5421 * fields and the syscall_nr (another long).
5422 * So we check if that is the case and if so
5423 * don't look after the sc->args_size but
5424 * always after the full raw_syscalls:sys_enter
5425 * payload, which is fixed.
5426 *
5427 * We'll revisit this later to pass
5428 * s->args_size to the BPF augmenter (now
5429 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
5430 * so that it copies only what we need for each
5431 * syscall, like what happens when we use
5432 * syscalls:sys_enter_NAME, so that we reduce
5433 * the kernel/userspace traffic to just what is
5434 * needed for each syscall.
5435 */
5436 if (trace.raw_augmented_syscalls)
5437 trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
5438 evsel__init_augmented_syscall_tp_ret(evsel);
5439 evsel->handler = trace__sys_exit;
5440 }
5441 }
5442 }
5443
5444 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
5445 return trace__record(&trace, argc-1, &argv[1]);
5446
5447 /* Using just --errno-summary will trigger --summary */
5448 if (trace.errno_summary && !trace.summary && !trace.summary_only)
5449 trace.summary_only = true;
5450
5451 /* summary_only implies summary option, but don't overwrite summary if set */
5452 if (trace.summary_only)
5453 trace.summary = trace.summary_only;
5454
5455 /* Keep exited threads, otherwise information might be lost for summary */
5456 if (trace.summary)
5457 symbol_conf.keep_exited_threads = true;
5458
5459 if (output_name != NULL) {
5460 err = trace__open_output(&trace, output_name);
5461 if (err < 0) {
5462 perror("failed to create output file");
5463 goto out;
5464 }
5465 }
5466
5467 err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
5468 if (err)
5469 goto out_close;
5470
5471 err = target__validate(&trace.opts.target);
5472 if (err) {
5473 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5474 fprintf(trace.output, "%s", bf);
5475 goto out_close;
5476 }
5477
5478 err = target__parse_uid(&trace.opts.target);
5479 if (err) {
5480 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5481 fprintf(trace.output, "%s", bf);
5482 goto out_close;
5483 }
5484
5485 if (!argc && target__none(&trace.opts.target))
5486 trace.opts.target.system_wide = true;
5487
5488 if (input_name)
5489 err = trace__replay(&trace);
5490 else
5491 err = trace__run(&trace, argc, argv);
5492
5493out_close:
5494 if (output_name != NULL)
5495 fclose(trace.output);
5496out:
5497 trace__exit(&trace);
5498#ifdef HAVE_BPF_SKEL
5499 augmented_raw_syscalls_bpf__destroy(trace.skel);
5500#endif
5501 return err;
5502}