Loading...
1#include <traceevent/event-parse.h>
2#include "builtin.h"
3#include "util/color.h"
4#include "util/debug.h"
5#include "util/evlist.h"
6#include "util/machine.h"
7#include "util/session.h"
8#include "util/thread.h"
9#include "util/parse-options.h"
10#include "util/strlist.h"
11#include "util/intlist.h"
12#include "util/thread_map.h"
13#include "util/stat.h"
14#include "trace-event.h"
15#include "util/parse-events.h"
16
17#include <libaudit.h>
18#include <stdlib.h>
19#include <sys/eventfd.h>
20#include <sys/mman.h>
21#include <linux/futex.h>
22
23/* For older distros: */
24#ifndef MAP_STACK
25# define MAP_STACK 0x20000
26#endif
27
28#ifndef MADV_HWPOISON
29# define MADV_HWPOISON 100
30#endif
31
32#ifndef MADV_MERGEABLE
33# define MADV_MERGEABLE 12
34#endif
35
36#ifndef MADV_UNMERGEABLE
37# define MADV_UNMERGEABLE 13
38#endif
39
40#ifndef EFD_SEMAPHORE
41# define EFD_SEMAPHORE 1
42#endif
43
44struct tp_field {
45 int offset;
46 union {
47 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
48 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
49 };
50};
51
52#define TP_UINT_FIELD(bits) \
53static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
54{ \
55 return *(u##bits *)(sample->raw_data + field->offset); \
56}
57
58TP_UINT_FIELD(8);
59TP_UINT_FIELD(16);
60TP_UINT_FIELD(32);
61TP_UINT_FIELD(64);
62
63#define TP_UINT_FIELD__SWAPPED(bits) \
64static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
65{ \
66 u##bits value = *(u##bits *)(sample->raw_data + field->offset); \
67 return bswap_##bits(value);\
68}
69
70TP_UINT_FIELD__SWAPPED(16);
71TP_UINT_FIELD__SWAPPED(32);
72TP_UINT_FIELD__SWAPPED(64);
73
74static int tp_field__init_uint(struct tp_field *field,
75 struct format_field *format_field,
76 bool needs_swap)
77{
78 field->offset = format_field->offset;
79
80 switch (format_field->size) {
81 case 1:
82 field->integer = tp_field__u8;
83 break;
84 case 2:
85 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
86 break;
87 case 4:
88 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
89 break;
90 case 8:
91 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
92 break;
93 default:
94 return -1;
95 }
96
97 return 0;
98}
99
100static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
101{
102 return sample->raw_data + field->offset;
103}
104
105static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
106{
107 field->offset = format_field->offset;
108 field->pointer = tp_field__ptr;
109 return 0;
110}
111
112struct syscall_tp {
113 struct tp_field id;
114 union {
115 struct tp_field args, ret;
116 };
117};
118
119static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
120 struct tp_field *field,
121 const char *name)
122{
123 struct format_field *format_field = perf_evsel__field(evsel, name);
124
125 if (format_field == NULL)
126 return -1;
127
128 return tp_field__init_uint(field, format_field, evsel->needs_swap);
129}
130
131#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
132 ({ struct syscall_tp *sc = evsel->priv;\
133 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
134
135static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
136 struct tp_field *field,
137 const char *name)
138{
139 struct format_field *format_field = perf_evsel__field(evsel, name);
140
141 if (format_field == NULL)
142 return -1;
143
144 return tp_field__init_ptr(field, format_field);
145}
146
147#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
148 ({ struct syscall_tp *sc = evsel->priv;\
149 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
150
151static void perf_evsel__delete_priv(struct perf_evsel *evsel)
152{
153 zfree(&evsel->priv);
154 perf_evsel__delete(evsel);
155}
156
157static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
158{
159 evsel->priv = malloc(sizeof(struct syscall_tp));
160 if (evsel->priv != NULL) {
161 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
162 goto out_delete;
163
164 evsel->handler = handler;
165 return 0;
166 }
167
168 return -ENOMEM;
169
170out_delete:
171 zfree(&evsel->priv);
172 return -ENOENT;
173}
174
175static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
176{
177 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
178
179 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
180 if (evsel == NULL)
181 evsel = perf_evsel__newtp("syscalls", direction);
182
183 if (evsel) {
184 if (perf_evsel__init_syscall_tp(evsel, handler))
185 goto out_delete;
186 }
187
188 return evsel;
189
190out_delete:
191 perf_evsel__delete_priv(evsel);
192 return NULL;
193}
194
195#define perf_evsel__sc_tp_uint(evsel, name, sample) \
196 ({ struct syscall_tp *fields = evsel->priv; \
197 fields->name.integer(&fields->name, sample); })
198
199#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
200 ({ struct syscall_tp *fields = evsel->priv; \
201 fields->name.pointer(&fields->name, sample); })
202
203static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
204 void *sys_enter_handler,
205 void *sys_exit_handler)
206{
207 int ret = -1;
208 struct perf_evsel *sys_enter, *sys_exit;
209
210 sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
211 if (sys_enter == NULL)
212 goto out;
213
214 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
215 goto out_delete_sys_enter;
216
217 sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
218 if (sys_exit == NULL)
219 goto out_delete_sys_enter;
220
221 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
222 goto out_delete_sys_exit;
223
224 perf_evlist__add(evlist, sys_enter);
225 perf_evlist__add(evlist, sys_exit);
226
227 ret = 0;
228out:
229 return ret;
230
231out_delete_sys_exit:
232 perf_evsel__delete_priv(sys_exit);
233out_delete_sys_enter:
234 perf_evsel__delete_priv(sys_enter);
235 goto out;
236}
237
238
239struct syscall_arg {
240 unsigned long val;
241 struct thread *thread;
242 struct trace *trace;
243 void *parm;
244 u8 idx;
245 u8 mask;
246};
247
248struct strarray {
249 int offset;
250 int nr_entries;
251 const char **entries;
252};
253
254#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
255 .nr_entries = ARRAY_SIZE(array), \
256 .entries = array, \
257}
258
259#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
260 .offset = off, \
261 .nr_entries = ARRAY_SIZE(array), \
262 .entries = array, \
263}
264
265static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
266 const char *intfmt,
267 struct syscall_arg *arg)
268{
269 struct strarray *sa = arg->parm;
270 int idx = arg->val - sa->offset;
271
272 if (idx < 0 || idx >= sa->nr_entries)
273 return scnprintf(bf, size, intfmt, arg->val);
274
275 return scnprintf(bf, size, "%s", sa->entries[idx]);
276}
277
278static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
279 struct syscall_arg *arg)
280{
281 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
282}
283
284#define SCA_STRARRAY syscall_arg__scnprintf_strarray
285
286#if defined(__i386__) || defined(__x86_64__)
287/*
288 * FIXME: Make this available to all arches as soon as the ioctl beautifier
289 * gets rewritten to support all arches.
290 */
291static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
292 struct syscall_arg *arg)
293{
294 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
295}
296
297#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
298#endif /* defined(__i386__) || defined(__x86_64__) */
299
300static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
301 struct syscall_arg *arg);
302
303#define SCA_FD syscall_arg__scnprintf_fd
304
305static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
306 struct syscall_arg *arg)
307{
308 int fd = arg->val;
309
310 if (fd == AT_FDCWD)
311 return scnprintf(bf, size, "CWD");
312
313 return syscall_arg__scnprintf_fd(bf, size, arg);
314}
315
316#define SCA_FDAT syscall_arg__scnprintf_fd_at
317
318static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
319 struct syscall_arg *arg);
320
321#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
322
323static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
324 struct syscall_arg *arg)
325{
326 return scnprintf(bf, size, "%#lx", arg->val);
327}
328
329#define SCA_HEX syscall_arg__scnprintf_hex
330
331static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
332 struct syscall_arg *arg)
333{
334 int printed = 0, prot = arg->val;
335
336 if (prot == PROT_NONE)
337 return scnprintf(bf, size, "NONE");
338#define P_MMAP_PROT(n) \
339 if (prot & PROT_##n) { \
340 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
341 prot &= ~PROT_##n; \
342 }
343
344 P_MMAP_PROT(EXEC);
345 P_MMAP_PROT(READ);
346 P_MMAP_PROT(WRITE);
347#ifdef PROT_SEM
348 P_MMAP_PROT(SEM);
349#endif
350 P_MMAP_PROT(GROWSDOWN);
351 P_MMAP_PROT(GROWSUP);
352#undef P_MMAP_PROT
353
354 if (prot)
355 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
356
357 return printed;
358}
359
360#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
361
362static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
363 struct syscall_arg *arg)
364{
365 int printed = 0, flags = arg->val;
366
367#define P_MMAP_FLAG(n) \
368 if (flags & MAP_##n) { \
369 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
370 flags &= ~MAP_##n; \
371 }
372
373 P_MMAP_FLAG(SHARED);
374 P_MMAP_FLAG(PRIVATE);
375#ifdef MAP_32BIT
376 P_MMAP_FLAG(32BIT);
377#endif
378 P_MMAP_FLAG(ANONYMOUS);
379 P_MMAP_FLAG(DENYWRITE);
380 P_MMAP_FLAG(EXECUTABLE);
381 P_MMAP_FLAG(FILE);
382 P_MMAP_FLAG(FIXED);
383 P_MMAP_FLAG(GROWSDOWN);
384#ifdef MAP_HUGETLB
385 P_MMAP_FLAG(HUGETLB);
386#endif
387 P_MMAP_FLAG(LOCKED);
388 P_MMAP_FLAG(NONBLOCK);
389 P_MMAP_FLAG(NORESERVE);
390 P_MMAP_FLAG(POPULATE);
391 P_MMAP_FLAG(STACK);
392#ifdef MAP_UNINITIALIZED
393 P_MMAP_FLAG(UNINITIALIZED);
394#endif
395#undef P_MMAP_FLAG
396
397 if (flags)
398 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
399
400 return printed;
401}
402
403#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
404
405static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
406 struct syscall_arg *arg)
407{
408 int behavior = arg->val;
409
410 switch (behavior) {
411#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
412 P_MADV_BHV(NORMAL);
413 P_MADV_BHV(RANDOM);
414 P_MADV_BHV(SEQUENTIAL);
415 P_MADV_BHV(WILLNEED);
416 P_MADV_BHV(DONTNEED);
417 P_MADV_BHV(REMOVE);
418 P_MADV_BHV(DONTFORK);
419 P_MADV_BHV(DOFORK);
420 P_MADV_BHV(HWPOISON);
421#ifdef MADV_SOFT_OFFLINE
422 P_MADV_BHV(SOFT_OFFLINE);
423#endif
424 P_MADV_BHV(MERGEABLE);
425 P_MADV_BHV(UNMERGEABLE);
426#ifdef MADV_HUGEPAGE
427 P_MADV_BHV(HUGEPAGE);
428#endif
429#ifdef MADV_NOHUGEPAGE
430 P_MADV_BHV(NOHUGEPAGE);
431#endif
432#ifdef MADV_DONTDUMP
433 P_MADV_BHV(DONTDUMP);
434#endif
435#ifdef MADV_DODUMP
436 P_MADV_BHV(DODUMP);
437#endif
438#undef P_MADV_PHV
439 default: break;
440 }
441
442 return scnprintf(bf, size, "%#x", behavior);
443}
444
445#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
446
447static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
448 struct syscall_arg *arg)
449{
450 int printed = 0, op = arg->val;
451
452 if (op == 0)
453 return scnprintf(bf, size, "NONE");
454#define P_CMD(cmd) \
455 if ((op & LOCK_##cmd) == LOCK_##cmd) { \
456 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
457 op &= ~LOCK_##cmd; \
458 }
459
460 P_CMD(SH);
461 P_CMD(EX);
462 P_CMD(NB);
463 P_CMD(UN);
464 P_CMD(MAND);
465 P_CMD(RW);
466 P_CMD(READ);
467 P_CMD(WRITE);
468#undef P_OP
469
470 if (op)
471 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
472
473 return printed;
474}
475
476#define SCA_FLOCK syscall_arg__scnprintf_flock
477
478static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
479{
480 enum syscall_futex_args {
481 SCF_UADDR = (1 << 0),
482 SCF_OP = (1 << 1),
483 SCF_VAL = (1 << 2),
484 SCF_TIMEOUT = (1 << 3),
485 SCF_UADDR2 = (1 << 4),
486 SCF_VAL3 = (1 << 5),
487 };
488 int op = arg->val;
489 int cmd = op & FUTEX_CMD_MASK;
490 size_t printed = 0;
491
492 switch (cmd) {
493#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
494 P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
495 P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
496 P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
497 P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break;
498 P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break;
499 P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break;
500 P_FUTEX_OP(WAKE_OP); break;
501 P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
502 P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
503 P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
504 P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break;
505 P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break;
506 P_FUTEX_OP(WAIT_REQUEUE_PI); break;
507 default: printed = scnprintf(bf, size, "%#x", cmd); break;
508 }
509
510 if (op & FUTEX_PRIVATE_FLAG)
511 printed += scnprintf(bf + printed, size - printed, "|PRIV");
512
513 if (op & FUTEX_CLOCK_REALTIME)
514 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
515
516 return printed;
517}
518
519#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op
520
521static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
522static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
523
524static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
525static DEFINE_STRARRAY(itimers);
526
527static const char *whences[] = { "SET", "CUR", "END",
528#ifdef SEEK_DATA
529"DATA",
530#endif
531#ifdef SEEK_HOLE
532"HOLE",
533#endif
534};
535static DEFINE_STRARRAY(whences);
536
537static const char *fcntl_cmds[] = {
538 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
539 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
540 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
541 "F_GETOWNER_UIDS",
542};
543static DEFINE_STRARRAY(fcntl_cmds);
544
545static const char *rlimit_resources[] = {
546 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
547 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
548 "RTTIME",
549};
550static DEFINE_STRARRAY(rlimit_resources);
551
552static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
553static DEFINE_STRARRAY(sighow);
554
555static const char *clockid[] = {
556 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
557 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
558};
559static DEFINE_STRARRAY(clockid);
560
561static const char *socket_families[] = {
562 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
563 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
564 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
565 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
566 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
567 "ALG", "NFC", "VSOCK",
568};
569static DEFINE_STRARRAY(socket_families);
570
571#ifndef SOCK_TYPE_MASK
572#define SOCK_TYPE_MASK 0xf
573#endif
574
575static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
576 struct syscall_arg *arg)
577{
578 size_t printed;
579 int type = arg->val,
580 flags = type & ~SOCK_TYPE_MASK;
581
582 type &= SOCK_TYPE_MASK;
583 /*
584 * Can't use a strarray, MIPS may override for ABI reasons.
585 */
586 switch (type) {
587#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
588 P_SK_TYPE(STREAM);
589 P_SK_TYPE(DGRAM);
590 P_SK_TYPE(RAW);
591 P_SK_TYPE(RDM);
592 P_SK_TYPE(SEQPACKET);
593 P_SK_TYPE(DCCP);
594 P_SK_TYPE(PACKET);
595#undef P_SK_TYPE
596 default:
597 printed = scnprintf(bf, size, "%#x", type);
598 }
599
600#define P_SK_FLAG(n) \
601 if (flags & SOCK_##n) { \
602 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
603 flags &= ~SOCK_##n; \
604 }
605
606 P_SK_FLAG(CLOEXEC);
607 P_SK_FLAG(NONBLOCK);
608#undef P_SK_FLAG
609
610 if (flags)
611 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
612
613 return printed;
614}
615
616#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
617
618#ifndef MSG_PROBE
619#define MSG_PROBE 0x10
620#endif
621#ifndef MSG_WAITFORONE
622#define MSG_WAITFORONE 0x10000
623#endif
624#ifndef MSG_SENDPAGE_NOTLAST
625#define MSG_SENDPAGE_NOTLAST 0x20000
626#endif
627#ifndef MSG_FASTOPEN
628#define MSG_FASTOPEN 0x20000000
629#endif
630
631static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
632 struct syscall_arg *arg)
633{
634 int printed = 0, flags = arg->val;
635
636 if (flags == 0)
637 return scnprintf(bf, size, "NONE");
638#define P_MSG_FLAG(n) \
639 if (flags & MSG_##n) { \
640 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
641 flags &= ~MSG_##n; \
642 }
643
644 P_MSG_FLAG(OOB);
645 P_MSG_FLAG(PEEK);
646 P_MSG_FLAG(DONTROUTE);
647 P_MSG_FLAG(TRYHARD);
648 P_MSG_FLAG(CTRUNC);
649 P_MSG_FLAG(PROBE);
650 P_MSG_FLAG(TRUNC);
651 P_MSG_FLAG(DONTWAIT);
652 P_MSG_FLAG(EOR);
653 P_MSG_FLAG(WAITALL);
654 P_MSG_FLAG(FIN);
655 P_MSG_FLAG(SYN);
656 P_MSG_FLAG(CONFIRM);
657 P_MSG_FLAG(RST);
658 P_MSG_FLAG(ERRQUEUE);
659 P_MSG_FLAG(NOSIGNAL);
660 P_MSG_FLAG(MORE);
661 P_MSG_FLAG(WAITFORONE);
662 P_MSG_FLAG(SENDPAGE_NOTLAST);
663 P_MSG_FLAG(FASTOPEN);
664 P_MSG_FLAG(CMSG_CLOEXEC);
665#undef P_MSG_FLAG
666
667 if (flags)
668 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
669
670 return printed;
671}
672
673#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
674
675static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
676 struct syscall_arg *arg)
677{
678 size_t printed = 0;
679 int mode = arg->val;
680
681 if (mode == F_OK) /* 0 */
682 return scnprintf(bf, size, "F");
683#define P_MODE(n) \
684 if (mode & n##_OK) { \
685 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
686 mode &= ~n##_OK; \
687 }
688
689 P_MODE(R);
690 P_MODE(W);
691 P_MODE(X);
692#undef P_MODE
693
694 if (mode)
695 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
696
697 return printed;
698}
699
700#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
701
702static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
703 struct syscall_arg *arg)
704{
705 int printed = 0, flags = arg->val;
706
707 if (!(flags & O_CREAT))
708 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
709
710 if (flags == 0)
711 return scnprintf(bf, size, "RDONLY");
712#define P_FLAG(n) \
713 if (flags & O_##n) { \
714 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
715 flags &= ~O_##n; \
716 }
717
718 P_FLAG(APPEND);
719 P_FLAG(ASYNC);
720 P_FLAG(CLOEXEC);
721 P_FLAG(CREAT);
722 P_FLAG(DIRECT);
723 P_FLAG(DIRECTORY);
724 P_FLAG(EXCL);
725 P_FLAG(LARGEFILE);
726 P_FLAG(NOATIME);
727 P_FLAG(NOCTTY);
728#ifdef O_NONBLOCK
729 P_FLAG(NONBLOCK);
730#elif O_NDELAY
731 P_FLAG(NDELAY);
732#endif
733#ifdef O_PATH
734 P_FLAG(PATH);
735#endif
736 P_FLAG(RDWR);
737#ifdef O_DSYNC
738 if ((flags & O_SYNC) == O_SYNC)
739 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
740 else {
741 P_FLAG(DSYNC);
742 }
743#else
744 P_FLAG(SYNC);
745#endif
746 P_FLAG(TRUNC);
747 P_FLAG(WRONLY);
748#undef P_FLAG
749
750 if (flags)
751 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
752
753 return printed;
754}
755
756#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
757
758static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
759 struct syscall_arg *arg)
760{
761 int printed = 0, flags = arg->val;
762
763 if (flags == 0)
764 return scnprintf(bf, size, "NONE");
765#define P_FLAG(n) \
766 if (flags & EFD_##n) { \
767 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
768 flags &= ~EFD_##n; \
769 }
770
771 P_FLAG(SEMAPHORE);
772 P_FLAG(CLOEXEC);
773 P_FLAG(NONBLOCK);
774#undef P_FLAG
775
776 if (flags)
777 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
778
779 return printed;
780}
781
782#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
783
784static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
785 struct syscall_arg *arg)
786{
787 int printed = 0, flags = arg->val;
788
789#define P_FLAG(n) \
790 if (flags & O_##n) { \
791 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
792 flags &= ~O_##n; \
793 }
794
795 P_FLAG(CLOEXEC);
796 P_FLAG(NONBLOCK);
797#undef P_FLAG
798
799 if (flags)
800 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
801
802 return printed;
803}
804
805#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
806
807static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
808{
809 int sig = arg->val;
810
811 switch (sig) {
812#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
813 P_SIGNUM(HUP);
814 P_SIGNUM(INT);
815 P_SIGNUM(QUIT);
816 P_SIGNUM(ILL);
817 P_SIGNUM(TRAP);
818 P_SIGNUM(ABRT);
819 P_SIGNUM(BUS);
820 P_SIGNUM(FPE);
821 P_SIGNUM(KILL);
822 P_SIGNUM(USR1);
823 P_SIGNUM(SEGV);
824 P_SIGNUM(USR2);
825 P_SIGNUM(PIPE);
826 P_SIGNUM(ALRM);
827 P_SIGNUM(TERM);
828 P_SIGNUM(CHLD);
829 P_SIGNUM(CONT);
830 P_SIGNUM(STOP);
831 P_SIGNUM(TSTP);
832 P_SIGNUM(TTIN);
833 P_SIGNUM(TTOU);
834 P_SIGNUM(URG);
835 P_SIGNUM(XCPU);
836 P_SIGNUM(XFSZ);
837 P_SIGNUM(VTALRM);
838 P_SIGNUM(PROF);
839 P_SIGNUM(WINCH);
840 P_SIGNUM(IO);
841 P_SIGNUM(PWR);
842 P_SIGNUM(SYS);
843#ifdef SIGEMT
844 P_SIGNUM(EMT);
845#endif
846#ifdef SIGSTKFLT
847 P_SIGNUM(STKFLT);
848#endif
849#ifdef SIGSWI
850 P_SIGNUM(SWI);
851#endif
852 default: break;
853 }
854
855 return scnprintf(bf, size, "%#x", sig);
856}
857
858#define SCA_SIGNUM syscall_arg__scnprintf_signum
859
860#if defined(__i386__) || defined(__x86_64__)
861/*
862 * FIXME: Make this available to all arches.
863 */
864#define TCGETS 0x5401
865
866static const char *tioctls[] = {
867 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
868 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
869 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
870 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
871 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
872 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
873 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
874 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
875 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
876 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
877 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
878 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
879 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
880 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
881 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
882};
883
884static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
885#endif /* defined(__i386__) || defined(__x86_64__) */
886
887#define STRARRAY(arg, name, array) \
888 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
889 .arg_parm = { [arg] = &strarray__##array, }
890
891static struct syscall_fmt {
892 const char *name;
893 const char *alias;
894 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
895 void *arg_parm[6];
896 bool errmsg;
897 bool timeout;
898 bool hexret;
899} syscall_fmts[] = {
900 { .name = "access", .errmsg = true,
901 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
902 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
903 { .name = "brk", .hexret = true,
904 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
905 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
906 { .name = "close", .errmsg = true,
907 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
908 { .name = "connect", .errmsg = true, },
909 { .name = "dup", .errmsg = true,
910 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
911 { .name = "dup2", .errmsg = true,
912 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
913 { .name = "dup3", .errmsg = true,
914 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
915 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
916 { .name = "eventfd2", .errmsg = true,
917 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
918 { .name = "faccessat", .errmsg = true,
919 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
920 { .name = "fadvise64", .errmsg = true,
921 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
922 { .name = "fallocate", .errmsg = true,
923 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
924 { .name = "fchdir", .errmsg = true,
925 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
926 { .name = "fchmod", .errmsg = true,
927 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
928 { .name = "fchmodat", .errmsg = true,
929 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
930 { .name = "fchown", .errmsg = true,
931 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
932 { .name = "fchownat", .errmsg = true,
933 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
934 { .name = "fcntl", .errmsg = true,
935 .arg_scnprintf = { [0] = SCA_FD, /* fd */
936 [1] = SCA_STRARRAY, /* cmd */ },
937 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
938 { .name = "fdatasync", .errmsg = true,
939 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
940 { .name = "flock", .errmsg = true,
941 .arg_scnprintf = { [0] = SCA_FD, /* fd */
942 [1] = SCA_FLOCK, /* cmd */ }, },
943 { .name = "fsetxattr", .errmsg = true,
944 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
945 { .name = "fstat", .errmsg = true, .alias = "newfstat",
946 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
947 { .name = "fstatat", .errmsg = true, .alias = "newfstatat",
948 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
949 { .name = "fstatfs", .errmsg = true,
950 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
951 { .name = "fsync", .errmsg = true,
952 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
953 { .name = "ftruncate", .errmsg = true,
954 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
955 { .name = "futex", .errmsg = true,
956 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
957 { .name = "futimesat", .errmsg = true,
958 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
959 { .name = "getdents", .errmsg = true,
960 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
961 { .name = "getdents64", .errmsg = true,
962 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
963 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
964 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
965 { .name = "ioctl", .errmsg = true,
966 .arg_scnprintf = { [0] = SCA_FD, /* fd */
967#if defined(__i386__) || defined(__x86_64__)
968/*
969 * FIXME: Make this available to all arches.
970 */
971 [1] = SCA_STRHEXARRAY, /* cmd */
972 [2] = SCA_HEX, /* arg */ },
973 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
974#else
975 [2] = SCA_HEX, /* arg */ }, },
976#endif
977 { .name = "kill", .errmsg = true,
978 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
979 { .name = "linkat", .errmsg = true,
980 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
981 { .name = "lseek", .errmsg = true,
982 .arg_scnprintf = { [0] = SCA_FD, /* fd */
983 [2] = SCA_STRARRAY, /* whence */ },
984 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
985 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
986 { .name = "madvise", .errmsg = true,
987 .arg_scnprintf = { [0] = SCA_HEX, /* start */
988 [2] = SCA_MADV_BHV, /* behavior */ }, },
989 { .name = "mkdirat", .errmsg = true,
990 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
991 { .name = "mknodat", .errmsg = true,
992 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
993 { .name = "mlock", .errmsg = true,
994 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
995 { .name = "mlockall", .errmsg = true,
996 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
997 { .name = "mmap", .hexret = true,
998 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
999 [2] = SCA_MMAP_PROT, /* prot */
1000 [3] = SCA_MMAP_FLAGS, /* flags */
1001 [4] = SCA_FD, /* fd */ }, },
1002 { .name = "mprotect", .errmsg = true,
1003 .arg_scnprintf = { [0] = SCA_HEX, /* start */
1004 [2] = SCA_MMAP_PROT, /* prot */ }, },
1005 { .name = "mremap", .hexret = true,
1006 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1007 [4] = SCA_HEX, /* new_addr */ }, },
1008 { .name = "munlock", .errmsg = true,
1009 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1010 { .name = "munmap", .errmsg = true,
1011 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1012 { .name = "name_to_handle_at", .errmsg = true,
1013 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1014 { .name = "newfstatat", .errmsg = true,
1015 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1016 { .name = "open", .errmsg = true,
1017 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1018 { .name = "open_by_handle_at", .errmsg = true,
1019 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1020 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1021 { .name = "openat", .errmsg = true,
1022 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1023 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1024 { .name = "pipe2", .errmsg = true,
1025 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1026 { .name = "poll", .errmsg = true, .timeout = true, },
1027 { .name = "ppoll", .errmsg = true, .timeout = true, },
1028 { .name = "pread", .errmsg = true, .alias = "pread64",
1029 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1030 { .name = "preadv", .errmsg = true, .alias = "pread",
1031 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1032 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1033 { .name = "pwrite", .errmsg = true, .alias = "pwrite64",
1034 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1035 { .name = "pwritev", .errmsg = true,
1036 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1037 { .name = "read", .errmsg = true,
1038 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039 { .name = "readlinkat", .errmsg = true,
1040 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1041 { .name = "readv", .errmsg = true,
1042 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1043 { .name = "recvfrom", .errmsg = true,
1044 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1045 { .name = "recvmmsg", .errmsg = true,
1046 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1047 { .name = "recvmsg", .errmsg = true,
1048 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1049 { .name = "renameat", .errmsg = true,
1050 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1051 { .name = "rt_sigaction", .errmsg = true,
1052 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1053 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
1054 { .name = "rt_sigqueueinfo", .errmsg = true,
1055 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1056 { .name = "rt_tgsigqueueinfo", .errmsg = true,
1057 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1058 { .name = "select", .errmsg = true, .timeout = true, },
1059 { .name = "sendmmsg", .errmsg = true,
1060 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1061 { .name = "sendmsg", .errmsg = true,
1062 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1063 { .name = "sendto", .errmsg = true,
1064 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1065 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
1066 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1067 { .name = "shutdown", .errmsg = true,
1068 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069 { .name = "socket", .errmsg = true,
1070 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1071 [1] = SCA_SK_TYPE, /* type */ },
1072 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1073 { .name = "socketpair", .errmsg = true,
1074 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1075 [1] = SCA_SK_TYPE, /* type */ },
1076 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1077 { .name = "stat", .errmsg = true, .alias = "newstat", },
1078 { .name = "symlinkat", .errmsg = true,
1079 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1080 { .name = "tgkill", .errmsg = true,
1081 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1082 { .name = "tkill", .errmsg = true,
1083 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1084 { .name = "uname", .errmsg = true, .alias = "newuname", },
1085 { .name = "unlinkat", .errmsg = true,
1086 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1087 { .name = "utimensat", .errmsg = true,
1088 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1089 { .name = "write", .errmsg = true,
1090 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1091 { .name = "writev", .errmsg = true,
1092 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093};
1094
1095static int syscall_fmt__cmp(const void *name, const void *fmtp)
1096{
1097 const struct syscall_fmt *fmt = fmtp;
1098 return strcmp(name, fmt->name);
1099}
1100
1101static struct syscall_fmt *syscall_fmt__find(const char *name)
1102{
1103 const int nmemb = ARRAY_SIZE(syscall_fmts);
1104 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1105}
1106
1107struct syscall {
1108 struct event_format *tp_format;
1109 const char *name;
1110 bool filtered;
1111 struct syscall_fmt *fmt;
1112 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1113 void **arg_parm;
1114};
1115
1116static size_t fprintf_duration(unsigned long t, FILE *fp)
1117{
1118 double duration = (double)t / NSEC_PER_MSEC;
1119 size_t printed = fprintf(fp, "(");
1120
1121 if (duration >= 1.0)
1122 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1123 else if (duration >= 0.01)
1124 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1125 else
1126 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1127 return printed + fprintf(fp, "): ");
1128}
1129
1130struct thread_trace {
1131 u64 entry_time;
1132 u64 exit_time;
1133 bool entry_pending;
1134 unsigned long nr_events;
1135 char *entry_str;
1136 double runtime_ms;
1137 struct {
1138 int max;
1139 char **table;
1140 } paths;
1141
1142 struct intlist *syscall_stats;
1143};
1144
1145static struct thread_trace *thread_trace__new(void)
1146{
1147 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1148
1149 if (ttrace)
1150 ttrace->paths.max = -1;
1151
1152 ttrace->syscall_stats = intlist__new(NULL);
1153
1154 return ttrace;
1155}
1156
1157static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1158{
1159 struct thread_trace *ttrace;
1160
1161 if (thread == NULL)
1162 goto fail;
1163
1164 if (thread->priv == NULL)
1165 thread->priv = thread_trace__new();
1166
1167 if (thread->priv == NULL)
1168 goto fail;
1169
1170 ttrace = thread->priv;
1171 ++ttrace->nr_events;
1172
1173 return ttrace;
1174fail:
1175 color_fprintf(fp, PERF_COLOR_RED,
1176 "WARNING: not enough memory, dropping samples!\n");
1177 return NULL;
1178}
1179
1180struct trace {
1181 struct perf_tool tool;
1182 struct {
1183 int machine;
1184 int open_id;
1185 } audit;
1186 struct {
1187 int max;
1188 struct syscall *table;
1189 } syscalls;
1190 struct record_opts opts;
1191 struct machine *host;
1192 u64 base_time;
1193 FILE *output;
1194 unsigned long nr_events;
1195 struct strlist *ev_qualifier;
1196 const char *last_vfs_getname;
1197 struct intlist *tid_list;
1198 struct intlist *pid_list;
1199 double duration_filter;
1200 double runtime_ms;
1201 struct {
1202 u64 vfs_getname,
1203 proc_getname;
1204 } stats;
1205 bool not_ev_qualifier;
1206 bool live;
1207 bool full_time;
1208 bool sched;
1209 bool multiple_threads;
1210 bool summary;
1211 bool summary_only;
1212 bool show_comm;
1213 bool show_tool_stats;
1214};
1215
1216static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1217{
1218 struct thread_trace *ttrace = thread->priv;
1219
1220 if (fd > ttrace->paths.max) {
1221 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1222
1223 if (npath == NULL)
1224 return -1;
1225
1226 if (ttrace->paths.max != -1) {
1227 memset(npath + ttrace->paths.max + 1, 0,
1228 (fd - ttrace->paths.max) * sizeof(char *));
1229 } else {
1230 memset(npath, 0, (fd + 1) * sizeof(char *));
1231 }
1232
1233 ttrace->paths.table = npath;
1234 ttrace->paths.max = fd;
1235 }
1236
1237 ttrace->paths.table[fd] = strdup(pathname);
1238
1239 return ttrace->paths.table[fd] != NULL ? 0 : -1;
1240}
1241
1242static int thread__read_fd_path(struct thread *thread, int fd)
1243{
1244 char linkname[PATH_MAX], pathname[PATH_MAX];
1245 struct stat st;
1246 int ret;
1247
1248 if (thread->pid_ == thread->tid) {
1249 scnprintf(linkname, sizeof(linkname),
1250 "/proc/%d/fd/%d", thread->pid_, fd);
1251 } else {
1252 scnprintf(linkname, sizeof(linkname),
1253 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1254 }
1255
1256 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1257 return -1;
1258
1259 ret = readlink(linkname, pathname, sizeof(pathname));
1260
1261 if (ret < 0 || ret > st.st_size)
1262 return -1;
1263
1264 pathname[ret] = '\0';
1265 return trace__set_fd_pathname(thread, fd, pathname);
1266}
1267
1268static const char *thread__fd_path(struct thread *thread, int fd,
1269 struct trace *trace)
1270{
1271 struct thread_trace *ttrace = thread->priv;
1272
1273 if (ttrace == NULL)
1274 return NULL;
1275
1276 if (fd < 0)
1277 return NULL;
1278
1279 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL))
1280 if (!trace->live)
1281 return NULL;
1282 ++trace->stats.proc_getname;
1283 if (thread__read_fd_path(thread, fd)) {
1284 return NULL;
1285 }
1286
1287 return ttrace->paths.table[fd];
1288}
1289
1290static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1291 struct syscall_arg *arg)
1292{
1293 int fd = arg->val;
1294 size_t printed = scnprintf(bf, size, "%d", fd);
1295 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1296
1297 if (path)
1298 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1299
1300 return printed;
1301}
1302
1303static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1304 struct syscall_arg *arg)
1305{
1306 int fd = arg->val;
1307 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1308 struct thread_trace *ttrace = arg->thread->priv;
1309
1310 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1311 zfree(&ttrace->paths.table[fd]);
1312
1313 return printed;
1314}
1315
1316static bool trace__filter_duration(struct trace *trace, double t)
1317{
1318 return t < (trace->duration_filter * NSEC_PER_MSEC);
1319}
1320
1321static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1322{
1323 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1324
1325 return fprintf(fp, "%10.3f ", ts);
1326}
1327
1328static bool done = false;
1329static bool interrupted = false;
1330
1331static void sig_handler(int sig)
1332{
1333 done = true;
1334 interrupted = sig == SIGINT;
1335}
1336
1337static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1338 u64 duration, u64 tstamp, FILE *fp)
1339{
1340 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1341 printed += fprintf_duration(duration, fp);
1342
1343 if (trace->multiple_threads) {
1344 if (trace->show_comm)
1345 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1346 printed += fprintf(fp, "%d ", thread->tid);
1347 }
1348
1349 return printed;
1350}
1351
1352static int trace__process_event(struct trace *trace, struct machine *machine,
1353 union perf_event *event, struct perf_sample *sample)
1354{
1355 int ret = 0;
1356
1357 switch (event->header.type) {
1358 case PERF_RECORD_LOST:
1359 color_fprintf(trace->output, PERF_COLOR_RED,
1360 "LOST %" PRIu64 " events!\n", event->lost.lost);
1361 ret = machine__process_lost_event(machine, event, sample);
1362 default:
1363 ret = machine__process_event(machine, event, sample);
1364 break;
1365 }
1366
1367 return ret;
1368}
1369
1370static int trace__tool_process(struct perf_tool *tool,
1371 union perf_event *event,
1372 struct perf_sample *sample,
1373 struct machine *machine)
1374{
1375 struct trace *trace = container_of(tool, struct trace, tool);
1376 return trace__process_event(trace, machine, event, sample);
1377}
1378
1379static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1380{
1381 int err = symbol__init();
1382
1383 if (err)
1384 return err;
1385
1386 trace->host = machine__new_host();
1387 if (trace->host == NULL)
1388 return -ENOMEM;
1389
1390 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1391 evlist->threads, trace__tool_process, false);
1392 if (err)
1393 symbol__exit();
1394
1395 return err;
1396}
1397
1398static int syscall__set_arg_fmts(struct syscall *sc)
1399{
1400 struct format_field *field;
1401 int idx = 0;
1402
1403 sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
1404 if (sc->arg_scnprintf == NULL)
1405 return -1;
1406
1407 if (sc->fmt)
1408 sc->arg_parm = sc->fmt->arg_parm;
1409
1410 for (field = sc->tp_format->format.fields->next; field; field = field->next) {
1411 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1412 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1413 else if (field->flags & FIELD_IS_POINTER)
1414 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1415 ++idx;
1416 }
1417
1418 return 0;
1419}
1420
1421static int trace__read_syscall_info(struct trace *trace, int id)
1422{
1423 char tp_name[128];
1424 struct syscall *sc;
1425 const char *name = audit_syscall_to_name(id, trace->audit.machine);
1426
1427 if (name == NULL)
1428 return -1;
1429
1430 if (id > trace->syscalls.max) {
1431 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1432
1433 if (nsyscalls == NULL)
1434 return -1;
1435
1436 if (trace->syscalls.max != -1) {
1437 memset(nsyscalls + trace->syscalls.max + 1, 0,
1438 (id - trace->syscalls.max) * sizeof(*sc));
1439 } else {
1440 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1441 }
1442
1443 trace->syscalls.table = nsyscalls;
1444 trace->syscalls.max = id;
1445 }
1446
1447 sc = trace->syscalls.table + id;
1448 sc->name = name;
1449
1450 if (trace->ev_qualifier) {
1451 bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1452
1453 if (!(in ^ trace->not_ev_qualifier)) {
1454 sc->filtered = true;
1455 /*
1456 * No need to do read tracepoint information since this will be
1457 * filtered out.
1458 */
1459 return 0;
1460 }
1461 }
1462
1463 sc->fmt = syscall_fmt__find(sc->name);
1464
1465 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1466 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1467
1468 if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1469 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1470 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1471 }
1472
1473 if (sc->tp_format == NULL)
1474 return -1;
1475
1476 return syscall__set_arg_fmts(sc);
1477}
1478
1479static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1480 unsigned long *args, struct trace *trace,
1481 struct thread *thread)
1482{
1483 size_t printed = 0;
1484
1485 if (sc->tp_format != NULL) {
1486 struct format_field *field;
1487 u8 bit = 1;
1488 struct syscall_arg arg = {
1489 .idx = 0,
1490 .mask = 0,
1491 .trace = trace,
1492 .thread = thread,
1493 };
1494
1495 for (field = sc->tp_format->format.fields->next; field;
1496 field = field->next, ++arg.idx, bit <<= 1) {
1497 if (arg.mask & bit)
1498 continue;
1499 /*
1500 * Suppress this argument if its value is zero and
1501 * and we don't have a string associated in an
1502 * strarray for it.
1503 */
1504 if (args[arg.idx] == 0 &&
1505 !(sc->arg_scnprintf &&
1506 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1507 sc->arg_parm[arg.idx]))
1508 continue;
1509
1510 printed += scnprintf(bf + printed, size - printed,
1511 "%s%s: ", printed ? ", " : "", field->name);
1512 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1513 arg.val = args[arg.idx];
1514 if (sc->arg_parm)
1515 arg.parm = sc->arg_parm[arg.idx];
1516 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1517 size - printed, &arg);
1518 } else {
1519 printed += scnprintf(bf + printed, size - printed,
1520 "%ld", args[arg.idx]);
1521 }
1522 }
1523 } else {
1524 int i = 0;
1525
1526 while (i < 6) {
1527 printed += scnprintf(bf + printed, size - printed,
1528 "%sarg%d: %ld",
1529 printed ? ", " : "", i, args[i]);
1530 ++i;
1531 }
1532 }
1533
1534 return printed;
1535}
1536
1537typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1538 struct perf_sample *sample);
1539
1540static struct syscall *trace__syscall_info(struct trace *trace,
1541 struct perf_evsel *evsel, int id)
1542{
1543
1544 if (id < 0) {
1545
1546 /*
1547 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1548 * before that, leaving at a higher verbosity level till that is
1549 * explained. Reproduced with plain ftrace with:
1550 *
1551 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1552 * grep "NR -1 " /t/trace_pipe
1553 *
1554 * After generating some load on the machine.
1555 */
1556 if (verbose > 1) {
1557 static u64 n;
1558 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1559 id, perf_evsel__name(evsel), ++n);
1560 }
1561 return NULL;
1562 }
1563
1564 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1565 trace__read_syscall_info(trace, id))
1566 goto out_cant_read;
1567
1568 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1569 goto out_cant_read;
1570
1571 return &trace->syscalls.table[id];
1572
1573out_cant_read:
1574 if (verbose) {
1575 fprintf(trace->output, "Problems reading syscall %d", id);
1576 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1577 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1578 fputs(" information\n", trace->output);
1579 }
1580 return NULL;
1581}
1582
1583static void thread__update_stats(struct thread_trace *ttrace,
1584 int id, struct perf_sample *sample)
1585{
1586 struct int_node *inode;
1587 struct stats *stats;
1588 u64 duration = 0;
1589
1590 inode = intlist__findnew(ttrace->syscall_stats, id);
1591 if (inode == NULL)
1592 return;
1593
1594 stats = inode->priv;
1595 if (stats == NULL) {
1596 stats = malloc(sizeof(struct stats));
1597 if (stats == NULL)
1598 return;
1599 init_stats(stats);
1600 inode->priv = stats;
1601 }
1602
1603 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1604 duration = sample->time - ttrace->entry_time;
1605
1606 update_stats(stats, duration);
1607}
1608
1609static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1610 struct perf_sample *sample)
1611{
1612 char *msg;
1613 void *args;
1614 size_t printed = 0;
1615 struct thread *thread;
1616 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1617 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1618 struct thread_trace *ttrace;
1619
1620 if (sc == NULL)
1621 return -1;
1622
1623 if (sc->filtered)
1624 return 0;
1625
1626 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1627 ttrace = thread__trace(thread, trace->output);
1628 if (ttrace == NULL)
1629 return -1;
1630
1631 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1632 ttrace = thread->priv;
1633
1634 if (ttrace->entry_str == NULL) {
1635 ttrace->entry_str = malloc(1024);
1636 if (!ttrace->entry_str)
1637 return -1;
1638 }
1639
1640 ttrace->entry_time = sample->time;
1641 msg = ttrace->entry_str;
1642 printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1643
1644 printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1645 args, trace, thread);
1646
1647 if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) {
1648 if (!trace->duration_filter && !trace->summary_only) {
1649 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1650 fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1651 }
1652 } else
1653 ttrace->entry_pending = true;
1654
1655 return 0;
1656}
1657
1658static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1659 struct perf_sample *sample)
1660{
1661 int ret;
1662 u64 duration = 0;
1663 struct thread *thread;
1664 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1665 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1666 struct thread_trace *ttrace;
1667
1668 if (sc == NULL)
1669 return -1;
1670
1671 if (sc->filtered)
1672 return 0;
1673
1674 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1675 ttrace = thread__trace(thread, trace->output);
1676 if (ttrace == NULL)
1677 return -1;
1678
1679 if (trace->summary)
1680 thread__update_stats(ttrace, id, sample);
1681
1682 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1683
1684 if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1685 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1686 trace->last_vfs_getname = NULL;
1687 ++trace->stats.vfs_getname;
1688 }
1689
1690 ttrace = thread->priv;
1691
1692 ttrace->exit_time = sample->time;
1693
1694 if (ttrace->entry_time) {
1695 duration = sample->time - ttrace->entry_time;
1696 if (trace__filter_duration(trace, duration))
1697 goto out;
1698 } else if (trace->duration_filter)
1699 goto out;
1700
1701 if (trace->summary_only)
1702 goto out;
1703
1704 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1705
1706 if (ttrace->entry_pending) {
1707 fprintf(trace->output, "%-70s", ttrace->entry_str);
1708 } else {
1709 fprintf(trace->output, " ... [");
1710 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1711 fprintf(trace->output, "]: %s()", sc->name);
1712 }
1713
1714 if (sc->fmt == NULL) {
1715signed_print:
1716 fprintf(trace->output, ") = %d", ret);
1717 } else if (ret < 0 && sc->fmt->errmsg) {
1718 char bf[256];
1719 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1720 *e = audit_errno_to_name(-ret);
1721
1722 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1723 } else if (ret == 0 && sc->fmt->timeout)
1724 fprintf(trace->output, ") = 0 Timeout");
1725 else if (sc->fmt->hexret)
1726 fprintf(trace->output, ") = %#x", ret);
1727 else
1728 goto signed_print;
1729
1730 fputc('\n', trace->output);
1731out:
1732 ttrace->entry_pending = false;
1733
1734 return 0;
1735}
1736
1737static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1738 struct perf_sample *sample)
1739{
1740 trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1741 return 0;
1742}
1743
1744static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1745 struct perf_sample *sample)
1746{
1747 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1748 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1749 struct thread *thread = machine__findnew_thread(trace->host,
1750 sample->pid,
1751 sample->tid);
1752 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1753
1754 if (ttrace == NULL)
1755 goto out_dump;
1756
1757 ttrace->runtime_ms += runtime_ms;
1758 trace->runtime_ms += runtime_ms;
1759 return 0;
1760
1761out_dump:
1762 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1763 evsel->name,
1764 perf_evsel__strval(evsel, sample, "comm"),
1765 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1766 runtime,
1767 perf_evsel__intval(evsel, sample, "vruntime"));
1768 return 0;
1769}
1770
1771static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1772{
1773 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1774 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1775 return false;
1776
1777 if (trace->pid_list || trace->tid_list)
1778 return true;
1779
1780 return false;
1781}
1782
1783static int trace__process_sample(struct perf_tool *tool,
1784 union perf_event *event __maybe_unused,
1785 struct perf_sample *sample,
1786 struct perf_evsel *evsel,
1787 struct machine *machine __maybe_unused)
1788{
1789 struct trace *trace = container_of(tool, struct trace, tool);
1790 int err = 0;
1791
1792 tracepoint_handler handler = evsel->handler;
1793
1794 if (skip_sample(trace, sample))
1795 return 0;
1796
1797 if (!trace->full_time && trace->base_time == 0)
1798 trace->base_time = sample->time;
1799
1800 if (handler) {
1801 ++trace->nr_events;
1802 handler(trace, evsel, sample);
1803 }
1804
1805 return err;
1806}
1807
1808static int parse_target_str(struct trace *trace)
1809{
1810 if (trace->opts.target.pid) {
1811 trace->pid_list = intlist__new(trace->opts.target.pid);
1812 if (trace->pid_list == NULL) {
1813 pr_err("Error parsing process id string\n");
1814 return -EINVAL;
1815 }
1816 }
1817
1818 if (trace->opts.target.tid) {
1819 trace->tid_list = intlist__new(trace->opts.target.tid);
1820 if (trace->tid_list == NULL) {
1821 pr_err("Error parsing thread id string\n");
1822 return -EINVAL;
1823 }
1824 }
1825
1826 return 0;
1827}
1828
1829static int trace__record(int argc, const char **argv)
1830{
1831 unsigned int rec_argc, i, j;
1832 const char **rec_argv;
1833 const char * const record_args[] = {
1834 "record",
1835 "-R",
1836 "-m", "1024",
1837 "-c", "1",
1838 "-e",
1839 };
1840
1841 /* +1 is for the event string below */
1842 rec_argc = ARRAY_SIZE(record_args) + 1 + argc;
1843 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1844
1845 if (rec_argv == NULL)
1846 return -ENOMEM;
1847
1848 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1849 rec_argv[i] = record_args[i];
1850
1851 /* event string may be different for older kernels - e.g., RHEL6 */
1852 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1853 rec_argv[i] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1854 else if (is_valid_tracepoint("syscalls:sys_enter"))
1855 rec_argv[i] = "syscalls:sys_enter,syscalls:sys_exit";
1856 else {
1857 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
1858 return -1;
1859 }
1860 i++;
1861
1862 for (j = 0; j < (unsigned int)argc; j++, i++)
1863 rec_argv[i] = argv[j];
1864
1865 return cmd_record(i, rec_argv, NULL);
1866}
1867
1868static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
1869
1870static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
1871{
1872 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
1873 if (evsel == NULL)
1874 return;
1875
1876 if (perf_evsel__field(evsel, "pathname") == NULL) {
1877 perf_evsel__delete(evsel);
1878 return;
1879 }
1880
1881 evsel->handler = trace__vfs_getname;
1882 perf_evlist__add(evlist, evsel);
1883}
1884
1885static int trace__run(struct trace *trace, int argc, const char **argv)
1886{
1887 struct perf_evlist *evlist = perf_evlist__new();
1888 struct perf_evsel *evsel;
1889 int err = -1, i;
1890 unsigned long before;
1891 const bool forks = argc > 0;
1892
1893 trace->live = true;
1894
1895 if (evlist == NULL) {
1896 fprintf(trace->output, "Not enough memory to run!\n");
1897 goto out;
1898 }
1899
1900 if (perf_evlist__add_syscall_newtp(evlist, trace__sys_enter, trace__sys_exit))
1901 goto out_error_tp;
1902
1903 perf_evlist__add_vfs_getname(evlist);
1904
1905 if (trace->sched &&
1906 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
1907 trace__sched_stat_runtime))
1908 goto out_error_tp;
1909
1910 err = perf_evlist__create_maps(evlist, &trace->opts.target);
1911 if (err < 0) {
1912 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
1913 goto out_delete_evlist;
1914 }
1915
1916 err = trace__symbols_init(trace, evlist);
1917 if (err < 0) {
1918 fprintf(trace->output, "Problems initializing symbol libraries!\n");
1919 goto out_delete_evlist;
1920 }
1921
1922 perf_evlist__config(evlist, &trace->opts);
1923
1924 signal(SIGCHLD, sig_handler);
1925 signal(SIGINT, sig_handler);
1926
1927 if (forks) {
1928 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
1929 argv, false, NULL);
1930 if (err < 0) {
1931 fprintf(trace->output, "Couldn't run the workload!\n");
1932 goto out_delete_evlist;
1933 }
1934 }
1935
1936 err = perf_evlist__open(evlist);
1937 if (err < 0)
1938 goto out_error_open;
1939
1940 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
1941 if (err < 0) {
1942 fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
1943 goto out_delete_evlist;
1944 }
1945
1946 perf_evlist__enable(evlist);
1947
1948 if (forks)
1949 perf_evlist__start_workload(evlist);
1950
1951 trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
1952again:
1953 before = trace->nr_events;
1954
1955 for (i = 0; i < evlist->nr_mmaps; i++) {
1956 union perf_event *event;
1957
1958 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
1959 const u32 type = event->header.type;
1960 tracepoint_handler handler;
1961 struct perf_sample sample;
1962
1963 ++trace->nr_events;
1964
1965 err = perf_evlist__parse_sample(evlist, event, &sample);
1966 if (err) {
1967 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
1968 goto next_event;
1969 }
1970
1971 if (!trace->full_time && trace->base_time == 0)
1972 trace->base_time = sample.time;
1973
1974 if (type != PERF_RECORD_SAMPLE) {
1975 trace__process_event(trace, trace->host, event, &sample);
1976 continue;
1977 }
1978
1979 evsel = perf_evlist__id2evsel(evlist, sample.id);
1980 if (evsel == NULL) {
1981 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
1982 goto next_event;
1983 }
1984
1985 if (sample.raw_data == NULL) {
1986 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
1987 perf_evsel__name(evsel), sample.tid,
1988 sample.cpu, sample.raw_size);
1989 goto next_event;
1990 }
1991
1992 handler = evsel->handler;
1993 handler(trace, evsel, &sample);
1994next_event:
1995 perf_evlist__mmap_consume(evlist, i);
1996
1997 if (interrupted)
1998 goto out_disable;
1999 }
2000 }
2001
2002 if (trace->nr_events == before) {
2003 int timeout = done ? 100 : -1;
2004
2005 if (poll(evlist->pollfd, evlist->nr_fds, timeout) > 0)
2006 goto again;
2007 } else {
2008 goto again;
2009 }
2010
2011out_disable:
2012 perf_evlist__disable(evlist);
2013
2014 if (!err) {
2015 if (trace->summary)
2016 trace__fprintf_thread_summary(trace, trace->output);
2017
2018 if (trace->show_tool_stats) {
2019 fprintf(trace->output, "Stats:\n "
2020 " vfs_getname : %" PRIu64 "\n"
2021 " proc_getname: %" PRIu64 "\n",
2022 trace->stats.vfs_getname,
2023 trace->stats.proc_getname);
2024 }
2025 }
2026
2027out_delete_evlist:
2028 perf_evlist__delete(evlist);
2029out:
2030 trace->live = false;
2031 return err;
2032{
2033 char errbuf[BUFSIZ];
2034
2035out_error_tp:
2036 perf_evlist__strerror_tp(evlist, errno, errbuf, sizeof(errbuf));
2037 goto out_error;
2038
2039out_error_open:
2040 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2041
2042out_error:
2043 fprintf(trace->output, "%s\n", errbuf);
2044 goto out_delete_evlist;
2045}
2046}
2047
2048static int trace__replay(struct trace *trace)
2049{
2050 const struct perf_evsel_str_handler handlers[] = {
2051 { "probe:vfs_getname", trace__vfs_getname, },
2052 };
2053 struct perf_data_file file = {
2054 .path = input_name,
2055 .mode = PERF_DATA_MODE_READ,
2056 };
2057 struct perf_session *session;
2058 struct perf_evsel *evsel;
2059 int err = -1;
2060
2061 trace->tool.sample = trace__process_sample;
2062 trace->tool.mmap = perf_event__process_mmap;
2063 trace->tool.mmap2 = perf_event__process_mmap2;
2064 trace->tool.comm = perf_event__process_comm;
2065 trace->tool.exit = perf_event__process_exit;
2066 trace->tool.fork = perf_event__process_fork;
2067 trace->tool.attr = perf_event__process_attr;
2068 trace->tool.tracing_data = perf_event__process_tracing_data;
2069 trace->tool.build_id = perf_event__process_build_id;
2070
2071 trace->tool.ordered_samples = true;
2072 trace->tool.ordering_requires_timestamps = true;
2073
2074 /* add tid to output */
2075 trace->multiple_threads = true;
2076
2077 if (symbol__init() < 0)
2078 return -1;
2079
2080 session = perf_session__new(&file, false, &trace->tool);
2081 if (session == NULL)
2082 return -ENOMEM;
2083
2084 trace->host = &session->machines.host;
2085
2086 err = perf_session__set_tracepoints_handlers(session, handlers);
2087 if (err)
2088 goto out;
2089
2090 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2091 "raw_syscalls:sys_enter");
2092 /* older kernels have syscalls tp versus raw_syscalls */
2093 if (evsel == NULL)
2094 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2095 "syscalls:sys_enter");
2096 if (evsel == NULL) {
2097 pr_err("Data file does not have raw_syscalls:sys_enter event\n");
2098 goto out;
2099 }
2100
2101 if (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2102 perf_evsel__init_sc_tp_ptr_field(evsel, args)) {
2103 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2104 goto out;
2105 }
2106
2107 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2108 "raw_syscalls:sys_exit");
2109 if (evsel == NULL)
2110 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2111 "syscalls:sys_exit");
2112 if (evsel == NULL) {
2113 pr_err("Data file does not have raw_syscalls:sys_exit event\n");
2114 goto out;
2115 }
2116
2117 if (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2118 perf_evsel__init_sc_tp_uint_field(evsel, ret)) {
2119 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2120 goto out;
2121 }
2122
2123 err = parse_target_str(trace);
2124 if (err != 0)
2125 goto out;
2126
2127 setup_pager();
2128
2129 err = perf_session__process_events(session, &trace->tool);
2130 if (err)
2131 pr_err("Failed to process events, error %d", err);
2132
2133 else if (trace->summary)
2134 trace__fprintf_thread_summary(trace, trace->output);
2135
2136out:
2137 perf_session__delete(session);
2138
2139 return err;
2140}
2141
2142static size_t trace__fprintf_threads_header(FILE *fp)
2143{
2144 size_t printed;
2145
2146 printed = fprintf(fp, "\n Summary of events:\n\n");
2147
2148 return printed;
2149}
2150
2151static size_t thread__dump_stats(struct thread_trace *ttrace,
2152 struct trace *trace, FILE *fp)
2153{
2154 struct stats *stats;
2155 size_t printed = 0;
2156 struct syscall *sc;
2157 struct int_node *inode = intlist__first(ttrace->syscall_stats);
2158
2159 if (inode == NULL)
2160 return 0;
2161
2162 printed += fprintf(fp, "\n");
2163
2164 printed += fprintf(fp, " syscall calls min avg max stddev\n");
2165 printed += fprintf(fp, " (msec) (msec) (msec) (%%)\n");
2166 printed += fprintf(fp, " --------------- -------- --------- --------- --------- ------\n");
2167
2168 /* each int_node is a syscall */
2169 while (inode) {
2170 stats = inode->priv;
2171 if (stats) {
2172 double min = (double)(stats->min) / NSEC_PER_MSEC;
2173 double max = (double)(stats->max) / NSEC_PER_MSEC;
2174 double avg = avg_stats(stats);
2175 double pct;
2176 u64 n = (u64) stats->n;
2177
2178 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2179 avg /= NSEC_PER_MSEC;
2180
2181 sc = &trace->syscalls.table[inode->i];
2182 printed += fprintf(fp, " %-15s", sc->name);
2183 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2184 n, min, avg);
2185 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2186 }
2187
2188 inode = intlist__next(inode);
2189 }
2190
2191 printed += fprintf(fp, "\n\n");
2192
2193 return printed;
2194}
2195
2196/* struct used to pass data to per-thread function */
2197struct summary_data {
2198 FILE *fp;
2199 struct trace *trace;
2200 size_t printed;
2201};
2202
2203static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2204{
2205 struct summary_data *data = priv;
2206 FILE *fp = data->fp;
2207 size_t printed = data->printed;
2208 struct trace *trace = data->trace;
2209 struct thread_trace *ttrace = thread->priv;
2210 double ratio;
2211
2212 if (ttrace == NULL)
2213 return 0;
2214
2215 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2216
2217 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2218 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2219 printed += fprintf(fp, "%.1f%%", ratio);
2220 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2221 printed += thread__dump_stats(ttrace, trace, fp);
2222
2223 data->printed += printed;
2224
2225 return 0;
2226}
2227
2228static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2229{
2230 struct summary_data data = {
2231 .fp = fp,
2232 .trace = trace
2233 };
2234 data.printed = trace__fprintf_threads_header(fp);
2235
2236 machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2237
2238 return data.printed;
2239}
2240
2241static int trace__set_duration(const struct option *opt, const char *str,
2242 int unset __maybe_unused)
2243{
2244 struct trace *trace = opt->value;
2245
2246 trace->duration_filter = atof(str);
2247 return 0;
2248}
2249
2250static int trace__open_output(struct trace *trace, const char *filename)
2251{
2252 struct stat st;
2253
2254 if (!stat(filename, &st) && st.st_size) {
2255 char oldname[PATH_MAX];
2256
2257 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2258 unlink(oldname);
2259 rename(filename, oldname);
2260 }
2261
2262 trace->output = fopen(filename, "w");
2263
2264 return trace->output == NULL ? -errno : 0;
2265}
2266
2267int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2268{
2269 const char * const trace_usage[] = {
2270 "perf trace [<options>] [<command>]",
2271 "perf trace [<options>] -- <command> [<options>]",
2272 "perf trace record [<options>] [<command>]",
2273 "perf trace record [<options>] -- <command> [<options>]",
2274 NULL
2275 };
2276 struct trace trace = {
2277 .audit = {
2278 .machine = audit_detect_machine(),
2279 .open_id = audit_name_to_syscall("open", trace.audit.machine),
2280 },
2281 .syscalls = {
2282 . max = -1,
2283 },
2284 .opts = {
2285 .target = {
2286 .uid = UINT_MAX,
2287 .uses_mmap = true,
2288 },
2289 .user_freq = UINT_MAX,
2290 .user_interval = ULLONG_MAX,
2291 .no_buffering = true,
2292 .mmap_pages = 1024,
2293 },
2294 .output = stdout,
2295 .show_comm = true,
2296 };
2297 const char *output_name = NULL;
2298 const char *ev_qualifier_str = NULL;
2299 const struct option trace_options[] = {
2300 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2301 "show the thread COMM next to its id"),
2302 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2303 OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
2304 "list of events to trace"),
2305 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2306 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2307 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2308 "trace events on existing process id"),
2309 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2310 "trace events on existing thread id"),
2311 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2312 "system-wide collection from all CPUs"),
2313 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2314 "list of cpus to monitor"),
2315 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2316 "child tasks do not inherit counters"),
2317 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2318 "number of mmap data pages",
2319 perf_evlist__parse_mmap_pages),
2320 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2321 "user to profile"),
2322 OPT_CALLBACK(0, "duration", &trace, "float",
2323 "show only events with duration > N.M ms",
2324 trace__set_duration),
2325 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2326 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2327 OPT_BOOLEAN('T', "time", &trace.full_time,
2328 "Show full timestamp, not time relative to first start"),
2329 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2330 "Show only syscall summary with statistics"),
2331 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2332 "Show all syscalls and summary with statistics"),
2333 OPT_END()
2334 };
2335 int err;
2336 char bf[BUFSIZ];
2337
2338 if ((argc > 1) && (strcmp(argv[1], "record") == 0))
2339 return trace__record(argc-2, &argv[2]);
2340
2341 argc = parse_options(argc, argv, trace_options, trace_usage, 0);
2342
2343 /* summary_only implies summary option, but don't overwrite summary if set */
2344 if (trace.summary_only)
2345 trace.summary = trace.summary_only;
2346
2347 if (output_name != NULL) {
2348 err = trace__open_output(&trace, output_name);
2349 if (err < 0) {
2350 perror("failed to create output file");
2351 goto out;
2352 }
2353 }
2354
2355 if (ev_qualifier_str != NULL) {
2356 const char *s = ev_qualifier_str;
2357
2358 trace.not_ev_qualifier = *s == '!';
2359 if (trace.not_ev_qualifier)
2360 ++s;
2361 trace.ev_qualifier = strlist__new(true, s);
2362 if (trace.ev_qualifier == NULL) {
2363 fputs("Not enough memory to parse event qualifier",
2364 trace.output);
2365 err = -ENOMEM;
2366 goto out_close;
2367 }
2368 }
2369
2370 err = target__validate(&trace.opts.target);
2371 if (err) {
2372 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2373 fprintf(trace.output, "%s", bf);
2374 goto out_close;
2375 }
2376
2377 err = target__parse_uid(&trace.opts.target);
2378 if (err) {
2379 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2380 fprintf(trace.output, "%s", bf);
2381 goto out_close;
2382 }
2383
2384 if (!argc && target__none(&trace.opts.target))
2385 trace.opts.target.system_wide = true;
2386
2387 if (input_name)
2388 err = trace__replay(&trace);
2389 else
2390 err = trace__run(&trace, argc, argv);
2391
2392out_close:
2393 if (output_name != NULL)
2394 fclose(trace.output);
2395out:
2396 return err;
2397}
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 */
16
17#include "util/record.h"
18#include <traceevent/event-parse.h>
19#include <api/fs/tracing_path.h>
20#include <bpf/bpf.h>
21#include "util/bpf_map.h"
22#include "util/rlimit.h"
23#include "builtin.h"
24#include "util/cgroup.h"
25#include "util/color.h"
26#include "util/config.h"
27#include "util/debug.h"
28#include "util/dso.h"
29#include "util/env.h"
30#include "util/event.h"
31#include "util/evsel.h"
32#include "util/evsel_fprintf.h"
33#include "util/synthetic-events.h"
34#include "util/evlist.h"
35#include "util/evswitch.h"
36#include "util/mmap.h"
37#include <subcmd/pager.h>
38#include <subcmd/exec-cmd.h>
39#include "util/machine.h"
40#include "util/map.h"
41#include "util/symbol.h"
42#include "util/path.h"
43#include "util/session.h"
44#include "util/thread.h"
45#include <subcmd/parse-options.h>
46#include "util/strlist.h"
47#include "util/intlist.h"
48#include "util/thread_map.h"
49#include "util/stat.h"
50#include "util/tool.h"
51#include "util/util.h"
52#include "trace/beauty/beauty.h"
53#include "trace-event.h"
54#include "util/parse-events.h"
55#include "util/bpf-loader.h"
56#include "callchain.h"
57#include "print_binary.h"
58#include "string2.h"
59#include "syscalltbl.h"
60#include "rb_resort.h"
61#include "../perf.h"
62
63#include <errno.h>
64#include <inttypes.h>
65#include <poll.h>
66#include <signal.h>
67#include <stdlib.h>
68#include <string.h>
69#include <linux/err.h>
70#include <linux/filter.h>
71#include <linux/kernel.h>
72#include <linux/random.h>
73#include <linux/stringify.h>
74#include <linux/time64.h>
75#include <linux/zalloc.h>
76#include <fcntl.h>
77#include <sys/sysmacros.h>
78
79#include <linux/ctype.h>
80
81#ifndef O_CLOEXEC
82# define O_CLOEXEC 02000000
83#endif
84
85#ifndef F_LINUX_SPECIFIC_BASE
86# define F_LINUX_SPECIFIC_BASE 1024
87#endif
88
89struct trace {
90 struct perf_tool tool;
91 struct syscalltbl *sctbl;
92 struct {
93 struct syscall *table;
94 struct bpf_map *map;
95 struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
96 struct bpf_map *sys_enter,
97 *sys_exit;
98 } prog_array;
99 struct {
100 struct evsel *sys_enter,
101 *sys_exit,
102 *augmented;
103 } events;
104 struct bpf_program *unaugmented_prog;
105 } syscalls;
106 struct {
107 struct bpf_map *map;
108 } dump;
109 struct record_opts opts;
110 struct evlist *evlist;
111 struct machine *host;
112 struct thread *current;
113 struct bpf_object *bpf_obj;
114 struct cgroup *cgroup;
115 u64 base_time;
116 FILE *output;
117 unsigned long nr_events;
118 unsigned long nr_events_printed;
119 unsigned long max_events;
120 struct evswitch evswitch;
121 struct strlist *ev_qualifier;
122 struct {
123 size_t nr;
124 int *entries;
125 } ev_qualifier_ids;
126 struct {
127 size_t nr;
128 pid_t *entries;
129 struct bpf_map *map;
130 } filter_pids;
131 double duration_filter;
132 double runtime_ms;
133 struct {
134 u64 vfs_getname,
135 proc_getname;
136 } stats;
137 unsigned int max_stack;
138 unsigned int min_stack;
139 int raw_augmented_syscalls_args_size;
140 bool raw_augmented_syscalls;
141 bool fd_path_disabled;
142 bool sort_events;
143 bool not_ev_qualifier;
144 bool live;
145 bool full_time;
146 bool sched;
147 bool multiple_threads;
148 bool summary;
149 bool summary_only;
150 bool failure_only;
151 bool show_comm;
152 bool print_sample;
153 bool show_tool_stats;
154 bool trace_syscalls;
155 bool kernel_syscallchains;
156 s16 args_alignment;
157 bool show_tstamp;
158 bool show_duration;
159 bool show_zeros;
160 bool show_arg_names;
161 bool show_string_prefix;
162 bool force;
163 bool vfs_getname;
164 int trace_pgfaults;
165 struct {
166 struct ordered_events data;
167 u64 last;
168 } oe;
169};
170
171struct tp_field {
172 int offset;
173 union {
174 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
175 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
176 };
177};
178
179#define TP_UINT_FIELD(bits) \
180static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
181{ \
182 u##bits value; \
183 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
184 return value; \
185}
186
187TP_UINT_FIELD(8);
188TP_UINT_FIELD(16);
189TP_UINT_FIELD(32);
190TP_UINT_FIELD(64);
191
192#define TP_UINT_FIELD__SWAPPED(bits) \
193static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
194{ \
195 u##bits value; \
196 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
197 return bswap_##bits(value);\
198}
199
200TP_UINT_FIELD__SWAPPED(16);
201TP_UINT_FIELD__SWAPPED(32);
202TP_UINT_FIELD__SWAPPED(64);
203
204static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
205{
206 field->offset = offset;
207
208 switch (size) {
209 case 1:
210 field->integer = tp_field__u8;
211 break;
212 case 2:
213 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
214 break;
215 case 4:
216 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
217 break;
218 case 8:
219 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
220 break;
221 default:
222 return -1;
223 }
224
225 return 0;
226}
227
228static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
229{
230 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
231}
232
233static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
234{
235 return sample->raw_data + field->offset;
236}
237
238static int __tp_field__init_ptr(struct tp_field *field, int offset)
239{
240 field->offset = offset;
241 field->pointer = tp_field__ptr;
242 return 0;
243}
244
245static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
246{
247 return __tp_field__init_ptr(field, format_field->offset);
248}
249
250struct syscall_tp {
251 struct tp_field id;
252 union {
253 struct tp_field args, ret;
254 };
255};
256
257static int perf_evsel__init_tp_uint_field(struct evsel *evsel,
258 struct tp_field *field,
259 const char *name)
260{
261 struct tep_format_field *format_field = perf_evsel__field(evsel, name);
262
263 if (format_field == NULL)
264 return -1;
265
266 return tp_field__init_uint(field, format_field, evsel->needs_swap);
267}
268
269#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
270 ({ struct syscall_tp *sc = evsel->priv;\
271 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
272
273static int perf_evsel__init_tp_ptr_field(struct evsel *evsel,
274 struct tp_field *field,
275 const char *name)
276{
277 struct tep_format_field *format_field = perf_evsel__field(evsel, name);
278
279 if (format_field == NULL)
280 return -1;
281
282 return tp_field__init_ptr(field, format_field);
283}
284
285#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
286 ({ struct syscall_tp *sc = evsel->priv;\
287 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
288
289static void evsel__delete_priv(struct evsel *evsel)
290{
291 zfree(&evsel->priv);
292 evsel__delete(evsel);
293}
294
295static int perf_evsel__init_syscall_tp(struct evsel *evsel)
296{
297 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
298
299 if (evsel->priv != NULL) {
300 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
301 perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
302 goto out_delete;
303 return 0;
304 }
305
306 return -ENOMEM;
307out_delete:
308 zfree(&evsel->priv);
309 return -ENOENT;
310}
311
312static int perf_evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
313{
314 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
315
316 if (evsel->priv != NULL) {
317 struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
318 if (syscall_id == NULL)
319 syscall_id = perf_evsel__field(tp, "__syscall_nr");
320 if (syscall_id == NULL)
321 goto out_delete;
322 if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
323 goto out_delete;
324
325 return 0;
326 }
327
328 return -ENOMEM;
329out_delete:
330 zfree(&evsel->priv);
331 return -EINVAL;
332}
333
334static int perf_evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
335{
336 struct syscall_tp *sc = evsel->priv;
337
338 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
339}
340
341static int perf_evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
342{
343 struct syscall_tp *sc = evsel->priv;
344
345 return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
346}
347
348static int perf_evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
349{
350 evsel->priv = malloc(sizeof(struct syscall_tp));
351 if (evsel->priv != NULL) {
352 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
353 goto out_delete;
354
355 evsel->handler = handler;
356 return 0;
357 }
358
359 return -ENOMEM;
360
361out_delete:
362 zfree(&evsel->priv);
363 return -ENOENT;
364}
365
366static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
367{
368 struct evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
369
370 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
371 if (IS_ERR(evsel))
372 evsel = perf_evsel__newtp("syscalls", direction);
373
374 if (IS_ERR(evsel))
375 return NULL;
376
377 if (perf_evsel__init_raw_syscall_tp(evsel, handler))
378 goto out_delete;
379
380 return evsel;
381
382out_delete:
383 evsel__delete_priv(evsel);
384 return NULL;
385}
386
387#define perf_evsel__sc_tp_uint(evsel, name, sample) \
388 ({ struct syscall_tp *fields = evsel->priv; \
389 fields->name.integer(&fields->name, sample); })
390
391#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
392 ({ struct syscall_tp *fields = evsel->priv; \
393 fields->name.pointer(&fields->name, sample); })
394
395size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
396{
397 int idx = val - sa->offset;
398
399 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
400 size_t printed = scnprintf(bf, size, intfmt, val);
401 if (show_prefix)
402 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
403 return printed;
404 }
405
406 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
407}
408
409static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
410 const char *intfmt,
411 struct syscall_arg *arg)
412{
413 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
414}
415
416static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
417 struct syscall_arg *arg)
418{
419 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
420}
421
422#define SCA_STRARRAY syscall_arg__scnprintf_strarray
423
424size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
425{
426 return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
427}
428
429size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
430{
431 size_t printed;
432 int i;
433
434 for (i = 0; i < sas->nr_entries; ++i) {
435 struct strarray *sa = sas->entries[i];
436 int idx = val - sa->offset;
437
438 if (idx >= 0 && idx < sa->nr_entries) {
439 if (sa->entries[idx] == NULL)
440 break;
441 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
442 }
443 }
444
445 printed = scnprintf(bf, size, intfmt, val);
446 if (show_prefix)
447 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
448 return printed;
449}
450
451size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
452 struct syscall_arg *arg)
453{
454 return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
455}
456
457#ifndef AT_FDCWD
458#define AT_FDCWD -100
459#endif
460
461static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
462 struct syscall_arg *arg)
463{
464 int fd = arg->val;
465 const char *prefix = "AT_FD";
466
467 if (fd == AT_FDCWD)
468 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
469
470 return syscall_arg__scnprintf_fd(bf, size, arg);
471}
472
473#define SCA_FDAT syscall_arg__scnprintf_fd_at
474
475static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
476 struct syscall_arg *arg);
477
478#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
479
480size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
481{
482 return scnprintf(bf, size, "%#lx", arg->val);
483}
484
485size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
486{
487 if (arg->val == 0)
488 return scnprintf(bf, size, "NULL");
489 return syscall_arg__scnprintf_hex(bf, size, arg);
490}
491
492size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
493{
494 return scnprintf(bf, size, "%d", arg->val);
495}
496
497size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
498{
499 return scnprintf(bf, size, "%ld", arg->val);
500}
501
502static const char *bpf_cmd[] = {
503 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
504 "MAP_GET_NEXT_KEY", "PROG_LOAD",
505};
506static DEFINE_STRARRAY(bpf_cmd, "BPF_");
507
508static const char *fsmount_flags[] = {
509 [1] = "CLOEXEC",
510};
511static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
512
513#include "trace/beauty/generated/fsconfig_arrays.c"
514
515static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
516
517static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
518static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
519
520static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
521static DEFINE_STRARRAY(itimers, "ITIMER_");
522
523static const char *keyctl_options[] = {
524 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
525 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
526 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
527 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
528 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
529};
530static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
531
532static const char *whences[] = { "SET", "CUR", "END",
533#ifdef SEEK_DATA
534"DATA",
535#endif
536#ifdef SEEK_HOLE
537"HOLE",
538#endif
539};
540static DEFINE_STRARRAY(whences, "SEEK_");
541
542static const char *fcntl_cmds[] = {
543 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
544 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
545 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
546 "GETOWNER_UIDS",
547};
548static DEFINE_STRARRAY(fcntl_cmds, "F_");
549
550static const char *fcntl_linux_specific_cmds[] = {
551 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
552 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
553 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
554};
555
556static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
557
558static struct strarray *fcntl_cmds_arrays[] = {
559 &strarray__fcntl_cmds,
560 &strarray__fcntl_linux_specific_cmds,
561};
562
563static DEFINE_STRARRAYS(fcntl_cmds_arrays);
564
565static const char *rlimit_resources[] = {
566 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
567 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
568 "RTTIME",
569};
570static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
571
572static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
573static DEFINE_STRARRAY(sighow, "SIG_");
574
575static const char *clockid[] = {
576 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
577 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
578 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
579};
580static DEFINE_STRARRAY(clockid, "CLOCK_");
581
582static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
583 struct syscall_arg *arg)
584{
585 bool show_prefix = arg->show_string_prefix;
586 const char *suffix = "_OK";
587 size_t printed = 0;
588 int mode = arg->val;
589
590 if (mode == F_OK) /* 0 */
591 return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
592#define P_MODE(n) \
593 if (mode & n##_OK) { \
594 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
595 mode &= ~n##_OK; \
596 }
597
598 P_MODE(R);
599 P_MODE(W);
600 P_MODE(X);
601#undef P_MODE
602
603 if (mode)
604 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
605
606 return printed;
607}
608
609#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
610
611static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
612 struct syscall_arg *arg);
613
614#define SCA_FILENAME syscall_arg__scnprintf_filename
615
616static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
617 struct syscall_arg *arg)
618{
619 bool show_prefix = arg->show_string_prefix;
620 const char *prefix = "O_";
621 int printed = 0, flags = arg->val;
622
623#define P_FLAG(n) \
624 if (flags & O_##n) { \
625 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
626 flags &= ~O_##n; \
627 }
628
629 P_FLAG(CLOEXEC);
630 P_FLAG(NONBLOCK);
631#undef P_FLAG
632
633 if (flags)
634 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
635
636 return printed;
637}
638
639#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
640
641#ifndef GRND_NONBLOCK
642#define GRND_NONBLOCK 0x0001
643#endif
644#ifndef GRND_RANDOM
645#define GRND_RANDOM 0x0002
646#endif
647
648static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
649 struct syscall_arg *arg)
650{
651 bool show_prefix = arg->show_string_prefix;
652 const char *prefix = "GRND_";
653 int printed = 0, flags = arg->val;
654
655#define P_FLAG(n) \
656 if (flags & GRND_##n) { \
657 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
658 flags &= ~GRND_##n; \
659 }
660
661 P_FLAG(RANDOM);
662 P_FLAG(NONBLOCK);
663#undef P_FLAG
664
665 if (flags)
666 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
667
668 return printed;
669}
670
671#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
672
673#define STRARRAY(name, array) \
674 { .scnprintf = SCA_STRARRAY, \
675 .parm = &strarray__##array, }
676
677#define STRARRAY_FLAGS(name, array) \
678 { .scnprintf = SCA_STRARRAY_FLAGS, \
679 .parm = &strarray__##array, }
680
681#include "trace/beauty/arch_errno_names.c"
682#include "trace/beauty/eventfd.c"
683#include "trace/beauty/futex_op.c"
684#include "trace/beauty/futex_val3.c"
685#include "trace/beauty/mmap.c"
686#include "trace/beauty/mode_t.c"
687#include "trace/beauty/msg_flags.c"
688#include "trace/beauty/open_flags.c"
689#include "trace/beauty/perf_event_open.c"
690#include "trace/beauty/pid.c"
691#include "trace/beauty/sched_policy.c"
692#include "trace/beauty/seccomp.c"
693#include "trace/beauty/signum.c"
694#include "trace/beauty/socket_type.c"
695#include "trace/beauty/waitid_options.c"
696
697struct syscall_arg_fmt {
698 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
699 unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
700 void *parm;
701 const char *name;
702 bool show_zero;
703};
704
705static struct syscall_fmt {
706 const char *name;
707 const char *alias;
708 struct {
709 const char *sys_enter,
710 *sys_exit;
711 } bpf_prog_name;
712 struct syscall_arg_fmt arg[6];
713 u8 nr_args;
714 bool errpid;
715 bool timeout;
716 bool hexret;
717} syscall_fmts[] = {
718 { .name = "access",
719 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
720 { .name = "arch_prctl",
721 .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
722 [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
723 { .name = "bind",
724 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
725 [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ },
726 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
727 { .name = "bpf",
728 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
729 { .name = "brk", .hexret = true,
730 .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
731 { .name = "clock_gettime",
732 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
733 { .name = "clone", .errpid = true, .nr_args = 5,
734 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
735 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
736 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
737 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
738 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
739 { .name = "close",
740 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
741 { .name = "connect",
742 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
743 [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ },
744 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
745 { .name = "epoll_ctl",
746 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
747 { .name = "eventfd2",
748 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
749 { .name = "fchmodat",
750 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
751 { .name = "fchownat",
752 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
753 { .name = "fcntl",
754 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
755 .parm = &strarrays__fcntl_cmds_arrays,
756 .show_zero = true, },
757 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
758 { .name = "flock",
759 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
760 { .name = "fsconfig",
761 .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
762 { .name = "fsmount",
763 .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
764 [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
765 { .name = "fspick",
766 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
767 [1] = { .scnprintf = SCA_FILENAME, /* path */ },
768 [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
769 { .name = "fstat", .alias = "newfstat", },
770 { .name = "fstatat", .alias = "newfstatat", },
771 { .name = "futex",
772 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
773 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
774 { .name = "futimesat",
775 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
776 { .name = "getitimer",
777 .arg = { [0] = STRARRAY(which, itimers), }, },
778 { .name = "getpid", .errpid = true, },
779 { .name = "getpgid", .errpid = true, },
780 { .name = "getppid", .errpid = true, },
781 { .name = "getrandom",
782 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
783 { .name = "getrlimit",
784 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
785 { .name = "gettid", .errpid = true, },
786 { .name = "ioctl",
787 .arg = {
788#if defined(__i386__) || defined(__x86_64__)
789/*
790 * FIXME: Make this available to all arches.
791 */
792 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
793 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
794#else
795 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
796#endif
797 { .name = "kcmp", .nr_args = 5,
798 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
799 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
800 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
801 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
802 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
803 { .name = "keyctl",
804 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
805 { .name = "kill",
806 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
807 { .name = "linkat",
808 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
809 { .name = "lseek",
810 .arg = { [2] = STRARRAY(whence, whences), }, },
811 { .name = "lstat", .alias = "newlstat", },
812 { .name = "madvise",
813 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
814 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
815 { .name = "mkdirat",
816 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
817 { .name = "mknodat",
818 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
819 { .name = "mmap", .hexret = true,
820/* The standard mmap maps to old_mmap on s390x */
821#if defined(__s390x__)
822 .alias = "old_mmap",
823#endif
824 .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
825 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ },
826 [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, },
827 { .name = "mount",
828 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
829 [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
830 .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
831 { .name = "move_mount",
832 .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ },
833 [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
834 [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ },
835 [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
836 [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
837 { .name = "mprotect",
838 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
839 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
840 { .name = "mq_unlink",
841 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
842 { .name = "mremap", .hexret = true,
843 .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
844 { .name = "name_to_handle_at",
845 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
846 { .name = "newfstatat",
847 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
848 { .name = "open",
849 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
850 { .name = "open_by_handle_at",
851 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
852 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
853 { .name = "openat",
854 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
855 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
856 { .name = "perf_event_open",
857 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
858 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
859 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
860 { .name = "pipe2",
861 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
862 { .name = "pkey_alloc",
863 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
864 { .name = "pkey_free",
865 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
866 { .name = "pkey_mprotect",
867 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
868 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
869 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
870 { .name = "poll", .timeout = true, },
871 { .name = "ppoll", .timeout = true, },
872 { .name = "prctl",
873 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
874 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
875 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
876 { .name = "pread", .alias = "pread64", },
877 { .name = "preadv", .alias = "pread", },
878 { .name = "prlimit64",
879 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
880 { .name = "pwrite", .alias = "pwrite64", },
881 { .name = "readlinkat",
882 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
883 { .name = "recvfrom",
884 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
885 { .name = "recvmmsg",
886 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
887 { .name = "recvmsg",
888 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
889 { .name = "renameat",
890 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
891 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
892 { .name = "renameat2",
893 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
894 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
895 [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
896 { .name = "rt_sigaction",
897 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
898 { .name = "rt_sigprocmask",
899 .arg = { [0] = STRARRAY(how, sighow), }, },
900 { .name = "rt_sigqueueinfo",
901 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
902 { .name = "rt_tgsigqueueinfo",
903 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
904 { .name = "sched_setscheduler",
905 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
906 { .name = "seccomp",
907 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
908 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
909 { .name = "select", .timeout = true, },
910 { .name = "sendfile", .alias = "sendfile64", },
911 { .name = "sendmmsg",
912 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
913 { .name = "sendmsg",
914 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
915 { .name = "sendto",
916 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
917 [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
918 { .name = "set_tid_address", .errpid = true, },
919 { .name = "setitimer",
920 .arg = { [0] = STRARRAY(which, itimers), }, },
921 { .name = "setrlimit",
922 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
923 { .name = "socket",
924 .arg = { [0] = STRARRAY(family, socket_families),
925 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
926 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
927 { .name = "socketpair",
928 .arg = { [0] = STRARRAY(family, socket_families),
929 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
930 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
931 { .name = "stat", .alias = "newstat", },
932 { .name = "statx",
933 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
934 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
935 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
936 { .name = "swapoff",
937 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
938 { .name = "swapon",
939 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
940 { .name = "symlinkat",
941 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
942 { .name = "sync_file_range",
943 .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
944 { .name = "tgkill",
945 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
946 { .name = "tkill",
947 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
948 { .name = "umount2", .alias = "umount",
949 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
950 { .name = "uname", .alias = "newuname", },
951 { .name = "unlinkat",
952 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
953 { .name = "utimensat",
954 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
955 { .name = "wait4", .errpid = true,
956 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
957 { .name = "waitid", .errpid = true,
958 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
959};
960
961static int syscall_fmt__cmp(const void *name, const void *fmtp)
962{
963 const struct syscall_fmt *fmt = fmtp;
964 return strcmp(name, fmt->name);
965}
966
967static struct syscall_fmt *syscall_fmt__find(const char *name)
968{
969 const int nmemb = ARRAY_SIZE(syscall_fmts);
970 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
971}
972
973static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
974{
975 int i, nmemb = ARRAY_SIZE(syscall_fmts);
976
977 for (i = 0; i < nmemb; ++i) {
978 if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
979 return &syscall_fmts[i];
980 }
981
982 return NULL;
983}
984
985/*
986 * is_exit: is this "exit" or "exit_group"?
987 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
988 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
989 * nonexistent: Just a hole in the syscall table, syscall id not allocated
990 */
991struct syscall {
992 struct tep_event *tp_format;
993 int nr_args;
994 int args_size;
995 struct {
996 struct bpf_program *sys_enter,
997 *sys_exit;
998 } bpf_prog;
999 bool is_exit;
1000 bool is_open;
1001 bool nonexistent;
1002 struct tep_format_field *args;
1003 const char *name;
1004 struct syscall_fmt *fmt;
1005 struct syscall_arg_fmt *arg_fmt;
1006};
1007
1008/*
1009 * Must match what is in the BPF program:
1010 *
1011 * tools/perf/examples/bpf/augmented_raw_syscalls.c
1012 */
1013struct bpf_map_syscall_entry {
1014 bool enabled;
1015 u16 string_args_len[6];
1016};
1017
1018/*
1019 * We need to have this 'calculated' boolean because in some cases we really
1020 * don't know what is the duration of a syscall, for instance, when we start
1021 * a session and some threads are waiting for a syscall to finish, say 'poll',
1022 * in which case all we can do is to print "( ? ) for duration and for the
1023 * start timestamp.
1024 */
1025static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1026{
1027 double duration = (double)t / NSEC_PER_MSEC;
1028 size_t printed = fprintf(fp, "(");
1029
1030 if (!calculated)
1031 printed += fprintf(fp, " ");
1032 else if (duration >= 1.0)
1033 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1034 else if (duration >= 0.01)
1035 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1036 else
1037 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1038 return printed + fprintf(fp, "): ");
1039}
1040
1041/**
1042 * filename.ptr: The filename char pointer that will be vfs_getname'd
1043 * filename.entry_str_pos: Where to insert the string translated from
1044 * filename.ptr by the vfs_getname tracepoint/kprobe.
1045 * ret_scnprintf: syscall args may set this to a different syscall return
1046 * formatter, for instance, fcntl may return fds, file flags, etc.
1047 */
1048struct thread_trace {
1049 u64 entry_time;
1050 bool entry_pending;
1051 unsigned long nr_events;
1052 unsigned long pfmaj, pfmin;
1053 char *entry_str;
1054 double runtime_ms;
1055 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1056 struct {
1057 unsigned long ptr;
1058 short int entry_str_pos;
1059 bool pending_open;
1060 unsigned int namelen;
1061 char *name;
1062 } filename;
1063 struct {
1064 int max;
1065 struct file *table;
1066 } files;
1067
1068 struct intlist *syscall_stats;
1069};
1070
1071static struct thread_trace *thread_trace__new(void)
1072{
1073 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1074
1075 if (ttrace) {
1076 ttrace->files.max = -1;
1077 ttrace->syscall_stats = intlist__new(NULL);
1078 }
1079
1080 return ttrace;
1081}
1082
1083static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1084{
1085 struct thread_trace *ttrace;
1086
1087 if (thread == NULL)
1088 goto fail;
1089
1090 if (thread__priv(thread) == NULL)
1091 thread__set_priv(thread, thread_trace__new());
1092
1093 if (thread__priv(thread) == NULL)
1094 goto fail;
1095
1096 ttrace = thread__priv(thread);
1097 ++ttrace->nr_events;
1098
1099 return ttrace;
1100fail:
1101 color_fprintf(fp, PERF_COLOR_RED,
1102 "WARNING: not enough memory, dropping samples!\n");
1103 return NULL;
1104}
1105
1106
1107void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1108 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1109{
1110 struct thread_trace *ttrace = thread__priv(arg->thread);
1111
1112 ttrace->ret_scnprintf = ret_scnprintf;
1113}
1114
1115#define TRACE_PFMAJ (1 << 0)
1116#define TRACE_PFMIN (1 << 1)
1117
1118static const size_t trace__entry_str_size = 2048;
1119
1120static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1121{
1122 if (fd < 0)
1123 return NULL;
1124
1125 if (fd > ttrace->files.max) {
1126 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1127
1128 if (nfiles == NULL)
1129 return NULL;
1130
1131 if (ttrace->files.max != -1) {
1132 memset(nfiles + ttrace->files.max + 1, 0,
1133 (fd - ttrace->files.max) * sizeof(struct file));
1134 } else {
1135 memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1136 }
1137
1138 ttrace->files.table = nfiles;
1139 ttrace->files.max = fd;
1140 }
1141
1142 return ttrace->files.table + fd;
1143}
1144
1145struct file *thread__files_entry(struct thread *thread, int fd)
1146{
1147 return thread_trace__files_entry(thread__priv(thread), fd);
1148}
1149
1150static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1151{
1152 struct thread_trace *ttrace = thread__priv(thread);
1153 struct file *file = thread_trace__files_entry(ttrace, fd);
1154
1155 if (file != NULL) {
1156 struct stat st;
1157 if (stat(pathname, &st) == 0)
1158 file->dev_maj = major(st.st_rdev);
1159 file->pathname = strdup(pathname);
1160 if (file->pathname)
1161 return 0;
1162 }
1163
1164 return -1;
1165}
1166
1167static int thread__read_fd_path(struct thread *thread, int fd)
1168{
1169 char linkname[PATH_MAX], pathname[PATH_MAX];
1170 struct stat st;
1171 int ret;
1172
1173 if (thread->pid_ == thread->tid) {
1174 scnprintf(linkname, sizeof(linkname),
1175 "/proc/%d/fd/%d", thread->pid_, fd);
1176 } else {
1177 scnprintf(linkname, sizeof(linkname),
1178 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1179 }
1180
1181 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1182 return -1;
1183
1184 ret = readlink(linkname, pathname, sizeof(pathname));
1185
1186 if (ret < 0 || ret > st.st_size)
1187 return -1;
1188
1189 pathname[ret] = '\0';
1190 return trace__set_fd_pathname(thread, fd, pathname);
1191}
1192
1193static const char *thread__fd_path(struct thread *thread, int fd,
1194 struct trace *trace)
1195{
1196 struct thread_trace *ttrace = thread__priv(thread);
1197
1198 if (ttrace == NULL || trace->fd_path_disabled)
1199 return NULL;
1200
1201 if (fd < 0)
1202 return NULL;
1203
1204 if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1205 if (!trace->live)
1206 return NULL;
1207 ++trace->stats.proc_getname;
1208 if (thread__read_fd_path(thread, fd))
1209 return NULL;
1210 }
1211
1212 return ttrace->files.table[fd].pathname;
1213}
1214
1215size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1216{
1217 int fd = arg->val;
1218 size_t printed = scnprintf(bf, size, "%d", fd);
1219 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1220
1221 if (path)
1222 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1223
1224 return printed;
1225}
1226
1227size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1228{
1229 size_t printed = scnprintf(bf, size, "%d", fd);
1230 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1231
1232 if (thread) {
1233 const char *path = thread__fd_path(thread, fd, trace);
1234
1235 if (path)
1236 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1237
1238 thread__put(thread);
1239 }
1240
1241 return printed;
1242}
1243
1244static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1245 struct syscall_arg *arg)
1246{
1247 int fd = arg->val;
1248 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1249 struct thread_trace *ttrace = thread__priv(arg->thread);
1250
1251 if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1252 zfree(&ttrace->files.table[fd].pathname);
1253
1254 return printed;
1255}
1256
1257static void thread__set_filename_pos(struct thread *thread, const char *bf,
1258 unsigned long ptr)
1259{
1260 struct thread_trace *ttrace = thread__priv(thread);
1261
1262 ttrace->filename.ptr = ptr;
1263 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1264}
1265
1266static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1267{
1268 struct augmented_arg *augmented_arg = arg->augmented.args;
1269 size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1270 /*
1271 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1272 * we would have two strings, each prefixed by its size.
1273 */
1274 int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1275
1276 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1277 arg->augmented.size -= consumed;
1278
1279 return printed;
1280}
1281
1282static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1283 struct syscall_arg *arg)
1284{
1285 unsigned long ptr = arg->val;
1286
1287 if (arg->augmented.args)
1288 return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1289
1290 if (!arg->trace->vfs_getname)
1291 return scnprintf(bf, size, "%#x", ptr);
1292
1293 thread__set_filename_pos(arg->thread, bf, ptr);
1294 return 0;
1295}
1296
1297static bool trace__filter_duration(struct trace *trace, double t)
1298{
1299 return t < (trace->duration_filter * NSEC_PER_MSEC);
1300}
1301
1302static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1303{
1304 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1305
1306 return fprintf(fp, "%10.3f ", ts);
1307}
1308
1309/*
1310 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1311 * using ttrace->entry_time for a thread that receives a sys_exit without
1312 * first having received a sys_enter ("poll" issued before tracing session
1313 * starts, lost sys_enter exit due to ring buffer overflow).
1314 */
1315static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1316{
1317 if (tstamp > 0)
1318 return __trace__fprintf_tstamp(trace, tstamp, fp);
1319
1320 return fprintf(fp, " ? ");
1321}
1322
1323static bool done = false;
1324static bool interrupted = false;
1325
1326static void sig_handler(int sig)
1327{
1328 done = true;
1329 interrupted = sig == SIGINT;
1330}
1331
1332static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1333{
1334 size_t printed = 0;
1335
1336 if (trace->multiple_threads) {
1337 if (trace->show_comm)
1338 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1339 printed += fprintf(fp, "%d ", thread->tid);
1340 }
1341
1342 return printed;
1343}
1344
1345static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1346 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1347{
1348 size_t printed = 0;
1349
1350 if (trace->show_tstamp)
1351 printed = trace__fprintf_tstamp(trace, tstamp, fp);
1352 if (trace->show_duration)
1353 printed += fprintf_duration(duration, duration_calculated, fp);
1354 return printed + trace__fprintf_comm_tid(trace, thread, fp);
1355}
1356
1357static int trace__process_event(struct trace *trace, struct machine *machine,
1358 union perf_event *event, struct perf_sample *sample)
1359{
1360 int ret = 0;
1361
1362 switch (event->header.type) {
1363 case PERF_RECORD_LOST:
1364 color_fprintf(trace->output, PERF_COLOR_RED,
1365 "LOST %" PRIu64 " events!\n", event->lost.lost);
1366 ret = machine__process_lost_event(machine, event, sample);
1367 break;
1368 default:
1369 ret = machine__process_event(machine, event, sample);
1370 break;
1371 }
1372
1373 return ret;
1374}
1375
1376static int trace__tool_process(struct perf_tool *tool,
1377 union perf_event *event,
1378 struct perf_sample *sample,
1379 struct machine *machine)
1380{
1381 struct trace *trace = container_of(tool, struct trace, tool);
1382 return trace__process_event(trace, machine, event, sample);
1383}
1384
1385static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1386{
1387 struct machine *machine = vmachine;
1388
1389 if (machine->kptr_restrict_warned)
1390 return NULL;
1391
1392 if (symbol_conf.kptr_restrict) {
1393 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1394 "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1395 "Kernel samples will not be resolved.\n");
1396 machine->kptr_restrict_warned = true;
1397 return NULL;
1398 }
1399
1400 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1401}
1402
1403static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1404{
1405 int err = symbol__init(NULL);
1406
1407 if (err)
1408 return err;
1409
1410 trace->host = machine__new_host();
1411 if (trace->host == NULL)
1412 return -ENOMEM;
1413
1414 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1415 if (err < 0)
1416 goto out;
1417
1418 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1419 evlist->core.threads, trace__tool_process, false,
1420 1);
1421out:
1422 if (err)
1423 symbol__exit();
1424
1425 return err;
1426}
1427
1428static void trace__symbols__exit(struct trace *trace)
1429{
1430 machine__exit(trace->host);
1431 trace->host = NULL;
1432
1433 symbol__exit();
1434}
1435
1436static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1437{
1438 int idx;
1439
1440 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1441 nr_args = sc->fmt->nr_args;
1442
1443 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1444 if (sc->arg_fmt == NULL)
1445 return -1;
1446
1447 for (idx = 0; idx < nr_args; ++idx) {
1448 if (sc->fmt)
1449 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1450 }
1451
1452 sc->nr_args = nr_args;
1453 return 0;
1454}
1455
1456static int syscall__set_arg_fmts(struct syscall *sc)
1457{
1458 struct tep_format_field *field, *last_field = NULL;
1459 int idx = 0, len;
1460
1461 for (field = sc->args; field; field = field->next, ++idx) {
1462 last_field = field;
1463
1464 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1465 continue;
1466
1467 len = strlen(field->name);
1468
1469 if (strcmp(field->type, "const char *") == 0 &&
1470 ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
1471 strstr(field->name, "path") != NULL))
1472 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1473 else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
1474 sc->arg_fmt[idx].scnprintf = SCA_PTR;
1475 else if (strcmp(field->type, "pid_t") == 0)
1476 sc->arg_fmt[idx].scnprintf = SCA_PID;
1477 else if (strcmp(field->type, "umode_t") == 0)
1478 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1479 else if ((strcmp(field->type, "int") == 0 ||
1480 strcmp(field->type, "unsigned int") == 0 ||
1481 strcmp(field->type, "long") == 0) &&
1482 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
1483 /*
1484 * /sys/kernel/tracing/events/syscalls/sys_enter*
1485 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1486 * 65 int
1487 * 23 unsigned int
1488 * 7 unsigned long
1489 */
1490 sc->arg_fmt[idx].scnprintf = SCA_FD;
1491 }
1492 }
1493
1494 if (last_field)
1495 sc->args_size = last_field->offset + last_field->size;
1496
1497 return 0;
1498}
1499
1500static int trace__read_syscall_info(struct trace *trace, int id)
1501{
1502 char tp_name[128];
1503 struct syscall *sc;
1504 const char *name = syscalltbl__name(trace->sctbl, id);
1505
1506 if (trace->syscalls.table == NULL) {
1507 trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
1508 if (trace->syscalls.table == NULL)
1509 return -ENOMEM;
1510 }
1511
1512 sc = trace->syscalls.table + id;
1513 if (sc->nonexistent)
1514 return 0;
1515
1516 if (name == NULL) {
1517 sc->nonexistent = true;
1518 return 0;
1519 }
1520
1521 sc->name = name;
1522 sc->fmt = syscall_fmt__find(sc->name);
1523
1524 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1525 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1526
1527 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1528 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1529 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1530 }
1531
1532 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1533 return -ENOMEM;
1534
1535 if (IS_ERR(sc->tp_format))
1536 return PTR_ERR(sc->tp_format);
1537
1538 sc->args = sc->tp_format->format.fields;
1539 /*
1540 * We need to check and discard the first variable '__syscall_nr'
1541 * or 'nr' that mean the syscall number. It is needless here.
1542 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1543 */
1544 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1545 sc->args = sc->args->next;
1546 --sc->nr_args;
1547 }
1548
1549 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1550 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1551
1552 return syscall__set_arg_fmts(sc);
1553}
1554
1555static int intcmp(const void *a, const void *b)
1556{
1557 const int *one = a, *another = b;
1558
1559 return *one - *another;
1560}
1561
1562static int trace__validate_ev_qualifier(struct trace *trace)
1563{
1564 int err = 0;
1565 bool printed_invalid_prefix = false;
1566 struct str_node *pos;
1567 size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
1568
1569 trace->ev_qualifier_ids.entries = malloc(nr_allocated *
1570 sizeof(trace->ev_qualifier_ids.entries[0]));
1571
1572 if (trace->ev_qualifier_ids.entries == NULL) {
1573 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1574 trace->output);
1575 err = -EINVAL;
1576 goto out;
1577 }
1578
1579 strlist__for_each_entry(pos, trace->ev_qualifier) {
1580 const char *sc = pos->s;
1581 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1582
1583 if (id < 0) {
1584 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1585 if (id >= 0)
1586 goto matches;
1587
1588 if (!printed_invalid_prefix) {
1589 pr_debug("Skipping unknown syscalls: ");
1590 printed_invalid_prefix = true;
1591 } else {
1592 pr_debug(", ");
1593 }
1594
1595 pr_debug("%s", sc);
1596 continue;
1597 }
1598matches:
1599 trace->ev_qualifier_ids.entries[nr_used++] = id;
1600 if (match_next == -1)
1601 continue;
1602
1603 while (1) {
1604 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1605 if (id < 0)
1606 break;
1607 if (nr_allocated == nr_used) {
1608 void *entries;
1609
1610 nr_allocated += 8;
1611 entries = realloc(trace->ev_qualifier_ids.entries,
1612 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1613 if (entries == NULL) {
1614 err = -ENOMEM;
1615 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1616 goto out_free;
1617 }
1618 trace->ev_qualifier_ids.entries = entries;
1619 }
1620 trace->ev_qualifier_ids.entries[nr_used++] = id;
1621 }
1622 }
1623
1624 trace->ev_qualifier_ids.nr = nr_used;
1625 qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
1626out:
1627 if (printed_invalid_prefix)
1628 pr_debug("\n");
1629 return err;
1630out_free:
1631 zfree(&trace->ev_qualifier_ids.entries);
1632 trace->ev_qualifier_ids.nr = 0;
1633 goto out;
1634}
1635
1636static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
1637{
1638 bool in_ev_qualifier;
1639
1640 if (trace->ev_qualifier_ids.nr == 0)
1641 return true;
1642
1643 in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
1644 trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
1645
1646 if (in_ev_qualifier)
1647 return !trace->not_ev_qualifier;
1648
1649 return trace->not_ev_qualifier;
1650}
1651
1652/*
1653 * args is to be interpreted as a series of longs but we need to handle
1654 * 8-byte unaligned accesses. args points to raw_data within the event
1655 * and raw_data is guaranteed to be 8-byte unaligned because it is
1656 * preceded by raw_size which is a u32. So we need to copy args to a temp
1657 * variable to read it. Most notably this avoids extended load instructions
1658 * on unaligned addresses
1659 */
1660unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1661{
1662 unsigned long val;
1663 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1664
1665 memcpy(&val, p, sizeof(val));
1666 return val;
1667}
1668
1669static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1670 struct syscall_arg *arg)
1671{
1672 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1673 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1674
1675 return scnprintf(bf, size, "arg%d: ", arg->idx);
1676}
1677
1678/*
1679 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
1680 * as mount 'flags' argument that needs ignoring some magic flag, see comment
1681 * in tools/perf/trace/beauty/mount_flags.c
1682 */
1683static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
1684{
1685 if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
1686 return sc->arg_fmt[arg->idx].mask_val(arg, val);
1687
1688 return val;
1689}
1690
1691static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1692 struct syscall_arg *arg, unsigned long val)
1693{
1694 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1695 arg->val = val;
1696 if (sc->arg_fmt[arg->idx].parm)
1697 arg->parm = sc->arg_fmt[arg->idx].parm;
1698 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1699 }
1700 return scnprintf(bf, size, "%ld", val);
1701}
1702
1703static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1704 unsigned char *args, void *augmented_args, int augmented_args_size,
1705 struct trace *trace, struct thread *thread)
1706{
1707 size_t printed = 0;
1708 unsigned long val;
1709 u8 bit = 1;
1710 struct syscall_arg arg = {
1711 .args = args,
1712 .augmented = {
1713 .size = augmented_args_size,
1714 .args = augmented_args,
1715 },
1716 .idx = 0,
1717 .mask = 0,
1718 .trace = trace,
1719 .thread = thread,
1720 .show_string_prefix = trace->show_string_prefix,
1721 };
1722 struct thread_trace *ttrace = thread__priv(thread);
1723
1724 /*
1725 * Things like fcntl will set this in its 'cmd' formatter to pick the
1726 * right formatter for the return value (an fd? file flags?), which is
1727 * not needed for syscalls that always return a given type, say an fd.
1728 */
1729 ttrace->ret_scnprintf = NULL;
1730
1731 if (sc->args != NULL) {
1732 struct tep_format_field *field;
1733
1734 for (field = sc->args; field;
1735 field = field->next, ++arg.idx, bit <<= 1) {
1736 if (arg.mask & bit)
1737 continue;
1738
1739 val = syscall_arg__val(&arg, arg.idx);
1740 /*
1741 * Some syscall args need some mask, most don't and
1742 * return val untouched.
1743 */
1744 val = syscall__mask_val(sc, &arg, val);
1745
1746 /*
1747 * Suppress this argument if its value is zero and
1748 * and we don't have a string associated in an
1749 * strarray for it.
1750 */
1751 if (val == 0 &&
1752 !trace->show_zeros &&
1753 !(sc->arg_fmt &&
1754 (sc->arg_fmt[arg.idx].show_zero ||
1755 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1756 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1757 sc->arg_fmt[arg.idx].parm))
1758 continue;
1759
1760 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
1761
1762 if (trace->show_arg_names)
1763 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
1764
1765 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1766 }
1767 } else if (IS_ERR(sc->tp_format)) {
1768 /*
1769 * If we managed to read the tracepoint /format file, then we
1770 * may end up not having any args, like with gettid(), so only
1771 * print the raw args when we didn't manage to read it.
1772 */
1773 while (arg.idx < sc->nr_args) {
1774 if (arg.mask & bit)
1775 goto next_arg;
1776 val = syscall_arg__val(&arg, arg.idx);
1777 if (printed)
1778 printed += scnprintf(bf + printed, size - printed, ", ");
1779 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1780 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1781next_arg:
1782 ++arg.idx;
1783 bit <<= 1;
1784 }
1785 }
1786
1787 return printed;
1788}
1789
1790typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
1791 union perf_event *event,
1792 struct perf_sample *sample);
1793
1794static struct syscall *trace__syscall_info(struct trace *trace,
1795 struct evsel *evsel, int id)
1796{
1797 int err = 0;
1798
1799 if (id < 0) {
1800
1801 /*
1802 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1803 * before that, leaving at a higher verbosity level till that is
1804 * explained. Reproduced with plain ftrace with:
1805 *
1806 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1807 * grep "NR -1 " /t/trace_pipe
1808 *
1809 * After generating some load on the machine.
1810 */
1811 if (verbose > 1) {
1812 static u64 n;
1813 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1814 id, perf_evsel__name(evsel), ++n);
1815 }
1816 return NULL;
1817 }
1818
1819 err = -EINVAL;
1820
1821 if (id > trace->sctbl->syscalls.max_id)
1822 goto out_cant_read;
1823
1824 if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
1825 (err = trace__read_syscall_info(trace, id)) != 0)
1826 goto out_cant_read;
1827
1828 if (trace->syscalls.table[id].name == NULL) {
1829 if (trace->syscalls.table[id].nonexistent)
1830 return NULL;
1831 goto out_cant_read;
1832 }
1833
1834 return &trace->syscalls.table[id];
1835
1836out_cant_read:
1837 if (verbose > 0) {
1838 char sbuf[STRERR_BUFSIZE];
1839 fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
1840 if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
1841 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1842 fputs(" information\n", trace->output);
1843 }
1844 return NULL;
1845}
1846
1847static void thread__update_stats(struct thread_trace *ttrace,
1848 int id, struct perf_sample *sample)
1849{
1850 struct int_node *inode;
1851 struct stats *stats;
1852 u64 duration = 0;
1853
1854 inode = intlist__findnew(ttrace->syscall_stats, id);
1855 if (inode == NULL)
1856 return;
1857
1858 stats = inode->priv;
1859 if (stats == NULL) {
1860 stats = malloc(sizeof(struct stats));
1861 if (stats == NULL)
1862 return;
1863 init_stats(stats);
1864 inode->priv = stats;
1865 }
1866
1867 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1868 duration = sample->time - ttrace->entry_time;
1869
1870 update_stats(stats, duration);
1871}
1872
1873static int trace__printf_interrupted_entry(struct trace *trace)
1874{
1875 struct thread_trace *ttrace;
1876 size_t printed;
1877 int len;
1878
1879 if (trace->failure_only || trace->current == NULL)
1880 return 0;
1881
1882 ttrace = thread__priv(trace->current);
1883
1884 if (!ttrace->entry_pending)
1885 return 0;
1886
1887 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1888 printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
1889
1890 if (len < trace->args_alignment - 4)
1891 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
1892
1893 printed += fprintf(trace->output, " ...\n");
1894
1895 ttrace->entry_pending = false;
1896 ++trace->nr_events_printed;
1897
1898 return printed;
1899}
1900
1901static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
1902 struct perf_sample *sample, struct thread *thread)
1903{
1904 int printed = 0;
1905
1906 if (trace->print_sample) {
1907 double ts = (double)sample->time / NSEC_PER_MSEC;
1908
1909 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1910 perf_evsel__name(evsel), ts,
1911 thread__comm_str(thread),
1912 sample->pid, sample->tid, sample->cpu);
1913 }
1914
1915 return printed;
1916}
1917
1918static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
1919{
1920 void *augmented_args = NULL;
1921 /*
1922 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
1923 * and there we get all 6 syscall args plus the tracepoint common fields
1924 * that gets calculated at the start and the syscall_nr (another long).
1925 * So we check if that is the case and if so don't look after the
1926 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
1927 * which is fixed.
1928 *
1929 * We'll revisit this later to pass s->args_size to the BPF augmenter
1930 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
1931 * copies only what we need for each syscall, like what happens when we
1932 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
1933 * traffic to just what is needed for each syscall.
1934 */
1935 int args_size = raw_augmented_args_size ?: sc->args_size;
1936
1937 *augmented_args_size = sample->raw_size - args_size;
1938 if (*augmented_args_size > 0)
1939 augmented_args = sample->raw_data + args_size;
1940
1941 return augmented_args;
1942}
1943
1944static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
1945 union perf_event *event __maybe_unused,
1946 struct perf_sample *sample)
1947{
1948 char *msg;
1949 void *args;
1950 int printed = 0;
1951 struct thread *thread;
1952 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1953 int augmented_args_size = 0;
1954 void *augmented_args = NULL;
1955 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1956 struct thread_trace *ttrace;
1957
1958 if (sc == NULL)
1959 return -1;
1960
1961 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1962 ttrace = thread__trace(thread, trace->output);
1963 if (ttrace == NULL)
1964 goto out_put;
1965
1966 trace__fprintf_sample(trace, evsel, sample, thread);
1967
1968 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1969
1970 if (ttrace->entry_str == NULL) {
1971 ttrace->entry_str = malloc(trace__entry_str_size);
1972 if (!ttrace->entry_str)
1973 goto out_put;
1974 }
1975
1976 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1977 trace__printf_interrupted_entry(trace);
1978 /*
1979 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
1980 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
1981 * this breaks syscall__augmented_args() check for augmented args, as we calculate
1982 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
1983 * so when handling, say the openat syscall, we end up getting 6 args for the
1984 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
1985 * thinking that the extra 2 u64 args are the augmented filename, so just check
1986 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
1987 */
1988 if (evsel != trace->syscalls.events.sys_enter)
1989 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1990 ttrace->entry_time = sample->time;
1991 msg = ttrace->entry_str;
1992 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1993
1994 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1995 args, augmented_args, augmented_args_size, trace, thread);
1996
1997 if (sc->is_exit) {
1998 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1999 int alignment = 0;
2000
2001 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2002 printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2003 if (trace->args_alignment > printed)
2004 alignment = trace->args_alignment - printed;
2005 fprintf(trace->output, "%*s= ?\n", alignment, " ");
2006 }
2007 } else {
2008 ttrace->entry_pending = true;
2009 /* See trace__vfs_getname & trace__sys_exit */
2010 ttrace->filename.pending_open = false;
2011 }
2012
2013 if (trace->current != thread) {
2014 thread__put(trace->current);
2015 trace->current = thread__get(thread);
2016 }
2017 err = 0;
2018out_put:
2019 thread__put(thread);
2020 return err;
2021}
2022
2023static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2024 struct perf_sample *sample)
2025{
2026 struct thread_trace *ttrace;
2027 struct thread *thread;
2028 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030 char msg[1024];
2031 void *args, *augmented_args = NULL;
2032 int augmented_args_size;
2033
2034 if (sc == NULL)
2035 return -1;
2036
2037 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2038 ttrace = thread__trace(thread, trace->output);
2039 /*
2040 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2041 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2042 */
2043 if (ttrace == NULL)
2044 goto out_put;
2045
2046 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2047 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2048 syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2049 fprintf(trace->output, "%s", msg);
2050 err = 0;
2051out_put:
2052 thread__put(thread);
2053 return err;
2054}
2055
2056static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2057 struct perf_sample *sample,
2058 struct callchain_cursor *cursor)
2059{
2060 struct addr_location al;
2061 int max_stack = evsel->core.attr.sample_max_stack ?
2062 evsel->core.attr.sample_max_stack :
2063 trace->max_stack;
2064 int err;
2065
2066 if (machine__resolve(trace->host, &al, sample) < 0)
2067 return -1;
2068
2069 err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2070 addr_location__put(&al);
2071 return err;
2072}
2073
2074static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2075{
2076 /* TODO: user-configurable print_opts */
2077 const unsigned int print_opts = EVSEL__PRINT_SYM |
2078 EVSEL__PRINT_DSO |
2079 EVSEL__PRINT_UNKNOWN_AS_ADDR;
2080
2081 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output);
2082}
2083
2084static const char *errno_to_name(struct evsel *evsel, int err)
2085{
2086 struct perf_env *env = perf_evsel__env(evsel);
2087 const char *arch_name = perf_env__arch(env);
2088
2089 return arch_syscalls__strerrno(arch_name, err);
2090}
2091
2092static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2093 union perf_event *event __maybe_unused,
2094 struct perf_sample *sample)
2095{
2096 long ret;
2097 u64 duration = 0;
2098 bool duration_calculated = false;
2099 struct thread *thread;
2100 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2101 int alignment = trace->args_alignment;
2102 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2103 struct thread_trace *ttrace;
2104
2105 if (sc == NULL)
2106 return -1;
2107
2108 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2109 ttrace = thread__trace(thread, trace->output);
2110 if (ttrace == NULL)
2111 goto out_put;
2112
2113 trace__fprintf_sample(trace, evsel, sample, thread);
2114
2115 if (trace->summary)
2116 thread__update_stats(ttrace, id, sample);
2117
2118 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2119
2120 if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2121 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2122 ttrace->filename.pending_open = false;
2123 ++trace->stats.vfs_getname;
2124 }
2125
2126 if (ttrace->entry_time) {
2127 duration = sample->time - ttrace->entry_time;
2128 if (trace__filter_duration(trace, duration))
2129 goto out;
2130 duration_calculated = true;
2131 } else if (trace->duration_filter)
2132 goto out;
2133
2134 if (sample->callchain) {
2135 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2136 if (callchain_ret == 0) {
2137 if (callchain_cursor.nr < trace->min_stack)
2138 goto out;
2139 callchain_ret = 1;
2140 }
2141 }
2142
2143 if (trace->summary_only || (ret >= 0 && trace->failure_only))
2144 goto out;
2145
2146 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2147
2148 if (ttrace->entry_pending) {
2149 printed = fprintf(trace->output, "%s", ttrace->entry_str);
2150 } else {
2151 printed += fprintf(trace->output, " ... [");
2152 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2153 printed += 9;
2154 printed += fprintf(trace->output, "]: %s()", sc->name);
2155 }
2156
2157 printed++; /* the closing ')' */
2158
2159 if (alignment > printed)
2160 alignment -= printed;
2161 else
2162 alignment = 0;
2163
2164 fprintf(trace->output, ")%*s= ", alignment, " ");
2165
2166 if (sc->fmt == NULL) {
2167 if (ret < 0)
2168 goto errno_print;
2169signed_print:
2170 fprintf(trace->output, "%ld", ret);
2171 } else if (ret < 0) {
2172errno_print: {
2173 char bf[STRERR_BUFSIZE];
2174 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2175 *e = errno_to_name(evsel, -ret);
2176
2177 fprintf(trace->output, "-1 %s (%s)", e, emsg);
2178 }
2179 } else if (ret == 0 && sc->fmt->timeout)
2180 fprintf(trace->output, "0 (Timeout)");
2181 else if (ttrace->ret_scnprintf) {
2182 char bf[1024];
2183 struct syscall_arg arg = {
2184 .val = ret,
2185 .thread = thread,
2186 .trace = trace,
2187 };
2188 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2189 ttrace->ret_scnprintf = NULL;
2190 fprintf(trace->output, "%s", bf);
2191 } else if (sc->fmt->hexret)
2192 fprintf(trace->output, "%#lx", ret);
2193 else if (sc->fmt->errpid) {
2194 struct thread *child = machine__find_thread(trace->host, ret, ret);
2195
2196 if (child != NULL) {
2197 fprintf(trace->output, "%ld", ret);
2198 if (child->comm_set)
2199 fprintf(trace->output, " (%s)", thread__comm_str(child));
2200 thread__put(child);
2201 }
2202 } else
2203 goto signed_print;
2204
2205 fputc('\n', trace->output);
2206
2207 /*
2208 * We only consider an 'event' for the sake of --max-events a non-filtered
2209 * sys_enter + sys_exit and other tracepoint events.
2210 */
2211 if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2212 interrupted = true;
2213
2214 if (callchain_ret > 0)
2215 trace__fprintf_callchain(trace, sample);
2216 else if (callchain_ret < 0)
2217 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2218out:
2219 ttrace->entry_pending = false;
2220 err = 0;
2221out_put:
2222 thread__put(thread);
2223 return err;
2224}
2225
2226static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2227 union perf_event *event __maybe_unused,
2228 struct perf_sample *sample)
2229{
2230 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2231 struct thread_trace *ttrace;
2232 size_t filename_len, entry_str_len, to_move;
2233 ssize_t remaining_space;
2234 char *pos;
2235 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2236
2237 if (!thread)
2238 goto out;
2239
2240 ttrace = thread__priv(thread);
2241 if (!ttrace)
2242 goto out_put;
2243
2244 filename_len = strlen(filename);
2245 if (filename_len == 0)
2246 goto out_put;
2247
2248 if (ttrace->filename.namelen < filename_len) {
2249 char *f = realloc(ttrace->filename.name, filename_len + 1);
2250
2251 if (f == NULL)
2252 goto out_put;
2253
2254 ttrace->filename.namelen = filename_len;
2255 ttrace->filename.name = f;
2256 }
2257
2258 strcpy(ttrace->filename.name, filename);
2259 ttrace->filename.pending_open = true;
2260
2261 if (!ttrace->filename.ptr)
2262 goto out_put;
2263
2264 entry_str_len = strlen(ttrace->entry_str);
2265 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2266 if (remaining_space <= 0)
2267 goto out_put;
2268
2269 if (filename_len > (size_t)remaining_space) {
2270 filename += filename_len - remaining_space;
2271 filename_len = remaining_space;
2272 }
2273
2274 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2275 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2276 memmove(pos + filename_len, pos, to_move);
2277 memcpy(pos, filename, filename_len);
2278
2279 ttrace->filename.ptr = 0;
2280 ttrace->filename.entry_str_pos = 0;
2281out_put:
2282 thread__put(thread);
2283out:
2284 return 0;
2285}
2286
2287static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2288 union perf_event *event __maybe_unused,
2289 struct perf_sample *sample)
2290{
2291 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2292 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2293 struct thread *thread = machine__findnew_thread(trace->host,
2294 sample->pid,
2295 sample->tid);
2296 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2297
2298 if (ttrace == NULL)
2299 goto out_dump;
2300
2301 ttrace->runtime_ms += runtime_ms;
2302 trace->runtime_ms += runtime_ms;
2303out_put:
2304 thread__put(thread);
2305 return 0;
2306
2307out_dump:
2308 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2309 evsel->name,
2310 perf_evsel__strval(evsel, sample, "comm"),
2311 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2312 runtime,
2313 perf_evsel__intval(evsel, sample, "vruntime"));
2314 goto out_put;
2315}
2316
2317static int bpf_output__printer(enum binary_printer_ops op,
2318 unsigned int val, void *extra __maybe_unused, FILE *fp)
2319{
2320 unsigned char ch = (unsigned char)val;
2321
2322 switch (op) {
2323 case BINARY_PRINT_CHAR_DATA:
2324 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2325 case BINARY_PRINT_DATA_BEGIN:
2326 case BINARY_PRINT_LINE_BEGIN:
2327 case BINARY_PRINT_ADDR:
2328 case BINARY_PRINT_NUM_DATA:
2329 case BINARY_PRINT_NUM_PAD:
2330 case BINARY_PRINT_SEP:
2331 case BINARY_PRINT_CHAR_PAD:
2332 case BINARY_PRINT_LINE_END:
2333 case BINARY_PRINT_DATA_END:
2334 default:
2335 break;
2336 }
2337
2338 return 0;
2339}
2340
2341static void bpf_output__fprintf(struct trace *trace,
2342 struct perf_sample *sample)
2343{
2344 binary__fprintf(sample->raw_data, sample->raw_size, 8,
2345 bpf_output__printer, NULL, trace->output);
2346 ++trace->nr_events_printed;
2347}
2348
2349static int trace__event_handler(struct trace *trace, struct evsel *evsel,
2350 union perf_event *event __maybe_unused,
2351 struct perf_sample *sample)
2352{
2353 struct thread *thread;
2354 int callchain_ret = 0;
2355 /*
2356 * Check if we called perf_evsel__disable(evsel) due to, for instance,
2357 * this event's max_events having been hit and this is an entry coming
2358 * from the ring buffer that we should discard, since the max events
2359 * have already been considered/printed.
2360 */
2361 if (evsel->disabled)
2362 return 0;
2363
2364 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2365
2366 if (sample->callchain) {
2367 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2368 if (callchain_ret == 0) {
2369 if (callchain_cursor.nr < trace->min_stack)
2370 goto out;
2371 callchain_ret = 1;
2372 }
2373 }
2374
2375 trace__printf_interrupted_entry(trace);
2376 trace__fprintf_tstamp(trace, sample->time, trace->output);
2377
2378 if (trace->trace_syscalls && trace->show_duration)
2379 fprintf(trace->output, "( ): ");
2380
2381 if (thread)
2382 trace__fprintf_comm_tid(trace, thread, trace->output);
2383
2384 if (evsel == trace->syscalls.events.augmented) {
2385 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2386 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2387
2388 if (sc) {
2389 fprintf(trace->output, "%s(", sc->name);
2390 trace__fprintf_sys_enter(trace, evsel, sample);
2391 fputc(')', trace->output);
2392 goto newline;
2393 }
2394
2395 /*
2396 * XXX: Not having the associated syscall info or not finding/adding
2397 * the thread should never happen, but if it does...
2398 * fall thru and print it as a bpf_output event.
2399 */
2400 }
2401
2402 fprintf(trace->output, "%s:", evsel->name);
2403
2404 if (perf_evsel__is_bpf_output(evsel)) {
2405 bpf_output__fprintf(trace, sample);
2406 } else if (evsel->tp_format) {
2407 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2408 trace__fprintf_sys_enter(trace, evsel, sample)) {
2409 event_format__fprintf(evsel->tp_format, sample->cpu,
2410 sample->raw_data, sample->raw_size,
2411 trace->output);
2412 ++trace->nr_events_printed;
2413
2414 if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
2415 evsel__disable(evsel);
2416 evsel__close(evsel);
2417 }
2418 }
2419 }
2420
2421newline:
2422 fprintf(trace->output, "\n");
2423
2424 if (callchain_ret > 0)
2425 trace__fprintf_callchain(trace, sample);
2426 else if (callchain_ret < 0)
2427 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2428out:
2429 thread__put(thread);
2430 return 0;
2431}
2432
2433static void print_location(FILE *f, struct perf_sample *sample,
2434 struct addr_location *al,
2435 bool print_dso, bool print_sym)
2436{
2437
2438 if ((verbose > 0 || print_dso) && al->map)
2439 fprintf(f, "%s@", al->map->dso->long_name);
2440
2441 if ((verbose > 0 || print_sym) && al->sym)
2442 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2443 al->addr - al->sym->start);
2444 else if (al->map)
2445 fprintf(f, "0x%" PRIx64, al->addr);
2446 else
2447 fprintf(f, "0x%" PRIx64, sample->addr);
2448}
2449
2450static int trace__pgfault(struct trace *trace,
2451 struct evsel *evsel,
2452 union perf_event *event __maybe_unused,
2453 struct perf_sample *sample)
2454{
2455 struct thread *thread;
2456 struct addr_location al;
2457 char map_type = 'd';
2458 struct thread_trace *ttrace;
2459 int err = -1;
2460 int callchain_ret = 0;
2461
2462 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2463
2464 if (sample->callchain) {
2465 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2466 if (callchain_ret == 0) {
2467 if (callchain_cursor.nr < trace->min_stack)
2468 goto out_put;
2469 callchain_ret = 1;
2470 }
2471 }
2472
2473 ttrace = thread__trace(thread, trace->output);
2474 if (ttrace == NULL)
2475 goto out_put;
2476
2477 if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2478 ttrace->pfmaj++;
2479 else
2480 ttrace->pfmin++;
2481
2482 if (trace->summary_only)
2483 goto out;
2484
2485 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2486
2487 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2488
2489 fprintf(trace->output, "%sfault [",
2490 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2491 "maj" : "min");
2492
2493 print_location(trace->output, sample, &al, false, true);
2494
2495 fprintf(trace->output, "] => ");
2496
2497 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2498
2499 if (!al.map) {
2500 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2501
2502 if (al.map)
2503 map_type = 'x';
2504 else
2505 map_type = '?';
2506 }
2507
2508 print_location(trace->output, sample, &al, true, false);
2509
2510 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2511
2512 if (callchain_ret > 0)
2513 trace__fprintf_callchain(trace, sample);
2514 else if (callchain_ret < 0)
2515 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2516
2517 ++trace->nr_events_printed;
2518out:
2519 err = 0;
2520out_put:
2521 thread__put(thread);
2522 return err;
2523}
2524
2525static void trace__set_base_time(struct trace *trace,
2526 struct evsel *evsel,
2527 struct perf_sample *sample)
2528{
2529 /*
2530 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2531 * and don't use sample->time unconditionally, we may end up having
2532 * some other event in the future without PERF_SAMPLE_TIME for good
2533 * reason, i.e. we may not be interested in its timestamps, just in
2534 * it taking place, picking some piece of information when it
2535 * appears in our event stream (vfs_getname comes to mind).
2536 */
2537 if (trace->base_time == 0 && !trace->full_time &&
2538 (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
2539 trace->base_time = sample->time;
2540}
2541
2542static int trace__process_sample(struct perf_tool *tool,
2543 union perf_event *event,
2544 struct perf_sample *sample,
2545 struct evsel *evsel,
2546 struct machine *machine __maybe_unused)
2547{
2548 struct trace *trace = container_of(tool, struct trace, tool);
2549 struct thread *thread;
2550 int err = 0;
2551
2552 tracepoint_handler handler = evsel->handler;
2553
2554 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2555 if (thread && thread__is_filtered(thread))
2556 goto out;
2557
2558 trace__set_base_time(trace, evsel, sample);
2559
2560 if (handler) {
2561 ++trace->nr_events;
2562 handler(trace, evsel, event, sample);
2563 }
2564out:
2565 thread__put(thread);
2566 return err;
2567}
2568
2569static int trace__record(struct trace *trace, int argc, const char **argv)
2570{
2571 unsigned int rec_argc, i, j;
2572 const char **rec_argv;
2573 const char * const record_args[] = {
2574 "record",
2575 "-R",
2576 "-m", "1024",
2577 "-c", "1",
2578 };
2579
2580 const char * const sc_args[] = { "-e", };
2581 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2582 const char * const majpf_args[] = { "-e", "major-faults" };
2583 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2584 const char * const minpf_args[] = { "-e", "minor-faults" };
2585 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2586
2587 /* +1 is for the event string below */
2588 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2589 majpf_args_nr + minpf_args_nr + argc;
2590 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2591
2592 if (rec_argv == NULL)
2593 return -ENOMEM;
2594
2595 j = 0;
2596 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2597 rec_argv[j++] = record_args[i];
2598
2599 if (trace->trace_syscalls) {
2600 for (i = 0; i < sc_args_nr; i++)
2601 rec_argv[j++] = sc_args[i];
2602
2603 /* event string may be different for older kernels - e.g., RHEL6 */
2604 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2605 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2606 else if (is_valid_tracepoint("syscalls:sys_enter"))
2607 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2608 else {
2609 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2610 free(rec_argv);
2611 return -1;
2612 }
2613 }
2614
2615 if (trace->trace_pgfaults & TRACE_PFMAJ)
2616 for (i = 0; i < majpf_args_nr; i++)
2617 rec_argv[j++] = majpf_args[i];
2618
2619 if (trace->trace_pgfaults & TRACE_PFMIN)
2620 for (i = 0; i < minpf_args_nr; i++)
2621 rec_argv[j++] = minpf_args[i];
2622
2623 for (i = 0; i < (unsigned int)argc; i++)
2624 rec_argv[j++] = argv[i];
2625
2626 return cmd_record(j, rec_argv);
2627}
2628
2629static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2630
2631static bool evlist__add_vfs_getname(struct evlist *evlist)
2632{
2633 bool found = false;
2634 struct evsel *evsel, *tmp;
2635 struct parse_events_error err = { .idx = 0, };
2636 int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2637
2638 if (ret)
2639 return false;
2640
2641 evlist__for_each_entry_safe(evlist, evsel, tmp) {
2642 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2643 continue;
2644
2645 if (perf_evsel__field(evsel, "pathname")) {
2646 evsel->handler = trace__vfs_getname;
2647 found = true;
2648 continue;
2649 }
2650
2651 list_del_init(&evsel->core.node);
2652 evsel->evlist = NULL;
2653 evsel__delete(evsel);
2654 }
2655
2656 return found;
2657}
2658
2659static struct evsel *perf_evsel__new_pgfault(u64 config)
2660{
2661 struct evsel *evsel;
2662 struct perf_event_attr attr = {
2663 .type = PERF_TYPE_SOFTWARE,
2664 .mmap_data = 1,
2665 };
2666
2667 attr.config = config;
2668 attr.sample_period = 1;
2669
2670 event_attr_init(&attr);
2671
2672 evsel = evsel__new(&attr);
2673 if (evsel)
2674 evsel->handler = trace__pgfault;
2675
2676 return evsel;
2677}
2678
2679static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2680{
2681 const u32 type = event->header.type;
2682 struct evsel *evsel;
2683
2684 if (type != PERF_RECORD_SAMPLE) {
2685 trace__process_event(trace, trace->host, event, sample);
2686 return;
2687 }
2688
2689 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2690 if (evsel == NULL) {
2691 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2692 return;
2693 }
2694
2695 if (evswitch__discard(&trace->evswitch, evsel))
2696 return;
2697
2698 trace__set_base_time(trace, evsel, sample);
2699
2700 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
2701 sample->raw_data == NULL) {
2702 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2703 perf_evsel__name(evsel), sample->tid,
2704 sample->cpu, sample->raw_size);
2705 } else {
2706 tracepoint_handler handler = evsel->handler;
2707 handler(trace, evsel, event, sample);
2708 }
2709
2710 if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
2711 interrupted = true;
2712}
2713
2714static int trace__add_syscall_newtp(struct trace *trace)
2715{
2716 int ret = -1;
2717 struct evlist *evlist = trace->evlist;
2718 struct evsel *sys_enter, *sys_exit;
2719
2720 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2721 if (sys_enter == NULL)
2722 goto out;
2723
2724 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2725 goto out_delete_sys_enter;
2726
2727 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2728 if (sys_exit == NULL)
2729 goto out_delete_sys_enter;
2730
2731 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2732 goto out_delete_sys_exit;
2733
2734 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2735 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2736
2737 evlist__add(evlist, sys_enter);
2738 evlist__add(evlist, sys_exit);
2739
2740 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2741 /*
2742 * We're interested only in the user space callchain
2743 * leading to the syscall, allow overriding that for
2744 * debugging reasons using --kernel_syscall_callchains
2745 */
2746 sys_exit->core.attr.exclude_callchain_kernel = 1;
2747 }
2748
2749 trace->syscalls.events.sys_enter = sys_enter;
2750 trace->syscalls.events.sys_exit = sys_exit;
2751
2752 ret = 0;
2753out:
2754 return ret;
2755
2756out_delete_sys_exit:
2757 evsel__delete_priv(sys_exit);
2758out_delete_sys_enter:
2759 evsel__delete_priv(sys_enter);
2760 goto out;
2761}
2762
2763static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
2764{
2765 int err = -1;
2766 struct evsel *sys_exit;
2767 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2768 trace->ev_qualifier_ids.nr,
2769 trace->ev_qualifier_ids.entries);
2770
2771 if (filter == NULL)
2772 goto out_enomem;
2773
2774 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2775 filter)) {
2776 sys_exit = trace->syscalls.events.sys_exit;
2777 err = perf_evsel__append_tp_filter(sys_exit, filter);
2778 }
2779
2780 free(filter);
2781out:
2782 return err;
2783out_enomem:
2784 errno = ENOMEM;
2785 goto out;
2786}
2787
2788#ifdef HAVE_LIBBPF_SUPPORT
2789static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
2790{
2791 if (trace->bpf_obj == NULL)
2792 return NULL;
2793
2794 return bpf_object__find_program_by_title(trace->bpf_obj, name);
2795}
2796
2797static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
2798 const char *prog_name, const char *type)
2799{
2800 struct bpf_program *prog;
2801
2802 if (prog_name == NULL) {
2803 char default_prog_name[256];
2804 scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
2805 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
2806 if (prog != NULL)
2807 goto out_found;
2808 if (sc->fmt && sc->fmt->alias) {
2809 scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
2810 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
2811 if (prog != NULL)
2812 goto out_found;
2813 }
2814 goto out_unaugmented;
2815 }
2816
2817 prog = trace__find_bpf_program_by_title(trace, prog_name);
2818
2819 if (prog != NULL) {
2820out_found:
2821 return prog;
2822 }
2823
2824 pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
2825 prog_name, type, sc->name);
2826out_unaugmented:
2827 return trace->syscalls.unaugmented_prog;
2828}
2829
2830static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
2831{
2832 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2833
2834 if (sc == NULL)
2835 return;
2836
2837 sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
2838 sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit");
2839}
2840
2841static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
2842{
2843 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2844 return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
2845}
2846
2847static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
2848{
2849 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2850 return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
2851}
2852
2853static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry)
2854{
2855 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2856 int arg = 0;
2857
2858 if (sc == NULL)
2859 goto out;
2860
2861 for (; arg < sc->nr_args; ++arg) {
2862 entry->string_args_len[arg] = 0;
2863 if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) {
2864 /* Should be set like strace -s strsize */
2865 entry->string_args_len[arg] = PATH_MAX;
2866 }
2867 }
2868out:
2869 for (; arg < 6; ++arg)
2870 entry->string_args_len[arg] = 0;
2871}
2872static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
2873{
2874 int fd = bpf_map__fd(trace->syscalls.map);
2875 struct bpf_map_syscall_entry value = {
2876 .enabled = !trace->not_ev_qualifier,
2877 };
2878 int err = 0;
2879 size_t i;
2880
2881 for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
2882 int key = trace->ev_qualifier_ids.entries[i];
2883
2884 if (value.enabled) {
2885 trace__init_bpf_map_syscall_args(trace, key, &value);
2886 trace__init_syscall_bpf_progs(trace, key);
2887 }
2888
2889 err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
2890 if (err)
2891 break;
2892 }
2893
2894 return err;
2895}
2896
2897static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
2898{
2899 int fd = bpf_map__fd(trace->syscalls.map);
2900 struct bpf_map_syscall_entry value = {
2901 .enabled = enabled,
2902 };
2903 int err = 0, key;
2904
2905 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
2906 if (enabled)
2907 trace__init_bpf_map_syscall_args(trace, key, &value);
2908
2909 err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
2910 if (err)
2911 break;
2912 }
2913
2914 return err;
2915}
2916
2917static int trace__init_syscalls_bpf_map(struct trace *trace)
2918{
2919 bool enabled = true;
2920
2921 if (trace->ev_qualifier_ids.nr)
2922 enabled = trace->not_ev_qualifier;
2923
2924 return __trace__init_syscalls_bpf_map(trace, enabled);
2925}
2926
2927static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
2928{
2929 struct tep_format_field *field, *candidate_field;
2930 int id;
2931
2932 /*
2933 * We're only interested in syscalls that have a pointer:
2934 */
2935 for (field = sc->args; field; field = field->next) {
2936 if (field->flags & TEP_FIELD_IS_POINTER)
2937 goto try_to_find_pair;
2938 }
2939
2940 return NULL;
2941
2942try_to_find_pair:
2943 for (id = 0; id < trace->sctbl->syscalls.nr_entries; ++id) {
2944 struct syscall *pair = trace__syscall_info(trace, NULL, id);
2945 struct bpf_program *pair_prog;
2946 bool is_candidate = false;
2947
2948 if (pair == NULL || pair == sc ||
2949 pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
2950 continue;
2951
2952 for (field = sc->args, candidate_field = pair->args;
2953 field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
2954 bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
2955 candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
2956
2957 if (is_pointer) {
2958 if (!candidate_is_pointer) {
2959 // The candidate just doesn't copies our pointer arg, might copy other pointers we want.
2960 continue;
2961 }
2962 } else {
2963 if (candidate_is_pointer) {
2964 // The candidate might copy a pointer we don't have, skip it.
2965 goto next_candidate;
2966 }
2967 continue;
2968 }
2969
2970 if (strcmp(field->type, candidate_field->type))
2971 goto next_candidate;
2972
2973 is_candidate = true;
2974 }
2975
2976 if (!is_candidate)
2977 goto next_candidate;
2978
2979 /*
2980 * Check if the tentative pair syscall augmenter has more pointers, if it has,
2981 * then it may be collecting that and we then can't use it, as it would collect
2982 * more than what is common to the two syscalls.
2983 */
2984 if (candidate_field) {
2985 for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
2986 if (candidate_field->flags & TEP_FIELD_IS_POINTER)
2987 goto next_candidate;
2988 }
2989
2990 pair_prog = pair->bpf_prog.sys_enter;
2991 /*
2992 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
2993 * have been searched for, so search it here and if it returns the
2994 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
2995 * program for a filtered syscall on a non-filtered one.
2996 *
2997 * For instance, we have "!syscalls:sys_enter_renameat" and that is
2998 * useful for "renameat2".
2999 */
3000 if (pair_prog == NULL) {
3001 pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3002 if (pair_prog == trace->syscalls.unaugmented_prog)
3003 goto next_candidate;
3004 }
3005
3006 pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3007 return pair_prog;
3008 next_candidate:
3009 continue;
3010 }
3011
3012 return NULL;
3013}
3014
3015static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3016{
3017 int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
3018 map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
3019 int err = 0, key;
3020
3021 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
3022 int prog_fd;
3023
3024 if (!trace__syscall_enabled(trace, key))
3025 continue;
3026
3027 trace__init_syscall_bpf_progs(trace, key);
3028
3029 // It'll get at least the "!raw_syscalls:unaugmented"
3030 prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3031 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3032 if (err)
3033 break;
3034 prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3035 err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3036 if (err)
3037 break;
3038 }
3039
3040 /*
3041 * Now lets do a second pass looking for enabled syscalls without
3042 * an augmenter that have a signature that is a superset of another
3043 * syscall with an augmenter so that we can auto-reuse it.
3044 *
3045 * I.e. if we have an augmenter for the "open" syscall that has
3046 * this signature:
3047 *
3048 * int open(const char *pathname, int flags, mode_t mode);
3049 *
3050 * I.e. that will collect just the first string argument, then we
3051 * can reuse it for the 'creat' syscall, that has this signature:
3052 *
3053 * int creat(const char *pathname, mode_t mode);
3054 *
3055 * and for:
3056 *
3057 * int stat(const char *pathname, struct stat *statbuf);
3058 * int lstat(const char *pathname, struct stat *statbuf);
3059 *
3060 * Because the 'open' augmenter will collect the first arg as a string,
3061 * and leave alone all the other args, which already helps with
3062 * beautifying 'stat' and 'lstat''s pathname arg.
3063 *
3064 * Then, in time, when 'stat' gets an augmenter that collects both
3065 * first and second arg (this one on the raw_syscalls:sys_exit prog
3066 * array tail call, then that one will be used.
3067 */
3068 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
3069 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3070 struct bpf_program *pair_prog;
3071 int prog_fd;
3072
3073 if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3074 continue;
3075
3076 /*
3077 * For now we're just reusing the sys_enter prog, and if it
3078 * already has an augmenter, we don't need to find one.
3079 */
3080 if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
3081 continue;
3082
3083 /*
3084 * Look at all the other syscalls for one that has a signature
3085 * that is close enough that we can share:
3086 */
3087 pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3088 if (pair_prog == NULL)
3089 continue;
3090
3091 sc->bpf_prog.sys_enter = pair_prog;
3092
3093 /*
3094 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3095 * with the fd for the program we're reusing:
3096 */
3097 prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3098 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3099 if (err)
3100 break;
3101 }
3102
3103
3104 return err;
3105}
3106#else
3107static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
3108{
3109 return 0;
3110}
3111
3112static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
3113{
3114 return 0;
3115}
3116
3117static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace __maybe_unused,
3118 const char *name __maybe_unused)
3119{
3120 return NULL;
3121}
3122
3123static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
3124{
3125 return 0;
3126}
3127#endif // HAVE_LIBBPF_SUPPORT
3128
3129static int trace__set_ev_qualifier_filter(struct trace *trace)
3130{
3131 if (trace->syscalls.map)
3132 return trace__set_ev_qualifier_bpf_filter(trace);
3133 if (trace->syscalls.events.sys_enter)
3134 return trace__set_ev_qualifier_tp_filter(trace);
3135 return 0;
3136}
3137
3138static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3139 size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3140{
3141 int err = 0;
3142#ifdef HAVE_LIBBPF_SUPPORT
3143 bool value = true;
3144 int map_fd = bpf_map__fd(map);
3145 size_t i;
3146
3147 for (i = 0; i < npids; ++i) {
3148 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3149 if (err)
3150 break;
3151 }
3152#endif
3153 return err;
3154}
3155
3156static int trace__set_filter_loop_pids(struct trace *trace)
3157{
3158 unsigned int nr = 1, err;
3159 pid_t pids[32] = {
3160 getpid(),
3161 };
3162 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3163
3164 while (thread && nr < ARRAY_SIZE(pids)) {
3165 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
3166
3167 if (parent == NULL)
3168 break;
3169
3170 if (!strcmp(thread__comm_str(parent), "sshd") ||
3171 strstarts(thread__comm_str(parent), "gnome-terminal")) {
3172 pids[nr++] = parent->tid;
3173 break;
3174 }
3175 thread = parent;
3176 }
3177
3178 err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
3179 if (!err && trace->filter_pids.map)
3180 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3181
3182 return err;
3183}
3184
3185static int trace__set_filter_pids(struct trace *trace)
3186{
3187 int err = 0;
3188 /*
3189 * Better not use !target__has_task() here because we need to cover the
3190 * case where no threads were specified in the command line, but a
3191 * workload was, and in that case we will fill in the thread_map when
3192 * we fork the workload in perf_evlist__prepare_workload.
3193 */
3194 if (trace->filter_pids.nr > 0) {
3195 err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
3196 trace->filter_pids.entries);
3197 if (!err && trace->filter_pids.map) {
3198 err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
3199 trace->filter_pids.entries);
3200 }
3201 } else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
3202 err = trace__set_filter_loop_pids(trace);
3203 }
3204
3205 return err;
3206}
3207
3208static int __trace__deliver_event(struct trace *trace, union perf_event *event)
3209{
3210 struct evlist *evlist = trace->evlist;
3211 struct perf_sample sample;
3212 int err;
3213
3214 err = perf_evlist__parse_sample(evlist, event, &sample);
3215 if (err)
3216 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
3217 else
3218 trace__handle_event(trace, event, &sample);
3219
3220 return 0;
3221}
3222
3223static int __trace__flush_events(struct trace *trace)
3224{
3225 u64 first = ordered_events__first_time(&trace->oe.data);
3226 u64 flush = trace->oe.last - NSEC_PER_SEC;
3227
3228 /* Is there some thing to flush.. */
3229 if (first && first < flush)
3230 return ordered_events__flush_time(&trace->oe.data, flush);
3231
3232 return 0;
3233}
3234
3235static int trace__flush_events(struct trace *trace)
3236{
3237 return !trace->sort_events ? 0 : __trace__flush_events(trace);
3238}
3239
3240static int trace__deliver_event(struct trace *trace, union perf_event *event)
3241{
3242 int err;
3243
3244 if (!trace->sort_events)
3245 return __trace__deliver_event(trace, event);
3246
3247 err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
3248 if (err && err != -1)
3249 return err;
3250
3251 err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
3252 if (err)
3253 return err;
3254
3255 return trace__flush_events(trace);
3256}
3257
3258static int ordered_events__deliver_event(struct ordered_events *oe,
3259 struct ordered_event *event)
3260{
3261 struct trace *trace = container_of(oe, struct trace, oe.data);
3262
3263 return __trace__deliver_event(trace, event->event);
3264}
3265
3266static int trace__run(struct trace *trace, int argc, const char **argv)
3267{
3268 struct evlist *evlist = trace->evlist;
3269 struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
3270 int err = -1, i;
3271 unsigned long before;
3272 const bool forks = argc > 0;
3273 bool draining = false;
3274
3275 trace->live = true;
3276
3277 if (!trace->raw_augmented_syscalls) {
3278 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
3279 goto out_error_raw_syscalls;
3280
3281 if (trace->trace_syscalls)
3282 trace->vfs_getname = evlist__add_vfs_getname(evlist);
3283 }
3284
3285 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
3286 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
3287 if (pgfault_maj == NULL)
3288 goto out_error_mem;
3289 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
3290 evlist__add(evlist, pgfault_maj);
3291 }
3292
3293 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
3294 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
3295 if (pgfault_min == NULL)
3296 goto out_error_mem;
3297 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
3298 evlist__add(evlist, pgfault_min);
3299 }
3300
3301 if (trace->sched &&
3302 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
3303 trace__sched_stat_runtime))
3304 goto out_error_sched_stat_runtime;
3305
3306 /*
3307 * If a global cgroup was set, apply it to all the events without an
3308 * explicit cgroup. I.e.:
3309 *
3310 * trace -G A -e sched:*switch
3311 *
3312 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
3313 * _and_ sched:sched_switch to the 'A' cgroup, while:
3314 *
3315 * trace -e sched:*switch -G A
3316 *
3317 * will only set the sched:sched_switch event to the 'A' cgroup, all the
3318 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
3319 * a cgroup (on the root cgroup, sys wide, etc).
3320 *
3321 * Multiple cgroups:
3322 *
3323 * trace -G A -e sched:*switch -G B
3324 *
3325 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
3326 * to the 'B' cgroup.
3327 *
3328 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
3329 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
3330 */
3331 if (trace->cgroup)
3332 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
3333
3334 err = perf_evlist__create_maps(evlist, &trace->opts.target);
3335 if (err < 0) {
3336 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
3337 goto out_delete_evlist;
3338 }
3339
3340 err = trace__symbols_init(trace, evlist);
3341 if (err < 0) {
3342 fprintf(trace->output, "Problems initializing symbol libraries!\n");
3343 goto out_delete_evlist;
3344 }
3345
3346 perf_evlist__config(evlist, &trace->opts, &callchain_param);
3347
3348 signal(SIGCHLD, sig_handler);
3349 signal(SIGINT, sig_handler);
3350
3351 if (forks) {
3352 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
3353 argv, false, NULL);
3354 if (err < 0) {
3355 fprintf(trace->output, "Couldn't run the workload!\n");
3356 goto out_delete_evlist;
3357 }
3358 }
3359
3360 err = evlist__open(evlist);
3361 if (err < 0)
3362 goto out_error_open;
3363
3364 err = bpf__apply_obj_config();
3365 if (err) {
3366 char errbuf[BUFSIZ];
3367
3368 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
3369 pr_err("ERROR: Apply config to BPF failed: %s\n",
3370 errbuf);
3371 goto out_error_open;
3372 }
3373
3374 err = trace__set_filter_pids(trace);
3375 if (err < 0)
3376 goto out_error_mem;
3377
3378 if (trace->syscalls.map)
3379 trace__init_syscalls_bpf_map(trace);
3380
3381 if (trace->syscalls.prog_array.sys_enter)
3382 trace__init_syscalls_bpf_prog_array_maps(trace);
3383
3384 if (trace->ev_qualifier_ids.nr > 0) {
3385 err = trace__set_ev_qualifier_filter(trace);
3386 if (err < 0)
3387 goto out_errno;
3388
3389 if (trace->syscalls.events.sys_exit) {
3390 pr_debug("event qualifier tracepoint filter: %s\n",
3391 trace->syscalls.events.sys_exit->filter);
3392 }
3393 }
3394
3395 /*
3396 * If the "close" syscall is not traced, then we will not have the
3397 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
3398 * fd->pathname table and were ending up showing the last value set by
3399 * syscalls opening a pathname and associating it with a descriptor or
3400 * reading it from /proc/pid/fd/ in cases where that doesn't make
3401 * sense.
3402 *
3403 * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
3404 * not in use.
3405 */
3406 trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
3407
3408 err = perf_evlist__apply_filters(evlist, &evsel);
3409 if (err < 0)
3410 goto out_error_apply_filters;
3411
3412 if (trace->dump.map)
3413 bpf_map__fprintf(trace->dump.map, trace->output);
3414
3415 err = evlist__mmap(evlist, trace->opts.mmap_pages);
3416 if (err < 0)
3417 goto out_error_mmap;
3418
3419 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
3420 evlist__enable(evlist);
3421
3422 if (forks)
3423 perf_evlist__start_workload(evlist);
3424
3425 if (trace->opts.initial_delay) {
3426 usleep(trace->opts.initial_delay * 1000);
3427 evlist__enable(evlist);
3428 }
3429
3430 trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
3431 evlist->core.threads->nr > 1 ||
3432 evlist__first(evlist)->core.attr.inherit;
3433
3434 /*
3435 * Now that we already used evsel->core.attr to ask the kernel to setup the
3436 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
3437 * trace__resolve_callchain(), allowing per-event max-stack settings
3438 * to override an explicitly set --max-stack global setting.
3439 */
3440 evlist__for_each_entry(evlist, evsel) {
3441 if (evsel__has_callchain(evsel) &&
3442 evsel->core.attr.sample_max_stack == 0)
3443 evsel->core.attr.sample_max_stack = trace->max_stack;
3444 }
3445again:
3446 before = trace->nr_events;
3447
3448 for (i = 0; i < evlist->core.nr_mmaps; i++) {
3449 union perf_event *event;
3450 struct mmap *md;
3451
3452 md = &evlist->mmap[i];
3453 if (perf_mmap__read_init(md) < 0)
3454 continue;
3455
3456 while ((event = perf_mmap__read_event(md)) != NULL) {
3457 ++trace->nr_events;
3458
3459 err = trace__deliver_event(trace, event);
3460 if (err)
3461 goto out_disable;
3462
3463 perf_mmap__consume(md);
3464
3465 if (interrupted)
3466 goto out_disable;
3467
3468 if (done && !draining) {
3469 evlist__disable(evlist);
3470 draining = true;
3471 }
3472 }
3473 perf_mmap__read_done(md);
3474 }
3475
3476 if (trace->nr_events == before) {
3477 int timeout = done ? 100 : -1;
3478
3479 if (!draining && evlist__poll(evlist, timeout) > 0) {
3480 if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
3481 draining = true;
3482
3483 goto again;
3484 } else {
3485 if (trace__flush_events(trace))
3486 goto out_disable;
3487 }
3488 } else {
3489 goto again;
3490 }
3491
3492out_disable:
3493 thread__zput(trace->current);
3494
3495 evlist__disable(evlist);
3496
3497 if (trace->sort_events)
3498 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
3499
3500 if (!err) {
3501 if (trace->summary)
3502 trace__fprintf_thread_summary(trace, trace->output);
3503
3504 if (trace->show_tool_stats) {
3505 fprintf(trace->output, "Stats:\n "
3506 " vfs_getname : %" PRIu64 "\n"
3507 " proc_getname: %" PRIu64 "\n",
3508 trace->stats.vfs_getname,
3509 trace->stats.proc_getname);
3510 }
3511 }
3512
3513out_delete_evlist:
3514 trace__symbols__exit(trace);
3515
3516 evlist__delete(evlist);
3517 cgroup__put(trace->cgroup);
3518 trace->evlist = NULL;
3519 trace->live = false;
3520 return err;
3521{
3522 char errbuf[BUFSIZ];
3523
3524out_error_sched_stat_runtime:
3525 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
3526 goto out_error;
3527
3528out_error_raw_syscalls:
3529 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
3530 goto out_error;
3531
3532out_error_mmap:
3533 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
3534 goto out_error;
3535
3536out_error_open:
3537 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
3538
3539out_error:
3540 fprintf(trace->output, "%s\n", errbuf);
3541 goto out_delete_evlist;
3542
3543out_error_apply_filters:
3544 fprintf(trace->output,
3545 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
3546 evsel->filter, perf_evsel__name(evsel), errno,
3547 str_error_r(errno, errbuf, sizeof(errbuf)));
3548 goto out_delete_evlist;
3549}
3550out_error_mem:
3551 fprintf(trace->output, "Not enough memory to run!\n");
3552 goto out_delete_evlist;
3553
3554out_errno:
3555 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
3556 goto out_delete_evlist;
3557}
3558
3559static int trace__replay(struct trace *trace)
3560{
3561 const struct evsel_str_handler handlers[] = {
3562 { "probe:vfs_getname", trace__vfs_getname, },
3563 };
3564 struct perf_data data = {
3565 .path = input_name,
3566 .mode = PERF_DATA_MODE_READ,
3567 .force = trace->force,
3568 };
3569 struct perf_session *session;
3570 struct evsel *evsel;
3571 int err = -1;
3572
3573 trace->tool.sample = trace__process_sample;
3574 trace->tool.mmap = perf_event__process_mmap;
3575 trace->tool.mmap2 = perf_event__process_mmap2;
3576 trace->tool.comm = perf_event__process_comm;
3577 trace->tool.exit = perf_event__process_exit;
3578 trace->tool.fork = perf_event__process_fork;
3579 trace->tool.attr = perf_event__process_attr;
3580 trace->tool.tracing_data = perf_event__process_tracing_data;
3581 trace->tool.build_id = perf_event__process_build_id;
3582 trace->tool.namespaces = perf_event__process_namespaces;
3583
3584 trace->tool.ordered_events = true;
3585 trace->tool.ordering_requires_timestamps = true;
3586
3587 /* add tid to output */
3588 trace->multiple_threads = true;
3589
3590 session = perf_session__new(&data, false, &trace->tool);
3591 if (IS_ERR(session))
3592 return PTR_ERR(session);
3593
3594 if (trace->opts.target.pid)
3595 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
3596
3597 if (trace->opts.target.tid)
3598 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
3599
3600 if (symbol__init(&session->header.env) < 0)
3601 goto out;
3602
3603 trace->host = &session->machines.host;
3604
3605 err = perf_session__set_tracepoints_handlers(session, handlers);
3606 if (err)
3607 goto out;
3608
3609 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3610 "raw_syscalls:sys_enter");
3611 /* older kernels have syscalls tp versus raw_syscalls */
3612 if (evsel == NULL)
3613 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3614 "syscalls:sys_enter");
3615
3616 if (evsel &&
3617 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
3618 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3619 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
3620 goto out;
3621 }
3622
3623 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3624 "raw_syscalls:sys_exit");
3625 if (evsel == NULL)
3626 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3627 "syscalls:sys_exit");
3628 if (evsel &&
3629 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
3630 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3631 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3632 goto out;
3633 }
3634
3635 evlist__for_each_entry(session->evlist, evsel) {
3636 if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
3637 (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
3638 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
3639 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
3640 evsel->handler = trace__pgfault;
3641 }
3642
3643 setup_pager();
3644
3645 err = perf_session__process_events(session);
3646 if (err)
3647 pr_err("Failed to process events, error %d", err);
3648
3649 else if (trace->summary)
3650 trace__fprintf_thread_summary(trace, trace->output);
3651
3652out:
3653 perf_session__delete(session);
3654
3655 return err;
3656}
3657
3658static size_t trace__fprintf_threads_header(FILE *fp)
3659{
3660 size_t printed;
3661
3662 printed = fprintf(fp, "\n Summary of events:\n\n");
3663
3664 return printed;
3665}
3666
3667DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
3668 struct stats *stats;
3669 double msecs;
3670 int syscall;
3671)
3672{
3673 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
3674 struct stats *stats = source->priv;
3675
3676 entry->syscall = source->i;
3677 entry->stats = stats;
3678 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
3679}
3680
3681static size_t thread__dump_stats(struct thread_trace *ttrace,
3682 struct trace *trace, FILE *fp)
3683{
3684 size_t printed = 0;
3685 struct syscall *sc;
3686 struct rb_node *nd;
3687 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
3688
3689 if (syscall_stats == NULL)
3690 return 0;
3691
3692 printed += fprintf(fp, "\n");
3693
3694 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
3695 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
3696 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
3697
3698 resort_rb__for_each_entry(nd, syscall_stats) {
3699 struct stats *stats = syscall_stats_entry->stats;
3700 if (stats) {
3701 double min = (double)(stats->min) / NSEC_PER_MSEC;
3702 double max = (double)(stats->max) / NSEC_PER_MSEC;
3703 double avg = avg_stats(stats);
3704 double pct;
3705 u64 n = (u64) stats->n;
3706
3707 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3708 avg /= NSEC_PER_MSEC;
3709
3710 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
3711 printed += fprintf(fp, " %-15s", sc->name);
3712 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3713 n, syscall_stats_entry->msecs, min, avg);
3714 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3715 }
3716 }
3717
3718 resort_rb__delete(syscall_stats);
3719 printed += fprintf(fp, "\n\n");
3720
3721 return printed;
3722}
3723
3724static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
3725{
3726 size_t printed = 0;
3727 struct thread_trace *ttrace = thread__priv(thread);
3728 double ratio;
3729
3730 if (ttrace == NULL)
3731 return 0;
3732
3733 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3734
3735 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3736 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3737 printed += fprintf(fp, "%.1f%%", ratio);
3738 if (ttrace->pfmaj)
3739 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3740 if (ttrace->pfmin)
3741 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3742 if (trace->sched)
3743 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3744 else if (fputc('\n', fp) != EOF)
3745 ++printed;
3746
3747 printed += thread__dump_stats(ttrace, trace, fp);
3748
3749 return printed;
3750}
3751
3752static unsigned long thread__nr_events(struct thread_trace *ttrace)
3753{
3754 return ttrace ? ttrace->nr_events : 0;
3755}
3756
3757DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
3758 struct thread *thread;
3759)
3760{
3761 entry->thread = rb_entry(nd, struct thread, rb_node);
3762}
3763
3764static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3765{
3766 size_t printed = trace__fprintf_threads_header(fp);
3767 struct rb_node *nd;
3768 int i;
3769
3770 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
3771 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
3772
3773 if (threads == NULL) {
3774 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
3775 return 0;
3776 }
3777
3778 resort_rb__for_each_entry(nd, threads)
3779 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
3780
3781 resort_rb__delete(threads);
3782 }
3783 return printed;
3784}
3785
3786static int trace__set_duration(const struct option *opt, const char *str,
3787 int unset __maybe_unused)
3788{
3789 struct trace *trace = opt->value;
3790
3791 trace->duration_filter = atof(str);
3792 return 0;
3793}
3794
3795static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
3796 int unset __maybe_unused)
3797{
3798 int ret = -1;
3799 size_t i;
3800 struct trace *trace = opt->value;
3801 /*
3802 * FIXME: introduce a intarray class, plain parse csv and create a
3803 * { int nr, int entries[] } struct...
3804 */
3805 struct intlist *list = intlist__new(str);
3806
3807 if (list == NULL)
3808 return -1;
3809
3810 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3811 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3812
3813 if (trace->filter_pids.entries == NULL)
3814 goto out;
3815
3816 trace->filter_pids.entries[0] = getpid();
3817
3818 for (i = 1; i < trace->filter_pids.nr; ++i)
3819 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3820
3821 intlist__delete(list);
3822 ret = 0;
3823out:
3824 return ret;
3825}
3826
3827static int trace__open_output(struct trace *trace, const char *filename)
3828{
3829 struct stat st;
3830
3831 if (!stat(filename, &st) && st.st_size) {
3832 char oldname[PATH_MAX];
3833
3834 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3835 unlink(oldname);
3836 rename(filename, oldname);
3837 }
3838
3839 trace->output = fopen(filename, "w");
3840
3841 return trace->output == NULL ? -errno : 0;
3842}
3843
3844static int parse_pagefaults(const struct option *opt, const char *str,
3845 int unset __maybe_unused)
3846{
3847 int *trace_pgfaults = opt->value;
3848
3849 if (strcmp(str, "all") == 0)
3850 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3851 else if (strcmp(str, "maj") == 0)
3852 *trace_pgfaults |= TRACE_PFMAJ;
3853 else if (strcmp(str, "min") == 0)
3854 *trace_pgfaults |= TRACE_PFMIN;
3855 else
3856 return -1;
3857
3858 return 0;
3859}
3860
3861static void evlist__set_evsel_handler(struct evlist *evlist, void *handler)
3862{
3863 struct evsel *evsel;
3864
3865 evlist__for_each_entry(evlist, evsel)
3866 evsel->handler = handler;
3867}
3868
3869static int evlist__set_syscall_tp_fields(struct evlist *evlist)
3870{
3871 struct evsel *evsel;
3872
3873 evlist__for_each_entry(evlist, evsel) {
3874 if (evsel->priv || !evsel->tp_format)
3875 continue;
3876
3877 if (strcmp(evsel->tp_format->system, "syscalls"))
3878 continue;
3879
3880 if (perf_evsel__init_syscall_tp(evsel))
3881 return -1;
3882
3883 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3884 struct syscall_tp *sc = evsel->priv;
3885
3886 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3887 return -1;
3888 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3889 struct syscall_tp *sc = evsel->priv;
3890
3891 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3892 return -1;
3893 }
3894 }
3895
3896 return 0;
3897}
3898
3899/*
3900 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3901 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3902 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3903 *
3904 * It'd be better to introduce a parse_options() variant that would return a
3905 * list with the terms it didn't match to an event...
3906 */
3907static int trace__parse_events_option(const struct option *opt, const char *str,
3908 int unset __maybe_unused)
3909{
3910 struct trace *trace = (struct trace *)opt->value;
3911 const char *s = str;
3912 char *sep = NULL, *lists[2] = { NULL, NULL, };
3913 int len = strlen(str) + 1, err = -1, list, idx;
3914 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3915 char group_name[PATH_MAX];
3916 struct syscall_fmt *fmt;
3917
3918 if (strace_groups_dir == NULL)
3919 return -1;
3920
3921 if (*s == '!') {
3922 ++s;
3923 trace->not_ev_qualifier = true;
3924 }
3925
3926 while (1) {
3927 if ((sep = strchr(s, ',')) != NULL)
3928 *sep = '\0';
3929
3930 list = 0;
3931 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3932 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3933 list = 1;
3934 goto do_concat;
3935 }
3936
3937 fmt = syscall_fmt__find_by_alias(s);
3938 if (fmt != NULL) {
3939 list = 1;
3940 s = fmt->name;
3941 } else {
3942 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3943 if (access(group_name, R_OK) == 0)
3944 list = 1;
3945 }
3946do_concat:
3947 if (lists[list]) {
3948 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3949 } else {
3950 lists[list] = malloc(len);
3951 if (lists[list] == NULL)
3952 goto out;
3953 strcpy(lists[list], s);
3954 }
3955
3956 if (!sep)
3957 break;
3958
3959 *sep = ',';
3960 s = sep + 1;
3961 }
3962
3963 if (lists[1] != NULL) {
3964 struct strlist_config slist_config = {
3965 .dirname = strace_groups_dir,
3966 };
3967
3968 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3969 if (trace->ev_qualifier == NULL) {
3970 fputs("Not enough memory to parse event qualifier", trace->output);
3971 goto out;
3972 }
3973
3974 if (trace__validate_ev_qualifier(trace))
3975 goto out;
3976 trace->trace_syscalls = true;
3977 }
3978
3979 err = 0;
3980
3981 if (lists[0]) {
3982 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3983 "event selector. use 'perf list' to list available events",
3984 parse_events_option);
3985 err = parse_events_option(&o, lists[0], 0);
3986 }
3987out:
3988 if (sep)
3989 *sep = ',';
3990
3991 return err;
3992}
3993
3994static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3995{
3996 struct trace *trace = opt->value;
3997
3998 if (!list_empty(&trace->evlist->core.entries))
3999 return parse_cgroups(opt, str, unset);
4000
4001 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
4002
4003 return 0;
4004}
4005
4006static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
4007{
4008 if (trace->bpf_obj == NULL)
4009 return NULL;
4010
4011 return bpf_object__find_map_by_name(trace->bpf_obj, name);
4012}
4013
4014static void trace__set_bpf_map_filtered_pids(struct trace *trace)
4015{
4016 trace->filter_pids.map = trace__find_bpf_map_by_name(trace, "pids_filtered");
4017}
4018
4019static void trace__set_bpf_map_syscalls(struct trace *trace)
4020{
4021 trace->syscalls.map = trace__find_bpf_map_by_name(trace, "syscalls");
4022 trace->syscalls.prog_array.sys_enter = trace__find_bpf_map_by_name(trace, "syscalls_sys_enter");
4023 trace->syscalls.prog_array.sys_exit = trace__find_bpf_map_by_name(trace, "syscalls_sys_exit");
4024}
4025
4026static int trace__config(const char *var, const char *value, void *arg)
4027{
4028 struct trace *trace = arg;
4029 int err = 0;
4030
4031 if (!strcmp(var, "trace.add_events")) {
4032 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
4033 "event selector. use 'perf list' to list available events",
4034 parse_events_option);
4035 /*
4036 * We can't propagate parse_event_option() return, as it is 1
4037 * for failure while perf_config() expects -1.
4038 */
4039 if (parse_events_option(&o, value, 0))
4040 err = -1;
4041 } else if (!strcmp(var, "trace.show_timestamp")) {
4042 trace->show_tstamp = perf_config_bool(var, value);
4043 } else if (!strcmp(var, "trace.show_duration")) {
4044 trace->show_duration = perf_config_bool(var, value);
4045 } else if (!strcmp(var, "trace.show_arg_names")) {
4046 trace->show_arg_names = perf_config_bool(var, value);
4047 if (!trace->show_arg_names)
4048 trace->show_zeros = true;
4049 } else if (!strcmp(var, "trace.show_zeros")) {
4050 bool new_show_zeros = perf_config_bool(var, value);
4051 if (!trace->show_arg_names && !new_show_zeros) {
4052 pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
4053 goto out;
4054 }
4055 trace->show_zeros = new_show_zeros;
4056 } else if (!strcmp(var, "trace.show_prefix")) {
4057 trace->show_string_prefix = perf_config_bool(var, value);
4058 } else if (!strcmp(var, "trace.no_inherit")) {
4059 trace->opts.no_inherit = perf_config_bool(var, value);
4060 } else if (!strcmp(var, "trace.args_alignment")) {
4061 int args_alignment = 0;
4062 if (perf_config_int(&args_alignment, var, value) == 0)
4063 trace->args_alignment = args_alignment;
4064 }
4065out:
4066 return err;
4067}
4068
4069int cmd_trace(int argc, const char **argv)
4070{
4071 const char *trace_usage[] = {
4072 "perf trace [<options>] [<command>]",
4073 "perf trace [<options>] -- <command> [<options>]",
4074 "perf trace record [<options>] [<command>]",
4075 "perf trace record [<options>] -- <command> [<options>]",
4076 NULL
4077 };
4078 struct trace trace = {
4079 .opts = {
4080 .target = {
4081 .uid = UINT_MAX,
4082 .uses_mmap = true,
4083 },
4084 .user_freq = UINT_MAX,
4085 .user_interval = ULLONG_MAX,
4086 .no_buffering = true,
4087 .mmap_pages = UINT_MAX,
4088 },
4089 .output = stderr,
4090 .show_comm = true,
4091 .show_tstamp = true,
4092 .show_duration = true,
4093 .show_arg_names = true,
4094 .args_alignment = 70,
4095 .trace_syscalls = false,
4096 .kernel_syscallchains = false,
4097 .max_stack = UINT_MAX,
4098 .max_events = ULONG_MAX,
4099 };
4100 const char *map_dump_str = NULL;
4101 const char *output_name = NULL;
4102 const struct option trace_options[] = {
4103 OPT_CALLBACK('e', "event", &trace, "event",
4104 "event/syscall selector. use 'perf list' to list available events",
4105 trace__parse_events_option),
4106 OPT_BOOLEAN(0, "comm", &trace.show_comm,
4107 "show the thread COMM next to its id"),
4108 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
4109 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
4110 trace__parse_events_option),
4111 OPT_STRING('o', "output", &output_name, "file", "output file name"),
4112 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
4113 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
4114 "trace events on existing process id"),
4115 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
4116 "trace events on existing thread id"),
4117 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
4118 "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
4119 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
4120 "system-wide collection from all CPUs"),
4121 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
4122 "list of cpus to monitor"),
4123 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
4124 "child tasks do not inherit counters"),
4125 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
4126 "number of mmap data pages",
4127 perf_evlist__parse_mmap_pages),
4128 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
4129 "user to profile"),
4130 OPT_CALLBACK(0, "duration", &trace, "float",
4131 "show only events with duration > N.M ms",
4132 trace__set_duration),
4133#ifdef HAVE_LIBBPF_SUPPORT
4134 OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
4135#endif
4136 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
4137 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
4138 OPT_BOOLEAN('T', "time", &trace.full_time,
4139 "Show full timestamp, not time relative to first start"),
4140 OPT_BOOLEAN(0, "failure", &trace.failure_only,
4141 "Show only syscalls that failed"),
4142 OPT_BOOLEAN('s', "summary", &trace.summary_only,
4143 "Show only syscall summary with statistics"),
4144 OPT_BOOLEAN('S', "with-summary", &trace.summary,
4145 "Show all syscalls and summary with statistics"),
4146 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
4147 "Trace pagefaults", parse_pagefaults, "maj"),
4148 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
4149 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
4150 OPT_CALLBACK(0, "call-graph", &trace.opts,
4151 "record_mode[,record_size]", record_callchain_help,
4152 &record_parse_callchain_opt),
4153 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
4154 "Show the kernel callchains on the syscall exit path"),
4155 OPT_ULONG(0, "max-events", &trace.max_events,
4156 "Set the maximum number of events to print, exit after that is reached. "),
4157 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
4158 "Set the minimum stack depth when parsing the callchain, "
4159 "anything below the specified depth will be ignored."),
4160 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
4161 "Set the maximum stack depth when parsing the callchain, "
4162 "anything beyond the specified depth will be ignored. "
4163 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
4164 OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
4165 "Sort batch of events before processing, use if getting out of order events"),
4166 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
4167 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
4168 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
4169 "per thread proc mmap processing timeout in ms"),
4170 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
4171 trace__parse_cgroups),
4172 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
4173 "ms to wait before starting measurement after program "
4174 "start"),
4175 OPTS_EVSWITCH(&trace.evswitch),
4176 OPT_END()
4177 };
4178 bool __maybe_unused max_stack_user_set = true;
4179 bool mmap_pages_user_set = true;
4180 struct evsel *evsel;
4181 const char * const trace_subcommands[] = { "record", NULL };
4182 int err = -1;
4183 char bf[BUFSIZ];
4184
4185 signal(SIGSEGV, sighandler_dump_stack);
4186 signal(SIGFPE, sighandler_dump_stack);
4187
4188 trace.evlist = evlist__new();
4189 trace.sctbl = syscalltbl__new();
4190
4191 if (trace.evlist == NULL || trace.sctbl == NULL) {
4192 pr_err("Not enough memory to run!\n");
4193 err = -ENOMEM;
4194 goto out;
4195 }
4196
4197 /*
4198 * Parsing .perfconfig may entail creating a BPF event, that may need
4199 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
4200 * is too small. This affects just this process, not touching the
4201 * global setting. If it fails we'll get something in 'perf trace -v'
4202 * to help diagnose the problem.
4203 */
4204 rlimit__bump_memlock();
4205
4206 err = perf_config(trace__config, &trace);
4207 if (err)
4208 goto out;
4209
4210 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
4211 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
4212
4213 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
4214 usage_with_options_msg(trace_usage, trace_options,
4215 "cgroup monitoring only available in system-wide mode");
4216 }
4217
4218 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
4219 if (IS_ERR(evsel)) {
4220 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
4221 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
4222 goto out;
4223 }
4224
4225 if (evsel) {
4226 trace.syscalls.events.augmented = evsel;
4227
4228 evsel = perf_evlist__find_tracepoint_by_name(trace.evlist, "raw_syscalls:sys_enter");
4229 if (evsel == NULL) {
4230 pr_err("ERROR: raw_syscalls:sys_enter not found in the augmented BPF object\n");
4231 goto out;
4232 }
4233
4234 if (evsel->bpf_obj == NULL) {
4235 pr_err("ERROR: raw_syscalls:sys_enter not associated to a BPF object\n");
4236 goto out;
4237 }
4238
4239 trace.bpf_obj = evsel->bpf_obj;
4240
4241 trace__set_bpf_map_filtered_pids(&trace);
4242 trace__set_bpf_map_syscalls(&trace);
4243 trace.syscalls.unaugmented_prog = trace__find_bpf_program_by_title(&trace, "!raw_syscalls:unaugmented");
4244 }
4245
4246 err = bpf__setup_stdout(trace.evlist);
4247 if (err) {
4248 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
4249 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
4250 goto out;
4251 }
4252
4253 err = -1;
4254
4255 if (map_dump_str) {
4256 trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
4257 if (trace.dump.map == NULL) {
4258 pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
4259 goto out;
4260 }
4261 }
4262
4263 if (trace.trace_pgfaults) {
4264 trace.opts.sample_address = true;
4265 trace.opts.sample_time = true;
4266 }
4267
4268 if (trace.opts.mmap_pages == UINT_MAX)
4269 mmap_pages_user_set = false;
4270
4271 if (trace.max_stack == UINT_MAX) {
4272 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
4273 max_stack_user_set = false;
4274 }
4275
4276#ifdef HAVE_DWARF_UNWIND_SUPPORT
4277 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
4278 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
4279 }
4280#endif
4281
4282 if (callchain_param.enabled) {
4283 if (!mmap_pages_user_set && geteuid() == 0)
4284 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
4285
4286 symbol_conf.use_callchain = true;
4287 }
4288
4289 if (trace.evlist->core.nr_entries > 0) {
4290 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
4291 if (evlist__set_syscall_tp_fields(trace.evlist)) {
4292 perror("failed to set syscalls:* tracepoint fields");
4293 goto out;
4294 }
4295 }
4296
4297 if (trace.sort_events) {
4298 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
4299 ordered_events__set_copy_on_queue(&trace.oe.data, true);
4300 }
4301
4302 /*
4303 * If we are augmenting syscalls, then combine what we put in the
4304 * __augmented_syscalls__ BPF map with what is in the
4305 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
4306 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
4307 *
4308 * We'll switch to look at two BPF maps, one for sys_enter and the
4309 * other for sys_exit when we start augmenting the sys_exit paths with
4310 * buffers that are being copied from kernel to userspace, think 'read'
4311 * syscall.
4312 */
4313 if (trace.syscalls.events.augmented) {
4314 evlist__for_each_entry(trace.evlist, evsel) {
4315 bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
4316
4317 if (raw_syscalls_sys_exit) {
4318 trace.raw_augmented_syscalls = true;
4319 goto init_augmented_syscall_tp;
4320 }
4321
4322 if (trace.syscalls.events.augmented->priv == NULL &&
4323 strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) {
4324 struct evsel *augmented = trace.syscalls.events.augmented;
4325 if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
4326 perf_evsel__init_augmented_syscall_tp_args(augmented))
4327 goto out;
4328 /*
4329 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
4330 * Above we made sure we can get from the payload the tp fields
4331 * that we get from syscalls:sys_enter tracefs format file.
4332 */
4333 augmented->handler = trace__sys_enter;
4334 /*
4335 * Now we do the same for the *syscalls:sys_enter event so that
4336 * if we handle it directly, i.e. if the BPF prog returns 0 so
4337 * as not to filter it, then we'll handle it just like we would
4338 * for the BPF_OUTPUT one:
4339 */
4340 if (perf_evsel__init_augmented_syscall_tp(evsel, evsel) ||
4341 perf_evsel__init_augmented_syscall_tp_args(evsel))
4342 goto out;
4343 evsel->handler = trace__sys_enter;
4344 }
4345
4346 if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
4347 struct syscall_tp *sc;
4348init_augmented_syscall_tp:
4349 if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
4350 goto out;
4351 sc = evsel->priv;
4352 /*
4353 * For now with BPF raw_augmented we hook into
4354 * raw_syscalls:sys_enter and there we get all
4355 * 6 syscall args plus the tracepoint common
4356 * fields and the syscall_nr (another long).
4357 * So we check if that is the case and if so
4358 * don't look after the sc->args_size but
4359 * always after the full raw_syscalls:sys_enter
4360 * payload, which is fixed.
4361 *
4362 * We'll revisit this later to pass
4363 * s->args_size to the BPF augmenter (now
4364 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
4365 * so that it copies only what we need for each
4366 * syscall, like what happens when we use
4367 * syscalls:sys_enter_NAME, so that we reduce
4368 * the kernel/userspace traffic to just what is
4369 * needed for each syscall.
4370 */
4371 if (trace.raw_augmented_syscalls)
4372 trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
4373 perf_evsel__init_augmented_syscall_tp_ret(evsel);
4374 evsel->handler = trace__sys_exit;
4375 }
4376 }
4377 }
4378
4379 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
4380 return trace__record(&trace, argc-1, &argv[1]);
4381
4382 /* summary_only implies summary option, but don't overwrite summary if set */
4383 if (trace.summary_only)
4384 trace.summary = trace.summary_only;
4385
4386 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
4387 trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
4388 trace.trace_syscalls = true;
4389 }
4390
4391 if (output_name != NULL) {
4392 err = trace__open_output(&trace, output_name);
4393 if (err < 0) {
4394 perror("failed to create output file");
4395 goto out;
4396 }
4397 }
4398
4399 err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
4400 if (err)
4401 goto out_close;
4402
4403 err = target__validate(&trace.opts.target);
4404 if (err) {
4405 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4406 fprintf(trace.output, "%s", bf);
4407 goto out_close;
4408 }
4409
4410 err = target__parse_uid(&trace.opts.target);
4411 if (err) {
4412 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4413 fprintf(trace.output, "%s", bf);
4414 goto out_close;
4415 }
4416
4417 if (!argc && target__none(&trace.opts.target))
4418 trace.opts.target.system_wide = true;
4419
4420 if (input_name)
4421 err = trace__replay(&trace);
4422 else
4423 err = trace__run(&trace, argc, argv);
4424
4425out_close:
4426 if (output_name != NULL)
4427 fclose(trace.output);
4428out:
4429 return err;
4430}