Loading...
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19#include <traceevent/event-parse.h>
20#include <api/fs/tracing_path.h>
21#include "builtin.h"
22#include "util/color.h"
23#include "util/debug.h"
24#include "util/evlist.h"
25#include <subcmd/exec-cmd.h>
26#include "util/machine.h"
27#include "util/session.h"
28#include "util/thread.h"
29#include <subcmd/parse-options.h>
30#include "util/strlist.h"
31#include "util/intlist.h"
32#include "util/thread_map.h"
33#include "util/stat.h"
34#include "trace-event.h"
35#include "util/parse-events.h"
36#include "util/bpf-loader.h"
37
38#include <libaudit.h>
39#include <stdlib.h>
40#include <sys/mman.h>
41#include <linux/futex.h>
42#include <linux/err.h>
43
44/* For older distros: */
45#ifndef MAP_STACK
46# define MAP_STACK 0x20000
47#endif
48
49#ifndef MADV_HWPOISON
50# define MADV_HWPOISON 100
51
52#endif
53
54#ifndef MADV_MERGEABLE
55# define MADV_MERGEABLE 12
56#endif
57
58#ifndef MADV_UNMERGEABLE
59# define MADV_UNMERGEABLE 13
60#endif
61
62#ifndef EFD_SEMAPHORE
63# define EFD_SEMAPHORE 1
64#endif
65
66#ifndef EFD_NONBLOCK
67# define EFD_NONBLOCK 00004000
68#endif
69
70#ifndef EFD_CLOEXEC
71# define EFD_CLOEXEC 02000000
72#endif
73
74#ifndef O_CLOEXEC
75# define O_CLOEXEC 02000000
76#endif
77
78#ifndef SOCK_DCCP
79# define SOCK_DCCP 6
80#endif
81
82#ifndef SOCK_CLOEXEC
83# define SOCK_CLOEXEC 02000000
84#endif
85
86#ifndef SOCK_NONBLOCK
87# define SOCK_NONBLOCK 00004000
88#endif
89
90#ifndef MSG_CMSG_CLOEXEC
91# define MSG_CMSG_CLOEXEC 0x40000000
92#endif
93
94#ifndef PERF_FLAG_FD_NO_GROUP
95# define PERF_FLAG_FD_NO_GROUP (1UL << 0)
96#endif
97
98#ifndef PERF_FLAG_FD_OUTPUT
99# define PERF_FLAG_FD_OUTPUT (1UL << 1)
100#endif
101
102#ifndef PERF_FLAG_PID_CGROUP
103# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
104#endif
105
106#ifndef PERF_FLAG_FD_CLOEXEC
107# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
108#endif
109
110
111struct tp_field {
112 int offset;
113 union {
114 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
115 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 };
117};
118
119#define TP_UINT_FIELD(bits) \
120static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
121{ \
122 u##bits value; \
123 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
124 return value; \
125}
126
127TP_UINT_FIELD(8);
128TP_UINT_FIELD(16);
129TP_UINT_FIELD(32);
130TP_UINT_FIELD(64);
131
132#define TP_UINT_FIELD__SWAPPED(bits) \
133static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
134{ \
135 u##bits value; \
136 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 return bswap_##bits(value);\
138}
139
140TP_UINT_FIELD__SWAPPED(16);
141TP_UINT_FIELD__SWAPPED(32);
142TP_UINT_FIELD__SWAPPED(64);
143
144static int tp_field__init_uint(struct tp_field *field,
145 struct format_field *format_field,
146 bool needs_swap)
147{
148 field->offset = format_field->offset;
149
150 switch (format_field->size) {
151 case 1:
152 field->integer = tp_field__u8;
153 break;
154 case 2:
155 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
156 break;
157 case 4:
158 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
159 break;
160 case 8:
161 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
162 break;
163 default:
164 return -1;
165 }
166
167 return 0;
168}
169
170static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
171{
172 return sample->raw_data + field->offset;
173}
174
175static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
176{
177 field->offset = format_field->offset;
178 field->pointer = tp_field__ptr;
179 return 0;
180}
181
182struct syscall_tp {
183 struct tp_field id;
184 union {
185 struct tp_field args, ret;
186 };
187};
188
189static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
190 struct tp_field *field,
191 const char *name)
192{
193 struct format_field *format_field = perf_evsel__field(evsel, name);
194
195 if (format_field == NULL)
196 return -1;
197
198 return tp_field__init_uint(field, format_field, evsel->needs_swap);
199}
200
201#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
202 ({ struct syscall_tp *sc = evsel->priv;\
203 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
204
205static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
206 struct tp_field *field,
207 const char *name)
208{
209 struct format_field *format_field = perf_evsel__field(evsel, name);
210
211 if (format_field == NULL)
212 return -1;
213
214 return tp_field__init_ptr(field, format_field);
215}
216
217#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
218 ({ struct syscall_tp *sc = evsel->priv;\
219 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
220
221static void perf_evsel__delete_priv(struct perf_evsel *evsel)
222{
223 zfree(&evsel->priv);
224 perf_evsel__delete(evsel);
225}
226
227static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
228{
229 evsel->priv = malloc(sizeof(struct syscall_tp));
230 if (evsel->priv != NULL) {
231 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
232 goto out_delete;
233
234 evsel->handler = handler;
235 return 0;
236 }
237
238 return -ENOMEM;
239
240out_delete:
241 zfree(&evsel->priv);
242 return -ENOENT;
243}
244
245static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
246{
247 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
248
249 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
250 if (IS_ERR(evsel))
251 evsel = perf_evsel__newtp("syscalls", direction);
252
253 if (IS_ERR(evsel))
254 return NULL;
255
256 if (perf_evsel__init_syscall_tp(evsel, handler))
257 goto out_delete;
258
259 return evsel;
260
261out_delete:
262 perf_evsel__delete_priv(evsel);
263 return NULL;
264}
265
266#define perf_evsel__sc_tp_uint(evsel, name, sample) \
267 ({ struct syscall_tp *fields = evsel->priv; \
268 fields->name.integer(&fields->name, sample); })
269
270#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
271 ({ struct syscall_tp *fields = evsel->priv; \
272 fields->name.pointer(&fields->name, sample); })
273
274struct syscall_arg {
275 unsigned long val;
276 struct thread *thread;
277 struct trace *trace;
278 void *parm;
279 u8 idx;
280 u8 mask;
281};
282
283struct strarray {
284 int offset;
285 int nr_entries;
286 const char **entries;
287};
288
289#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
290 .nr_entries = ARRAY_SIZE(array), \
291 .entries = array, \
292}
293
294#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
295 .offset = off, \
296 .nr_entries = ARRAY_SIZE(array), \
297 .entries = array, \
298}
299
300static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 const char *intfmt,
302 struct syscall_arg *arg)
303{
304 struct strarray *sa = arg->parm;
305 int idx = arg->val - sa->offset;
306
307 if (idx < 0 || idx >= sa->nr_entries)
308 return scnprintf(bf, size, intfmt, arg->val);
309
310 return scnprintf(bf, size, "%s", sa->entries[idx]);
311}
312
313static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
314 struct syscall_arg *arg)
315{
316 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
317}
318
319#define SCA_STRARRAY syscall_arg__scnprintf_strarray
320
321#if defined(__i386__) || defined(__x86_64__)
322/*
323 * FIXME: Make this available to all arches as soon as the ioctl beautifier
324 * gets rewritten to support all arches.
325 */
326static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
327 struct syscall_arg *arg)
328{
329 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
330}
331
332#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
333#endif /* defined(__i386__) || defined(__x86_64__) */
334
335static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
336 struct syscall_arg *arg);
337
338#define SCA_FD syscall_arg__scnprintf_fd
339
340static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341 struct syscall_arg *arg)
342{
343 int fd = arg->val;
344
345 if (fd == AT_FDCWD)
346 return scnprintf(bf, size, "CWD");
347
348 return syscall_arg__scnprintf_fd(bf, size, arg);
349}
350
351#define SCA_FDAT syscall_arg__scnprintf_fd_at
352
353static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354 struct syscall_arg *arg);
355
356#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357
358static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359 struct syscall_arg *arg)
360{
361 return scnprintf(bf, size, "%#lx", arg->val);
362}
363
364#define SCA_HEX syscall_arg__scnprintf_hex
365
366static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367 struct syscall_arg *arg)
368{
369 return scnprintf(bf, size, "%d", arg->val);
370}
371
372#define SCA_INT syscall_arg__scnprintf_int
373
374static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
375 struct syscall_arg *arg)
376{
377 int printed = 0, prot = arg->val;
378
379 if (prot == PROT_NONE)
380 return scnprintf(bf, size, "NONE");
381#define P_MMAP_PROT(n) \
382 if (prot & PROT_##n) { \
383 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
384 prot &= ~PROT_##n; \
385 }
386
387 P_MMAP_PROT(EXEC);
388 P_MMAP_PROT(READ);
389 P_MMAP_PROT(WRITE);
390#ifdef PROT_SEM
391 P_MMAP_PROT(SEM);
392#endif
393 P_MMAP_PROT(GROWSDOWN);
394 P_MMAP_PROT(GROWSUP);
395#undef P_MMAP_PROT
396
397 if (prot)
398 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
399
400 return printed;
401}
402
403#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
404
405static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
406 struct syscall_arg *arg)
407{
408 int printed = 0, flags = arg->val;
409
410#define P_MMAP_FLAG(n) \
411 if (flags & MAP_##n) { \
412 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
413 flags &= ~MAP_##n; \
414 }
415
416 P_MMAP_FLAG(SHARED);
417 P_MMAP_FLAG(PRIVATE);
418#ifdef MAP_32BIT
419 P_MMAP_FLAG(32BIT);
420#endif
421 P_MMAP_FLAG(ANONYMOUS);
422 P_MMAP_FLAG(DENYWRITE);
423 P_MMAP_FLAG(EXECUTABLE);
424 P_MMAP_FLAG(FILE);
425 P_MMAP_FLAG(FIXED);
426 P_MMAP_FLAG(GROWSDOWN);
427#ifdef MAP_HUGETLB
428 P_MMAP_FLAG(HUGETLB);
429#endif
430 P_MMAP_FLAG(LOCKED);
431 P_MMAP_FLAG(NONBLOCK);
432 P_MMAP_FLAG(NORESERVE);
433 P_MMAP_FLAG(POPULATE);
434 P_MMAP_FLAG(STACK);
435#ifdef MAP_UNINITIALIZED
436 P_MMAP_FLAG(UNINITIALIZED);
437#endif
438#undef P_MMAP_FLAG
439
440 if (flags)
441 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
442
443 return printed;
444}
445
446#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
447
448static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
449 struct syscall_arg *arg)
450{
451 int printed = 0, flags = arg->val;
452
453#define P_MREMAP_FLAG(n) \
454 if (flags & MREMAP_##n) { \
455 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
456 flags &= ~MREMAP_##n; \
457 }
458
459 P_MREMAP_FLAG(MAYMOVE);
460#ifdef MREMAP_FIXED
461 P_MREMAP_FLAG(FIXED);
462#endif
463#undef P_MREMAP_FLAG
464
465 if (flags)
466 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
467
468 return printed;
469}
470
471#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
472
473static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
474 struct syscall_arg *arg)
475{
476 int behavior = arg->val;
477
478 switch (behavior) {
479#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
480 P_MADV_BHV(NORMAL);
481 P_MADV_BHV(RANDOM);
482 P_MADV_BHV(SEQUENTIAL);
483 P_MADV_BHV(WILLNEED);
484 P_MADV_BHV(DONTNEED);
485 P_MADV_BHV(REMOVE);
486 P_MADV_BHV(DONTFORK);
487 P_MADV_BHV(DOFORK);
488 P_MADV_BHV(HWPOISON);
489#ifdef MADV_SOFT_OFFLINE
490 P_MADV_BHV(SOFT_OFFLINE);
491#endif
492 P_MADV_BHV(MERGEABLE);
493 P_MADV_BHV(UNMERGEABLE);
494#ifdef MADV_HUGEPAGE
495 P_MADV_BHV(HUGEPAGE);
496#endif
497#ifdef MADV_NOHUGEPAGE
498 P_MADV_BHV(NOHUGEPAGE);
499#endif
500#ifdef MADV_DONTDUMP
501 P_MADV_BHV(DONTDUMP);
502#endif
503#ifdef MADV_DODUMP
504 P_MADV_BHV(DODUMP);
505#endif
506#undef P_MADV_PHV
507 default: break;
508 }
509
510 return scnprintf(bf, size, "%#x", behavior);
511}
512
513#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
514
515static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
516 struct syscall_arg *arg)
517{
518 int printed = 0, op = arg->val;
519
520 if (op == 0)
521 return scnprintf(bf, size, "NONE");
522#define P_CMD(cmd) \
523 if ((op & LOCK_##cmd) == LOCK_##cmd) { \
524 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
525 op &= ~LOCK_##cmd; \
526 }
527
528 P_CMD(SH);
529 P_CMD(EX);
530 P_CMD(NB);
531 P_CMD(UN);
532 P_CMD(MAND);
533 P_CMD(RW);
534 P_CMD(READ);
535 P_CMD(WRITE);
536#undef P_OP
537
538 if (op)
539 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
540
541 return printed;
542}
543
544#define SCA_FLOCK syscall_arg__scnprintf_flock
545
546static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
547{
548 enum syscall_futex_args {
549 SCF_UADDR = (1 << 0),
550 SCF_OP = (1 << 1),
551 SCF_VAL = (1 << 2),
552 SCF_TIMEOUT = (1 << 3),
553 SCF_UADDR2 = (1 << 4),
554 SCF_VAL3 = (1 << 5),
555 };
556 int op = arg->val;
557 int cmd = op & FUTEX_CMD_MASK;
558 size_t printed = 0;
559
560 switch (cmd) {
561#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
562 P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
563 P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564 P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
565 P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break;
566 P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break;
567 P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break;
568 P_FUTEX_OP(WAKE_OP); break;
569 P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570 P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
571 P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
572 P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break;
573 P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break;
574 P_FUTEX_OP(WAIT_REQUEUE_PI); break;
575 default: printed = scnprintf(bf, size, "%#x", cmd); break;
576 }
577
578 if (op & FUTEX_PRIVATE_FLAG)
579 printed += scnprintf(bf + printed, size - printed, "|PRIV");
580
581 if (op & FUTEX_CLOCK_REALTIME)
582 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
583
584 return printed;
585}
586
587#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op
588
589static const char *bpf_cmd[] = {
590 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
591 "MAP_GET_NEXT_KEY", "PROG_LOAD",
592};
593static DEFINE_STRARRAY(bpf_cmd);
594
595static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
596static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
597
598static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
599static DEFINE_STRARRAY(itimers);
600
601static const char *keyctl_options[] = {
602 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
603 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
604 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
605 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
606 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
607};
608static DEFINE_STRARRAY(keyctl_options);
609
610static const char *whences[] = { "SET", "CUR", "END",
611#ifdef SEEK_DATA
612"DATA",
613#endif
614#ifdef SEEK_HOLE
615"HOLE",
616#endif
617};
618static DEFINE_STRARRAY(whences);
619
620static const char *fcntl_cmds[] = {
621 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
622 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
623 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
624 "F_GETOWNER_UIDS",
625};
626static DEFINE_STRARRAY(fcntl_cmds);
627
628static const char *rlimit_resources[] = {
629 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
630 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
631 "RTTIME",
632};
633static DEFINE_STRARRAY(rlimit_resources);
634
635static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
636static DEFINE_STRARRAY(sighow);
637
638static const char *clockid[] = {
639 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
640 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
641 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
642};
643static DEFINE_STRARRAY(clockid);
644
645static const char *socket_families[] = {
646 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
647 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
648 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
649 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
650 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
651 "ALG", "NFC", "VSOCK",
652};
653static DEFINE_STRARRAY(socket_families);
654
655#ifndef SOCK_TYPE_MASK
656#define SOCK_TYPE_MASK 0xf
657#endif
658
659static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
660 struct syscall_arg *arg)
661{
662 size_t printed;
663 int type = arg->val,
664 flags = type & ~SOCK_TYPE_MASK;
665
666 type &= SOCK_TYPE_MASK;
667 /*
668 * Can't use a strarray, MIPS may override for ABI reasons.
669 */
670 switch (type) {
671#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
672 P_SK_TYPE(STREAM);
673 P_SK_TYPE(DGRAM);
674 P_SK_TYPE(RAW);
675 P_SK_TYPE(RDM);
676 P_SK_TYPE(SEQPACKET);
677 P_SK_TYPE(DCCP);
678 P_SK_TYPE(PACKET);
679#undef P_SK_TYPE
680 default:
681 printed = scnprintf(bf, size, "%#x", type);
682 }
683
684#define P_SK_FLAG(n) \
685 if (flags & SOCK_##n) { \
686 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
687 flags &= ~SOCK_##n; \
688 }
689
690 P_SK_FLAG(CLOEXEC);
691 P_SK_FLAG(NONBLOCK);
692#undef P_SK_FLAG
693
694 if (flags)
695 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
696
697 return printed;
698}
699
700#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
701
702#ifndef MSG_PROBE
703#define MSG_PROBE 0x10
704#endif
705#ifndef MSG_WAITFORONE
706#define MSG_WAITFORONE 0x10000
707#endif
708#ifndef MSG_SENDPAGE_NOTLAST
709#define MSG_SENDPAGE_NOTLAST 0x20000
710#endif
711#ifndef MSG_FASTOPEN
712#define MSG_FASTOPEN 0x20000000
713#endif
714
715static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
716 struct syscall_arg *arg)
717{
718 int printed = 0, flags = arg->val;
719
720 if (flags == 0)
721 return scnprintf(bf, size, "NONE");
722#define P_MSG_FLAG(n) \
723 if (flags & MSG_##n) { \
724 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
725 flags &= ~MSG_##n; \
726 }
727
728 P_MSG_FLAG(OOB);
729 P_MSG_FLAG(PEEK);
730 P_MSG_FLAG(DONTROUTE);
731 P_MSG_FLAG(TRYHARD);
732 P_MSG_FLAG(CTRUNC);
733 P_MSG_FLAG(PROBE);
734 P_MSG_FLAG(TRUNC);
735 P_MSG_FLAG(DONTWAIT);
736 P_MSG_FLAG(EOR);
737 P_MSG_FLAG(WAITALL);
738 P_MSG_FLAG(FIN);
739 P_MSG_FLAG(SYN);
740 P_MSG_FLAG(CONFIRM);
741 P_MSG_FLAG(RST);
742 P_MSG_FLAG(ERRQUEUE);
743 P_MSG_FLAG(NOSIGNAL);
744 P_MSG_FLAG(MORE);
745 P_MSG_FLAG(WAITFORONE);
746 P_MSG_FLAG(SENDPAGE_NOTLAST);
747 P_MSG_FLAG(FASTOPEN);
748 P_MSG_FLAG(CMSG_CLOEXEC);
749#undef P_MSG_FLAG
750
751 if (flags)
752 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
753
754 return printed;
755}
756
757#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
758
759static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
760 struct syscall_arg *arg)
761{
762 size_t printed = 0;
763 int mode = arg->val;
764
765 if (mode == F_OK) /* 0 */
766 return scnprintf(bf, size, "F");
767#define P_MODE(n) \
768 if (mode & n##_OK) { \
769 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
770 mode &= ~n##_OK; \
771 }
772
773 P_MODE(R);
774 P_MODE(W);
775 P_MODE(X);
776#undef P_MODE
777
778 if (mode)
779 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
780
781 return printed;
782}
783
784#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
785
786static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
787 struct syscall_arg *arg);
788
789#define SCA_FILENAME syscall_arg__scnprintf_filename
790
791static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
792 struct syscall_arg *arg)
793{
794 int printed = 0, flags = arg->val;
795
796 if (!(flags & O_CREAT))
797 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
798
799 if (flags == 0)
800 return scnprintf(bf, size, "RDONLY");
801#define P_FLAG(n) \
802 if (flags & O_##n) { \
803 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
804 flags &= ~O_##n; \
805 }
806
807 P_FLAG(APPEND);
808 P_FLAG(ASYNC);
809 P_FLAG(CLOEXEC);
810 P_FLAG(CREAT);
811 P_FLAG(DIRECT);
812 P_FLAG(DIRECTORY);
813 P_FLAG(EXCL);
814 P_FLAG(LARGEFILE);
815 P_FLAG(NOATIME);
816 P_FLAG(NOCTTY);
817#ifdef O_NONBLOCK
818 P_FLAG(NONBLOCK);
819#elif O_NDELAY
820 P_FLAG(NDELAY);
821#endif
822#ifdef O_PATH
823 P_FLAG(PATH);
824#endif
825 P_FLAG(RDWR);
826#ifdef O_DSYNC
827 if ((flags & O_SYNC) == O_SYNC)
828 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
829 else {
830 P_FLAG(DSYNC);
831 }
832#else
833 P_FLAG(SYNC);
834#endif
835 P_FLAG(TRUNC);
836 P_FLAG(WRONLY);
837#undef P_FLAG
838
839 if (flags)
840 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
841
842 return printed;
843}
844
845#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
846
847static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
848 struct syscall_arg *arg)
849{
850 int printed = 0, flags = arg->val;
851
852 if (flags == 0)
853 return 0;
854
855#define P_FLAG(n) \
856 if (flags & PERF_FLAG_##n) { \
857 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
858 flags &= ~PERF_FLAG_##n; \
859 }
860
861 P_FLAG(FD_NO_GROUP);
862 P_FLAG(FD_OUTPUT);
863 P_FLAG(PID_CGROUP);
864 P_FLAG(FD_CLOEXEC);
865#undef P_FLAG
866
867 if (flags)
868 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
869
870 return printed;
871}
872
873#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
874
875static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
876 struct syscall_arg *arg)
877{
878 int printed = 0, flags = arg->val;
879
880 if (flags == 0)
881 return scnprintf(bf, size, "NONE");
882#define P_FLAG(n) \
883 if (flags & EFD_##n) { \
884 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
885 flags &= ~EFD_##n; \
886 }
887
888 P_FLAG(SEMAPHORE);
889 P_FLAG(CLOEXEC);
890 P_FLAG(NONBLOCK);
891#undef P_FLAG
892
893 if (flags)
894 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
895
896 return printed;
897}
898
899#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
900
901static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
902 struct syscall_arg *arg)
903{
904 int printed = 0, flags = arg->val;
905
906#define P_FLAG(n) \
907 if (flags & O_##n) { \
908 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
909 flags &= ~O_##n; \
910 }
911
912 P_FLAG(CLOEXEC);
913 P_FLAG(NONBLOCK);
914#undef P_FLAG
915
916 if (flags)
917 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
918
919 return printed;
920}
921
922#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
923
924static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
925{
926 int sig = arg->val;
927
928 switch (sig) {
929#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
930 P_SIGNUM(HUP);
931 P_SIGNUM(INT);
932 P_SIGNUM(QUIT);
933 P_SIGNUM(ILL);
934 P_SIGNUM(TRAP);
935 P_SIGNUM(ABRT);
936 P_SIGNUM(BUS);
937 P_SIGNUM(FPE);
938 P_SIGNUM(KILL);
939 P_SIGNUM(USR1);
940 P_SIGNUM(SEGV);
941 P_SIGNUM(USR2);
942 P_SIGNUM(PIPE);
943 P_SIGNUM(ALRM);
944 P_SIGNUM(TERM);
945 P_SIGNUM(CHLD);
946 P_SIGNUM(CONT);
947 P_SIGNUM(STOP);
948 P_SIGNUM(TSTP);
949 P_SIGNUM(TTIN);
950 P_SIGNUM(TTOU);
951 P_SIGNUM(URG);
952 P_SIGNUM(XCPU);
953 P_SIGNUM(XFSZ);
954 P_SIGNUM(VTALRM);
955 P_SIGNUM(PROF);
956 P_SIGNUM(WINCH);
957 P_SIGNUM(IO);
958 P_SIGNUM(PWR);
959 P_SIGNUM(SYS);
960#ifdef SIGEMT
961 P_SIGNUM(EMT);
962#endif
963#ifdef SIGSTKFLT
964 P_SIGNUM(STKFLT);
965#endif
966#ifdef SIGSWI
967 P_SIGNUM(SWI);
968#endif
969 default: break;
970 }
971
972 return scnprintf(bf, size, "%#x", sig);
973}
974
975#define SCA_SIGNUM syscall_arg__scnprintf_signum
976
977#if defined(__i386__) || defined(__x86_64__)
978/*
979 * FIXME: Make this available to all arches.
980 */
981#define TCGETS 0x5401
982
983static const char *tioctls[] = {
984 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
985 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
986 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
987 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
988 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
989 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
990 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
991 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
992 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
993 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
994 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
995 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
996 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
997 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
998 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
999};
1000
1001static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1002#endif /* defined(__i386__) || defined(__x86_64__) */
1003
1004#define STRARRAY(arg, name, array) \
1005 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1006 .arg_parm = { [arg] = &strarray__##array, }
1007
1008static struct syscall_fmt {
1009 const char *name;
1010 const char *alias;
1011 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1012 void *arg_parm[6];
1013 bool errmsg;
1014 bool timeout;
1015 bool hexret;
1016} syscall_fmts[] = {
1017 { .name = "access", .errmsg = true,
1018 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1019 [1] = SCA_ACCMODE, /* mode */ }, },
1020 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
1021 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1022 { .name = "brk", .hexret = true,
1023 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1024 { .name = "chdir", .errmsg = true,
1025 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1026 { .name = "chmod", .errmsg = true,
1027 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1028 { .name = "chroot", .errmsg = true,
1029 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1030 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
1031 { .name = "close", .errmsg = true,
1032 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1033 { .name = "connect", .errmsg = true, },
1034 { .name = "creat", .errmsg = true,
1035 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1036 { .name = "dup", .errmsg = true,
1037 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1038 { .name = "dup2", .errmsg = true,
1039 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040 { .name = "dup3", .errmsg = true,
1041 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1042 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1043 { .name = "eventfd2", .errmsg = true,
1044 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1045 { .name = "faccessat", .errmsg = true,
1046 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1047 [1] = SCA_FILENAME, /* filename */ }, },
1048 { .name = "fadvise64", .errmsg = true,
1049 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1050 { .name = "fallocate", .errmsg = true,
1051 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1052 { .name = "fchdir", .errmsg = true,
1053 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1054 { .name = "fchmod", .errmsg = true,
1055 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1056 { .name = "fchmodat", .errmsg = true,
1057 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1058 [1] = SCA_FILENAME, /* filename */ }, },
1059 { .name = "fchown", .errmsg = true,
1060 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 { .name = "fchownat", .errmsg = true,
1062 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1063 [1] = SCA_FILENAME, /* filename */ }, },
1064 { .name = "fcntl", .errmsg = true,
1065 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1066 [1] = SCA_STRARRAY, /* cmd */ },
1067 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1068 { .name = "fdatasync", .errmsg = true,
1069 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1070 { .name = "flock", .errmsg = true,
1071 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1072 [1] = SCA_FLOCK, /* cmd */ }, },
1073 { .name = "fsetxattr", .errmsg = true,
1074 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1075 { .name = "fstat", .errmsg = true, .alias = "newfstat",
1076 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1077 { .name = "fstatat", .errmsg = true, .alias = "newfstatat",
1078 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1079 [1] = SCA_FILENAME, /* filename */ }, },
1080 { .name = "fstatfs", .errmsg = true,
1081 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1082 { .name = "fsync", .errmsg = true,
1083 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1084 { .name = "ftruncate", .errmsg = true,
1085 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1086 { .name = "futex", .errmsg = true,
1087 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1088 { .name = "futimesat", .errmsg = true,
1089 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1090 [1] = SCA_FILENAME, /* filename */ }, },
1091 { .name = "getdents", .errmsg = true,
1092 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093 { .name = "getdents64", .errmsg = true,
1094 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1095 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
1096 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1097 { .name = "getxattr", .errmsg = true,
1098 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1099 { .name = "inotify_add_watch", .errmsg = true,
1100 .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1101 { .name = "ioctl", .errmsg = true,
1102 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1103#if defined(__i386__) || defined(__x86_64__)
1104/*
1105 * FIXME: Make this available to all arches.
1106 */
1107 [1] = SCA_STRHEXARRAY, /* cmd */
1108 [2] = SCA_HEX, /* arg */ },
1109 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
1110#else
1111 [2] = SCA_HEX, /* arg */ }, },
1112#endif
1113 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
1114 { .name = "kill", .errmsg = true,
1115 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1116 { .name = "lchown", .errmsg = true,
1117 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1118 { .name = "lgetxattr", .errmsg = true,
1119 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1120 { .name = "linkat", .errmsg = true,
1121 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1122 { .name = "listxattr", .errmsg = true,
1123 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1124 { .name = "llistxattr", .errmsg = true,
1125 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1126 { .name = "lremovexattr", .errmsg = true,
1127 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1128 { .name = "lseek", .errmsg = true,
1129 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1130 [2] = SCA_STRARRAY, /* whence */ },
1131 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
1132 { .name = "lsetxattr", .errmsg = true,
1133 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1134 { .name = "lstat", .errmsg = true, .alias = "newlstat",
1135 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1136 { .name = "lsxattr", .errmsg = true,
1137 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1138 { .name = "madvise", .errmsg = true,
1139 .arg_scnprintf = { [0] = SCA_HEX, /* start */
1140 [2] = SCA_MADV_BHV, /* behavior */ }, },
1141 { .name = "mkdir", .errmsg = true,
1142 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1143 { .name = "mkdirat", .errmsg = true,
1144 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1145 [1] = SCA_FILENAME, /* pathname */ }, },
1146 { .name = "mknod", .errmsg = true,
1147 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1148 { .name = "mknodat", .errmsg = true,
1149 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1150 [1] = SCA_FILENAME, /* filename */ }, },
1151 { .name = "mlock", .errmsg = true,
1152 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1153 { .name = "mlockall", .errmsg = true,
1154 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1155 { .name = "mmap", .hexret = true,
1156 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1157 [2] = SCA_MMAP_PROT, /* prot */
1158 [3] = SCA_MMAP_FLAGS, /* flags */
1159 [4] = SCA_FD, /* fd */ }, },
1160 { .name = "mprotect", .errmsg = true,
1161 .arg_scnprintf = { [0] = SCA_HEX, /* start */
1162 [2] = SCA_MMAP_PROT, /* prot */ }, },
1163 { .name = "mq_unlink", .errmsg = true,
1164 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1165 { .name = "mremap", .hexret = true,
1166 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1167 [3] = SCA_MREMAP_FLAGS, /* flags */
1168 [4] = SCA_HEX, /* new_addr */ }, },
1169 { .name = "munlock", .errmsg = true,
1170 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1171 { .name = "munmap", .errmsg = true,
1172 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1173 { .name = "name_to_handle_at", .errmsg = true,
1174 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1175 { .name = "newfstatat", .errmsg = true,
1176 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1177 [1] = SCA_FILENAME, /* filename */ }, },
1178 { .name = "open", .errmsg = true,
1179 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1180 [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1181 { .name = "open_by_handle_at", .errmsg = true,
1182 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1183 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1184 { .name = "openat", .errmsg = true,
1185 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1186 [1] = SCA_FILENAME, /* filename */
1187 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1188 { .name = "perf_event_open", .errmsg = true,
1189 .arg_scnprintf = { [1] = SCA_INT, /* pid */
1190 [2] = SCA_INT, /* cpu */
1191 [3] = SCA_FD, /* group_fd */
1192 [4] = SCA_PERF_FLAGS, /* flags */ }, },
1193 { .name = "pipe2", .errmsg = true,
1194 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1195 { .name = "poll", .errmsg = true, .timeout = true, },
1196 { .name = "ppoll", .errmsg = true, .timeout = true, },
1197 { .name = "pread", .errmsg = true, .alias = "pread64",
1198 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1199 { .name = "preadv", .errmsg = true, .alias = "pread",
1200 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1201 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1202 { .name = "pwrite", .errmsg = true, .alias = "pwrite64",
1203 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204 { .name = "pwritev", .errmsg = true,
1205 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1206 { .name = "read", .errmsg = true,
1207 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1208 { .name = "readlink", .errmsg = true,
1209 .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1210 { .name = "readlinkat", .errmsg = true,
1211 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1212 [1] = SCA_FILENAME, /* pathname */ }, },
1213 { .name = "readv", .errmsg = true,
1214 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1215 { .name = "recvfrom", .errmsg = true,
1216 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1217 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1218 { .name = "recvmmsg", .errmsg = true,
1219 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1220 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1221 { .name = "recvmsg", .errmsg = true,
1222 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1223 [2] = SCA_MSG_FLAGS, /* flags */ }, },
1224 { .name = "removexattr", .errmsg = true,
1225 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1226 { .name = "renameat", .errmsg = true,
1227 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1228 { .name = "rmdir", .errmsg = true,
1229 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1230 { .name = "rt_sigaction", .errmsg = true,
1231 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1232 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
1233 { .name = "rt_sigqueueinfo", .errmsg = true,
1234 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1235 { .name = "rt_tgsigqueueinfo", .errmsg = true,
1236 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1237 { .name = "select", .errmsg = true, .timeout = true, },
1238 { .name = "sendmmsg", .errmsg = true,
1239 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1240 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1241 { .name = "sendmsg", .errmsg = true,
1242 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1243 [2] = SCA_MSG_FLAGS, /* flags */ }, },
1244 { .name = "sendto", .errmsg = true,
1245 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1246 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1247 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
1248 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1249 { .name = "setxattr", .errmsg = true,
1250 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1251 { .name = "shutdown", .errmsg = true,
1252 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1253 { .name = "socket", .errmsg = true,
1254 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1255 [1] = SCA_SK_TYPE, /* type */ },
1256 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1257 { .name = "socketpair", .errmsg = true,
1258 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1259 [1] = SCA_SK_TYPE, /* type */ },
1260 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1261 { .name = "stat", .errmsg = true, .alias = "newstat",
1262 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1263 { .name = "statfs", .errmsg = true,
1264 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1265 { .name = "swapoff", .errmsg = true,
1266 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1267 { .name = "swapon", .errmsg = true,
1268 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1269 { .name = "symlinkat", .errmsg = true,
1270 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1271 { .name = "tgkill", .errmsg = true,
1272 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1273 { .name = "tkill", .errmsg = true,
1274 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1275 { .name = "truncate", .errmsg = true,
1276 .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1277 { .name = "uname", .errmsg = true, .alias = "newuname", },
1278 { .name = "unlinkat", .errmsg = true,
1279 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1280 [1] = SCA_FILENAME, /* pathname */ }, },
1281 { .name = "utime", .errmsg = true,
1282 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1283 { .name = "utimensat", .errmsg = true,
1284 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1285 [1] = SCA_FILENAME, /* filename */ }, },
1286 { .name = "utimes", .errmsg = true,
1287 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1288 { .name = "vmsplice", .errmsg = true,
1289 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1290 { .name = "write", .errmsg = true,
1291 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1292 { .name = "writev", .errmsg = true,
1293 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1294};
1295
1296static int syscall_fmt__cmp(const void *name, const void *fmtp)
1297{
1298 const struct syscall_fmt *fmt = fmtp;
1299 return strcmp(name, fmt->name);
1300}
1301
1302static struct syscall_fmt *syscall_fmt__find(const char *name)
1303{
1304 const int nmemb = ARRAY_SIZE(syscall_fmts);
1305 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1306}
1307
1308struct syscall {
1309 struct event_format *tp_format;
1310 int nr_args;
1311 struct format_field *args;
1312 const char *name;
1313 bool is_exit;
1314 struct syscall_fmt *fmt;
1315 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1316 void **arg_parm;
1317};
1318
1319static size_t fprintf_duration(unsigned long t, FILE *fp)
1320{
1321 double duration = (double)t / NSEC_PER_MSEC;
1322 size_t printed = fprintf(fp, "(");
1323
1324 if (duration >= 1.0)
1325 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1326 else if (duration >= 0.01)
1327 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1328 else
1329 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1330 return printed + fprintf(fp, "): ");
1331}
1332
1333/**
1334 * filename.ptr: The filename char pointer that will be vfs_getname'd
1335 * filename.entry_str_pos: Where to insert the string translated from
1336 * filename.ptr by the vfs_getname tracepoint/kprobe.
1337 */
1338struct thread_trace {
1339 u64 entry_time;
1340 u64 exit_time;
1341 bool entry_pending;
1342 unsigned long nr_events;
1343 unsigned long pfmaj, pfmin;
1344 char *entry_str;
1345 double runtime_ms;
1346 struct {
1347 unsigned long ptr;
1348 short int entry_str_pos;
1349 bool pending_open;
1350 unsigned int namelen;
1351 char *name;
1352 } filename;
1353 struct {
1354 int max;
1355 char **table;
1356 } paths;
1357
1358 struct intlist *syscall_stats;
1359};
1360
1361static struct thread_trace *thread_trace__new(void)
1362{
1363 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1364
1365 if (ttrace)
1366 ttrace->paths.max = -1;
1367
1368 ttrace->syscall_stats = intlist__new(NULL);
1369
1370 return ttrace;
1371}
1372
1373static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1374{
1375 struct thread_trace *ttrace;
1376
1377 if (thread == NULL)
1378 goto fail;
1379
1380 if (thread__priv(thread) == NULL)
1381 thread__set_priv(thread, thread_trace__new());
1382
1383 if (thread__priv(thread) == NULL)
1384 goto fail;
1385
1386 ttrace = thread__priv(thread);
1387 ++ttrace->nr_events;
1388
1389 return ttrace;
1390fail:
1391 color_fprintf(fp, PERF_COLOR_RED,
1392 "WARNING: not enough memory, dropping samples!\n");
1393 return NULL;
1394}
1395
1396#define TRACE_PFMAJ (1 << 0)
1397#define TRACE_PFMIN (1 << 1)
1398
1399static const size_t trace__entry_str_size = 2048;
1400
1401struct trace {
1402 struct perf_tool tool;
1403 struct {
1404 int machine;
1405 int open_id;
1406 } audit;
1407 struct {
1408 int max;
1409 struct syscall *table;
1410 struct {
1411 struct perf_evsel *sys_enter,
1412 *sys_exit;
1413 } events;
1414 } syscalls;
1415 struct record_opts opts;
1416 struct perf_evlist *evlist;
1417 struct machine *host;
1418 struct thread *current;
1419 u64 base_time;
1420 FILE *output;
1421 unsigned long nr_events;
1422 struct strlist *ev_qualifier;
1423 struct {
1424 size_t nr;
1425 int *entries;
1426 } ev_qualifier_ids;
1427 struct intlist *tid_list;
1428 struct intlist *pid_list;
1429 struct {
1430 size_t nr;
1431 pid_t *entries;
1432 } filter_pids;
1433 double duration_filter;
1434 double runtime_ms;
1435 struct {
1436 u64 vfs_getname,
1437 proc_getname;
1438 } stats;
1439 bool not_ev_qualifier;
1440 bool live;
1441 bool full_time;
1442 bool sched;
1443 bool multiple_threads;
1444 bool summary;
1445 bool summary_only;
1446 bool show_comm;
1447 bool show_tool_stats;
1448 bool trace_syscalls;
1449 bool force;
1450 bool vfs_getname;
1451 int trace_pgfaults;
1452};
1453
1454static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1455{
1456 struct thread_trace *ttrace = thread__priv(thread);
1457
1458 if (fd > ttrace->paths.max) {
1459 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1460
1461 if (npath == NULL)
1462 return -1;
1463
1464 if (ttrace->paths.max != -1) {
1465 memset(npath + ttrace->paths.max + 1, 0,
1466 (fd - ttrace->paths.max) * sizeof(char *));
1467 } else {
1468 memset(npath, 0, (fd + 1) * sizeof(char *));
1469 }
1470
1471 ttrace->paths.table = npath;
1472 ttrace->paths.max = fd;
1473 }
1474
1475 ttrace->paths.table[fd] = strdup(pathname);
1476
1477 return ttrace->paths.table[fd] != NULL ? 0 : -1;
1478}
1479
1480static int thread__read_fd_path(struct thread *thread, int fd)
1481{
1482 char linkname[PATH_MAX], pathname[PATH_MAX];
1483 struct stat st;
1484 int ret;
1485
1486 if (thread->pid_ == thread->tid) {
1487 scnprintf(linkname, sizeof(linkname),
1488 "/proc/%d/fd/%d", thread->pid_, fd);
1489 } else {
1490 scnprintf(linkname, sizeof(linkname),
1491 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1492 }
1493
1494 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1495 return -1;
1496
1497 ret = readlink(linkname, pathname, sizeof(pathname));
1498
1499 if (ret < 0 || ret > st.st_size)
1500 return -1;
1501
1502 pathname[ret] = '\0';
1503 return trace__set_fd_pathname(thread, fd, pathname);
1504}
1505
1506static const char *thread__fd_path(struct thread *thread, int fd,
1507 struct trace *trace)
1508{
1509 struct thread_trace *ttrace = thread__priv(thread);
1510
1511 if (ttrace == NULL)
1512 return NULL;
1513
1514 if (fd < 0)
1515 return NULL;
1516
1517 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1518 if (!trace->live)
1519 return NULL;
1520 ++trace->stats.proc_getname;
1521 if (thread__read_fd_path(thread, fd))
1522 return NULL;
1523 }
1524
1525 return ttrace->paths.table[fd];
1526}
1527
1528static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1529 struct syscall_arg *arg)
1530{
1531 int fd = arg->val;
1532 size_t printed = scnprintf(bf, size, "%d", fd);
1533 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1534
1535 if (path)
1536 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1537
1538 return printed;
1539}
1540
1541static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1542 struct syscall_arg *arg)
1543{
1544 int fd = arg->val;
1545 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1546 struct thread_trace *ttrace = thread__priv(arg->thread);
1547
1548 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1549 zfree(&ttrace->paths.table[fd]);
1550
1551 return printed;
1552}
1553
1554static void thread__set_filename_pos(struct thread *thread, const char *bf,
1555 unsigned long ptr)
1556{
1557 struct thread_trace *ttrace = thread__priv(thread);
1558
1559 ttrace->filename.ptr = ptr;
1560 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1561}
1562
1563static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1564 struct syscall_arg *arg)
1565{
1566 unsigned long ptr = arg->val;
1567
1568 if (!arg->trace->vfs_getname)
1569 return scnprintf(bf, size, "%#x", ptr);
1570
1571 thread__set_filename_pos(arg->thread, bf, ptr);
1572 return 0;
1573}
1574
1575static bool trace__filter_duration(struct trace *trace, double t)
1576{
1577 return t < (trace->duration_filter * NSEC_PER_MSEC);
1578}
1579
1580static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1581{
1582 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1583
1584 return fprintf(fp, "%10.3f ", ts);
1585}
1586
1587static bool done = false;
1588static bool interrupted = false;
1589
1590static void sig_handler(int sig)
1591{
1592 done = true;
1593 interrupted = sig == SIGINT;
1594}
1595
1596static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1597 u64 duration, u64 tstamp, FILE *fp)
1598{
1599 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1600 printed += fprintf_duration(duration, fp);
1601
1602 if (trace->multiple_threads) {
1603 if (trace->show_comm)
1604 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1605 printed += fprintf(fp, "%d ", thread->tid);
1606 }
1607
1608 return printed;
1609}
1610
1611static int trace__process_event(struct trace *trace, struct machine *machine,
1612 union perf_event *event, struct perf_sample *sample)
1613{
1614 int ret = 0;
1615
1616 switch (event->header.type) {
1617 case PERF_RECORD_LOST:
1618 color_fprintf(trace->output, PERF_COLOR_RED,
1619 "LOST %" PRIu64 " events!\n", event->lost.lost);
1620 ret = machine__process_lost_event(machine, event, sample);
1621 default:
1622 ret = machine__process_event(machine, event, sample);
1623 break;
1624 }
1625
1626 return ret;
1627}
1628
1629static int trace__tool_process(struct perf_tool *tool,
1630 union perf_event *event,
1631 struct perf_sample *sample,
1632 struct machine *machine)
1633{
1634 struct trace *trace = container_of(tool, struct trace, tool);
1635 return trace__process_event(trace, machine, event, sample);
1636}
1637
1638static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1639{
1640 int err = symbol__init(NULL);
1641
1642 if (err)
1643 return err;
1644
1645 trace->host = machine__new_host();
1646 if (trace->host == NULL)
1647 return -ENOMEM;
1648
1649 if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1650 return -errno;
1651
1652 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1653 evlist->threads, trace__tool_process, false,
1654 trace->opts.proc_map_timeout);
1655 if (err)
1656 symbol__exit();
1657
1658 return err;
1659}
1660
1661static int syscall__set_arg_fmts(struct syscall *sc)
1662{
1663 struct format_field *field;
1664 int idx = 0;
1665
1666 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1667 if (sc->arg_scnprintf == NULL)
1668 return -1;
1669
1670 if (sc->fmt)
1671 sc->arg_parm = sc->fmt->arg_parm;
1672
1673 for (field = sc->args; field; field = field->next) {
1674 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1675 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1676 else if (field->flags & FIELD_IS_POINTER)
1677 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1678 ++idx;
1679 }
1680
1681 return 0;
1682}
1683
1684static int trace__read_syscall_info(struct trace *trace, int id)
1685{
1686 char tp_name[128];
1687 struct syscall *sc;
1688 const char *name = audit_syscall_to_name(id, trace->audit.machine);
1689
1690 if (name == NULL)
1691 return -1;
1692
1693 if (id > trace->syscalls.max) {
1694 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1695
1696 if (nsyscalls == NULL)
1697 return -1;
1698
1699 if (trace->syscalls.max != -1) {
1700 memset(nsyscalls + trace->syscalls.max + 1, 0,
1701 (id - trace->syscalls.max) * sizeof(*sc));
1702 } else {
1703 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1704 }
1705
1706 trace->syscalls.table = nsyscalls;
1707 trace->syscalls.max = id;
1708 }
1709
1710 sc = trace->syscalls.table + id;
1711 sc->name = name;
1712
1713 sc->fmt = syscall_fmt__find(sc->name);
1714
1715 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1716 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1717
1718 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1719 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1720 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1721 }
1722
1723 if (IS_ERR(sc->tp_format))
1724 return -1;
1725
1726 sc->args = sc->tp_format->format.fields;
1727 sc->nr_args = sc->tp_format->format.nr_fields;
1728 /*
1729 * We need to check and discard the first variable '__syscall_nr'
1730 * or 'nr' that mean the syscall number. It is needless here.
1731 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1732 */
1733 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1734 sc->args = sc->args->next;
1735 --sc->nr_args;
1736 }
1737
1738 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1739
1740 return syscall__set_arg_fmts(sc);
1741}
1742
1743static int trace__validate_ev_qualifier(struct trace *trace)
1744{
1745 int err = 0, i;
1746 struct str_node *pos;
1747
1748 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1749 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1750 sizeof(trace->ev_qualifier_ids.entries[0]));
1751
1752 if (trace->ev_qualifier_ids.entries == NULL) {
1753 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1754 trace->output);
1755 err = -EINVAL;
1756 goto out;
1757 }
1758
1759 i = 0;
1760
1761 strlist__for_each(pos, trace->ev_qualifier) {
1762 const char *sc = pos->s;
1763 int id = audit_name_to_syscall(sc, trace->audit.machine);
1764
1765 if (id < 0) {
1766 if (err == 0) {
1767 fputs("Error:\tInvalid syscall ", trace->output);
1768 err = -EINVAL;
1769 } else {
1770 fputs(", ", trace->output);
1771 }
1772
1773 fputs(sc, trace->output);
1774 }
1775
1776 trace->ev_qualifier_ids.entries[i++] = id;
1777 }
1778
1779 if (err < 0) {
1780 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1781 "\nHint:\tand: 'man syscalls'\n", trace->output);
1782 zfree(&trace->ev_qualifier_ids.entries);
1783 trace->ev_qualifier_ids.nr = 0;
1784 }
1785out:
1786 return err;
1787}
1788
1789/*
1790 * args is to be interpreted as a series of longs but we need to handle
1791 * 8-byte unaligned accesses. args points to raw_data within the event
1792 * and raw_data is guaranteed to be 8-byte unaligned because it is
1793 * preceded by raw_size which is a u32. So we need to copy args to a temp
1794 * variable to read it. Most notably this avoids extended load instructions
1795 * on unaligned addresses
1796 */
1797
1798static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1799 unsigned char *args, struct trace *trace,
1800 struct thread *thread)
1801{
1802 size_t printed = 0;
1803 unsigned char *p;
1804 unsigned long val;
1805
1806 if (sc->args != NULL) {
1807 struct format_field *field;
1808 u8 bit = 1;
1809 struct syscall_arg arg = {
1810 .idx = 0,
1811 .mask = 0,
1812 .trace = trace,
1813 .thread = thread,
1814 };
1815
1816 for (field = sc->args; field;
1817 field = field->next, ++arg.idx, bit <<= 1) {
1818 if (arg.mask & bit)
1819 continue;
1820
1821 /* special care for unaligned accesses */
1822 p = args + sizeof(unsigned long) * arg.idx;
1823 memcpy(&val, p, sizeof(val));
1824
1825 /*
1826 * Suppress this argument if its value is zero and
1827 * and we don't have a string associated in an
1828 * strarray for it.
1829 */
1830 if (val == 0 &&
1831 !(sc->arg_scnprintf &&
1832 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1833 sc->arg_parm[arg.idx]))
1834 continue;
1835
1836 printed += scnprintf(bf + printed, size - printed,
1837 "%s%s: ", printed ? ", " : "", field->name);
1838 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1839 arg.val = val;
1840 if (sc->arg_parm)
1841 arg.parm = sc->arg_parm[arg.idx];
1842 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1843 size - printed, &arg);
1844 } else {
1845 printed += scnprintf(bf + printed, size - printed,
1846 "%ld", val);
1847 }
1848 }
1849 } else {
1850 int i = 0;
1851
1852 while (i < 6) {
1853 /* special care for unaligned accesses */
1854 p = args + sizeof(unsigned long) * i;
1855 memcpy(&val, p, sizeof(val));
1856 printed += scnprintf(bf + printed, size - printed,
1857 "%sarg%d: %ld",
1858 printed ? ", " : "", i, val);
1859 ++i;
1860 }
1861 }
1862
1863 return printed;
1864}
1865
1866typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1867 union perf_event *event,
1868 struct perf_sample *sample);
1869
1870static struct syscall *trace__syscall_info(struct trace *trace,
1871 struct perf_evsel *evsel, int id)
1872{
1873
1874 if (id < 0) {
1875
1876 /*
1877 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1878 * before that, leaving at a higher verbosity level till that is
1879 * explained. Reproduced with plain ftrace with:
1880 *
1881 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1882 * grep "NR -1 " /t/trace_pipe
1883 *
1884 * After generating some load on the machine.
1885 */
1886 if (verbose > 1) {
1887 static u64 n;
1888 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1889 id, perf_evsel__name(evsel), ++n);
1890 }
1891 return NULL;
1892 }
1893
1894 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1895 trace__read_syscall_info(trace, id))
1896 goto out_cant_read;
1897
1898 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1899 goto out_cant_read;
1900
1901 return &trace->syscalls.table[id];
1902
1903out_cant_read:
1904 if (verbose) {
1905 fprintf(trace->output, "Problems reading syscall %d", id);
1906 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1907 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1908 fputs(" information\n", trace->output);
1909 }
1910 return NULL;
1911}
1912
1913static void thread__update_stats(struct thread_trace *ttrace,
1914 int id, struct perf_sample *sample)
1915{
1916 struct int_node *inode;
1917 struct stats *stats;
1918 u64 duration = 0;
1919
1920 inode = intlist__findnew(ttrace->syscall_stats, id);
1921 if (inode == NULL)
1922 return;
1923
1924 stats = inode->priv;
1925 if (stats == NULL) {
1926 stats = malloc(sizeof(struct stats));
1927 if (stats == NULL)
1928 return;
1929 init_stats(stats);
1930 inode->priv = stats;
1931 }
1932
1933 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1934 duration = sample->time - ttrace->entry_time;
1935
1936 update_stats(stats, duration);
1937}
1938
1939static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1940{
1941 struct thread_trace *ttrace;
1942 u64 duration;
1943 size_t printed;
1944
1945 if (trace->current == NULL)
1946 return 0;
1947
1948 ttrace = thread__priv(trace->current);
1949
1950 if (!ttrace->entry_pending)
1951 return 0;
1952
1953 duration = sample->time - ttrace->entry_time;
1954
1955 printed = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1956 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1957 ttrace->entry_pending = false;
1958
1959 return printed;
1960}
1961
1962static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1963 union perf_event *event __maybe_unused,
1964 struct perf_sample *sample)
1965{
1966 char *msg;
1967 void *args;
1968 size_t printed = 0;
1969 struct thread *thread;
1970 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1971 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1972 struct thread_trace *ttrace;
1973
1974 if (sc == NULL)
1975 return -1;
1976
1977 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978 ttrace = thread__trace(thread, trace->output);
1979 if (ttrace == NULL)
1980 goto out_put;
1981
1982 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1983
1984 if (ttrace->entry_str == NULL) {
1985 ttrace->entry_str = malloc(trace__entry_str_size);
1986 if (!ttrace->entry_str)
1987 goto out_put;
1988 }
1989
1990 if (!trace->summary_only)
1991 trace__printf_interrupted_entry(trace, sample);
1992
1993 ttrace->entry_time = sample->time;
1994 msg = ttrace->entry_str;
1995 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1996
1997 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1998 args, trace, thread);
1999
2000 if (sc->is_exit) {
2001 if (!trace->duration_filter && !trace->summary_only) {
2002 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2003 fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2004 }
2005 } else {
2006 ttrace->entry_pending = true;
2007 /* See trace__vfs_getname & trace__sys_exit */
2008 ttrace->filename.pending_open = false;
2009 }
2010
2011 if (trace->current != thread) {
2012 thread__put(trace->current);
2013 trace->current = thread__get(thread);
2014 }
2015 err = 0;
2016out_put:
2017 thread__put(thread);
2018 return err;
2019}
2020
2021static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2022 union perf_event *event __maybe_unused,
2023 struct perf_sample *sample)
2024{
2025 long ret;
2026 u64 duration = 0;
2027 struct thread *thread;
2028 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030 struct thread_trace *ttrace;
2031
2032 if (sc == NULL)
2033 return -1;
2034
2035 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2036 ttrace = thread__trace(thread, trace->output);
2037 if (ttrace == NULL)
2038 goto out_put;
2039
2040 if (trace->summary)
2041 thread__update_stats(ttrace, id, sample);
2042
2043 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2044
2045 if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2046 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2047 ttrace->filename.pending_open = false;
2048 ++trace->stats.vfs_getname;
2049 }
2050
2051 ttrace->exit_time = sample->time;
2052
2053 if (ttrace->entry_time) {
2054 duration = sample->time - ttrace->entry_time;
2055 if (trace__filter_duration(trace, duration))
2056 goto out;
2057 } else if (trace->duration_filter)
2058 goto out;
2059
2060 if (trace->summary_only)
2061 goto out;
2062
2063 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2064
2065 if (ttrace->entry_pending) {
2066 fprintf(trace->output, "%-70s", ttrace->entry_str);
2067 } else {
2068 fprintf(trace->output, " ... [");
2069 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2070 fprintf(trace->output, "]: %s()", sc->name);
2071 }
2072
2073 if (sc->fmt == NULL) {
2074signed_print:
2075 fprintf(trace->output, ") = %ld", ret);
2076 } else if (ret < 0 && sc->fmt->errmsg) {
2077 char bf[STRERR_BUFSIZE];
2078 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2079 *e = audit_errno_to_name(-ret);
2080
2081 fprintf(trace->output, ") = -1 %s %s", e, emsg);
2082 } else if (ret == 0 && sc->fmt->timeout)
2083 fprintf(trace->output, ") = 0 Timeout");
2084 else if (sc->fmt->hexret)
2085 fprintf(trace->output, ") = %#lx", ret);
2086 else
2087 goto signed_print;
2088
2089 fputc('\n', trace->output);
2090out:
2091 ttrace->entry_pending = false;
2092 err = 0;
2093out_put:
2094 thread__put(thread);
2095 return err;
2096}
2097
2098static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2099 union perf_event *event __maybe_unused,
2100 struct perf_sample *sample)
2101{
2102 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2103 struct thread_trace *ttrace;
2104 size_t filename_len, entry_str_len, to_move;
2105 ssize_t remaining_space;
2106 char *pos;
2107 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2108
2109 if (!thread)
2110 goto out;
2111
2112 ttrace = thread__priv(thread);
2113 if (!ttrace)
2114 goto out;
2115
2116 filename_len = strlen(filename);
2117
2118 if (ttrace->filename.namelen < filename_len) {
2119 char *f = realloc(ttrace->filename.name, filename_len + 1);
2120
2121 if (f == NULL)
2122 goto out;
2123
2124 ttrace->filename.namelen = filename_len;
2125 ttrace->filename.name = f;
2126 }
2127
2128 strcpy(ttrace->filename.name, filename);
2129 ttrace->filename.pending_open = true;
2130
2131 if (!ttrace->filename.ptr)
2132 goto out;
2133
2134 entry_str_len = strlen(ttrace->entry_str);
2135 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2136 if (remaining_space <= 0)
2137 goto out;
2138
2139 if (filename_len > (size_t)remaining_space) {
2140 filename += filename_len - remaining_space;
2141 filename_len = remaining_space;
2142 }
2143
2144 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2145 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2146 memmove(pos + filename_len, pos, to_move);
2147 memcpy(pos, filename, filename_len);
2148
2149 ttrace->filename.ptr = 0;
2150 ttrace->filename.entry_str_pos = 0;
2151out:
2152 return 0;
2153}
2154
2155static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2156 union perf_event *event __maybe_unused,
2157 struct perf_sample *sample)
2158{
2159 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2160 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2161 struct thread *thread = machine__findnew_thread(trace->host,
2162 sample->pid,
2163 sample->tid);
2164 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2165
2166 if (ttrace == NULL)
2167 goto out_dump;
2168
2169 ttrace->runtime_ms += runtime_ms;
2170 trace->runtime_ms += runtime_ms;
2171 thread__put(thread);
2172 return 0;
2173
2174out_dump:
2175 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2176 evsel->name,
2177 perf_evsel__strval(evsel, sample, "comm"),
2178 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2179 runtime,
2180 perf_evsel__intval(evsel, sample, "vruntime"));
2181 thread__put(thread);
2182 return 0;
2183}
2184
2185static void bpf_output__printer(enum binary_printer_ops op,
2186 unsigned int val, void *extra)
2187{
2188 FILE *output = extra;
2189 unsigned char ch = (unsigned char)val;
2190
2191 switch (op) {
2192 case BINARY_PRINT_CHAR_DATA:
2193 fprintf(output, "%c", isprint(ch) ? ch : '.');
2194 break;
2195 case BINARY_PRINT_DATA_BEGIN:
2196 case BINARY_PRINT_LINE_BEGIN:
2197 case BINARY_PRINT_ADDR:
2198 case BINARY_PRINT_NUM_DATA:
2199 case BINARY_PRINT_NUM_PAD:
2200 case BINARY_PRINT_SEP:
2201 case BINARY_PRINT_CHAR_PAD:
2202 case BINARY_PRINT_LINE_END:
2203 case BINARY_PRINT_DATA_END:
2204 default:
2205 break;
2206 }
2207}
2208
2209static void bpf_output__fprintf(struct trace *trace,
2210 struct perf_sample *sample)
2211{
2212 print_binary(sample->raw_data, sample->raw_size, 8,
2213 bpf_output__printer, trace->output);
2214}
2215
2216static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2217 union perf_event *event __maybe_unused,
2218 struct perf_sample *sample)
2219{
2220 trace__printf_interrupted_entry(trace, sample);
2221 trace__fprintf_tstamp(trace, sample->time, trace->output);
2222
2223 if (trace->trace_syscalls)
2224 fprintf(trace->output, "( ): ");
2225
2226 fprintf(trace->output, "%s:", evsel->name);
2227
2228 if (perf_evsel__is_bpf_output(evsel)) {
2229 bpf_output__fprintf(trace, sample);
2230 } else if (evsel->tp_format) {
2231 event_format__fprintf(evsel->tp_format, sample->cpu,
2232 sample->raw_data, sample->raw_size,
2233 trace->output);
2234 }
2235
2236 fprintf(trace->output, ")\n");
2237 return 0;
2238}
2239
2240static void print_location(FILE *f, struct perf_sample *sample,
2241 struct addr_location *al,
2242 bool print_dso, bool print_sym)
2243{
2244
2245 if ((verbose || print_dso) && al->map)
2246 fprintf(f, "%s@", al->map->dso->long_name);
2247
2248 if ((verbose || print_sym) && al->sym)
2249 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2250 al->addr - al->sym->start);
2251 else if (al->map)
2252 fprintf(f, "0x%" PRIx64, al->addr);
2253 else
2254 fprintf(f, "0x%" PRIx64, sample->addr);
2255}
2256
2257static int trace__pgfault(struct trace *trace,
2258 struct perf_evsel *evsel,
2259 union perf_event *event __maybe_unused,
2260 struct perf_sample *sample)
2261{
2262 struct thread *thread;
2263 struct addr_location al;
2264 char map_type = 'd';
2265 struct thread_trace *ttrace;
2266 int err = -1;
2267
2268 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2269 ttrace = thread__trace(thread, trace->output);
2270 if (ttrace == NULL)
2271 goto out_put;
2272
2273 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2274 ttrace->pfmaj++;
2275 else
2276 ttrace->pfmin++;
2277
2278 if (trace->summary_only)
2279 goto out;
2280
2281 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2282 sample->ip, &al);
2283
2284 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2285
2286 fprintf(trace->output, "%sfault [",
2287 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2288 "maj" : "min");
2289
2290 print_location(trace->output, sample, &al, false, true);
2291
2292 fprintf(trace->output, "] => ");
2293
2294 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2295 sample->addr, &al);
2296
2297 if (!al.map) {
2298 thread__find_addr_location(thread, sample->cpumode,
2299 MAP__FUNCTION, sample->addr, &al);
2300
2301 if (al.map)
2302 map_type = 'x';
2303 else
2304 map_type = '?';
2305 }
2306
2307 print_location(trace->output, sample, &al, true, false);
2308
2309 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2310out:
2311 err = 0;
2312out_put:
2313 thread__put(thread);
2314 return err;
2315}
2316
2317static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2318{
2319 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2320 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2321 return false;
2322
2323 if (trace->pid_list || trace->tid_list)
2324 return true;
2325
2326 return false;
2327}
2328
2329static int trace__process_sample(struct perf_tool *tool,
2330 union perf_event *event,
2331 struct perf_sample *sample,
2332 struct perf_evsel *evsel,
2333 struct machine *machine __maybe_unused)
2334{
2335 struct trace *trace = container_of(tool, struct trace, tool);
2336 int err = 0;
2337
2338 tracepoint_handler handler = evsel->handler;
2339
2340 if (skip_sample(trace, sample))
2341 return 0;
2342
2343 if (!trace->full_time && trace->base_time == 0)
2344 trace->base_time = sample->time;
2345
2346 if (handler) {
2347 ++trace->nr_events;
2348 handler(trace, evsel, event, sample);
2349 }
2350
2351 return err;
2352}
2353
2354static int parse_target_str(struct trace *trace)
2355{
2356 if (trace->opts.target.pid) {
2357 trace->pid_list = intlist__new(trace->opts.target.pid);
2358 if (trace->pid_list == NULL) {
2359 pr_err("Error parsing process id string\n");
2360 return -EINVAL;
2361 }
2362 }
2363
2364 if (trace->opts.target.tid) {
2365 trace->tid_list = intlist__new(trace->opts.target.tid);
2366 if (trace->tid_list == NULL) {
2367 pr_err("Error parsing thread id string\n");
2368 return -EINVAL;
2369 }
2370 }
2371
2372 return 0;
2373}
2374
2375static int trace__record(struct trace *trace, int argc, const char **argv)
2376{
2377 unsigned int rec_argc, i, j;
2378 const char **rec_argv;
2379 const char * const record_args[] = {
2380 "record",
2381 "-R",
2382 "-m", "1024",
2383 "-c", "1",
2384 };
2385
2386 const char * const sc_args[] = { "-e", };
2387 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2388 const char * const majpf_args[] = { "-e", "major-faults" };
2389 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2390 const char * const minpf_args[] = { "-e", "minor-faults" };
2391 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2392
2393 /* +1 is for the event string below */
2394 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2395 majpf_args_nr + minpf_args_nr + argc;
2396 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2397
2398 if (rec_argv == NULL)
2399 return -ENOMEM;
2400
2401 j = 0;
2402 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2403 rec_argv[j++] = record_args[i];
2404
2405 if (trace->trace_syscalls) {
2406 for (i = 0; i < sc_args_nr; i++)
2407 rec_argv[j++] = sc_args[i];
2408
2409 /* event string may be different for older kernels - e.g., RHEL6 */
2410 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2411 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2412 else if (is_valid_tracepoint("syscalls:sys_enter"))
2413 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2414 else {
2415 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2416 return -1;
2417 }
2418 }
2419
2420 if (trace->trace_pgfaults & TRACE_PFMAJ)
2421 for (i = 0; i < majpf_args_nr; i++)
2422 rec_argv[j++] = majpf_args[i];
2423
2424 if (trace->trace_pgfaults & TRACE_PFMIN)
2425 for (i = 0; i < minpf_args_nr; i++)
2426 rec_argv[j++] = minpf_args[i];
2427
2428 for (i = 0; i < (unsigned int)argc; i++)
2429 rec_argv[j++] = argv[i];
2430
2431 return cmd_record(j, rec_argv, NULL);
2432}
2433
2434static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2435
2436static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2437{
2438 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2439
2440 if (IS_ERR(evsel))
2441 return false;
2442
2443 if (perf_evsel__field(evsel, "pathname") == NULL) {
2444 perf_evsel__delete(evsel);
2445 return false;
2446 }
2447
2448 evsel->handler = trace__vfs_getname;
2449 perf_evlist__add(evlist, evsel);
2450 return true;
2451}
2452
2453static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2454 u64 config)
2455{
2456 struct perf_evsel *evsel;
2457 struct perf_event_attr attr = {
2458 .type = PERF_TYPE_SOFTWARE,
2459 .mmap_data = 1,
2460 };
2461
2462 attr.config = config;
2463 attr.sample_period = 1;
2464
2465 event_attr_init(&attr);
2466
2467 evsel = perf_evsel__new(&attr);
2468 if (!evsel)
2469 return -ENOMEM;
2470
2471 evsel->handler = trace__pgfault;
2472 perf_evlist__add(evlist, evsel);
2473
2474 return 0;
2475}
2476
2477static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2478{
2479 const u32 type = event->header.type;
2480 struct perf_evsel *evsel;
2481
2482 if (!trace->full_time && trace->base_time == 0)
2483 trace->base_time = sample->time;
2484
2485 if (type != PERF_RECORD_SAMPLE) {
2486 trace__process_event(trace, trace->host, event, sample);
2487 return;
2488 }
2489
2490 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2491 if (evsel == NULL) {
2492 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2493 return;
2494 }
2495
2496 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2497 sample->raw_data == NULL) {
2498 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2499 perf_evsel__name(evsel), sample->tid,
2500 sample->cpu, sample->raw_size);
2501 } else {
2502 tracepoint_handler handler = evsel->handler;
2503 handler(trace, evsel, event, sample);
2504 }
2505}
2506
2507static int trace__add_syscall_newtp(struct trace *trace)
2508{
2509 int ret = -1;
2510 struct perf_evlist *evlist = trace->evlist;
2511 struct perf_evsel *sys_enter, *sys_exit;
2512
2513 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2514 if (sys_enter == NULL)
2515 goto out;
2516
2517 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2518 goto out_delete_sys_enter;
2519
2520 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2521 if (sys_exit == NULL)
2522 goto out_delete_sys_enter;
2523
2524 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2525 goto out_delete_sys_exit;
2526
2527 perf_evlist__add(evlist, sys_enter);
2528 perf_evlist__add(evlist, sys_exit);
2529
2530 trace->syscalls.events.sys_enter = sys_enter;
2531 trace->syscalls.events.sys_exit = sys_exit;
2532
2533 ret = 0;
2534out:
2535 return ret;
2536
2537out_delete_sys_exit:
2538 perf_evsel__delete_priv(sys_exit);
2539out_delete_sys_enter:
2540 perf_evsel__delete_priv(sys_enter);
2541 goto out;
2542}
2543
2544static int trace__set_ev_qualifier_filter(struct trace *trace)
2545{
2546 int err = -1;
2547 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2548 trace->ev_qualifier_ids.nr,
2549 trace->ev_qualifier_ids.entries);
2550
2551 if (filter == NULL)
2552 goto out_enomem;
2553
2554 if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2555 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2556
2557 free(filter);
2558out:
2559 return err;
2560out_enomem:
2561 errno = ENOMEM;
2562 goto out;
2563}
2564
2565static int trace__run(struct trace *trace, int argc, const char **argv)
2566{
2567 struct perf_evlist *evlist = trace->evlist;
2568 struct perf_evsel *evsel;
2569 int err = -1, i;
2570 unsigned long before;
2571 const bool forks = argc > 0;
2572 bool draining = false;
2573
2574 trace->live = true;
2575
2576 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2577 goto out_error_raw_syscalls;
2578
2579 if (trace->trace_syscalls)
2580 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2581
2582 if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2583 perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2584 goto out_error_mem;
2585 }
2586
2587 if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2588 perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2589 goto out_error_mem;
2590
2591 if (trace->sched &&
2592 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2593 trace__sched_stat_runtime))
2594 goto out_error_sched_stat_runtime;
2595
2596 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2597 if (err < 0) {
2598 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2599 goto out_delete_evlist;
2600 }
2601
2602 err = trace__symbols_init(trace, evlist);
2603 if (err < 0) {
2604 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2605 goto out_delete_evlist;
2606 }
2607
2608 perf_evlist__config(evlist, &trace->opts);
2609
2610 signal(SIGCHLD, sig_handler);
2611 signal(SIGINT, sig_handler);
2612
2613 if (forks) {
2614 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2615 argv, false, NULL);
2616 if (err < 0) {
2617 fprintf(trace->output, "Couldn't run the workload!\n");
2618 goto out_delete_evlist;
2619 }
2620 }
2621
2622 err = perf_evlist__open(evlist);
2623 if (err < 0)
2624 goto out_error_open;
2625
2626 err = bpf__apply_obj_config();
2627 if (err) {
2628 char errbuf[BUFSIZ];
2629
2630 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2631 pr_err("ERROR: Apply config to BPF failed: %s\n",
2632 errbuf);
2633 goto out_error_open;
2634 }
2635
2636 /*
2637 * Better not use !target__has_task() here because we need to cover the
2638 * case where no threads were specified in the command line, but a
2639 * workload was, and in that case we will fill in the thread_map when
2640 * we fork the workload in perf_evlist__prepare_workload.
2641 */
2642 if (trace->filter_pids.nr > 0)
2643 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2644 else if (thread_map__pid(evlist->threads, 0) == -1)
2645 err = perf_evlist__set_filter_pid(evlist, getpid());
2646
2647 if (err < 0)
2648 goto out_error_mem;
2649
2650 if (trace->ev_qualifier_ids.nr > 0) {
2651 err = trace__set_ev_qualifier_filter(trace);
2652 if (err < 0)
2653 goto out_errno;
2654
2655 pr_debug("event qualifier tracepoint filter: %s\n",
2656 trace->syscalls.events.sys_exit->filter);
2657 }
2658
2659 err = perf_evlist__apply_filters(evlist, &evsel);
2660 if (err < 0)
2661 goto out_error_apply_filters;
2662
2663 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2664 if (err < 0)
2665 goto out_error_mmap;
2666
2667 if (!target__none(&trace->opts.target))
2668 perf_evlist__enable(evlist);
2669
2670 if (forks)
2671 perf_evlist__start_workload(evlist);
2672
2673 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2674 evlist->threads->nr > 1 ||
2675 perf_evlist__first(evlist)->attr.inherit;
2676again:
2677 before = trace->nr_events;
2678
2679 for (i = 0; i < evlist->nr_mmaps; i++) {
2680 union perf_event *event;
2681
2682 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2683 struct perf_sample sample;
2684
2685 ++trace->nr_events;
2686
2687 err = perf_evlist__parse_sample(evlist, event, &sample);
2688 if (err) {
2689 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2690 goto next_event;
2691 }
2692
2693 trace__handle_event(trace, event, &sample);
2694next_event:
2695 perf_evlist__mmap_consume(evlist, i);
2696
2697 if (interrupted)
2698 goto out_disable;
2699
2700 if (done && !draining) {
2701 perf_evlist__disable(evlist);
2702 draining = true;
2703 }
2704 }
2705 }
2706
2707 if (trace->nr_events == before) {
2708 int timeout = done ? 100 : -1;
2709
2710 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2711 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2712 draining = true;
2713
2714 goto again;
2715 }
2716 } else {
2717 goto again;
2718 }
2719
2720out_disable:
2721 thread__zput(trace->current);
2722
2723 perf_evlist__disable(evlist);
2724
2725 if (!err) {
2726 if (trace->summary)
2727 trace__fprintf_thread_summary(trace, trace->output);
2728
2729 if (trace->show_tool_stats) {
2730 fprintf(trace->output, "Stats:\n "
2731 " vfs_getname : %" PRIu64 "\n"
2732 " proc_getname: %" PRIu64 "\n",
2733 trace->stats.vfs_getname,
2734 trace->stats.proc_getname);
2735 }
2736 }
2737
2738out_delete_evlist:
2739 perf_evlist__delete(evlist);
2740 trace->evlist = NULL;
2741 trace->live = false;
2742 return err;
2743{
2744 char errbuf[BUFSIZ];
2745
2746out_error_sched_stat_runtime:
2747 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2748 goto out_error;
2749
2750out_error_raw_syscalls:
2751 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2752 goto out_error;
2753
2754out_error_mmap:
2755 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2756 goto out_error;
2757
2758out_error_open:
2759 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2760
2761out_error:
2762 fprintf(trace->output, "%s\n", errbuf);
2763 goto out_delete_evlist;
2764
2765out_error_apply_filters:
2766 fprintf(trace->output,
2767 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2768 evsel->filter, perf_evsel__name(evsel), errno,
2769 strerror_r(errno, errbuf, sizeof(errbuf)));
2770 goto out_delete_evlist;
2771}
2772out_error_mem:
2773 fprintf(trace->output, "Not enough memory to run!\n");
2774 goto out_delete_evlist;
2775
2776out_errno:
2777 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2778 goto out_delete_evlist;
2779}
2780
2781static int trace__replay(struct trace *trace)
2782{
2783 const struct perf_evsel_str_handler handlers[] = {
2784 { "probe:vfs_getname", trace__vfs_getname, },
2785 };
2786 struct perf_data_file file = {
2787 .path = input_name,
2788 .mode = PERF_DATA_MODE_READ,
2789 .force = trace->force,
2790 };
2791 struct perf_session *session;
2792 struct perf_evsel *evsel;
2793 int err = -1;
2794
2795 trace->tool.sample = trace__process_sample;
2796 trace->tool.mmap = perf_event__process_mmap;
2797 trace->tool.mmap2 = perf_event__process_mmap2;
2798 trace->tool.comm = perf_event__process_comm;
2799 trace->tool.exit = perf_event__process_exit;
2800 trace->tool.fork = perf_event__process_fork;
2801 trace->tool.attr = perf_event__process_attr;
2802 trace->tool.tracing_data = perf_event__process_tracing_data;
2803 trace->tool.build_id = perf_event__process_build_id;
2804
2805 trace->tool.ordered_events = true;
2806 trace->tool.ordering_requires_timestamps = true;
2807
2808 /* add tid to output */
2809 trace->multiple_threads = true;
2810
2811 session = perf_session__new(&file, false, &trace->tool);
2812 if (session == NULL)
2813 return -1;
2814
2815 if (symbol__init(&session->header.env) < 0)
2816 goto out;
2817
2818 trace->host = &session->machines.host;
2819
2820 err = perf_session__set_tracepoints_handlers(session, handlers);
2821 if (err)
2822 goto out;
2823
2824 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2825 "raw_syscalls:sys_enter");
2826 /* older kernels have syscalls tp versus raw_syscalls */
2827 if (evsel == NULL)
2828 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2829 "syscalls:sys_enter");
2830
2831 if (evsel &&
2832 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2833 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2834 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2835 goto out;
2836 }
2837
2838 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2839 "raw_syscalls:sys_exit");
2840 if (evsel == NULL)
2841 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2842 "syscalls:sys_exit");
2843 if (evsel &&
2844 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2845 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2846 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2847 goto out;
2848 }
2849
2850 evlist__for_each(session->evlist, evsel) {
2851 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2852 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2853 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2854 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2855 evsel->handler = trace__pgfault;
2856 }
2857
2858 err = parse_target_str(trace);
2859 if (err != 0)
2860 goto out;
2861
2862 setup_pager();
2863
2864 err = perf_session__process_events(session);
2865 if (err)
2866 pr_err("Failed to process events, error %d", err);
2867
2868 else if (trace->summary)
2869 trace__fprintf_thread_summary(trace, trace->output);
2870
2871out:
2872 perf_session__delete(session);
2873
2874 return err;
2875}
2876
2877static size_t trace__fprintf_threads_header(FILE *fp)
2878{
2879 size_t printed;
2880
2881 printed = fprintf(fp, "\n Summary of events:\n\n");
2882
2883 return printed;
2884}
2885
2886static size_t thread__dump_stats(struct thread_trace *ttrace,
2887 struct trace *trace, FILE *fp)
2888{
2889 struct stats *stats;
2890 size_t printed = 0;
2891 struct syscall *sc;
2892 struct int_node *inode = intlist__first(ttrace->syscall_stats);
2893
2894 if (inode == NULL)
2895 return 0;
2896
2897 printed += fprintf(fp, "\n");
2898
2899 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2900 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2901 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2902
2903 /* each int_node is a syscall */
2904 while (inode) {
2905 stats = inode->priv;
2906 if (stats) {
2907 double min = (double)(stats->min) / NSEC_PER_MSEC;
2908 double max = (double)(stats->max) / NSEC_PER_MSEC;
2909 double avg = avg_stats(stats);
2910 double pct;
2911 u64 n = (u64) stats->n;
2912
2913 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2914 avg /= NSEC_PER_MSEC;
2915
2916 sc = &trace->syscalls.table[inode->i];
2917 printed += fprintf(fp, " %-15s", sc->name);
2918 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2919 n, avg * n, min, avg);
2920 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2921 }
2922
2923 inode = intlist__next(inode);
2924 }
2925
2926 printed += fprintf(fp, "\n\n");
2927
2928 return printed;
2929}
2930
2931/* struct used to pass data to per-thread function */
2932struct summary_data {
2933 FILE *fp;
2934 struct trace *trace;
2935 size_t printed;
2936};
2937
2938static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2939{
2940 struct summary_data *data = priv;
2941 FILE *fp = data->fp;
2942 size_t printed = data->printed;
2943 struct trace *trace = data->trace;
2944 struct thread_trace *ttrace = thread__priv(thread);
2945 double ratio;
2946
2947 if (ttrace == NULL)
2948 return 0;
2949
2950 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2951
2952 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2953 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2954 printed += fprintf(fp, "%.1f%%", ratio);
2955 if (ttrace->pfmaj)
2956 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2957 if (ttrace->pfmin)
2958 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2959 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2960 printed += thread__dump_stats(ttrace, trace, fp);
2961
2962 data->printed += printed;
2963
2964 return 0;
2965}
2966
2967static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2968{
2969 struct summary_data data = {
2970 .fp = fp,
2971 .trace = trace
2972 };
2973 data.printed = trace__fprintf_threads_header(fp);
2974
2975 machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2976
2977 return data.printed;
2978}
2979
2980static int trace__set_duration(const struct option *opt, const char *str,
2981 int unset __maybe_unused)
2982{
2983 struct trace *trace = opt->value;
2984
2985 trace->duration_filter = atof(str);
2986 return 0;
2987}
2988
2989static int trace__set_filter_pids(const struct option *opt, const char *str,
2990 int unset __maybe_unused)
2991{
2992 int ret = -1;
2993 size_t i;
2994 struct trace *trace = opt->value;
2995 /*
2996 * FIXME: introduce a intarray class, plain parse csv and create a
2997 * { int nr, int entries[] } struct...
2998 */
2999 struct intlist *list = intlist__new(str);
3000
3001 if (list == NULL)
3002 return -1;
3003
3004 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3005 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3006
3007 if (trace->filter_pids.entries == NULL)
3008 goto out;
3009
3010 trace->filter_pids.entries[0] = getpid();
3011
3012 for (i = 1; i < trace->filter_pids.nr; ++i)
3013 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3014
3015 intlist__delete(list);
3016 ret = 0;
3017out:
3018 return ret;
3019}
3020
3021static int trace__open_output(struct trace *trace, const char *filename)
3022{
3023 struct stat st;
3024
3025 if (!stat(filename, &st) && st.st_size) {
3026 char oldname[PATH_MAX];
3027
3028 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3029 unlink(oldname);
3030 rename(filename, oldname);
3031 }
3032
3033 trace->output = fopen(filename, "w");
3034
3035 return trace->output == NULL ? -errno : 0;
3036}
3037
3038static int parse_pagefaults(const struct option *opt, const char *str,
3039 int unset __maybe_unused)
3040{
3041 int *trace_pgfaults = opt->value;
3042
3043 if (strcmp(str, "all") == 0)
3044 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3045 else if (strcmp(str, "maj") == 0)
3046 *trace_pgfaults |= TRACE_PFMAJ;
3047 else if (strcmp(str, "min") == 0)
3048 *trace_pgfaults |= TRACE_PFMIN;
3049 else
3050 return -1;
3051
3052 return 0;
3053}
3054
3055static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3056{
3057 struct perf_evsel *evsel;
3058
3059 evlist__for_each(evlist, evsel)
3060 evsel->handler = handler;
3061}
3062
3063int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3064{
3065 const char *trace_usage[] = {
3066 "perf trace [<options>] [<command>]",
3067 "perf trace [<options>] -- <command> [<options>]",
3068 "perf trace record [<options>] [<command>]",
3069 "perf trace record [<options>] -- <command> [<options>]",
3070 NULL
3071 };
3072 struct trace trace = {
3073 .audit = {
3074 .machine = audit_detect_machine(),
3075 .open_id = audit_name_to_syscall("open", trace.audit.machine),
3076 },
3077 .syscalls = {
3078 . max = -1,
3079 },
3080 .opts = {
3081 .target = {
3082 .uid = UINT_MAX,
3083 .uses_mmap = true,
3084 },
3085 .user_freq = UINT_MAX,
3086 .user_interval = ULLONG_MAX,
3087 .no_buffering = true,
3088 .mmap_pages = UINT_MAX,
3089 .proc_map_timeout = 500,
3090 },
3091 .output = stderr,
3092 .show_comm = true,
3093 .trace_syscalls = true,
3094 };
3095 const char *output_name = NULL;
3096 const char *ev_qualifier_str = NULL;
3097 const struct option trace_options[] = {
3098 OPT_CALLBACK(0, "event", &trace.evlist, "event",
3099 "event selector. use 'perf list' to list available events",
3100 parse_events_option),
3101 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3102 "show the thread COMM next to its id"),
3103 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3104 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3105 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3106 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3107 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3108 "trace events on existing process id"),
3109 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3110 "trace events on existing thread id"),
3111 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3112 "pids to filter (by the kernel)", trace__set_filter_pids),
3113 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3114 "system-wide collection from all CPUs"),
3115 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3116 "list of cpus to monitor"),
3117 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3118 "child tasks do not inherit counters"),
3119 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3120 "number of mmap data pages",
3121 perf_evlist__parse_mmap_pages),
3122 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3123 "user to profile"),
3124 OPT_CALLBACK(0, "duration", &trace, "float",
3125 "show only events with duration > N.M ms",
3126 trace__set_duration),
3127 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3128 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3129 OPT_BOOLEAN('T', "time", &trace.full_time,
3130 "Show full timestamp, not time relative to first start"),
3131 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3132 "Show only syscall summary with statistics"),
3133 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3134 "Show all syscalls and summary with statistics"),
3135 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3136 "Trace pagefaults", parse_pagefaults, "maj"),
3137 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3138 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3139 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3140 "per thread proc mmap processing timeout in ms"),
3141 OPT_END()
3142 };
3143 const char * const trace_subcommands[] = { "record", NULL };
3144 int err;
3145 char bf[BUFSIZ];
3146
3147 signal(SIGSEGV, sighandler_dump_stack);
3148 signal(SIGFPE, sighandler_dump_stack);
3149
3150 trace.evlist = perf_evlist__new();
3151
3152 if (trace.evlist == NULL) {
3153 pr_err("Not enough memory to run!\n");
3154 err = -ENOMEM;
3155 goto out;
3156 }
3157
3158 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3159 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3160
3161 if (trace.trace_pgfaults) {
3162 trace.opts.sample_address = true;
3163 trace.opts.sample_time = true;
3164 }
3165
3166 if (trace.evlist->nr_entries > 0)
3167 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3168
3169 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3170 return trace__record(&trace, argc-1, &argv[1]);
3171
3172 /* summary_only implies summary option, but don't overwrite summary if set */
3173 if (trace.summary_only)
3174 trace.summary = trace.summary_only;
3175
3176 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3177 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3178 pr_err("Please specify something to trace.\n");
3179 return -1;
3180 }
3181
3182 if (output_name != NULL) {
3183 err = trace__open_output(&trace, output_name);
3184 if (err < 0) {
3185 perror("failed to create output file");
3186 goto out;
3187 }
3188 }
3189
3190 if (ev_qualifier_str != NULL) {
3191 const char *s = ev_qualifier_str;
3192 struct strlist_config slist_config = {
3193 .dirname = system_path(STRACE_GROUPS_DIR),
3194 };
3195
3196 trace.not_ev_qualifier = *s == '!';
3197 if (trace.not_ev_qualifier)
3198 ++s;
3199 trace.ev_qualifier = strlist__new(s, &slist_config);
3200 if (trace.ev_qualifier == NULL) {
3201 fputs("Not enough memory to parse event qualifier",
3202 trace.output);
3203 err = -ENOMEM;
3204 goto out_close;
3205 }
3206
3207 err = trace__validate_ev_qualifier(&trace);
3208 if (err)
3209 goto out_close;
3210 }
3211
3212 err = target__validate(&trace.opts.target);
3213 if (err) {
3214 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3215 fprintf(trace.output, "%s", bf);
3216 goto out_close;
3217 }
3218
3219 err = target__parse_uid(&trace.opts.target);
3220 if (err) {
3221 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3222 fprintf(trace.output, "%s", bf);
3223 goto out_close;
3224 }
3225
3226 if (!argc && target__none(&trace.opts.target))
3227 trace.opts.target.system_wide = true;
3228
3229 if (input_name)
3230 err = trace__replay(&trace);
3231 else
3232 err = trace__run(&trace, argc, argv);
3233
3234out_close:
3235 if (output_name != NULL)
3236 fclose(trace.output);
3237out:
3238 return err;
3239}
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 */
16
17#include "util/record.h"
18#include <api/fs/tracing_path.h>
19#ifdef HAVE_LIBBPF_SUPPORT
20#include <bpf/bpf.h>
21#include <bpf/libbpf.h>
22#include <bpf/btf.h>
23#ifdef HAVE_BPF_SKEL
24#include "bpf_skel/augmented_raw_syscalls.skel.h"
25#endif
26#endif
27#include "util/bpf_map.h"
28#include "util/rlimit.h"
29#include "builtin.h"
30#include "util/cgroup.h"
31#include "util/color.h"
32#include "util/config.h"
33#include "util/debug.h"
34#include "util/dso.h"
35#include "util/env.h"
36#include "util/event.h"
37#include "util/evsel.h"
38#include "util/evsel_fprintf.h"
39#include "util/synthetic-events.h"
40#include "util/evlist.h"
41#include "util/evswitch.h"
42#include "util/mmap.h"
43#include <subcmd/pager.h>
44#include <subcmd/exec-cmd.h>
45#include "util/machine.h"
46#include "util/map.h"
47#include "util/symbol.h"
48#include "util/path.h"
49#include "util/session.h"
50#include "util/thread.h"
51#include <subcmd/parse-options.h>
52#include "util/strlist.h"
53#include "util/intlist.h"
54#include "util/thread_map.h"
55#include "util/stat.h"
56#include "util/tool.h"
57#include "util/util.h"
58#include "trace/beauty/beauty.h"
59#include "trace-event.h"
60#include "util/parse-events.h"
61#include "util/tracepoint.h"
62#include "callchain.h"
63#include "print_binary.h"
64#include "string2.h"
65#include "syscalltbl.h"
66#include "rb_resort.h"
67#include "../perf.h"
68#include "trace_augment.h"
69
70#include <errno.h>
71#include <inttypes.h>
72#include <poll.h>
73#include <signal.h>
74#include <stdlib.h>
75#include <string.h>
76#include <linux/err.h>
77#include <linux/filter.h>
78#include <linux/kernel.h>
79#include <linux/list_sort.h>
80#include <linux/random.h>
81#include <linux/stringify.h>
82#include <linux/time64.h>
83#include <linux/zalloc.h>
84#include <fcntl.h>
85#include <sys/sysmacros.h>
86
87#include <linux/ctype.h>
88#include <perf/mmap.h>
89
90#ifdef HAVE_LIBTRACEEVENT
91#include <event-parse.h>
92#endif
93
94#ifndef O_CLOEXEC
95# define O_CLOEXEC 02000000
96#endif
97
98#ifndef F_LINUX_SPECIFIC_BASE
99# define F_LINUX_SPECIFIC_BASE 1024
100#endif
101
102#define RAW_SYSCALL_ARGS_NUM 6
103
104/*
105 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
106 *
107 * We have to explicitely mark the direction of the flow of data, if from the
108 * kernel to user space or the other way around, since the BPF collector we
109 * have so far copies only from user to kernel space, mark the arguments that
110 * go that direction, so that we don´t end up collecting the previous contents
111 * for syscall args that goes from kernel to user space.
112 */
113struct syscall_arg_fmt {
114 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
115 bool (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
116 unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
117 void *parm;
118 const char *name;
119 u16 nr_entries; // for arrays
120 bool from_user;
121 bool show_zero;
122#ifdef HAVE_LIBBPF_SUPPORT
123 const struct btf_type *type;
124 int type_id; /* used in btf_dump */
125#endif
126};
127
128struct syscall_fmt {
129 const char *name;
130 const char *alias;
131 struct {
132 const char *sys_enter,
133 *sys_exit;
134 } bpf_prog_name;
135 struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
136 u8 nr_args;
137 bool errpid;
138 bool timeout;
139 bool hexret;
140};
141
142struct trace {
143 struct perf_tool tool;
144 struct syscalltbl *sctbl;
145 struct {
146 struct syscall *table;
147 struct {
148 struct evsel *sys_enter,
149 *sys_exit,
150 *bpf_output;
151 } events;
152 } syscalls;
153#ifdef HAVE_BPF_SKEL
154 struct augmented_raw_syscalls_bpf *skel;
155#endif
156#ifdef HAVE_LIBBPF_SUPPORT
157 struct btf *btf;
158#endif
159 struct record_opts opts;
160 struct evlist *evlist;
161 struct machine *host;
162 struct thread *current;
163 struct cgroup *cgroup;
164 u64 base_time;
165 FILE *output;
166 unsigned long nr_events;
167 unsigned long nr_events_printed;
168 unsigned long max_events;
169 struct evswitch evswitch;
170 struct strlist *ev_qualifier;
171 struct {
172 size_t nr;
173 int *entries;
174 } ev_qualifier_ids;
175 struct {
176 size_t nr;
177 pid_t *entries;
178 struct bpf_map *map;
179 } filter_pids;
180 double duration_filter;
181 double runtime_ms;
182 struct {
183 u64 vfs_getname,
184 proc_getname;
185 } stats;
186 unsigned int max_stack;
187 unsigned int min_stack;
188 int raw_augmented_syscalls_args_size;
189 bool raw_augmented_syscalls;
190 bool fd_path_disabled;
191 bool sort_events;
192 bool not_ev_qualifier;
193 bool live;
194 bool full_time;
195 bool sched;
196 bool multiple_threads;
197 bool summary;
198 bool summary_only;
199 bool errno_summary;
200 bool failure_only;
201 bool show_comm;
202 bool print_sample;
203 bool show_tool_stats;
204 bool trace_syscalls;
205 bool libtraceevent_print;
206 bool kernel_syscallchains;
207 s16 args_alignment;
208 bool show_tstamp;
209 bool show_duration;
210 bool show_zeros;
211 bool show_arg_names;
212 bool show_string_prefix;
213 bool force;
214 bool vfs_getname;
215 bool force_btf;
216 int trace_pgfaults;
217 char *perfconfig_events;
218 struct {
219 struct ordered_events data;
220 u64 last;
221 } oe;
222};
223
224static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused)
225{
226#ifdef HAVE_LIBBPF_SUPPORT
227 if (trace->btf != NULL)
228 return;
229
230 trace->btf = btf__load_vmlinux_btf();
231 if (verbose > 0) {
232 fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" :
233 "Failed to load vmlinux BTF\n");
234 }
235#endif
236}
237
238struct tp_field {
239 int offset;
240 union {
241 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
242 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
243 };
244};
245
246#define TP_UINT_FIELD(bits) \
247static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
248{ \
249 u##bits value; \
250 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
251 return value; \
252}
253
254TP_UINT_FIELD(8);
255TP_UINT_FIELD(16);
256TP_UINT_FIELD(32);
257TP_UINT_FIELD(64);
258
259#define TP_UINT_FIELD__SWAPPED(bits) \
260static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
261{ \
262 u##bits value; \
263 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
264 return bswap_##bits(value);\
265}
266
267TP_UINT_FIELD__SWAPPED(16);
268TP_UINT_FIELD__SWAPPED(32);
269TP_UINT_FIELD__SWAPPED(64);
270
271static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
272{
273 field->offset = offset;
274
275 switch (size) {
276 case 1:
277 field->integer = tp_field__u8;
278 break;
279 case 2:
280 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
281 break;
282 case 4:
283 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
284 break;
285 case 8:
286 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
287 break;
288 default:
289 return -1;
290 }
291
292 return 0;
293}
294
295static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
296{
297 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
298}
299
300static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
301{
302 return sample->raw_data + field->offset;
303}
304
305static int __tp_field__init_ptr(struct tp_field *field, int offset)
306{
307 field->offset = offset;
308 field->pointer = tp_field__ptr;
309 return 0;
310}
311
312static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
313{
314 return __tp_field__init_ptr(field, format_field->offset);
315}
316
317struct syscall_tp {
318 struct tp_field id;
319 union {
320 struct tp_field args, ret;
321 };
322};
323
324/*
325 * The evsel->priv as used by 'perf trace'
326 * sc: for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
327 * fmt: for all the other tracepoints
328 */
329struct evsel_trace {
330 struct syscall_tp sc;
331 struct syscall_arg_fmt *fmt;
332};
333
334static struct evsel_trace *evsel_trace__new(void)
335{
336 return zalloc(sizeof(struct evsel_trace));
337}
338
339static void evsel_trace__delete(struct evsel_trace *et)
340{
341 if (et == NULL)
342 return;
343
344 zfree(&et->fmt);
345 free(et);
346}
347
348/*
349 * Used with raw_syscalls:sys_{enter,exit} and with the
350 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
351 */
352static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
353{
354 struct evsel_trace *et = evsel->priv;
355
356 return &et->sc;
357}
358
359static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
360{
361 if (evsel->priv == NULL) {
362 evsel->priv = evsel_trace__new();
363 if (evsel->priv == NULL)
364 return NULL;
365 }
366
367 return __evsel__syscall_tp(evsel);
368}
369
370/*
371 * Used with all the other tracepoints.
372 */
373static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
374{
375 struct evsel_trace *et = evsel->priv;
376
377 return et->fmt;
378}
379
380static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
381{
382 struct evsel_trace *et = evsel->priv;
383
384 if (evsel->priv == NULL) {
385 et = evsel->priv = evsel_trace__new();
386
387 if (et == NULL)
388 return NULL;
389 }
390
391 if (et->fmt == NULL) {
392 et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
393 if (et->fmt == NULL)
394 goto out_delete;
395 }
396
397 return __evsel__syscall_arg_fmt(evsel);
398
399out_delete:
400 evsel_trace__delete(evsel->priv);
401 evsel->priv = NULL;
402 return NULL;
403}
404
405static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
406{
407 struct tep_format_field *format_field = evsel__field(evsel, name);
408
409 if (format_field == NULL)
410 return -1;
411
412 return tp_field__init_uint(field, format_field, evsel->needs_swap);
413}
414
415#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
416 ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
417 evsel__init_tp_uint_field(evsel, &sc->name, #name); })
418
419static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
420{
421 struct tep_format_field *format_field = evsel__field(evsel, name);
422
423 if (format_field == NULL)
424 return -1;
425
426 return tp_field__init_ptr(field, format_field);
427}
428
429#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
430 ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
431 evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
432
433static void evsel__delete_priv(struct evsel *evsel)
434{
435 zfree(&evsel->priv);
436 evsel__delete(evsel);
437}
438
439static int evsel__init_syscall_tp(struct evsel *evsel)
440{
441 struct syscall_tp *sc = evsel__syscall_tp(evsel);
442
443 if (sc != NULL) {
444 if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
445 evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
446 return -ENOENT;
447
448 return 0;
449 }
450
451 return -ENOMEM;
452}
453
454static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
455{
456 struct syscall_tp *sc = evsel__syscall_tp(evsel);
457
458 if (sc != NULL) {
459 struct tep_format_field *syscall_id = evsel__field(tp, "id");
460 if (syscall_id == NULL)
461 syscall_id = evsel__field(tp, "__syscall_nr");
462 if (syscall_id == NULL ||
463 __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
464 return -EINVAL;
465
466 return 0;
467 }
468
469 return -ENOMEM;
470}
471
472static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
473{
474 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
475
476 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
477}
478
479static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
480{
481 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
482
483 return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
484}
485
486static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
487{
488 if (evsel__syscall_tp(evsel) != NULL) {
489 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
490 return -ENOENT;
491
492 evsel->handler = handler;
493 return 0;
494 }
495
496 return -ENOMEM;
497}
498
499static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
500{
501 struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
502
503 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
504 if (IS_ERR(evsel))
505 evsel = evsel__newtp("syscalls", direction);
506
507 if (IS_ERR(evsel))
508 return NULL;
509
510 if (evsel__init_raw_syscall_tp(evsel, handler))
511 goto out_delete;
512
513 return evsel;
514
515out_delete:
516 evsel__delete_priv(evsel);
517 return NULL;
518}
519
520#define perf_evsel__sc_tp_uint(evsel, name, sample) \
521 ({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
522 fields->name.integer(&fields->name, sample); })
523
524#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
525 ({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
526 fields->name.pointer(&fields->name, sample); })
527
528size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
529{
530 int idx = val - sa->offset;
531
532 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
533 size_t printed = scnprintf(bf, size, intfmt, val);
534 if (show_suffix)
535 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
536 return printed;
537 }
538
539 return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
540}
541
542size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
543{
544 int idx = val - sa->offset;
545
546 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
547 size_t printed = scnprintf(bf, size, intfmt, val);
548 if (show_prefix)
549 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
550 return printed;
551 }
552
553 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
554}
555
556static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
557 const char *intfmt,
558 struct syscall_arg *arg)
559{
560 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
561}
562
563static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
564 struct syscall_arg *arg)
565{
566 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
567}
568
569#define SCA_STRARRAY syscall_arg__scnprintf_strarray
570
571bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
572{
573 return strarray__strtoul(arg->parm, bf, size, ret);
574}
575
576bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
577{
578 return strarray__strtoul_flags(arg->parm, bf, size, ret);
579}
580
581bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
582{
583 return strarrays__strtoul(arg->parm, bf, size, ret);
584}
585
586size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
587{
588 return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
589}
590
591size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
592{
593 size_t printed;
594 int i;
595
596 for (i = 0; i < sas->nr_entries; ++i) {
597 struct strarray *sa = sas->entries[i];
598 int idx = val - sa->offset;
599
600 if (idx >= 0 && idx < sa->nr_entries) {
601 if (sa->entries[idx] == NULL)
602 break;
603 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
604 }
605 }
606
607 printed = scnprintf(bf, size, intfmt, val);
608 if (show_prefix)
609 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
610 return printed;
611}
612
613bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
614{
615 int i;
616
617 for (i = 0; i < sa->nr_entries; ++i) {
618 if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
619 *ret = sa->offset + i;
620 return true;
621 }
622 }
623
624 return false;
625}
626
627bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
628{
629 u64 val = 0;
630 char *tok = bf, *sep, *end;
631
632 *ret = 0;
633
634 while (size != 0) {
635 int toklen = size;
636
637 sep = memchr(tok, '|', size);
638 if (sep != NULL) {
639 size -= sep - tok + 1;
640
641 end = sep - 1;
642 while (end > tok && isspace(*end))
643 --end;
644
645 toklen = end - tok + 1;
646 }
647
648 while (isspace(*tok))
649 ++tok;
650
651 if (isalpha(*tok) || *tok == '_') {
652 if (!strarray__strtoul(sa, tok, toklen, &val))
653 return false;
654 } else
655 val = strtoul(tok, NULL, 0);
656
657 *ret |= (1 << (val - 1));
658
659 if (sep == NULL)
660 break;
661 tok = sep + 1;
662 }
663
664 return true;
665}
666
667bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
668{
669 int i;
670
671 for (i = 0; i < sas->nr_entries; ++i) {
672 struct strarray *sa = sas->entries[i];
673
674 if (strarray__strtoul(sa, bf, size, ret))
675 return true;
676 }
677
678 return false;
679}
680
681size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
682 struct syscall_arg *arg)
683{
684 return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
685}
686
687#ifndef AT_FDCWD
688#define AT_FDCWD -100
689#endif
690
691static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
692 struct syscall_arg *arg)
693{
694 int fd = arg->val;
695 const char *prefix = "AT_FD";
696
697 if (fd == AT_FDCWD)
698 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
699
700 return syscall_arg__scnprintf_fd(bf, size, arg);
701}
702
703#define SCA_FDAT syscall_arg__scnprintf_fd_at
704
705static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
706 struct syscall_arg *arg);
707
708#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
709
710size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
711{
712 return scnprintf(bf, size, "%#lx", arg->val);
713}
714
715size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
716{
717 if (arg->val == 0)
718 return scnprintf(bf, size, "NULL");
719 return syscall_arg__scnprintf_hex(bf, size, arg);
720}
721
722size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
723{
724 return scnprintf(bf, size, "%d", arg->val);
725}
726
727size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
728{
729 return scnprintf(bf, size, "%ld", arg->val);
730}
731
732static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
733{
734 // XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
735 // fill missing comms using thread__set_comm()...
736 // here or in a special syscall_arg__scnprintf_pid_sched_tp...
737 return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
738}
739
740#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
741
742static const char *bpf_cmd[] = {
743 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
744 "MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
745 "PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
746 "PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
747 "PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
748 "TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
749 "BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
750 "MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
751 "LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
752 "LINK_DETACH", "PROG_BIND_MAP",
753};
754static DEFINE_STRARRAY(bpf_cmd, "BPF_");
755
756static const char *fsmount_flags[] = {
757 [1] = "CLOEXEC",
758};
759static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
760
761#include "trace/beauty/generated/fsconfig_arrays.c"
762
763static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
764
765static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
766static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
767
768static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
769static DEFINE_STRARRAY(itimers, "ITIMER_");
770
771static const char *keyctl_options[] = {
772 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
773 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
774 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
775 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
776 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
777};
778static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
779
780static const char *whences[] = { "SET", "CUR", "END",
781#ifdef SEEK_DATA
782"DATA",
783#endif
784#ifdef SEEK_HOLE
785"HOLE",
786#endif
787};
788static DEFINE_STRARRAY(whences, "SEEK_");
789
790static const char *fcntl_cmds[] = {
791 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
792 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
793 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
794 "GETOWNER_UIDS",
795};
796static DEFINE_STRARRAY(fcntl_cmds, "F_");
797
798static const char *fcntl_linux_specific_cmds[] = {
799 "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
800 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
801 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
802};
803
804static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
805
806static struct strarray *fcntl_cmds_arrays[] = {
807 &strarray__fcntl_cmds,
808 &strarray__fcntl_linux_specific_cmds,
809};
810
811static DEFINE_STRARRAYS(fcntl_cmds_arrays);
812
813static const char *rlimit_resources[] = {
814 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
815 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
816 "RTTIME",
817};
818static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
819
820static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
821static DEFINE_STRARRAY(sighow, "SIG_");
822
823static const char *clockid[] = {
824 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
825 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
826 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
827};
828static DEFINE_STRARRAY(clockid, "CLOCK_");
829
830static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
831 struct syscall_arg *arg)
832{
833 bool show_prefix = arg->show_string_prefix;
834 const char *suffix = "_OK";
835 size_t printed = 0;
836 int mode = arg->val;
837
838 if (mode == F_OK) /* 0 */
839 return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
840#define P_MODE(n) \
841 if (mode & n##_OK) { \
842 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
843 mode &= ~n##_OK; \
844 }
845
846 P_MODE(R);
847 P_MODE(W);
848 P_MODE(X);
849#undef P_MODE
850
851 if (mode)
852 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
853
854 return printed;
855}
856
857#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
858
859static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
860 struct syscall_arg *arg);
861
862#define SCA_FILENAME syscall_arg__scnprintf_filename
863
864// 'argname' is just documentational at this point, to remove the previous comment with that info
865#define SCA_FILENAME_FROM_USER(argname) \
866 { .scnprintf = SCA_FILENAME, \
867 .from_user = true, }
868
869static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg);
870
871#define SCA_BUF syscall_arg__scnprintf_buf
872
873static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
874 struct syscall_arg *arg)
875{
876 bool show_prefix = arg->show_string_prefix;
877 const char *prefix = "O_";
878 int printed = 0, flags = arg->val;
879
880#define P_FLAG(n) \
881 if (flags & O_##n) { \
882 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
883 flags &= ~O_##n; \
884 }
885
886 P_FLAG(CLOEXEC);
887 P_FLAG(NONBLOCK);
888#undef P_FLAG
889
890 if (flags)
891 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
892
893 return printed;
894}
895
896#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
897
898#ifndef GRND_NONBLOCK
899#define GRND_NONBLOCK 0x0001
900#endif
901#ifndef GRND_RANDOM
902#define GRND_RANDOM 0x0002
903#endif
904
905static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
906 struct syscall_arg *arg)
907{
908 bool show_prefix = arg->show_string_prefix;
909 const char *prefix = "GRND_";
910 int printed = 0, flags = arg->val;
911
912#define P_FLAG(n) \
913 if (flags & GRND_##n) { \
914 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
915 flags &= ~GRND_##n; \
916 }
917
918 P_FLAG(RANDOM);
919 P_FLAG(NONBLOCK);
920#undef P_FLAG
921
922 if (flags)
923 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
924
925 return printed;
926}
927
928#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
929
930#ifdef HAVE_LIBBPF_SUPPORT
931static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
932{
933 int id;
934
935 type = strstr(type, "enum ");
936 if (type == NULL)
937 return;
938
939 type += 5; // skip "enum " to get the enumeration name
940
941 id = btf__find_by_name(btf, type);
942 if (id < 0)
943 return;
944
945 arg_fmt->type = btf__type_by_id(btf, id);
946}
947
948static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
949{
950 const struct btf_type *bt = arg->fmt->type;
951 struct btf *btf = arg->trace->btf;
952 struct btf_enum *be = btf_enum(bt);
953
954 for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
955 const char *name = btf__name_by_offset(btf, be->name_off);
956 int max_len = max(size, strlen(name));
957
958 if (strncmp(name, bf, max_len) == 0) {
959 *val = be->val;
960 return true;
961 }
962 }
963
964 return false;
965}
966
967static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
968{
969 const struct btf_type *bt;
970 char *type = arg->type_name;
971 struct btf *btf;
972
973 trace__load_vmlinux_btf(arg->trace);
974
975 btf = arg->trace->btf;
976 if (btf == NULL)
977 return false;
978
979 if (arg->fmt->type == NULL) {
980 // See if this is an enum
981 syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type);
982 }
983
984 // Now let's see if we have a BTF type resolved
985 bt = arg->fmt->type;
986 if (bt == NULL)
987 return false;
988
989 // If it is an enum:
990 if (btf_is_enum(arg->fmt->type))
991 return syscall_arg__strtoul_btf_enum(bf, size, arg, val);
992
993 return false;
994}
995
996static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
997{
998 struct btf_enum *be = btf_enum(type);
999 const int nr_entries = btf_vlen(type);
1000
1001 for (int i = 0; i < nr_entries; ++i, ++be) {
1002 if (be->val == val) {
1003 return scnprintf(bf, size, "%s",
1004 btf__name_by_offset(btf, be->name_off));
1005 }
1006 }
1007
1008 return 0;
1009}
1010
1011struct trace_btf_dump_snprintf_ctx {
1012 char *bf;
1013 size_t printed, size;
1014};
1015
1016static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args)
1017{
1018 struct trace_btf_dump_snprintf_ctx *ctx = vctx;
1019
1020 ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args);
1021}
1022
1023static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg)
1024{
1025 struct trace_btf_dump_snprintf_ctx ctx = {
1026 .bf = bf,
1027 .size = size,
1028 };
1029 struct augmented_arg *augmented_arg = arg->augmented.args;
1030 int type_id = arg->fmt->type_id, consumed;
1031 struct btf_dump *btf_dump;
1032
1033 LIBBPF_OPTS(btf_dump_opts, dump_opts);
1034 LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts);
1035
1036 if (arg == NULL || arg->augmented.args == NULL)
1037 return 0;
1038
1039 dump_data_opts.compact = true;
1040 dump_data_opts.skip_names = !arg->trace->show_arg_names;
1041
1042 btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts);
1043 if (btf_dump == NULL)
1044 return 0;
1045
1046 /* pretty print the struct data here */
1047 if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0)
1048 return 0;
1049
1050 consumed = sizeof(*augmented_arg) + augmented_arg->size;
1051 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1052 arg->augmented.size -= consumed;
1053
1054 btf_dump__free(btf_dump);
1055
1056 return ctx.printed;
1057}
1058
1059static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
1060 size_t size, int val, char *type)
1061{
1062 struct syscall_arg_fmt *arg_fmt = arg->fmt;
1063
1064 if (trace->btf == NULL)
1065 return 0;
1066
1067 if (arg_fmt->type == NULL) {
1068 // Check if this is an enum and if we have the BTF type for it.
1069 syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
1070 }
1071
1072 // Did we manage to find a BTF type for the syscall/tracepoint argument?
1073 if (arg_fmt->type == NULL)
1074 return 0;
1075
1076 if (btf_is_enum(arg_fmt->type))
1077 return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
1078 else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type))
1079 return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
1080
1081 return 0;
1082}
1083
1084#else // HAVE_LIBBPF_SUPPORT
1085static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused,
1086 char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
1087 char *type __maybe_unused)
1088{
1089 return 0;
1090}
1091
1092static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused,
1093 struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
1094{
1095 return false;
1096}
1097#endif // HAVE_LIBBPF_SUPPORT
1098
1099#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type
1100
1101#define STRARRAY(name, array) \
1102 { .scnprintf = SCA_STRARRAY, \
1103 .strtoul = STUL_STRARRAY, \
1104 .parm = &strarray__##array, }
1105
1106#define STRARRAY_FLAGS(name, array) \
1107 { .scnprintf = SCA_STRARRAY_FLAGS, \
1108 .strtoul = STUL_STRARRAY_FLAGS, \
1109 .parm = &strarray__##array, }
1110
1111#include "trace/beauty/arch_errno_names.c"
1112#include "trace/beauty/eventfd.c"
1113#include "trace/beauty/futex_op.c"
1114#include "trace/beauty/futex_val3.c"
1115#include "trace/beauty/mmap.c"
1116#include "trace/beauty/mode_t.c"
1117#include "trace/beauty/msg_flags.c"
1118#include "trace/beauty/open_flags.c"
1119#include "trace/beauty/perf_event_open.c"
1120#include "trace/beauty/pid.c"
1121#include "trace/beauty/sched_policy.c"
1122#include "trace/beauty/seccomp.c"
1123#include "trace/beauty/signum.c"
1124#include "trace/beauty/socket_type.c"
1125#include "trace/beauty/waitid_options.c"
1126
1127static const struct syscall_fmt syscall_fmts[] = {
1128 { .name = "access",
1129 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
1130 { .name = "arch_prctl",
1131 .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
1132 [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
1133 { .name = "bind",
1134 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1135 [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
1136 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1137 { .name = "bpf",
1138 .arg = { [0] = STRARRAY(cmd, bpf_cmd),
1139 [1] = { .from_user = true /* attr */, }, } },
1140 { .name = "brk", .hexret = true,
1141 .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
1142 { .name = "clock_gettime",
1143 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
1144 { .name = "clock_nanosleep",
1145 .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, },
1146 { .name = "clone", .errpid = true, .nr_args = 5,
1147 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
1148 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
1149 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
1150 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
1151 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
1152 { .name = "close",
1153 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
1154 { .name = "connect",
1155 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1156 [1] = SCA_SOCKADDR_FROM_USER(servaddr),
1157 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1158 { .name = "epoll_ctl",
1159 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
1160 { .name = "eventfd2",
1161 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
1162 { .name = "faccessat",
1163 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1164 [1] = SCA_FILENAME_FROM_USER(pathname),
1165 [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
1166 { .name = "faccessat2",
1167 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1168 [1] = SCA_FILENAME_FROM_USER(pathname),
1169 [2] = { .scnprintf = SCA_ACCMODE, /* mode */ },
1170 [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, },
1171 { .name = "fchmodat",
1172 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1173 { .name = "fchownat",
1174 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1175 { .name = "fcntl",
1176 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
1177 .strtoul = STUL_STRARRAYS,
1178 .parm = &strarrays__fcntl_cmds_arrays,
1179 .show_zero = true, },
1180 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
1181 { .name = "flock",
1182 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
1183 { .name = "fsconfig",
1184 .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
1185 { .name = "fsmount",
1186 .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
1187 [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
1188 { .name = "fspick",
1189 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1190 [1] = SCA_FILENAME_FROM_USER(path),
1191 [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
1192 { .name = "fstat", .alias = "newfstat", },
1193 { .name = "futex",
1194 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
1195 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
1196 { .name = "futimesat",
1197 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1198 { .name = "getitimer",
1199 .arg = { [0] = STRARRAY(which, itimers), }, },
1200 { .name = "getpid", .errpid = true, },
1201 { .name = "getpgid", .errpid = true, },
1202 { .name = "getppid", .errpid = true, },
1203 { .name = "getrandom",
1204 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
1205 { .name = "getrlimit",
1206 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
1207 { .name = "getsockopt",
1208 .arg = { [1] = STRARRAY(level, socket_level), }, },
1209 { .name = "gettid", .errpid = true, },
1210 { .name = "ioctl",
1211 .arg = {
1212#if defined(__i386__) || defined(__x86_64__)
1213/*
1214 * FIXME: Make this available to all arches.
1215 */
1216 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
1217 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1218#else
1219 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1220#endif
1221 { .name = "kcmp", .nr_args = 5,
1222 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
1223 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
1224 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
1225 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
1226 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
1227 { .name = "keyctl",
1228 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
1229 { .name = "kill",
1230 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1231 { .name = "linkat",
1232 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1233 { .name = "lseek",
1234 .arg = { [2] = STRARRAY(whence, whences), }, },
1235 { .name = "lstat", .alias = "newlstat", },
1236 { .name = "madvise",
1237 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1238 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
1239 { .name = "mkdirat",
1240 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1241 { .name = "mknodat",
1242 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1243 { .name = "mmap", .hexret = true,
1244/* The standard mmap maps to old_mmap on s390x */
1245#if defined(__s390x__)
1246 .alias = "old_mmap",
1247#endif
1248 .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1249 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */
1250 .strtoul = STUL_STRARRAY_FLAGS,
1251 .parm = &strarray__mmap_flags, },
1252 [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, },
1253 { .name = "mount",
1254 .arg = { [0] = SCA_FILENAME_FROM_USER(devname),
1255 [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
1256 .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
1257 { .name = "move_mount",
1258 .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ },
1259 [1] = SCA_FILENAME_FROM_USER(pathname),
1260 [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ },
1261 [3] = SCA_FILENAME_FROM_USER(pathname),
1262 [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
1263 { .name = "mprotect",
1264 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1265 [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, },
1266 { .name = "mq_unlink",
1267 .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, },
1268 { .name = "mremap", .hexret = true,
1269 .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
1270 { .name = "name_to_handle_at",
1271 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1272 { .name = "nanosleep",
1273 .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, },
1274 { .name = "newfstatat", .alias = "fstatat",
1275 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1276 [1] = SCA_FILENAME_FROM_USER(pathname),
1277 [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1278 { .name = "open",
1279 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1280 { .name = "open_by_handle_at",
1281 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1282 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1283 { .name = "openat",
1284 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1285 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1286 { .name = "perf_event_open",
1287 .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr),
1288 [2] = { .scnprintf = SCA_INT, /* cpu */ },
1289 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
1290 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
1291 { .name = "pipe2",
1292 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
1293 { .name = "pkey_alloc",
1294 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
1295 { .name = "pkey_free",
1296 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
1297 { .name = "pkey_mprotect",
1298 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1299 [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1300 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
1301 { .name = "poll", .timeout = true, },
1302 { .name = "ppoll", .timeout = true, },
1303 { .name = "prctl",
1304 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
1305 .strtoul = STUL_STRARRAY,
1306 .parm = &strarray__prctl_options, },
1307 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
1308 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
1309 { .name = "pread", .alias = "pread64", },
1310 { .name = "preadv", .alias = "pread", },
1311 { .name = "prlimit64",
1312 .arg = { [1] = STRARRAY(resource, rlimit_resources),
1313 [2] = { .from_user = true /* new_rlim */, }, }, },
1314 { .name = "pwrite", .alias = "pwrite64", },
1315 { .name = "readlinkat",
1316 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1317 { .name = "recvfrom",
1318 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1319 { .name = "recvmmsg",
1320 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1321 { .name = "recvmsg",
1322 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1323 { .name = "renameat",
1324 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1325 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
1326 { .name = "renameat2",
1327 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1328 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
1329 [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
1330 { .name = "rseq", .errpid = true,
1331 .arg = { [0] = { .from_user = true /* rseq */, }, }, },
1332 { .name = "rt_sigaction",
1333 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1334 { .name = "rt_sigprocmask",
1335 .arg = { [0] = STRARRAY(how, sighow), }, },
1336 { .name = "rt_sigqueueinfo",
1337 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1338 { .name = "rt_tgsigqueueinfo",
1339 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1340 { .name = "sched_setscheduler",
1341 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
1342 { .name = "seccomp",
1343 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
1344 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
1345 { .name = "select", .timeout = true, },
1346 { .name = "sendfile", .alias = "sendfile64", },
1347 { .name = "sendmmsg",
1348 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1349 { .name = "sendmsg",
1350 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1351 { .name = "sendto",
1352 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
1353 [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
1354 { .name = "set_robust_list", .errpid = true,
1355 .arg = { [0] = { .from_user = true /* head */, }, }, },
1356 { .name = "set_tid_address", .errpid = true, },
1357 { .name = "setitimer",
1358 .arg = { [0] = STRARRAY(which, itimers), }, },
1359 { .name = "setrlimit",
1360 .arg = { [0] = STRARRAY(resource, rlimit_resources),
1361 [1] = { .from_user = true /* rlim */, }, }, },
1362 { .name = "setsockopt",
1363 .arg = { [1] = STRARRAY(level, socket_level), }, },
1364 { .name = "socket",
1365 .arg = { [0] = STRARRAY(family, socket_families),
1366 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1367 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1368 { .name = "socketpair",
1369 .arg = { [0] = STRARRAY(family, socket_families),
1370 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1371 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1372 { .name = "stat", .alias = "newstat", },
1373 { .name = "statx",
1374 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
1375 [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } ,
1376 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
1377 { .name = "swapoff",
1378 .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1379 { .name = "swapon",
1380 .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1381 { .name = "symlinkat",
1382 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1383 { .name = "sync_file_range",
1384 .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
1385 { .name = "tgkill",
1386 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1387 { .name = "tkill",
1388 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1389 { .name = "umount2", .alias = "umount",
1390 .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, },
1391 { .name = "uname", .alias = "newuname", },
1392 { .name = "unlinkat",
1393 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1394 [1] = SCA_FILENAME_FROM_USER(pathname),
1395 [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1396 { .name = "utimensat",
1397 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
1398 { .name = "wait4", .errpid = true,
1399 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1400 { .name = "waitid", .errpid = true,
1401 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1402 { .name = "write",
1403 .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
1404};
1405
1406static int syscall_fmt__cmp(const void *name, const void *fmtp)
1407{
1408 const struct syscall_fmt *fmt = fmtp;
1409 return strcmp(name, fmt->name);
1410}
1411
1412static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts,
1413 const int nmemb,
1414 const char *name)
1415{
1416 return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1417}
1418
1419static const struct syscall_fmt *syscall_fmt__find(const char *name)
1420{
1421 const int nmemb = ARRAY_SIZE(syscall_fmts);
1422 return __syscall_fmt__find(syscall_fmts, nmemb, name);
1423}
1424
1425static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts,
1426 const int nmemb, const char *alias)
1427{
1428 int i;
1429
1430 for (i = 0; i < nmemb; ++i) {
1431 if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
1432 return &fmts[i];
1433 }
1434
1435 return NULL;
1436}
1437
1438static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
1439{
1440 const int nmemb = ARRAY_SIZE(syscall_fmts);
1441 return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
1442}
1443
1444/*
1445 * is_exit: is this "exit" or "exit_group"?
1446 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
1447 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
1448 * nonexistent: Just a hole in the syscall table, syscall id not allocated
1449 */
1450struct syscall {
1451 struct tep_event *tp_format;
1452 int nr_args;
1453 int args_size;
1454 struct {
1455 struct bpf_program *sys_enter,
1456 *sys_exit;
1457 } bpf_prog;
1458 bool is_exit;
1459 bool is_open;
1460 bool nonexistent;
1461 bool use_btf;
1462 struct tep_format_field *args;
1463 const char *name;
1464 const struct syscall_fmt *fmt;
1465 struct syscall_arg_fmt *arg_fmt;
1466};
1467
1468/*
1469 * We need to have this 'calculated' boolean because in some cases we really
1470 * don't know what is the duration of a syscall, for instance, when we start
1471 * a session and some threads are waiting for a syscall to finish, say 'poll',
1472 * in which case all we can do is to print "( ? ) for duration and for the
1473 * start timestamp.
1474 */
1475static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1476{
1477 double duration = (double)t / NSEC_PER_MSEC;
1478 size_t printed = fprintf(fp, "(");
1479
1480 if (!calculated)
1481 printed += fprintf(fp, " ");
1482 else if (duration >= 1.0)
1483 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1484 else if (duration >= 0.01)
1485 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1486 else
1487 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1488 return printed + fprintf(fp, "): ");
1489}
1490
1491/**
1492 * filename.ptr: The filename char pointer that will be vfs_getname'd
1493 * filename.entry_str_pos: Where to insert the string translated from
1494 * filename.ptr by the vfs_getname tracepoint/kprobe.
1495 * ret_scnprintf: syscall args may set this to a different syscall return
1496 * formatter, for instance, fcntl may return fds, file flags, etc.
1497 */
1498struct thread_trace {
1499 u64 entry_time;
1500 bool entry_pending;
1501 unsigned long nr_events;
1502 unsigned long pfmaj, pfmin;
1503 char *entry_str;
1504 double runtime_ms;
1505 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1506 struct {
1507 unsigned long ptr;
1508 short int entry_str_pos;
1509 bool pending_open;
1510 unsigned int namelen;
1511 char *name;
1512 } filename;
1513 struct {
1514 int max;
1515 struct file *table;
1516 } files;
1517
1518 struct intlist *syscall_stats;
1519};
1520
1521static struct thread_trace *thread_trace__new(void)
1522{
1523 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1524
1525 if (ttrace) {
1526 ttrace->files.max = -1;
1527 ttrace->syscall_stats = intlist__new(NULL);
1528 }
1529
1530 return ttrace;
1531}
1532
1533static void thread_trace__free_files(struct thread_trace *ttrace);
1534
1535static void thread_trace__delete(void *pttrace)
1536{
1537 struct thread_trace *ttrace = pttrace;
1538
1539 if (!ttrace)
1540 return;
1541
1542 intlist__delete(ttrace->syscall_stats);
1543 ttrace->syscall_stats = NULL;
1544 thread_trace__free_files(ttrace);
1545 zfree(&ttrace->entry_str);
1546 free(ttrace);
1547}
1548
1549static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1550{
1551 struct thread_trace *ttrace;
1552
1553 if (thread == NULL)
1554 goto fail;
1555
1556 if (thread__priv(thread) == NULL)
1557 thread__set_priv(thread, thread_trace__new());
1558
1559 if (thread__priv(thread) == NULL)
1560 goto fail;
1561
1562 ttrace = thread__priv(thread);
1563 ++ttrace->nr_events;
1564
1565 return ttrace;
1566fail:
1567 color_fprintf(fp, PERF_COLOR_RED,
1568 "WARNING: not enough memory, dropping samples!\n");
1569 return NULL;
1570}
1571
1572
1573void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1574 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1575{
1576 struct thread_trace *ttrace = thread__priv(arg->thread);
1577
1578 ttrace->ret_scnprintf = ret_scnprintf;
1579}
1580
1581#define TRACE_PFMAJ (1 << 0)
1582#define TRACE_PFMIN (1 << 1)
1583
1584static const size_t trace__entry_str_size = 2048;
1585
1586static void thread_trace__free_files(struct thread_trace *ttrace)
1587{
1588 for (int i = 0; i < ttrace->files.max; ++i) {
1589 struct file *file = ttrace->files.table + i;
1590 zfree(&file->pathname);
1591 }
1592
1593 zfree(&ttrace->files.table);
1594 ttrace->files.max = -1;
1595}
1596
1597static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1598{
1599 if (fd < 0)
1600 return NULL;
1601
1602 if (fd > ttrace->files.max) {
1603 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1604
1605 if (nfiles == NULL)
1606 return NULL;
1607
1608 if (ttrace->files.max != -1) {
1609 memset(nfiles + ttrace->files.max + 1, 0,
1610 (fd - ttrace->files.max) * sizeof(struct file));
1611 } else {
1612 memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1613 }
1614
1615 ttrace->files.table = nfiles;
1616 ttrace->files.max = fd;
1617 }
1618
1619 return ttrace->files.table + fd;
1620}
1621
1622struct file *thread__files_entry(struct thread *thread, int fd)
1623{
1624 return thread_trace__files_entry(thread__priv(thread), fd);
1625}
1626
1627static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1628{
1629 struct thread_trace *ttrace = thread__priv(thread);
1630 struct file *file = thread_trace__files_entry(ttrace, fd);
1631
1632 if (file != NULL) {
1633 struct stat st;
1634 if (stat(pathname, &st) == 0)
1635 file->dev_maj = major(st.st_rdev);
1636 file->pathname = strdup(pathname);
1637 if (file->pathname)
1638 return 0;
1639 }
1640
1641 return -1;
1642}
1643
1644static int thread__read_fd_path(struct thread *thread, int fd)
1645{
1646 char linkname[PATH_MAX], pathname[PATH_MAX];
1647 struct stat st;
1648 int ret;
1649
1650 if (thread__pid(thread) == thread__tid(thread)) {
1651 scnprintf(linkname, sizeof(linkname),
1652 "/proc/%d/fd/%d", thread__pid(thread), fd);
1653 } else {
1654 scnprintf(linkname, sizeof(linkname),
1655 "/proc/%d/task/%d/fd/%d",
1656 thread__pid(thread), thread__tid(thread), fd);
1657 }
1658
1659 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1660 return -1;
1661
1662 ret = readlink(linkname, pathname, sizeof(pathname));
1663
1664 if (ret < 0 || ret > st.st_size)
1665 return -1;
1666
1667 pathname[ret] = '\0';
1668 return trace__set_fd_pathname(thread, fd, pathname);
1669}
1670
1671static const char *thread__fd_path(struct thread *thread, int fd,
1672 struct trace *trace)
1673{
1674 struct thread_trace *ttrace = thread__priv(thread);
1675
1676 if (ttrace == NULL || trace->fd_path_disabled)
1677 return NULL;
1678
1679 if (fd < 0)
1680 return NULL;
1681
1682 if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1683 if (!trace->live)
1684 return NULL;
1685 ++trace->stats.proc_getname;
1686 if (thread__read_fd_path(thread, fd))
1687 return NULL;
1688 }
1689
1690 return ttrace->files.table[fd].pathname;
1691}
1692
1693size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1694{
1695 int fd = arg->val;
1696 size_t printed = scnprintf(bf, size, "%d", fd);
1697 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1698
1699 if (path)
1700 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1701
1702 return printed;
1703}
1704
1705size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1706{
1707 size_t printed = scnprintf(bf, size, "%d", fd);
1708 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1709
1710 if (thread) {
1711 const char *path = thread__fd_path(thread, fd, trace);
1712
1713 if (path)
1714 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1715
1716 thread__put(thread);
1717 }
1718
1719 return printed;
1720}
1721
1722static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1723 struct syscall_arg *arg)
1724{
1725 int fd = arg->val;
1726 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1727 struct thread_trace *ttrace = thread__priv(arg->thread);
1728
1729 if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1730 zfree(&ttrace->files.table[fd].pathname);
1731
1732 return printed;
1733}
1734
1735static void thread__set_filename_pos(struct thread *thread, const char *bf,
1736 unsigned long ptr)
1737{
1738 struct thread_trace *ttrace = thread__priv(thread);
1739
1740 ttrace->filename.ptr = ptr;
1741 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1742}
1743
1744static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1745{
1746 struct augmented_arg *augmented_arg = arg->augmented.args;
1747 size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1748 /*
1749 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1750 * we would have two strings, each prefixed by its size.
1751 */
1752 int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1753
1754 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1755 arg->augmented.size -= consumed;
1756
1757 return printed;
1758}
1759
1760static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1761 struct syscall_arg *arg)
1762{
1763 unsigned long ptr = arg->val;
1764
1765 if (arg->augmented.args)
1766 return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1767
1768 if (!arg->trace->vfs_getname)
1769 return scnprintf(bf, size, "%#x", ptr);
1770
1771 thread__set_filename_pos(arg->thread, bf, ptr);
1772 return 0;
1773}
1774
1775#define MAX_CONTROL_CHAR 31
1776#define MAX_ASCII 127
1777
1778static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg)
1779{
1780 struct augmented_arg *augmented_arg = arg->augmented.args;
1781 unsigned char *orig = (unsigned char *)augmented_arg->value;
1782 size_t printed = 0;
1783 int consumed;
1784
1785 if (augmented_arg == NULL)
1786 return 0;
1787
1788 for (int j = 0; j < augmented_arg->size; ++j) {
1789 bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII;
1790 /* print control characters (0~31 and 127), and non-ascii characters in \(digits) */
1791 printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]);
1792 }
1793
1794 consumed = sizeof(*augmented_arg) + augmented_arg->size;
1795 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1796 arg->augmented.size -= consumed;
1797
1798 return printed;
1799}
1800
1801static bool trace__filter_duration(struct trace *trace, double t)
1802{
1803 return t < (trace->duration_filter * NSEC_PER_MSEC);
1804}
1805
1806static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1807{
1808 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1809
1810 return fprintf(fp, "%10.3f ", ts);
1811}
1812
1813/*
1814 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1815 * using ttrace->entry_time for a thread that receives a sys_exit without
1816 * first having received a sys_enter ("poll" issued before tracing session
1817 * starts, lost sys_enter exit due to ring buffer overflow).
1818 */
1819static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1820{
1821 if (tstamp > 0)
1822 return __trace__fprintf_tstamp(trace, tstamp, fp);
1823
1824 return fprintf(fp, " ? ");
1825}
1826
1827static pid_t workload_pid = -1;
1828static volatile sig_atomic_t done = false;
1829static volatile sig_atomic_t interrupted = false;
1830
1831static void sighandler_interrupt(int sig __maybe_unused)
1832{
1833 done = interrupted = true;
1834}
1835
1836static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
1837 void *context __maybe_unused)
1838{
1839 if (info->si_pid == workload_pid)
1840 done = true;
1841}
1842
1843static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1844{
1845 size_t printed = 0;
1846
1847 if (trace->multiple_threads) {
1848 if (trace->show_comm)
1849 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1850 printed += fprintf(fp, "%d ", thread__tid(thread));
1851 }
1852
1853 return printed;
1854}
1855
1856static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1857 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1858{
1859 size_t printed = 0;
1860
1861 if (trace->show_tstamp)
1862 printed = trace__fprintf_tstamp(trace, tstamp, fp);
1863 if (trace->show_duration)
1864 printed += fprintf_duration(duration, duration_calculated, fp);
1865 return printed + trace__fprintf_comm_tid(trace, thread, fp);
1866}
1867
1868static int trace__process_event(struct trace *trace, struct machine *machine,
1869 union perf_event *event, struct perf_sample *sample)
1870{
1871 int ret = 0;
1872
1873 switch (event->header.type) {
1874 case PERF_RECORD_LOST:
1875 color_fprintf(trace->output, PERF_COLOR_RED,
1876 "LOST %" PRIu64 " events!\n", (u64)event->lost.lost);
1877 ret = machine__process_lost_event(machine, event, sample);
1878 break;
1879 default:
1880 ret = machine__process_event(machine, event, sample);
1881 break;
1882 }
1883
1884 return ret;
1885}
1886
1887static int trace__tool_process(const struct perf_tool *tool,
1888 union perf_event *event,
1889 struct perf_sample *sample,
1890 struct machine *machine)
1891{
1892 struct trace *trace = container_of(tool, struct trace, tool);
1893 return trace__process_event(trace, machine, event, sample);
1894}
1895
1896static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1897{
1898 struct machine *machine = vmachine;
1899
1900 if (machine->kptr_restrict_warned)
1901 return NULL;
1902
1903 if (symbol_conf.kptr_restrict) {
1904 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1905 "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1906 "Kernel samples will not be resolved.\n");
1907 machine->kptr_restrict_warned = true;
1908 return NULL;
1909 }
1910
1911 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1912}
1913
1914static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1915{
1916 int err = symbol__init(NULL);
1917
1918 if (err)
1919 return err;
1920
1921 trace->host = machine__new_host();
1922 if (trace->host == NULL)
1923 return -ENOMEM;
1924
1925 thread__set_priv_destructor(thread_trace__delete);
1926
1927 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1928 if (err < 0)
1929 goto out;
1930
1931 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1932 evlist->core.threads, trace__tool_process,
1933 true, false, 1);
1934out:
1935 if (err)
1936 symbol__exit();
1937
1938 return err;
1939}
1940
1941static void trace__symbols__exit(struct trace *trace)
1942{
1943 machine__exit(trace->host);
1944 trace->host = NULL;
1945
1946 symbol__exit();
1947}
1948
1949static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1950{
1951 int idx;
1952
1953 if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
1954 nr_args = sc->fmt->nr_args;
1955
1956 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1957 if (sc->arg_fmt == NULL)
1958 return -1;
1959
1960 for (idx = 0; idx < nr_args; ++idx) {
1961 if (sc->fmt)
1962 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1963 }
1964
1965 sc->nr_args = nr_args;
1966 return 0;
1967}
1968
1969static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
1970 { .name = "msr", .scnprintf = SCA_X86_MSR, .strtoul = STUL_X86_MSR, },
1971 { .name = "vector", .scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
1972};
1973
1974static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
1975{
1976 const struct syscall_arg_fmt *fmt = fmtp;
1977 return strcmp(name, fmt->name);
1978}
1979
1980static const struct syscall_arg_fmt *
1981__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb,
1982 const char *name)
1983{
1984 return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
1985}
1986
1987static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
1988{
1989 const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
1990 return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
1991}
1992
1993static struct tep_format_field *
1994syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
1995 bool *use_btf)
1996{
1997 struct tep_format_field *last_field = NULL;
1998 int len;
1999
2000 for (; field; field = field->next, ++arg) {
2001 last_field = field;
2002
2003 if (arg->scnprintf)
2004 continue;
2005
2006 len = strlen(field->name);
2007
2008 // As far as heuristics (or intention) goes this seems to hold true, and makes sense!
2009 if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
2010 arg->from_user = true;
2011
2012 if (strcmp(field->type, "const char *") == 0 &&
2013 ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
2014 strstr(field->name, "path") != NULL)) {
2015 arg->scnprintf = SCA_FILENAME;
2016 } else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
2017 arg->scnprintf = SCA_PTR;
2018 else if (strcmp(field->type, "pid_t") == 0)
2019 arg->scnprintf = SCA_PID;
2020 else if (strcmp(field->type, "umode_t") == 0)
2021 arg->scnprintf = SCA_MODE_T;
2022 else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
2023 arg->scnprintf = SCA_CHAR_ARRAY;
2024 arg->nr_entries = field->arraylen;
2025 } else if ((strcmp(field->type, "int") == 0 ||
2026 strcmp(field->type, "unsigned int") == 0 ||
2027 strcmp(field->type, "long") == 0) &&
2028 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
2029 /*
2030 * /sys/kernel/tracing/events/syscalls/sys_enter*
2031 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
2032 * 65 int
2033 * 23 unsigned int
2034 * 7 unsigned long
2035 */
2036 arg->scnprintf = SCA_FD;
2037 } else if (strstr(field->type, "enum") && use_btf != NULL) {
2038 *use_btf = true;
2039 arg->strtoul = STUL_BTF_TYPE;
2040 } else {
2041 const struct syscall_arg_fmt *fmt =
2042 syscall_arg_fmt__find_by_name(field->name);
2043
2044 if (fmt) {
2045 arg->scnprintf = fmt->scnprintf;
2046 arg->strtoul = fmt->strtoul;
2047 }
2048 }
2049 }
2050
2051 return last_field;
2052}
2053
2054static int syscall__set_arg_fmts(struct syscall *sc)
2055{
2056 struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args,
2057 &sc->use_btf);
2058
2059 if (last_field)
2060 sc->args_size = last_field->offset + last_field->size;
2061
2062 return 0;
2063}
2064
2065static int trace__read_syscall_info(struct trace *trace, int id)
2066{
2067 char tp_name[128];
2068 struct syscall *sc;
2069 const char *name = syscalltbl__name(trace->sctbl, id);
2070 int err;
2071
2072#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2073 if (trace->syscalls.table == NULL) {
2074 trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
2075 if (trace->syscalls.table == NULL)
2076 return -ENOMEM;
2077 }
2078#else
2079 if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
2080 // When using libaudit we don't know beforehand what is the max syscall id
2081 struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
2082
2083 if (table == NULL)
2084 return -ENOMEM;
2085
2086 // Need to memset from offset 0 and +1 members if brand new
2087 if (trace->syscalls.table == NULL)
2088 memset(table, 0, (id + 1) * sizeof(*sc));
2089 else
2090 memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
2091
2092 trace->syscalls.table = table;
2093 trace->sctbl->syscalls.max_id = id;
2094 }
2095#endif
2096 sc = trace->syscalls.table + id;
2097 if (sc->nonexistent)
2098 return -EEXIST;
2099
2100 if (name == NULL) {
2101 sc->nonexistent = true;
2102 return -EEXIST;
2103 }
2104
2105 sc->name = name;
2106 sc->fmt = syscall_fmt__find(sc->name);
2107
2108 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
2109 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2110
2111 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
2112 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
2113 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2114 }
2115
2116 /*
2117 * Fails to read trace point format via sysfs node, so the trace point
2118 * doesn't exist. Set the 'nonexistent' flag as true.
2119 */
2120 if (IS_ERR(sc->tp_format)) {
2121 sc->nonexistent = true;
2122 return PTR_ERR(sc->tp_format);
2123 }
2124
2125 /*
2126 * The tracepoint format contains __syscall_nr field, so it's one more
2127 * than the actual number of syscall arguments.
2128 */
2129 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
2130 RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
2131 return -ENOMEM;
2132
2133 sc->args = sc->tp_format->format.fields;
2134 /*
2135 * We need to check and discard the first variable '__syscall_nr'
2136 * or 'nr' that mean the syscall number. It is needless here.
2137 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
2138 */
2139 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
2140 sc->args = sc->args->next;
2141 --sc->nr_args;
2142 }
2143
2144 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
2145 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
2146
2147 err = syscall__set_arg_fmts(sc);
2148
2149 /* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */
2150 if (sc->use_btf)
2151 trace__load_vmlinux_btf(trace);
2152
2153 return err;
2154}
2155
2156static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf)
2157{
2158 struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
2159
2160 if (fmt != NULL) {
2161 syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf);
2162 return 0;
2163 }
2164
2165 return -ENOMEM;
2166}
2167
2168static int intcmp(const void *a, const void *b)
2169{
2170 const int *one = a, *another = b;
2171
2172 return *one - *another;
2173}
2174
2175static int trace__validate_ev_qualifier(struct trace *trace)
2176{
2177 int err = 0;
2178 bool printed_invalid_prefix = false;
2179 struct str_node *pos;
2180 size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
2181
2182 trace->ev_qualifier_ids.entries = malloc(nr_allocated *
2183 sizeof(trace->ev_qualifier_ids.entries[0]));
2184
2185 if (trace->ev_qualifier_ids.entries == NULL) {
2186 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
2187 trace->output);
2188 err = -EINVAL;
2189 goto out;
2190 }
2191
2192 strlist__for_each_entry(pos, trace->ev_qualifier) {
2193 const char *sc = pos->s;
2194 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
2195
2196 if (id < 0) {
2197 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
2198 if (id >= 0)
2199 goto matches;
2200
2201 if (!printed_invalid_prefix) {
2202 pr_debug("Skipping unknown syscalls: ");
2203 printed_invalid_prefix = true;
2204 } else {
2205 pr_debug(", ");
2206 }
2207
2208 pr_debug("%s", sc);
2209 continue;
2210 }
2211matches:
2212 trace->ev_qualifier_ids.entries[nr_used++] = id;
2213 if (match_next == -1)
2214 continue;
2215
2216 while (1) {
2217 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
2218 if (id < 0)
2219 break;
2220 if (nr_allocated == nr_used) {
2221 void *entries;
2222
2223 nr_allocated += 8;
2224 entries = realloc(trace->ev_qualifier_ids.entries,
2225 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
2226 if (entries == NULL) {
2227 err = -ENOMEM;
2228 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
2229 goto out_free;
2230 }
2231 trace->ev_qualifier_ids.entries = entries;
2232 }
2233 trace->ev_qualifier_ids.entries[nr_used++] = id;
2234 }
2235 }
2236
2237 trace->ev_qualifier_ids.nr = nr_used;
2238 qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
2239out:
2240 if (printed_invalid_prefix)
2241 pr_debug("\n");
2242 return err;
2243out_free:
2244 zfree(&trace->ev_qualifier_ids.entries);
2245 trace->ev_qualifier_ids.nr = 0;
2246 goto out;
2247}
2248
2249static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
2250{
2251 bool in_ev_qualifier;
2252
2253 if (trace->ev_qualifier_ids.nr == 0)
2254 return true;
2255
2256 in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
2257 trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
2258
2259 if (in_ev_qualifier)
2260 return !trace->not_ev_qualifier;
2261
2262 return trace->not_ev_qualifier;
2263}
2264
2265/*
2266 * args is to be interpreted as a series of longs but we need to handle
2267 * 8-byte unaligned accesses. args points to raw_data within the event
2268 * and raw_data is guaranteed to be 8-byte unaligned because it is
2269 * preceded by raw_size which is a u32. So we need to copy args to a temp
2270 * variable to read it. Most notably this avoids extended load instructions
2271 * on unaligned addresses
2272 */
2273unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
2274{
2275 unsigned long val;
2276 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
2277
2278 memcpy(&val, p, sizeof(val));
2279 return val;
2280}
2281
2282static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
2283 struct syscall_arg *arg)
2284{
2285 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
2286 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
2287
2288 return scnprintf(bf, size, "arg%d: ", arg->idx);
2289}
2290
2291/*
2292 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
2293 * as mount 'flags' argument that needs ignoring some magic flag, see comment
2294 * in tools/perf/trace/beauty/mount_flags.c
2295 */
2296static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
2297{
2298 if (fmt && fmt->mask_val)
2299 return fmt->mask_val(arg, val);
2300
2301 return val;
2302}
2303
2304static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
2305 struct syscall_arg *arg, unsigned long val)
2306{
2307 if (fmt && fmt->scnprintf) {
2308 arg->val = val;
2309 if (fmt->parm)
2310 arg->parm = fmt->parm;
2311 return fmt->scnprintf(bf, size, arg);
2312 }
2313 return scnprintf(bf, size, "%ld", val);
2314}
2315
2316static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
2317 unsigned char *args, void *augmented_args, int augmented_args_size,
2318 struct trace *trace, struct thread *thread)
2319{
2320 size_t printed = 0, btf_printed;
2321 unsigned long val;
2322 u8 bit = 1;
2323 struct syscall_arg arg = {
2324 .args = args,
2325 .augmented = {
2326 .size = augmented_args_size,
2327 .args = augmented_args,
2328 },
2329 .idx = 0,
2330 .mask = 0,
2331 .trace = trace,
2332 .thread = thread,
2333 .show_string_prefix = trace->show_string_prefix,
2334 };
2335 struct thread_trace *ttrace = thread__priv(thread);
2336 void *default_scnprintf;
2337
2338 /*
2339 * Things like fcntl will set this in its 'cmd' formatter to pick the
2340 * right formatter for the return value (an fd? file flags?), which is
2341 * not needed for syscalls that always return a given type, say an fd.
2342 */
2343 ttrace->ret_scnprintf = NULL;
2344
2345 if (sc->args != NULL) {
2346 struct tep_format_field *field;
2347
2348 for (field = sc->args; field;
2349 field = field->next, ++arg.idx, bit <<= 1) {
2350 if (arg.mask & bit)
2351 continue;
2352
2353 arg.fmt = &sc->arg_fmt[arg.idx];
2354 val = syscall_arg__val(&arg, arg.idx);
2355 /*
2356 * Some syscall args need some mask, most don't and
2357 * return val untouched.
2358 */
2359 val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
2360
2361 /*
2362 * Suppress this argument if its value is zero and show_zero
2363 * property isn't set.
2364 *
2365 * If it has a BTF type, then override the zero suppression knob
2366 * as the common case is for zero in an enum to have an associated entry.
2367 */
2368 if (val == 0 && !trace->show_zeros &&
2369 !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
2370 !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE))
2371 continue;
2372
2373 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2374
2375 if (trace->show_arg_names)
2376 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2377
2378 default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
2379
2380 if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
2381 btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
2382 size - printed, val, field->type);
2383 if (btf_printed) {
2384 printed += btf_printed;
2385 continue;
2386 }
2387 }
2388
2389 printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
2390 bf + printed, size - printed, &arg, val);
2391 }
2392 } else if (IS_ERR(sc->tp_format)) {
2393 /*
2394 * If we managed to read the tracepoint /format file, then we
2395 * may end up not having any args, like with gettid(), so only
2396 * print the raw args when we didn't manage to read it.
2397 */
2398 while (arg.idx < sc->nr_args) {
2399 if (arg.mask & bit)
2400 goto next_arg;
2401 val = syscall_arg__val(&arg, arg.idx);
2402 if (printed)
2403 printed += scnprintf(bf + printed, size - printed, ", ");
2404 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
2405 printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
2406next_arg:
2407 ++arg.idx;
2408 bit <<= 1;
2409 }
2410 }
2411
2412 return printed;
2413}
2414
2415typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
2416 union perf_event *event,
2417 struct perf_sample *sample);
2418
2419static struct syscall *trace__syscall_info(struct trace *trace,
2420 struct evsel *evsel, int id)
2421{
2422 int err = 0;
2423
2424 if (id < 0) {
2425
2426 /*
2427 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
2428 * before that, leaving at a higher verbosity level till that is
2429 * explained. Reproduced with plain ftrace with:
2430 *
2431 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
2432 * grep "NR -1 " /t/trace_pipe
2433 *
2434 * After generating some load on the machine.
2435 */
2436 if (verbose > 1) {
2437 static u64 n;
2438 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2439 id, evsel__name(evsel), ++n);
2440 }
2441 return NULL;
2442 }
2443
2444 err = -EINVAL;
2445
2446#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2447 if (id > trace->sctbl->syscalls.max_id) {
2448#else
2449 if (id >= trace->sctbl->syscalls.max_id) {
2450 /*
2451 * With libaudit we don't know beforehand what is the max_id,
2452 * so we let trace__read_syscall_info() figure that out as we
2453 * go on reading syscalls.
2454 */
2455 err = trace__read_syscall_info(trace, id);
2456 if (err)
2457#endif
2458 goto out_cant_read;
2459 }
2460
2461 if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
2462 (err = trace__read_syscall_info(trace, id)) != 0)
2463 goto out_cant_read;
2464
2465 if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
2466 goto out_cant_read;
2467
2468 return &trace->syscalls.table[id];
2469
2470out_cant_read:
2471 if (verbose > 0) {
2472 char sbuf[STRERR_BUFSIZE];
2473 fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
2474 if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
2475 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2476 fputs(" information\n", trace->output);
2477 }
2478 return NULL;
2479}
2480
2481struct syscall_stats {
2482 struct stats stats;
2483 u64 nr_failures;
2484 int max_errno;
2485 u32 *errnos;
2486};
2487
2488static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
2489 int id, struct perf_sample *sample, long err, bool errno_summary)
2490{
2491 struct int_node *inode;
2492 struct syscall_stats *stats;
2493 u64 duration = 0;
2494
2495 inode = intlist__findnew(ttrace->syscall_stats, id);
2496 if (inode == NULL)
2497 return;
2498
2499 stats = inode->priv;
2500 if (stats == NULL) {
2501 stats = zalloc(sizeof(*stats));
2502 if (stats == NULL)
2503 return;
2504
2505 init_stats(&stats->stats);
2506 inode->priv = stats;
2507 }
2508
2509 if (ttrace->entry_time && sample->time > ttrace->entry_time)
2510 duration = sample->time - ttrace->entry_time;
2511
2512 update_stats(&stats->stats, duration);
2513
2514 if (err < 0) {
2515 ++stats->nr_failures;
2516
2517 if (!errno_summary)
2518 return;
2519
2520 err = -err;
2521 if (err > stats->max_errno) {
2522 u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
2523
2524 if (new_errnos) {
2525 memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
2526 } else {
2527 pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
2528 thread__comm_str(thread), thread__pid(thread),
2529 thread__tid(thread));
2530 return;
2531 }
2532
2533 stats->errnos = new_errnos;
2534 stats->max_errno = err;
2535 }
2536
2537 ++stats->errnos[err - 1];
2538 }
2539}
2540
2541static int trace__printf_interrupted_entry(struct trace *trace)
2542{
2543 struct thread_trace *ttrace;
2544 size_t printed;
2545 int len;
2546
2547 if (trace->failure_only || trace->current == NULL)
2548 return 0;
2549
2550 ttrace = thread__priv(trace->current);
2551
2552 if (!ttrace->entry_pending)
2553 return 0;
2554
2555 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
2556 printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
2557
2558 if (len < trace->args_alignment - 4)
2559 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
2560
2561 printed += fprintf(trace->output, " ...\n");
2562
2563 ttrace->entry_pending = false;
2564 ++trace->nr_events_printed;
2565
2566 return printed;
2567}
2568
2569static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
2570 struct perf_sample *sample, struct thread *thread)
2571{
2572 int printed = 0;
2573
2574 if (trace->print_sample) {
2575 double ts = (double)sample->time / NSEC_PER_MSEC;
2576
2577 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
2578 evsel__name(evsel), ts,
2579 thread__comm_str(thread),
2580 sample->pid, sample->tid, sample->cpu);
2581 }
2582
2583 return printed;
2584}
2585
2586static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
2587{
2588 void *augmented_args = NULL;
2589 /*
2590 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
2591 * and there we get all 6 syscall args plus the tracepoint common fields
2592 * that gets calculated at the start and the syscall_nr (another long).
2593 * So we check if that is the case and if so don't look after the
2594 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
2595 * which is fixed.
2596 *
2597 * We'll revisit this later to pass s->args_size to the BPF augmenter
2598 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
2599 * copies only what we need for each syscall, like what happens when we
2600 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
2601 * traffic to just what is needed for each syscall.
2602 */
2603 int args_size = raw_augmented_args_size ?: sc->args_size;
2604
2605 *augmented_args_size = sample->raw_size - args_size;
2606 if (*augmented_args_size > 0)
2607 augmented_args = sample->raw_data + args_size;
2608
2609 return augmented_args;
2610}
2611
2612static void syscall__exit(struct syscall *sc)
2613{
2614 if (!sc)
2615 return;
2616
2617 zfree(&sc->arg_fmt);
2618}
2619
2620static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
2621 union perf_event *event __maybe_unused,
2622 struct perf_sample *sample)
2623{
2624 char *msg;
2625 void *args;
2626 int printed = 0;
2627 struct thread *thread;
2628 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2629 int augmented_args_size = 0;
2630 void *augmented_args = NULL;
2631 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2632 struct thread_trace *ttrace;
2633
2634 if (sc == NULL)
2635 return -1;
2636
2637 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2638 ttrace = thread__trace(thread, trace->output);
2639 if (ttrace == NULL)
2640 goto out_put;
2641
2642 trace__fprintf_sample(trace, evsel, sample, thread);
2643
2644 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2645
2646 if (ttrace->entry_str == NULL) {
2647 ttrace->entry_str = malloc(trace__entry_str_size);
2648 if (!ttrace->entry_str)
2649 goto out_put;
2650 }
2651
2652 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
2653 trace__printf_interrupted_entry(trace);
2654 /*
2655 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
2656 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
2657 * this breaks syscall__augmented_args() check for augmented args, as we calculate
2658 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
2659 * so when handling, say the openat syscall, we end up getting 6 args for the
2660 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
2661 * thinking that the extra 2 u64 args are the augmented filename, so just check
2662 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
2663 */
2664 if (evsel != trace->syscalls.events.sys_enter)
2665 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2666 ttrace->entry_time = sample->time;
2667 msg = ttrace->entry_str;
2668 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2669
2670 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2671 args, augmented_args, augmented_args_size, trace, thread);
2672
2673 if (sc->is_exit) {
2674 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
2675 int alignment = 0;
2676
2677 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2678 printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2679 if (trace->args_alignment > printed)
2680 alignment = trace->args_alignment - printed;
2681 fprintf(trace->output, "%*s= ?\n", alignment, " ");
2682 }
2683 } else {
2684 ttrace->entry_pending = true;
2685 /* See trace__vfs_getname & trace__sys_exit */
2686 ttrace->filename.pending_open = false;
2687 }
2688
2689 if (trace->current != thread) {
2690 thread__put(trace->current);
2691 trace->current = thread__get(thread);
2692 }
2693 err = 0;
2694out_put:
2695 thread__put(thread);
2696 return err;
2697}
2698
2699static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2700 struct perf_sample *sample)
2701{
2702 struct thread_trace *ttrace;
2703 struct thread *thread;
2704 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2705 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2706 char msg[1024];
2707 void *args, *augmented_args = NULL;
2708 int augmented_args_size;
2709 size_t printed = 0;
2710
2711 if (sc == NULL)
2712 return -1;
2713
2714 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2715 ttrace = thread__trace(thread, trace->output);
2716 /*
2717 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2718 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2719 */
2720 if (ttrace == NULL)
2721 goto out_put;
2722
2723 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2724 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2725 printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2726 fprintf(trace->output, "%.*s", (int)printed, msg);
2727 err = 0;
2728out_put:
2729 thread__put(thread);
2730 return err;
2731}
2732
2733static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2734 struct perf_sample *sample,
2735 struct callchain_cursor *cursor)
2736{
2737 struct addr_location al;
2738 int max_stack = evsel->core.attr.sample_max_stack ?
2739 evsel->core.attr.sample_max_stack :
2740 trace->max_stack;
2741 int err = -1;
2742
2743 addr_location__init(&al);
2744 if (machine__resolve(trace->host, &al, sample) < 0)
2745 goto out;
2746
2747 err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2748out:
2749 addr_location__exit(&al);
2750 return err;
2751}
2752
2753static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2754{
2755 /* TODO: user-configurable print_opts */
2756 const unsigned int print_opts = EVSEL__PRINT_SYM |
2757 EVSEL__PRINT_DSO |
2758 EVSEL__PRINT_UNKNOWN_AS_ADDR;
2759
2760 return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
2761}
2762
2763static const char *errno_to_name(struct evsel *evsel, int err)
2764{
2765 struct perf_env *env = evsel__env(evsel);
2766
2767 return perf_env__arch_strerrno(env, err);
2768}
2769
2770static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2771 union perf_event *event __maybe_unused,
2772 struct perf_sample *sample)
2773{
2774 long ret;
2775 u64 duration = 0;
2776 bool duration_calculated = false;
2777 struct thread *thread;
2778 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2779 int alignment = trace->args_alignment;
2780 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2781 struct thread_trace *ttrace;
2782
2783 if (sc == NULL)
2784 return -1;
2785
2786 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2787 ttrace = thread__trace(thread, trace->output);
2788 if (ttrace == NULL)
2789 goto out_put;
2790
2791 trace__fprintf_sample(trace, evsel, sample, thread);
2792
2793 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2794
2795 if (trace->summary)
2796 thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
2797
2798 if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2799 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2800 ttrace->filename.pending_open = false;
2801 ++trace->stats.vfs_getname;
2802 }
2803
2804 if (ttrace->entry_time) {
2805 duration = sample->time - ttrace->entry_time;
2806 if (trace__filter_duration(trace, duration))
2807 goto out;
2808 duration_calculated = true;
2809 } else if (trace->duration_filter)
2810 goto out;
2811
2812 if (sample->callchain) {
2813 struct callchain_cursor *cursor = get_tls_callchain_cursor();
2814
2815 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2816 if (callchain_ret == 0) {
2817 if (cursor->nr < trace->min_stack)
2818 goto out;
2819 callchain_ret = 1;
2820 }
2821 }
2822
2823 if (trace->summary_only || (ret >= 0 && trace->failure_only))
2824 goto out;
2825
2826 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2827
2828 if (ttrace->entry_pending) {
2829 printed = fprintf(trace->output, "%s", ttrace->entry_str);
2830 } else {
2831 printed += fprintf(trace->output, " ... [");
2832 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2833 printed += 9;
2834 printed += fprintf(trace->output, "]: %s()", sc->name);
2835 }
2836
2837 printed++; /* the closing ')' */
2838
2839 if (alignment > printed)
2840 alignment -= printed;
2841 else
2842 alignment = 0;
2843
2844 fprintf(trace->output, ")%*s= ", alignment, " ");
2845
2846 if (sc->fmt == NULL) {
2847 if (ret < 0)
2848 goto errno_print;
2849signed_print:
2850 fprintf(trace->output, "%ld", ret);
2851 } else if (ret < 0) {
2852errno_print: {
2853 char bf[STRERR_BUFSIZE];
2854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2855 *e = errno_to_name(evsel, -ret);
2856
2857 fprintf(trace->output, "-1 %s (%s)", e, emsg);
2858 }
2859 } else if (ret == 0 && sc->fmt->timeout)
2860 fprintf(trace->output, "0 (Timeout)");
2861 else if (ttrace->ret_scnprintf) {
2862 char bf[1024];
2863 struct syscall_arg arg = {
2864 .val = ret,
2865 .thread = thread,
2866 .trace = trace,
2867 };
2868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2869 ttrace->ret_scnprintf = NULL;
2870 fprintf(trace->output, "%s", bf);
2871 } else if (sc->fmt->hexret)
2872 fprintf(trace->output, "%#lx", ret);
2873 else if (sc->fmt->errpid) {
2874 struct thread *child = machine__find_thread(trace->host, ret, ret);
2875
2876 if (child != NULL) {
2877 fprintf(trace->output, "%ld", ret);
2878 if (thread__comm_set(child))
2879 fprintf(trace->output, " (%s)", thread__comm_str(child));
2880 thread__put(child);
2881 }
2882 } else
2883 goto signed_print;
2884
2885 fputc('\n', trace->output);
2886
2887 /*
2888 * We only consider an 'event' for the sake of --max-events a non-filtered
2889 * sys_enter + sys_exit and other tracepoint events.
2890 */
2891 if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2892 interrupted = true;
2893
2894 if (callchain_ret > 0)
2895 trace__fprintf_callchain(trace, sample);
2896 else if (callchain_ret < 0)
2897 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2898out:
2899 ttrace->entry_pending = false;
2900 err = 0;
2901out_put:
2902 thread__put(thread);
2903 return err;
2904}
2905
2906static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2907 union perf_event *event __maybe_unused,
2908 struct perf_sample *sample)
2909{
2910 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2911 struct thread_trace *ttrace;
2912 size_t filename_len, entry_str_len, to_move;
2913 ssize_t remaining_space;
2914 char *pos;
2915 const char *filename = evsel__rawptr(evsel, sample, "pathname");
2916
2917 if (!thread)
2918 goto out;
2919
2920 ttrace = thread__priv(thread);
2921 if (!ttrace)
2922 goto out_put;
2923
2924 filename_len = strlen(filename);
2925 if (filename_len == 0)
2926 goto out_put;
2927
2928 if (ttrace->filename.namelen < filename_len) {
2929 char *f = realloc(ttrace->filename.name, filename_len + 1);
2930
2931 if (f == NULL)
2932 goto out_put;
2933
2934 ttrace->filename.namelen = filename_len;
2935 ttrace->filename.name = f;
2936 }
2937
2938 strcpy(ttrace->filename.name, filename);
2939 ttrace->filename.pending_open = true;
2940
2941 if (!ttrace->filename.ptr)
2942 goto out_put;
2943
2944 entry_str_len = strlen(ttrace->entry_str);
2945 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2946 if (remaining_space <= 0)
2947 goto out_put;
2948
2949 if (filename_len > (size_t)remaining_space) {
2950 filename += filename_len - remaining_space;
2951 filename_len = remaining_space;
2952 }
2953
2954 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2955 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2956 memmove(pos + filename_len, pos, to_move);
2957 memcpy(pos, filename, filename_len);
2958
2959 ttrace->filename.ptr = 0;
2960 ttrace->filename.entry_str_pos = 0;
2961out_put:
2962 thread__put(thread);
2963out:
2964 return 0;
2965}
2966
2967static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2968 union perf_event *event __maybe_unused,
2969 struct perf_sample *sample)
2970{
2971 u64 runtime = evsel__intval(evsel, sample, "runtime");
2972 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2973 struct thread *thread = machine__findnew_thread(trace->host,
2974 sample->pid,
2975 sample->tid);
2976 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2977
2978 if (ttrace == NULL)
2979 goto out_dump;
2980
2981 ttrace->runtime_ms += runtime_ms;
2982 trace->runtime_ms += runtime_ms;
2983out_put:
2984 thread__put(thread);
2985 return 0;
2986
2987out_dump:
2988 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2989 evsel->name,
2990 evsel__strval(evsel, sample, "comm"),
2991 (pid_t)evsel__intval(evsel, sample, "pid"),
2992 runtime,
2993 evsel__intval(evsel, sample, "vruntime"));
2994 goto out_put;
2995}
2996
2997static int bpf_output__printer(enum binary_printer_ops op,
2998 unsigned int val, void *extra __maybe_unused, FILE *fp)
2999{
3000 unsigned char ch = (unsigned char)val;
3001
3002 switch (op) {
3003 case BINARY_PRINT_CHAR_DATA:
3004 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
3005 case BINARY_PRINT_DATA_BEGIN:
3006 case BINARY_PRINT_LINE_BEGIN:
3007 case BINARY_PRINT_ADDR:
3008 case BINARY_PRINT_NUM_DATA:
3009 case BINARY_PRINT_NUM_PAD:
3010 case BINARY_PRINT_SEP:
3011 case BINARY_PRINT_CHAR_PAD:
3012 case BINARY_PRINT_LINE_END:
3013 case BINARY_PRINT_DATA_END:
3014 default:
3015 break;
3016 }
3017
3018 return 0;
3019}
3020
3021static void bpf_output__fprintf(struct trace *trace,
3022 struct perf_sample *sample)
3023{
3024 binary__fprintf(sample->raw_data, sample->raw_size, 8,
3025 bpf_output__printer, NULL, trace->output);
3026 ++trace->nr_events_printed;
3027}
3028
3029static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
3030 struct thread *thread, void *augmented_args, int augmented_args_size)
3031{
3032 char bf[2048];
3033 size_t size = sizeof(bf);
3034 struct tep_format_field *field = evsel->tp_format->format.fields;
3035 struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
3036 size_t printed = 0, btf_printed;
3037 unsigned long val;
3038 u8 bit = 1;
3039 struct syscall_arg syscall_arg = {
3040 .augmented = {
3041 .size = augmented_args_size,
3042 .args = augmented_args,
3043 },
3044 .idx = 0,
3045 .mask = 0,
3046 .trace = trace,
3047 .thread = thread,
3048 .show_string_prefix = trace->show_string_prefix,
3049 };
3050
3051 for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
3052 if (syscall_arg.mask & bit)
3053 continue;
3054
3055 syscall_arg.len = 0;
3056 syscall_arg.fmt = arg;
3057 if (field->flags & TEP_FIELD_IS_ARRAY) {
3058 int offset = field->offset;
3059
3060 if (field->flags & TEP_FIELD_IS_DYNAMIC) {
3061 offset = format_field__intval(field, sample, evsel->needs_swap);
3062 syscall_arg.len = offset >> 16;
3063 offset &= 0xffff;
3064 if (tep_field_is_relative(field->flags))
3065 offset += field->offset + field->size;
3066 }
3067
3068 val = (uintptr_t)(sample->raw_data + offset);
3069 } else
3070 val = format_field__intval(field, sample, evsel->needs_swap);
3071 /*
3072 * Some syscall args need some mask, most don't and
3073 * return val untouched.
3074 */
3075 val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
3076
3077 /* Suppress this argument if its value is zero and show_zero property isn't set. */
3078 if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE)
3079 continue;
3080
3081 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
3082
3083 if (trace->show_arg_names)
3084 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
3085
3086 btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type);
3087 if (btf_printed) {
3088 printed += btf_printed;
3089 continue;
3090 }
3091
3092 printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
3093 }
3094
3095 return printed + fprintf(trace->output, "%.*s", (int)printed, bf);
3096}
3097
3098static int trace__event_handler(struct trace *trace, struct evsel *evsel,
3099 union perf_event *event __maybe_unused,
3100 struct perf_sample *sample)
3101{
3102 struct thread *thread;
3103 int callchain_ret = 0;
3104
3105 if (evsel->nr_events_printed >= evsel->max_events)
3106 return 0;
3107
3108 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3109
3110 if (sample->callchain) {
3111 struct callchain_cursor *cursor = get_tls_callchain_cursor();
3112
3113 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3114 if (callchain_ret == 0) {
3115 if (cursor->nr < trace->min_stack)
3116 goto out;
3117 callchain_ret = 1;
3118 }
3119 }
3120
3121 trace__printf_interrupted_entry(trace);
3122 trace__fprintf_tstamp(trace, sample->time, trace->output);
3123
3124 if (trace->trace_syscalls && trace->show_duration)
3125 fprintf(trace->output, "( ): ");
3126
3127 if (thread)
3128 trace__fprintf_comm_tid(trace, thread, trace->output);
3129
3130 if (evsel == trace->syscalls.events.bpf_output) {
3131 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
3132 struct syscall *sc = trace__syscall_info(trace, evsel, id);
3133
3134 if (sc) {
3135 fprintf(trace->output, "%s(", sc->name);
3136 trace__fprintf_sys_enter(trace, evsel, sample);
3137 fputc(')', trace->output);
3138 goto newline;
3139 }
3140
3141 /*
3142 * XXX: Not having the associated syscall info or not finding/adding
3143 * the thread should never happen, but if it does...
3144 * fall thru and print it as a bpf_output event.
3145 */
3146 }
3147
3148 fprintf(trace->output, "%s(", evsel->name);
3149
3150 if (evsel__is_bpf_output(evsel)) {
3151 bpf_output__fprintf(trace, sample);
3152 } else if (evsel->tp_format) {
3153 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
3154 trace__fprintf_sys_enter(trace, evsel, sample)) {
3155 if (trace->libtraceevent_print) {
3156 event_format__fprintf(evsel->tp_format, sample->cpu,
3157 sample->raw_data, sample->raw_size,
3158 trace->output);
3159 } else {
3160 trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
3161 }
3162 }
3163 }
3164
3165newline:
3166 fprintf(trace->output, ")\n");
3167
3168 if (callchain_ret > 0)
3169 trace__fprintf_callchain(trace, sample);
3170 else if (callchain_ret < 0)
3171 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3172
3173 ++trace->nr_events_printed;
3174
3175 if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
3176 evsel__disable(evsel);
3177 evsel__close(evsel);
3178 }
3179out:
3180 thread__put(thread);
3181 return 0;
3182}
3183
3184static void print_location(FILE *f, struct perf_sample *sample,
3185 struct addr_location *al,
3186 bool print_dso, bool print_sym)
3187{
3188
3189 if ((verbose > 0 || print_dso) && al->map)
3190 fprintf(f, "%s@", dso__long_name(map__dso(al->map)));
3191
3192 if ((verbose > 0 || print_sym) && al->sym)
3193 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
3194 al->addr - al->sym->start);
3195 else if (al->map)
3196 fprintf(f, "0x%" PRIx64, al->addr);
3197 else
3198 fprintf(f, "0x%" PRIx64, sample->addr);
3199}
3200
3201static int trace__pgfault(struct trace *trace,
3202 struct evsel *evsel,
3203 union perf_event *event __maybe_unused,
3204 struct perf_sample *sample)
3205{
3206 struct thread *thread;
3207 struct addr_location al;
3208 char map_type = 'd';
3209 struct thread_trace *ttrace;
3210 int err = -1;
3211 int callchain_ret = 0;
3212
3213 addr_location__init(&al);
3214 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3215
3216 if (sample->callchain) {
3217 struct callchain_cursor *cursor = get_tls_callchain_cursor();
3218
3219 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3220 if (callchain_ret == 0) {
3221 if (cursor->nr < trace->min_stack)
3222 goto out_put;
3223 callchain_ret = 1;
3224 }
3225 }
3226
3227 ttrace = thread__trace(thread, trace->output);
3228 if (ttrace == NULL)
3229 goto out_put;
3230
3231 if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
3232 ttrace->pfmaj++;
3233 else
3234 ttrace->pfmin++;
3235
3236 if (trace->summary_only)
3237 goto out;
3238
3239 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
3240
3241 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
3242
3243 fprintf(trace->output, "%sfault [",
3244 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
3245 "maj" : "min");
3246
3247 print_location(trace->output, sample, &al, false, true);
3248
3249 fprintf(trace->output, "] => ");
3250
3251 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
3252
3253 if (!al.map) {
3254 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
3255
3256 if (al.map)
3257 map_type = 'x';
3258 else
3259 map_type = '?';
3260 }
3261
3262 print_location(trace->output, sample, &al, true, false);
3263
3264 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
3265
3266 if (callchain_ret > 0)
3267 trace__fprintf_callchain(trace, sample);
3268 else if (callchain_ret < 0)
3269 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3270
3271 ++trace->nr_events_printed;
3272out:
3273 err = 0;
3274out_put:
3275 thread__put(thread);
3276 addr_location__exit(&al);
3277 return err;
3278}
3279
3280static void trace__set_base_time(struct trace *trace,
3281 struct evsel *evsel,
3282 struct perf_sample *sample)
3283{
3284 /*
3285 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
3286 * and don't use sample->time unconditionally, we may end up having
3287 * some other event in the future without PERF_SAMPLE_TIME for good
3288 * reason, i.e. we may not be interested in its timestamps, just in
3289 * it taking place, picking some piece of information when it
3290 * appears in our event stream (vfs_getname comes to mind).
3291 */
3292 if (trace->base_time == 0 && !trace->full_time &&
3293 (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
3294 trace->base_time = sample->time;
3295}
3296
3297static int trace__process_sample(const struct perf_tool *tool,
3298 union perf_event *event,
3299 struct perf_sample *sample,
3300 struct evsel *evsel,
3301 struct machine *machine __maybe_unused)
3302{
3303 struct trace *trace = container_of(tool, struct trace, tool);
3304 struct thread *thread;
3305 int err = 0;
3306
3307 tracepoint_handler handler = evsel->handler;
3308
3309 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3310 if (thread && thread__is_filtered(thread))
3311 goto out;
3312
3313 trace__set_base_time(trace, evsel, sample);
3314
3315 if (handler) {
3316 ++trace->nr_events;
3317 handler(trace, evsel, event, sample);
3318 }
3319out:
3320 thread__put(thread);
3321 return err;
3322}
3323
3324static int trace__record(struct trace *trace, int argc, const char **argv)
3325{
3326 unsigned int rec_argc, i, j;
3327 const char **rec_argv;
3328 const char * const record_args[] = {
3329 "record",
3330 "-R",
3331 "-m", "1024",
3332 "-c", "1",
3333 };
3334 pid_t pid = getpid();
3335 char *filter = asprintf__tp_filter_pids(1, &pid);
3336 const char * const sc_args[] = { "-e", };
3337 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
3338 const char * const majpf_args[] = { "-e", "major-faults" };
3339 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
3340 const char * const minpf_args[] = { "-e", "minor-faults" };
3341 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
3342 int err = -1;
3343
3344 /* +3 is for the event string below and the pid filter */
3345 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
3346 majpf_args_nr + minpf_args_nr + argc;
3347 rec_argv = calloc(rec_argc + 1, sizeof(char *));
3348
3349 if (rec_argv == NULL || filter == NULL)
3350 goto out_free;
3351
3352 j = 0;
3353 for (i = 0; i < ARRAY_SIZE(record_args); i++)
3354 rec_argv[j++] = record_args[i];
3355
3356 if (trace->trace_syscalls) {
3357 for (i = 0; i < sc_args_nr; i++)
3358 rec_argv[j++] = sc_args[i];
3359
3360 /* event string may be different for older kernels - e.g., RHEL6 */
3361 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
3362 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
3363 else if (is_valid_tracepoint("syscalls:sys_enter"))
3364 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
3365 else {
3366 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
3367 goto out_free;
3368 }
3369 }
3370
3371 rec_argv[j++] = "--filter";
3372 rec_argv[j++] = filter;
3373
3374 if (trace->trace_pgfaults & TRACE_PFMAJ)
3375 for (i = 0; i < majpf_args_nr; i++)
3376 rec_argv[j++] = majpf_args[i];
3377
3378 if (trace->trace_pgfaults & TRACE_PFMIN)
3379 for (i = 0; i < minpf_args_nr; i++)
3380 rec_argv[j++] = minpf_args[i];
3381
3382 for (i = 0; i < (unsigned int)argc; i++)
3383 rec_argv[j++] = argv[i];
3384
3385 err = cmd_record(j, rec_argv);
3386out_free:
3387 free(filter);
3388 free(rec_argv);
3389 return err;
3390}
3391
3392static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
3393
3394static bool evlist__add_vfs_getname(struct evlist *evlist)
3395{
3396 bool found = false;
3397 struct evsel *evsel, *tmp;
3398 struct parse_events_error err;
3399 int ret;
3400
3401 parse_events_error__init(&err);
3402 ret = parse_events(evlist, "probe:vfs_getname*", &err);
3403 parse_events_error__exit(&err);
3404 if (ret)
3405 return false;
3406
3407 evlist__for_each_entry_safe(evlist, evsel, tmp) {
3408 if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
3409 continue;
3410
3411 if (evsel__field(evsel, "pathname")) {
3412 evsel->handler = trace__vfs_getname;
3413 found = true;
3414 continue;
3415 }
3416
3417 list_del_init(&evsel->core.node);
3418 evsel->evlist = NULL;
3419 evsel__delete(evsel);
3420 }
3421
3422 return found;
3423}
3424
3425static struct evsel *evsel__new_pgfault(u64 config)
3426{
3427 struct evsel *evsel;
3428 struct perf_event_attr attr = {
3429 .type = PERF_TYPE_SOFTWARE,
3430 .mmap_data = 1,
3431 };
3432
3433 attr.config = config;
3434 attr.sample_period = 1;
3435
3436 event_attr_init(&attr);
3437
3438 evsel = evsel__new(&attr);
3439 if (evsel)
3440 evsel->handler = trace__pgfault;
3441
3442 return evsel;
3443}
3444
3445static void evlist__free_syscall_tp_fields(struct evlist *evlist)
3446{
3447 struct evsel *evsel;
3448
3449 evlist__for_each_entry(evlist, evsel) {
3450 evsel_trace__delete(evsel->priv);
3451 evsel->priv = NULL;
3452 }
3453}
3454
3455static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
3456{
3457 const u32 type = event->header.type;
3458 struct evsel *evsel;
3459
3460 if (type != PERF_RECORD_SAMPLE) {
3461 trace__process_event(trace, trace->host, event, sample);
3462 return;
3463 }
3464
3465 evsel = evlist__id2evsel(trace->evlist, sample->id);
3466 if (evsel == NULL) {
3467 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
3468 return;
3469 }
3470
3471 if (evswitch__discard(&trace->evswitch, evsel))
3472 return;
3473
3474 trace__set_base_time(trace, evsel, sample);
3475
3476 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
3477 sample->raw_data == NULL) {
3478 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
3479 evsel__name(evsel), sample->tid,
3480 sample->cpu, sample->raw_size);
3481 } else {
3482 tracepoint_handler handler = evsel->handler;
3483 handler(trace, evsel, event, sample);
3484 }
3485
3486 if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
3487 interrupted = true;
3488}
3489
3490static int trace__add_syscall_newtp(struct trace *trace)
3491{
3492 int ret = -1;
3493 struct evlist *evlist = trace->evlist;
3494 struct evsel *sys_enter, *sys_exit;
3495
3496 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
3497 if (sys_enter == NULL)
3498 goto out;
3499
3500 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
3501 goto out_delete_sys_enter;
3502
3503 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
3504 if (sys_exit == NULL)
3505 goto out_delete_sys_enter;
3506
3507 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
3508 goto out_delete_sys_exit;
3509
3510 evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
3511 evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
3512
3513 evlist__add(evlist, sys_enter);
3514 evlist__add(evlist, sys_exit);
3515
3516 if (callchain_param.enabled && !trace->kernel_syscallchains) {
3517 /*
3518 * We're interested only in the user space callchain
3519 * leading to the syscall, allow overriding that for
3520 * debugging reasons using --kernel_syscall_callchains
3521 */
3522 sys_exit->core.attr.exclude_callchain_kernel = 1;
3523 }
3524
3525 trace->syscalls.events.sys_enter = sys_enter;
3526 trace->syscalls.events.sys_exit = sys_exit;
3527
3528 ret = 0;
3529out:
3530 return ret;
3531
3532out_delete_sys_exit:
3533 evsel__delete_priv(sys_exit);
3534out_delete_sys_enter:
3535 evsel__delete_priv(sys_enter);
3536 goto out;
3537}
3538
3539static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
3540{
3541 int err = -1;
3542 struct evsel *sys_exit;
3543 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
3544 trace->ev_qualifier_ids.nr,
3545 trace->ev_qualifier_ids.entries);
3546
3547 if (filter == NULL)
3548 goto out_enomem;
3549
3550 if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
3551 sys_exit = trace->syscalls.events.sys_exit;
3552 err = evsel__append_tp_filter(sys_exit, filter);
3553 }
3554
3555 free(filter);
3556out:
3557 return err;
3558out_enomem:
3559 errno = ENOMEM;
3560 goto out;
3561}
3562
3563#ifdef HAVE_BPF_SKEL
3564static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
3565{
3566 int id;
3567
3568 if (arg_fmt->type != NULL)
3569 return -1;
3570
3571 id = btf__find_by_name(btf, type);
3572 if (id < 0)
3573 return -1;
3574
3575 arg_fmt->type = btf__type_by_id(btf, id);
3576 arg_fmt->type_id = id;
3577
3578 return 0;
3579}
3580
3581static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
3582{
3583 struct bpf_program *pos, *prog = NULL;
3584 const char *sec_name;
3585
3586 if (trace->skel->obj == NULL)
3587 return NULL;
3588
3589 bpf_object__for_each_program(pos, trace->skel->obj) {
3590 sec_name = bpf_program__section_name(pos);
3591 if (sec_name && !strcmp(sec_name, name)) {
3592 prog = pos;
3593 break;
3594 }
3595 }
3596
3597 return prog;
3598}
3599
3600static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
3601 const char *prog_name, const char *type)
3602{
3603 struct bpf_program *prog;
3604
3605 if (prog_name == NULL) {
3606 char default_prog_name[256];
3607 scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name);
3608 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3609 if (prog != NULL)
3610 goto out_found;
3611 if (sc->fmt && sc->fmt->alias) {
3612 scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
3613 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3614 if (prog != NULL)
3615 goto out_found;
3616 }
3617 goto out_unaugmented;
3618 }
3619
3620 prog = trace__find_bpf_program_by_title(trace, prog_name);
3621
3622 if (prog != NULL) {
3623out_found:
3624 return prog;
3625 }
3626
3627 pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
3628 prog_name, type, sc->name);
3629out_unaugmented:
3630 return trace->skel->progs.syscall_unaugmented;
3631}
3632
3633static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
3634{
3635 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3636
3637 if (sc == NULL)
3638 return;
3639
3640 sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3641 sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit");
3642}
3643
3644static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
3645{
3646 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3647 return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3648}
3649
3650static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
3651{
3652 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3653 return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3654}
3655
3656static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
3657{
3658 struct tep_format_field *field;
3659 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3660 const struct btf_type *bt;
3661 char *struct_offset, *tmp, name[32];
3662 bool can_augment = false;
3663 int i, cnt;
3664
3665 if (sc == NULL)
3666 return -1;
3667
3668 trace__load_vmlinux_btf(trace);
3669 if (trace->btf == NULL)
3670 return -1;
3671
3672 for (i = 0, field = sc->args; field; ++i, field = field->next) {
3673 // XXX We're only collecting pointer payloads _from_ user space
3674 if (!sc->arg_fmt[i].from_user)
3675 continue;
3676
3677 struct_offset = strstr(field->type, "struct ");
3678 if (struct_offset == NULL)
3679 struct_offset = strstr(field->type, "union ");
3680 else
3681 struct_offset++; // "union" is shorter
3682
3683 if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
3684 struct_offset += 6;
3685
3686 /* for 'struct foo *', we only want 'foo' */
3687 for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
3688 }
3689
3690 strncpy(name, struct_offset, cnt);
3691 name[cnt] = '\0';
3692
3693 /* cache struct's btf_type and type_id */
3694 if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name))
3695 continue;
3696
3697 bt = sc->arg_fmt[i].type;
3698 beauty_array[i] = bt->size;
3699 can_augment = true;
3700 } else if (field->flags & TEP_FIELD_IS_POINTER && /* string */
3701 strcmp(field->type, "const char *") == 0 &&
3702 (strstr(field->name, "name") ||
3703 strstr(field->name, "path") ||
3704 strstr(field->name, "file") ||
3705 strstr(field->name, "root") ||
3706 strstr(field->name, "key") ||
3707 strstr(field->name, "special") ||
3708 strstr(field->name, "type") ||
3709 strstr(field->name, "description"))) {
3710 beauty_array[i] = 1;
3711 can_augment = true;
3712 } else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */
3713 strstr(field->type, "char *") &&
3714 (strstr(field->name, "buf") ||
3715 strstr(field->name, "val") ||
3716 strstr(field->name, "msg"))) {
3717 int j;
3718 struct tep_format_field *field_tmp;
3719
3720 /* find the size of the buffer that appears in pairs with buf */
3721 for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) {
3722 if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */
3723 (strstr(field_tmp->name, "count") ||
3724 strstr(field_tmp->name, "siz") || /* size, bufsiz */
3725 (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) {
3726 /* filename's got 'len' in it, we don't want that */
3727 beauty_array[i] = -(j + 1);
3728 can_augment = true;
3729 break;
3730 }
3731 }
3732 }
3733 }
3734
3735 if (can_augment)
3736 return 0;
3737
3738 return -1;
3739}
3740
3741static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
3742{
3743 struct tep_format_field *field, *candidate_field;
3744 /*
3745 * We're only interested in syscalls that have a pointer:
3746 */
3747 for (field = sc->args; field; field = field->next) {
3748 if (field->flags & TEP_FIELD_IS_POINTER)
3749 goto try_to_find_pair;
3750 }
3751
3752 return NULL;
3753
3754try_to_find_pair:
3755 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3756 int id = syscalltbl__id_at_idx(trace->sctbl, i);
3757 struct syscall *pair = trace__syscall_info(trace, NULL, id);
3758 struct bpf_program *pair_prog;
3759 bool is_candidate = false;
3760
3761 if (pair == NULL || pair == sc ||
3762 pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
3763 continue;
3764
3765 for (field = sc->args, candidate_field = pair->args;
3766 field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
3767 bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
3768 candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
3769
3770 if (is_pointer) {
3771 if (!candidate_is_pointer) {
3772 // The candidate just doesn't copies our pointer arg, might copy other pointers we want.
3773 continue;
3774 }
3775 } else {
3776 if (candidate_is_pointer) {
3777 // The candidate might copy a pointer we don't have, skip it.
3778 goto next_candidate;
3779 }
3780 continue;
3781 }
3782
3783 if (strcmp(field->type, candidate_field->type))
3784 goto next_candidate;
3785
3786 /*
3787 * This is limited in the BPF program but sys_write
3788 * uses "const char *" for its "buf" arg so we need to
3789 * use some heuristic that is kinda future proof...
3790 */
3791 if (strcmp(field->type, "const char *") == 0 &&
3792 !(strstr(field->name, "name") ||
3793 strstr(field->name, "path") ||
3794 strstr(field->name, "file") ||
3795 strstr(field->name, "root") ||
3796 strstr(field->name, "description")))
3797 goto next_candidate;
3798
3799 is_candidate = true;
3800 }
3801
3802 if (!is_candidate)
3803 goto next_candidate;
3804
3805 /*
3806 * Check if the tentative pair syscall augmenter has more pointers, if it has,
3807 * then it may be collecting that and we then can't use it, as it would collect
3808 * more than what is common to the two syscalls.
3809 */
3810 if (candidate_field) {
3811 for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
3812 if (candidate_field->flags & TEP_FIELD_IS_POINTER)
3813 goto next_candidate;
3814 }
3815
3816 pair_prog = pair->bpf_prog.sys_enter;
3817 /*
3818 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
3819 * have been searched for, so search it here and if it returns the
3820 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
3821 * program for a filtered syscall on a non-filtered one.
3822 *
3823 * For instance, we have "!syscalls:sys_enter_renameat" and that is
3824 * useful for "renameat2".
3825 */
3826 if (pair_prog == NULL) {
3827 pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3828 if (pair_prog == trace->skel->progs.syscall_unaugmented)
3829 goto next_candidate;
3830 }
3831
3832 pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3833 return pair_prog;
3834 next_candidate:
3835 continue;
3836 }
3837
3838 return NULL;
3839}
3840
3841static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3842{
3843 int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
3844 int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
3845 int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
3846 int err = 0;
3847 unsigned int beauty_array[6];
3848
3849 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3850 int prog_fd, key = syscalltbl__id_at_idx(trace->sctbl, i);
3851
3852 if (!trace__syscall_enabled(trace, key))
3853 continue;
3854
3855 trace__init_syscall_bpf_progs(trace, key);
3856
3857 // It'll get at least the "!raw_syscalls:unaugmented"
3858 prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3859 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3860 if (err)
3861 break;
3862 prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3863 err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3864 if (err)
3865 break;
3866
3867 /* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
3868 memset(beauty_array, 0, sizeof(beauty_array));
3869 err = trace__bpf_sys_enter_beauty_map(trace, key, (unsigned int *)beauty_array);
3870 if (err)
3871 continue;
3872 err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY);
3873 if (err)
3874 break;
3875 }
3876
3877 /*
3878 * Now lets do a second pass looking for enabled syscalls without
3879 * an augmenter that have a signature that is a superset of another
3880 * syscall with an augmenter so that we can auto-reuse it.
3881 *
3882 * I.e. if we have an augmenter for the "open" syscall that has
3883 * this signature:
3884 *
3885 * int open(const char *pathname, int flags, mode_t mode);
3886 *
3887 * I.e. that will collect just the first string argument, then we
3888 * can reuse it for the 'creat' syscall, that has this signature:
3889 *
3890 * int creat(const char *pathname, mode_t mode);
3891 *
3892 * and for:
3893 *
3894 * int stat(const char *pathname, struct stat *statbuf);
3895 * int lstat(const char *pathname, struct stat *statbuf);
3896 *
3897 * Because the 'open' augmenter will collect the first arg as a string,
3898 * and leave alone all the other args, which already helps with
3899 * beautifying 'stat' and 'lstat''s pathname arg.
3900 *
3901 * Then, in time, when 'stat' gets an augmenter that collects both
3902 * first and second arg (this one on the raw_syscalls:sys_exit prog
3903 * array tail call, then that one will be used.
3904 */
3905 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3906 int key = syscalltbl__id_at_idx(trace->sctbl, i);
3907 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3908 struct bpf_program *pair_prog;
3909 int prog_fd;
3910
3911 if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3912 continue;
3913
3914 /*
3915 * For now we're just reusing the sys_enter prog, and if it
3916 * already has an augmenter, we don't need to find one.
3917 */
3918 if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
3919 continue;
3920
3921 /*
3922 * Look at all the other syscalls for one that has a signature
3923 * that is close enough that we can share:
3924 */
3925 pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3926 if (pair_prog == NULL)
3927 continue;
3928
3929 sc->bpf_prog.sys_enter = pair_prog;
3930
3931 /*
3932 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3933 * with the fd for the program we're reusing:
3934 */
3935 prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3936 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3937 if (err)
3938 break;
3939 }
3940
3941 return err;
3942}
3943#endif // HAVE_BPF_SKEL
3944
3945static int trace__set_ev_qualifier_filter(struct trace *trace)
3946{
3947 if (trace->syscalls.events.sys_enter)
3948 return trace__set_ev_qualifier_tp_filter(trace);
3949 return 0;
3950}
3951
3952static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3953 size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3954{
3955 int err = 0;
3956#ifdef HAVE_LIBBPF_SUPPORT
3957 bool value = true;
3958 int map_fd = bpf_map__fd(map);
3959 size_t i;
3960
3961 for (i = 0; i < npids; ++i) {
3962 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3963 if (err)
3964 break;
3965 }
3966#endif
3967 return err;
3968}
3969
3970static int trace__set_filter_loop_pids(struct trace *trace)
3971{
3972 unsigned int nr = 1, err;
3973 pid_t pids[32] = {
3974 getpid(),
3975 };
3976 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3977
3978 while (thread && nr < ARRAY_SIZE(pids)) {
3979 struct thread *parent = machine__find_thread(trace->host,
3980 thread__ppid(thread),
3981 thread__ppid(thread));
3982
3983 if (parent == NULL)
3984 break;
3985
3986 if (!strcmp(thread__comm_str(parent), "sshd") ||
3987 strstarts(thread__comm_str(parent), "gnome-terminal")) {
3988 pids[nr++] = thread__tid(parent);
3989 break;
3990 }
3991 thread = parent;
3992 }
3993
3994 err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
3995 if (!err && trace->filter_pids.map)
3996 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3997
3998 return err;
3999}
4000
4001static int trace__set_filter_pids(struct trace *trace)
4002{
4003 int err = 0;
4004 /*
4005 * Better not use !target__has_task() here because we need to cover the
4006 * case where no threads were specified in the command line, but a
4007 * workload was, and in that case we will fill in the thread_map when
4008 * we fork the workload in evlist__prepare_workload.
4009 */
4010 if (trace->filter_pids.nr > 0) {
4011 err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
4012 trace->filter_pids.entries);
4013 if (!err && trace->filter_pids.map) {
4014 err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
4015 trace->filter_pids.entries);
4016 }
4017 } else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
4018 err = trace__set_filter_loop_pids(trace);
4019 }
4020
4021 return err;
4022}
4023
4024static int __trace__deliver_event(struct trace *trace, union perf_event *event)
4025{
4026 struct evlist *evlist = trace->evlist;
4027 struct perf_sample sample;
4028 int err = evlist__parse_sample(evlist, event, &sample);
4029
4030 if (err)
4031 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
4032 else
4033 trace__handle_event(trace, event, &sample);
4034
4035 return 0;
4036}
4037
4038static int __trace__flush_events(struct trace *trace)
4039{
4040 u64 first = ordered_events__first_time(&trace->oe.data);
4041 u64 flush = trace->oe.last - NSEC_PER_SEC;
4042
4043 /* Is there some thing to flush.. */
4044 if (first && first < flush)
4045 return ordered_events__flush_time(&trace->oe.data, flush);
4046
4047 return 0;
4048}
4049
4050static int trace__flush_events(struct trace *trace)
4051{
4052 return !trace->sort_events ? 0 : __trace__flush_events(trace);
4053}
4054
4055static int trace__deliver_event(struct trace *trace, union perf_event *event)
4056{
4057 int err;
4058
4059 if (!trace->sort_events)
4060 return __trace__deliver_event(trace, event);
4061
4062 err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
4063 if (err && err != -1)
4064 return err;
4065
4066 err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
4067 if (err)
4068 return err;
4069
4070 return trace__flush_events(trace);
4071}
4072
4073static int ordered_events__deliver_event(struct ordered_events *oe,
4074 struct ordered_event *event)
4075{
4076 struct trace *trace = container_of(oe, struct trace, oe.data);
4077
4078 return __trace__deliver_event(trace, event->event);
4079}
4080
4081static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg,
4082 char **type)
4083{
4084 struct tep_format_field *field;
4085 struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
4086
4087 if (evsel->tp_format == NULL || fmt == NULL)
4088 return NULL;
4089
4090 for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
4091 if (strcmp(field->name, arg) == 0) {
4092 *type = field->type;
4093 return fmt;
4094 }
4095
4096 return NULL;
4097}
4098
4099static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
4100{
4101 char *tok, *left = evsel->filter, *new_filter = evsel->filter;
4102
4103 while ((tok = strpbrk(left, "=<>!")) != NULL) {
4104 char *right = tok + 1, *right_end;
4105
4106 if (*right == '=')
4107 ++right;
4108
4109 while (isspace(*right))
4110 ++right;
4111
4112 if (*right == '\0')
4113 break;
4114
4115 while (!isalpha(*left))
4116 if (++left == tok) {
4117 /*
4118 * Bail out, can't find the name of the argument that is being
4119 * used in the filter, let it try to set this filter, will fail later.
4120 */
4121 return 0;
4122 }
4123
4124 right_end = right + 1;
4125 while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
4126 ++right_end;
4127
4128 if (isalpha(*right)) {
4129 struct syscall_arg_fmt *fmt;
4130 int left_size = tok - left,
4131 right_size = right_end - right;
4132 char arg[128], *type;
4133
4134 while (isspace(left[left_size - 1]))
4135 --left_size;
4136
4137 scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
4138
4139 fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type);
4140 if (fmt == NULL) {
4141 pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
4142 arg, evsel->name, evsel->filter);
4143 return -1;
4144 }
4145
4146 pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
4147 arg, (int)(right - tok), tok, right_size, right);
4148
4149 if (fmt->strtoul) {
4150 u64 val;
4151 struct syscall_arg syscall_arg = {
4152 .trace = trace,
4153 .fmt = fmt,
4154 .type_name = type,
4155 .parm = fmt->parm,
4156 };
4157
4158 if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
4159 char *n, expansion[19];
4160 int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
4161 int expansion_offset = right - new_filter;
4162
4163 pr_debug("%s", expansion);
4164
4165 if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
4166 pr_debug(" out of memory!\n");
4167 free(new_filter);
4168 return -1;
4169 }
4170 if (new_filter != evsel->filter)
4171 free(new_filter);
4172 left = n + expansion_offset + expansion_lenght;
4173 new_filter = n;
4174 } else {
4175 pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4176 right_size, right, arg, evsel->name, evsel->filter);
4177 return -1;
4178 }
4179 } else {
4180 pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4181 arg, evsel->name, evsel->filter);
4182 return -1;
4183 }
4184
4185 pr_debug("\n");
4186 } else {
4187 left = right_end;
4188 }
4189 }
4190
4191 if (new_filter != evsel->filter) {
4192 pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
4193 evsel__set_filter(evsel, new_filter);
4194 free(new_filter);
4195 }
4196
4197 return 0;
4198}
4199
4200static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
4201{
4202 struct evlist *evlist = trace->evlist;
4203 struct evsel *evsel;
4204
4205 evlist__for_each_entry(evlist, evsel) {
4206 if (evsel->filter == NULL)
4207 continue;
4208
4209 if (trace__expand_filter(trace, evsel)) {
4210 *err_evsel = evsel;
4211 return -1;
4212 }
4213 }
4214
4215 return 0;
4216}
4217
4218static int trace__run(struct trace *trace, int argc, const char **argv)
4219{
4220 struct evlist *evlist = trace->evlist;
4221 struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
4222 int err = -1, i;
4223 unsigned long before;
4224 const bool forks = argc > 0;
4225 bool draining = false;
4226
4227 trace->live = true;
4228
4229 if (!trace->raw_augmented_syscalls) {
4230 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
4231 goto out_error_raw_syscalls;
4232
4233 if (trace->trace_syscalls)
4234 trace->vfs_getname = evlist__add_vfs_getname(evlist);
4235 }
4236
4237 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
4238 pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
4239 if (pgfault_maj == NULL)
4240 goto out_error_mem;
4241 evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
4242 evlist__add(evlist, pgfault_maj);
4243 }
4244
4245 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
4246 pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
4247 if (pgfault_min == NULL)
4248 goto out_error_mem;
4249 evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
4250 evlist__add(evlist, pgfault_min);
4251 }
4252
4253 /* Enable ignoring missing threads when -u/-p option is defined. */
4254 trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
4255
4256 if (trace->sched &&
4257 evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
4258 goto out_error_sched_stat_runtime;
4259 /*
4260 * If a global cgroup was set, apply it to all the events without an
4261 * explicit cgroup. I.e.:
4262 *
4263 * trace -G A -e sched:*switch
4264 *
4265 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
4266 * _and_ sched:sched_switch to the 'A' cgroup, while:
4267 *
4268 * trace -e sched:*switch -G A
4269 *
4270 * will only set the sched:sched_switch event to the 'A' cgroup, all the
4271 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
4272 * a cgroup (on the root cgroup, sys wide, etc).
4273 *
4274 * Multiple cgroups:
4275 *
4276 * trace -G A -e sched:*switch -G B
4277 *
4278 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
4279 * to the 'B' cgroup.
4280 *
4281 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
4282 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
4283 */
4284 if (trace->cgroup)
4285 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
4286
4287 err = evlist__create_maps(evlist, &trace->opts.target);
4288 if (err < 0) {
4289 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
4290 goto out_delete_evlist;
4291 }
4292
4293 err = trace__symbols_init(trace, evlist);
4294 if (err < 0) {
4295 fprintf(trace->output, "Problems initializing symbol libraries!\n");
4296 goto out_delete_evlist;
4297 }
4298
4299 evlist__config(evlist, &trace->opts, &callchain_param);
4300
4301 if (forks) {
4302 err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
4303 if (err < 0) {
4304 fprintf(trace->output, "Couldn't run the workload!\n");
4305 goto out_delete_evlist;
4306 }
4307 workload_pid = evlist->workload.pid;
4308 }
4309
4310 err = evlist__open(evlist);
4311 if (err < 0)
4312 goto out_error_open;
4313#ifdef HAVE_BPF_SKEL
4314 if (trace->syscalls.events.bpf_output) {
4315 struct perf_cpu cpu;
4316
4317 /*
4318 * Set up the __augmented_syscalls__ BPF map to hold for each
4319 * CPU the bpf-output event's file descriptor.
4320 */
4321 perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
4322 bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
4323 &cpu.cpu, sizeof(int),
4324 xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
4325 cpu.cpu, 0),
4326 sizeof(__u32), BPF_ANY);
4327 }
4328 }
4329
4330 if (trace->skel)
4331 trace->filter_pids.map = trace->skel->maps.pids_filtered;
4332#endif
4333 err = trace__set_filter_pids(trace);
4334 if (err < 0)
4335 goto out_error_mem;
4336
4337#ifdef HAVE_BPF_SKEL
4338 if (trace->skel && trace->skel->progs.sys_enter)
4339 trace__init_syscalls_bpf_prog_array_maps(trace);
4340#endif
4341
4342 if (trace->ev_qualifier_ids.nr > 0) {
4343 err = trace__set_ev_qualifier_filter(trace);
4344 if (err < 0)
4345 goto out_errno;
4346
4347 if (trace->syscalls.events.sys_exit) {
4348 pr_debug("event qualifier tracepoint filter: %s\n",
4349 trace->syscalls.events.sys_exit->filter);
4350 }
4351 }
4352
4353 /*
4354 * If the "close" syscall is not traced, then we will not have the
4355 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
4356 * fd->pathname table and were ending up showing the last value set by
4357 * syscalls opening a pathname and associating it with a descriptor or
4358 * reading it from /proc/pid/fd/ in cases where that doesn't make
4359 * sense.
4360 *
4361 * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
4362 * not in use.
4363 */
4364 trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
4365
4366 err = trace__expand_filters(trace, &evsel);
4367 if (err)
4368 goto out_delete_evlist;
4369 err = evlist__apply_filters(evlist, &evsel, &trace->opts.target);
4370 if (err < 0)
4371 goto out_error_apply_filters;
4372
4373 err = evlist__mmap(evlist, trace->opts.mmap_pages);
4374 if (err < 0)
4375 goto out_error_mmap;
4376
4377 if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
4378 evlist__enable(evlist);
4379
4380 if (forks)
4381 evlist__start_workload(evlist);
4382
4383 if (trace->opts.target.initial_delay) {
4384 usleep(trace->opts.target.initial_delay * 1000);
4385 evlist__enable(evlist);
4386 }
4387
4388 trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
4389 perf_thread_map__nr(evlist->core.threads) > 1 ||
4390 evlist__first(evlist)->core.attr.inherit;
4391
4392 /*
4393 * Now that we already used evsel->core.attr to ask the kernel to setup the
4394 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
4395 * trace__resolve_callchain(), allowing per-event max-stack settings
4396 * to override an explicitly set --max-stack global setting.
4397 */
4398 evlist__for_each_entry(evlist, evsel) {
4399 if (evsel__has_callchain(evsel) &&
4400 evsel->core.attr.sample_max_stack == 0)
4401 evsel->core.attr.sample_max_stack = trace->max_stack;
4402 }
4403again:
4404 before = trace->nr_events;
4405
4406 for (i = 0; i < evlist->core.nr_mmaps; i++) {
4407 union perf_event *event;
4408 struct mmap *md;
4409
4410 md = &evlist->mmap[i];
4411 if (perf_mmap__read_init(&md->core) < 0)
4412 continue;
4413
4414 while ((event = perf_mmap__read_event(&md->core)) != NULL) {
4415 ++trace->nr_events;
4416
4417 err = trace__deliver_event(trace, event);
4418 if (err)
4419 goto out_disable;
4420
4421 perf_mmap__consume(&md->core);
4422
4423 if (interrupted)
4424 goto out_disable;
4425
4426 if (done && !draining) {
4427 evlist__disable(evlist);
4428 draining = true;
4429 }
4430 }
4431 perf_mmap__read_done(&md->core);
4432 }
4433
4434 if (trace->nr_events == before) {
4435 int timeout = done ? 100 : -1;
4436
4437 if (!draining && evlist__poll(evlist, timeout) > 0) {
4438 if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
4439 draining = true;
4440
4441 goto again;
4442 } else {
4443 if (trace__flush_events(trace))
4444 goto out_disable;
4445 }
4446 } else {
4447 goto again;
4448 }
4449
4450out_disable:
4451 thread__zput(trace->current);
4452
4453 evlist__disable(evlist);
4454
4455 if (trace->sort_events)
4456 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
4457
4458 if (!err) {
4459 if (trace->summary)
4460 trace__fprintf_thread_summary(trace, trace->output);
4461
4462 if (trace->show_tool_stats) {
4463 fprintf(trace->output, "Stats:\n "
4464 " vfs_getname : %" PRIu64 "\n"
4465 " proc_getname: %" PRIu64 "\n",
4466 trace->stats.vfs_getname,
4467 trace->stats.proc_getname);
4468 }
4469 }
4470
4471out_delete_evlist:
4472 trace__symbols__exit(trace);
4473 evlist__free_syscall_tp_fields(evlist);
4474 evlist__delete(evlist);
4475 cgroup__put(trace->cgroup);
4476 trace->evlist = NULL;
4477 trace->live = false;
4478 return err;
4479{
4480 char errbuf[BUFSIZ];
4481
4482out_error_sched_stat_runtime:
4483 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
4484 goto out_error;
4485
4486out_error_raw_syscalls:
4487 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
4488 goto out_error;
4489
4490out_error_mmap:
4491 evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
4492 goto out_error;
4493
4494out_error_open:
4495 evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
4496
4497out_error:
4498 fprintf(trace->output, "%s\n", errbuf);
4499 goto out_delete_evlist;
4500
4501out_error_apply_filters:
4502 fprintf(trace->output,
4503 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
4504 evsel->filter, evsel__name(evsel), errno,
4505 str_error_r(errno, errbuf, sizeof(errbuf)));
4506 goto out_delete_evlist;
4507}
4508out_error_mem:
4509 fprintf(trace->output, "Not enough memory to run!\n");
4510 goto out_delete_evlist;
4511
4512out_errno:
4513 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
4514 goto out_delete_evlist;
4515}
4516
4517static int trace__replay(struct trace *trace)
4518{
4519 const struct evsel_str_handler handlers[] = {
4520 { "probe:vfs_getname", trace__vfs_getname, },
4521 };
4522 struct perf_data data = {
4523 .path = input_name,
4524 .mode = PERF_DATA_MODE_READ,
4525 .force = trace->force,
4526 };
4527 struct perf_session *session;
4528 struct evsel *evsel;
4529 int err = -1;
4530
4531 trace->tool.sample = trace__process_sample;
4532 trace->tool.mmap = perf_event__process_mmap;
4533 trace->tool.mmap2 = perf_event__process_mmap2;
4534 trace->tool.comm = perf_event__process_comm;
4535 trace->tool.exit = perf_event__process_exit;
4536 trace->tool.fork = perf_event__process_fork;
4537 trace->tool.attr = perf_event__process_attr;
4538 trace->tool.tracing_data = perf_event__process_tracing_data;
4539 trace->tool.build_id = perf_event__process_build_id;
4540 trace->tool.namespaces = perf_event__process_namespaces;
4541
4542 trace->tool.ordered_events = true;
4543 trace->tool.ordering_requires_timestamps = true;
4544
4545 /* add tid to output */
4546 trace->multiple_threads = true;
4547
4548 session = perf_session__new(&data, &trace->tool);
4549 if (IS_ERR(session))
4550 return PTR_ERR(session);
4551
4552 if (trace->opts.target.pid)
4553 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
4554
4555 if (trace->opts.target.tid)
4556 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
4557
4558 if (symbol__init(&session->header.env) < 0)
4559 goto out;
4560
4561 trace->host = &session->machines.host;
4562
4563 err = perf_session__set_tracepoints_handlers(session, handlers);
4564 if (err)
4565 goto out;
4566
4567 evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
4568 trace->syscalls.events.sys_enter = evsel;
4569 /* older kernels have syscalls tp versus raw_syscalls */
4570 if (evsel == NULL)
4571 evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
4572
4573 if (evsel &&
4574 (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
4575 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
4576 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
4577 goto out;
4578 }
4579
4580 evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
4581 trace->syscalls.events.sys_exit = evsel;
4582 if (evsel == NULL)
4583 evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
4584 if (evsel &&
4585 (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
4586 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
4587 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
4588 goto out;
4589 }
4590
4591 evlist__for_each_entry(session->evlist, evsel) {
4592 if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
4593 (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
4594 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
4595 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
4596 evsel->handler = trace__pgfault;
4597 }
4598
4599 setup_pager();
4600
4601 err = perf_session__process_events(session);
4602 if (err)
4603 pr_err("Failed to process events, error %d", err);
4604
4605 else if (trace->summary)
4606 trace__fprintf_thread_summary(trace, trace->output);
4607
4608out:
4609 perf_session__delete(session);
4610
4611 return err;
4612}
4613
4614static size_t trace__fprintf_threads_header(FILE *fp)
4615{
4616 size_t printed;
4617
4618 printed = fprintf(fp, "\n Summary of events:\n\n");
4619
4620 return printed;
4621}
4622
4623DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
4624 struct syscall_stats *stats;
4625 double msecs;
4626 int syscall;
4627)
4628{
4629 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
4630 struct syscall_stats *stats = source->priv;
4631
4632 entry->syscall = source->i;
4633 entry->stats = stats;
4634 entry->msecs = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
4635}
4636
4637static size_t thread__dump_stats(struct thread_trace *ttrace,
4638 struct trace *trace, FILE *fp)
4639{
4640 size_t printed = 0;
4641 struct syscall *sc;
4642 struct rb_node *nd;
4643 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
4644
4645 if (syscall_stats == NULL)
4646 return 0;
4647
4648 printed += fprintf(fp, "\n");
4649
4650 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
4651 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
4652 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
4653
4654 resort_rb__for_each_entry(nd, syscall_stats) {
4655 struct syscall_stats *stats = syscall_stats_entry->stats;
4656 if (stats) {
4657 double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
4658 double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
4659 double avg = avg_stats(&stats->stats);
4660 double pct;
4661 u64 n = (u64)stats->stats.n;
4662
4663 pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
4664 avg /= NSEC_PER_MSEC;
4665
4666 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
4667 printed += fprintf(fp, " %-15s", sc->name);
4668 printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
4669 n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
4670 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
4671
4672 if (trace->errno_summary && stats->nr_failures) {
4673 int e;
4674
4675 for (e = 0; e < stats->max_errno; ++e) {
4676 if (stats->errnos[e] != 0)
4677 fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
4678 }
4679 }
4680 }
4681 }
4682
4683 resort_rb__delete(syscall_stats);
4684 printed += fprintf(fp, "\n\n");
4685
4686 return printed;
4687}
4688
4689static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
4690{
4691 size_t printed = 0;
4692 struct thread_trace *ttrace = thread__priv(thread);
4693 double ratio;
4694
4695 if (ttrace == NULL)
4696 return 0;
4697
4698 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
4699
4700 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread));
4701 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
4702 printed += fprintf(fp, "%.1f%%", ratio);
4703 if (ttrace->pfmaj)
4704 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
4705 if (ttrace->pfmin)
4706 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
4707 if (trace->sched)
4708 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
4709 else if (fputc('\n', fp) != EOF)
4710 ++printed;
4711
4712 printed += thread__dump_stats(ttrace, trace, fp);
4713
4714 return printed;
4715}
4716
4717static unsigned long thread__nr_events(struct thread_trace *ttrace)
4718{
4719 return ttrace ? ttrace->nr_events : 0;
4720}
4721
4722static int trace_nr_events_cmp(void *priv __maybe_unused,
4723 const struct list_head *la,
4724 const struct list_head *lb)
4725{
4726 struct thread_list *a = list_entry(la, struct thread_list, list);
4727 struct thread_list *b = list_entry(lb, struct thread_list, list);
4728 unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
4729 unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
4730
4731 if (a_nr_events != b_nr_events)
4732 return a_nr_events < b_nr_events ? -1 : 1;
4733
4734 /* Identical number of threads, place smaller tids first. */
4735 return thread__tid(a->thread) < thread__tid(b->thread)
4736 ? -1
4737 : (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
4738}
4739
4740static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
4741{
4742 size_t printed = trace__fprintf_threads_header(fp);
4743 LIST_HEAD(threads);
4744
4745 if (machine__thread_list(trace->host, &threads) == 0) {
4746 struct thread_list *pos;
4747
4748 list_sort(NULL, &threads, trace_nr_events_cmp);
4749
4750 list_for_each_entry(pos, &threads, list)
4751 printed += trace__fprintf_thread(fp, pos->thread, trace);
4752 }
4753 thread_list__delete(&threads);
4754 return printed;
4755}
4756
4757static int trace__set_duration(const struct option *opt, const char *str,
4758 int unset __maybe_unused)
4759{
4760 struct trace *trace = opt->value;
4761
4762 trace->duration_filter = atof(str);
4763 return 0;
4764}
4765
4766static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
4767 int unset __maybe_unused)
4768{
4769 int ret = -1;
4770 size_t i;
4771 struct trace *trace = opt->value;
4772 /*
4773 * FIXME: introduce a intarray class, plain parse csv and create a
4774 * { int nr, int entries[] } struct...
4775 */
4776 struct intlist *list = intlist__new(str);
4777
4778 if (list == NULL)
4779 return -1;
4780
4781 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
4782 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
4783
4784 if (trace->filter_pids.entries == NULL)
4785 goto out;
4786
4787 trace->filter_pids.entries[0] = getpid();
4788
4789 for (i = 1; i < trace->filter_pids.nr; ++i)
4790 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
4791
4792 intlist__delete(list);
4793 ret = 0;
4794out:
4795 return ret;
4796}
4797
4798static int trace__open_output(struct trace *trace, const char *filename)
4799{
4800 struct stat st;
4801
4802 if (!stat(filename, &st) && st.st_size) {
4803 char oldname[PATH_MAX];
4804
4805 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
4806 unlink(oldname);
4807 rename(filename, oldname);
4808 }
4809
4810 trace->output = fopen(filename, "w");
4811
4812 return trace->output == NULL ? -errno : 0;
4813}
4814
4815static int parse_pagefaults(const struct option *opt, const char *str,
4816 int unset __maybe_unused)
4817{
4818 int *trace_pgfaults = opt->value;
4819
4820 if (strcmp(str, "all") == 0)
4821 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
4822 else if (strcmp(str, "maj") == 0)
4823 *trace_pgfaults |= TRACE_PFMAJ;
4824 else if (strcmp(str, "min") == 0)
4825 *trace_pgfaults |= TRACE_PFMIN;
4826 else
4827 return -1;
4828
4829 return 0;
4830}
4831
4832static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
4833{
4834 struct evsel *evsel;
4835
4836 evlist__for_each_entry(evlist, evsel) {
4837 if (evsel->handler == NULL)
4838 evsel->handler = handler;
4839 }
4840}
4841
4842static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
4843{
4844 struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
4845
4846 if (fmt) {
4847 const struct syscall_fmt *scfmt = syscall_fmt__find(name);
4848
4849 if (scfmt) {
4850 int skip = 0;
4851
4852 if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
4853 strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
4854 ++skip;
4855
4856 memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
4857 }
4858 }
4859}
4860
4861static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf)
4862{
4863 struct evsel *evsel;
4864
4865 evlist__for_each_entry(evlist, evsel) {
4866 if (evsel->priv || !evsel->tp_format)
4867 continue;
4868
4869 if (strcmp(evsel->tp_format->system, "syscalls")) {
4870 evsel__init_tp_arg_scnprintf(evsel, use_btf);
4871 continue;
4872 }
4873
4874 if (evsel__init_syscall_tp(evsel))
4875 return -1;
4876
4877 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
4878 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4879
4880 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
4881 return -1;
4882
4883 evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
4884 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
4885 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4886
4887 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
4888 return -1;
4889
4890 evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
4891 }
4892 }
4893
4894 return 0;
4895}
4896
4897/*
4898 * XXX: Hackish, just splitting the combined -e+--event (syscalls
4899 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
4900 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
4901 *
4902 * It'd be better to introduce a parse_options() variant that would return a
4903 * list with the terms it didn't match to an event...
4904 */
4905static int trace__parse_events_option(const struct option *opt, const char *str,
4906 int unset __maybe_unused)
4907{
4908 struct trace *trace = (struct trace *)opt->value;
4909 const char *s = str;
4910 char *sep = NULL, *lists[2] = { NULL, NULL, };
4911 int len = strlen(str) + 1, err = -1, list, idx;
4912 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
4913 char group_name[PATH_MAX];
4914 const struct syscall_fmt *fmt;
4915
4916 if (strace_groups_dir == NULL)
4917 return -1;
4918
4919 if (*s == '!') {
4920 ++s;
4921 trace->not_ev_qualifier = true;
4922 }
4923
4924 while (1) {
4925 if ((sep = strchr(s, ',')) != NULL)
4926 *sep = '\0';
4927
4928 list = 0;
4929 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
4930 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
4931 list = 1;
4932 goto do_concat;
4933 }
4934
4935 fmt = syscall_fmt__find_by_alias(s);
4936 if (fmt != NULL) {
4937 list = 1;
4938 s = fmt->name;
4939 } else {
4940 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
4941 if (access(group_name, R_OK) == 0)
4942 list = 1;
4943 }
4944do_concat:
4945 if (lists[list]) {
4946 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
4947 } else {
4948 lists[list] = malloc(len);
4949 if (lists[list] == NULL)
4950 goto out;
4951 strcpy(lists[list], s);
4952 }
4953
4954 if (!sep)
4955 break;
4956
4957 *sep = ',';
4958 s = sep + 1;
4959 }
4960
4961 if (lists[1] != NULL) {
4962 struct strlist_config slist_config = {
4963 .dirname = strace_groups_dir,
4964 };
4965
4966 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
4967 if (trace->ev_qualifier == NULL) {
4968 fputs("Not enough memory to parse event qualifier", trace->output);
4969 goto out;
4970 }
4971
4972 if (trace__validate_ev_qualifier(trace))
4973 goto out;
4974 trace->trace_syscalls = true;
4975 }
4976
4977 err = 0;
4978
4979 if (lists[0]) {
4980 struct parse_events_option_args parse_events_option_args = {
4981 .evlistp = &trace->evlist,
4982 };
4983 struct option o = {
4984 .value = &parse_events_option_args,
4985 };
4986 err = parse_events_option(&o, lists[0], 0);
4987 }
4988out:
4989 free(strace_groups_dir);
4990 free(lists[0]);
4991 free(lists[1]);
4992 if (sep)
4993 *sep = ',';
4994
4995 return err;
4996}
4997
4998static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
4999{
5000 struct trace *trace = opt->value;
5001
5002 if (!list_empty(&trace->evlist->core.entries)) {
5003 struct option o = {
5004 .value = &trace->evlist,
5005 };
5006 return parse_cgroups(&o, str, unset);
5007 }
5008 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
5009
5010 return 0;
5011}
5012
5013static int trace__config(const char *var, const char *value, void *arg)
5014{
5015 struct trace *trace = arg;
5016 int err = 0;
5017
5018 if (!strcmp(var, "trace.add_events")) {
5019 trace->perfconfig_events = strdup(value);
5020 if (trace->perfconfig_events == NULL) {
5021 pr_err("Not enough memory for %s\n", "trace.add_events");
5022 return -1;
5023 }
5024 } else if (!strcmp(var, "trace.show_timestamp")) {
5025 trace->show_tstamp = perf_config_bool(var, value);
5026 } else if (!strcmp(var, "trace.show_duration")) {
5027 trace->show_duration = perf_config_bool(var, value);
5028 } else if (!strcmp(var, "trace.show_arg_names")) {
5029 trace->show_arg_names = perf_config_bool(var, value);
5030 if (!trace->show_arg_names)
5031 trace->show_zeros = true;
5032 } else if (!strcmp(var, "trace.show_zeros")) {
5033 bool new_show_zeros = perf_config_bool(var, value);
5034 if (!trace->show_arg_names && !new_show_zeros) {
5035 pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
5036 goto out;
5037 }
5038 trace->show_zeros = new_show_zeros;
5039 } else if (!strcmp(var, "trace.show_prefix")) {
5040 trace->show_string_prefix = perf_config_bool(var, value);
5041 } else if (!strcmp(var, "trace.no_inherit")) {
5042 trace->opts.no_inherit = perf_config_bool(var, value);
5043 } else if (!strcmp(var, "trace.args_alignment")) {
5044 int args_alignment = 0;
5045 if (perf_config_int(&args_alignment, var, value) == 0)
5046 trace->args_alignment = args_alignment;
5047 } else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
5048 if (strcasecmp(value, "libtraceevent") == 0)
5049 trace->libtraceevent_print = true;
5050 else if (strcasecmp(value, "libbeauty") == 0)
5051 trace->libtraceevent_print = false;
5052 }
5053out:
5054 return err;
5055}
5056
5057static void trace__exit(struct trace *trace)
5058{
5059 int i;
5060
5061 strlist__delete(trace->ev_qualifier);
5062 zfree(&trace->ev_qualifier_ids.entries);
5063 if (trace->syscalls.table) {
5064 for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
5065 syscall__exit(&trace->syscalls.table[i]);
5066 zfree(&trace->syscalls.table);
5067 }
5068 syscalltbl__delete(trace->sctbl);
5069 zfree(&trace->perfconfig_events);
5070}
5071
5072#ifdef HAVE_BPF_SKEL
5073static int bpf__setup_bpf_output(struct evlist *evlist)
5074{
5075 int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
5076
5077 if (err)
5078 pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
5079
5080 return err;
5081}
5082#endif
5083
5084int cmd_trace(int argc, const char **argv)
5085{
5086 const char *trace_usage[] = {
5087 "perf trace [<options>] [<command>]",
5088 "perf trace [<options>] -- <command> [<options>]",
5089 "perf trace record [<options>] [<command>]",
5090 "perf trace record [<options>] -- <command> [<options>]",
5091 NULL
5092 };
5093 struct trace trace = {
5094 .opts = {
5095 .target = {
5096 .uid = UINT_MAX,
5097 .uses_mmap = true,
5098 },
5099 .user_freq = UINT_MAX,
5100 .user_interval = ULLONG_MAX,
5101 .no_buffering = true,
5102 .mmap_pages = UINT_MAX,
5103 },
5104 .output = stderr,
5105 .show_comm = true,
5106 .show_tstamp = true,
5107 .show_duration = true,
5108 .show_arg_names = true,
5109 .args_alignment = 70,
5110 .trace_syscalls = false,
5111 .kernel_syscallchains = false,
5112 .max_stack = UINT_MAX,
5113 .max_events = ULONG_MAX,
5114 };
5115 const char *output_name = NULL;
5116 const struct option trace_options[] = {
5117 OPT_CALLBACK('e', "event", &trace, "event",
5118 "event/syscall selector. use 'perf list' to list available events",
5119 trace__parse_events_option),
5120 OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
5121 "event filter", parse_filter),
5122 OPT_BOOLEAN(0, "comm", &trace.show_comm,
5123 "show the thread COMM next to its id"),
5124 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
5125 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
5126 trace__parse_events_option),
5127 OPT_STRING('o', "output", &output_name, "file", "output file name"),
5128 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
5129 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
5130 "trace events on existing process id"),
5131 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
5132 "trace events on existing thread id"),
5133 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
5134 "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
5135 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
5136 "system-wide collection from all CPUs"),
5137 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
5138 "list of cpus to monitor"),
5139 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
5140 "child tasks do not inherit counters"),
5141 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
5142 "number of mmap data pages", evlist__parse_mmap_pages),
5143 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
5144 "user to profile"),
5145 OPT_CALLBACK(0, "duration", &trace, "float",
5146 "show only events with duration > N.M ms",
5147 trace__set_duration),
5148 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
5149 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
5150 OPT_BOOLEAN('T', "time", &trace.full_time,
5151 "Show full timestamp, not time relative to first start"),
5152 OPT_BOOLEAN(0, "failure", &trace.failure_only,
5153 "Show only syscalls that failed"),
5154 OPT_BOOLEAN('s', "summary", &trace.summary_only,
5155 "Show only syscall summary with statistics"),
5156 OPT_BOOLEAN('S', "with-summary", &trace.summary,
5157 "Show all syscalls and summary with statistics"),
5158 OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
5159 "Show errno stats per syscall, use with -s or -S"),
5160 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
5161 "Trace pagefaults", parse_pagefaults, "maj"),
5162 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
5163 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
5164 OPT_CALLBACK(0, "call-graph", &trace.opts,
5165 "record_mode[,record_size]", record_callchain_help,
5166 &record_parse_callchain_opt),
5167 OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
5168 "Use libtraceevent to print the tracepoint arguments."),
5169 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
5170 "Show the kernel callchains on the syscall exit path"),
5171 OPT_ULONG(0, "max-events", &trace.max_events,
5172 "Set the maximum number of events to print, exit after that is reached. "),
5173 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
5174 "Set the minimum stack depth when parsing the callchain, "
5175 "anything below the specified depth will be ignored."),
5176 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
5177 "Set the maximum stack depth when parsing the callchain, "
5178 "anything beyond the specified depth will be ignored. "
5179 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
5180 OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
5181 "Sort batch of events before processing, use if getting out of order events"),
5182 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
5183 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
5184 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
5185 "per thread proc mmap processing timeout in ms"),
5186 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
5187 trace__parse_cgroups),
5188 OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
5189 "ms to wait before starting measurement after program "
5190 "start"),
5191 OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
5192 "to customized ones"),
5193 OPTS_EVSWITCH(&trace.evswitch),
5194 OPT_END()
5195 };
5196 bool __maybe_unused max_stack_user_set = true;
5197 bool mmap_pages_user_set = true;
5198 struct evsel *evsel;
5199 const char * const trace_subcommands[] = { "record", NULL };
5200 int err = -1;
5201 char bf[BUFSIZ];
5202 struct sigaction sigchld_act;
5203
5204 signal(SIGSEGV, sighandler_dump_stack);
5205 signal(SIGFPE, sighandler_dump_stack);
5206 signal(SIGINT, sighandler_interrupt);
5207
5208 memset(&sigchld_act, 0, sizeof(sigchld_act));
5209 sigchld_act.sa_flags = SA_SIGINFO;
5210 sigchld_act.sa_sigaction = sighandler_chld;
5211 sigaction(SIGCHLD, &sigchld_act, NULL);
5212
5213 trace.evlist = evlist__new();
5214 trace.sctbl = syscalltbl__new();
5215
5216 if (trace.evlist == NULL || trace.sctbl == NULL) {
5217 pr_err("Not enough memory to run!\n");
5218 err = -ENOMEM;
5219 goto out;
5220 }
5221
5222 /*
5223 * Parsing .perfconfig may entail creating a BPF event, that may need
5224 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
5225 * is too small. This affects just this process, not touching the
5226 * global setting. If it fails we'll get something in 'perf trace -v'
5227 * to help diagnose the problem.
5228 */
5229 rlimit__bump_memlock();
5230
5231 err = perf_config(trace__config, &trace);
5232 if (err)
5233 goto out;
5234
5235 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
5236 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
5237
5238 /*
5239 * Here we already passed thru trace__parse_events_option() and it has
5240 * already figured out if -e syscall_name, if not but if --event
5241 * foo:bar was used, the user is interested _just_ in those, say,
5242 * tracepoint events, not in the strace-like syscall-name-based mode.
5243 *
5244 * This is important because we need to check if strace-like mode is
5245 * needed to decided if we should filter out the eBPF
5246 * __augmented_syscalls__ code, if it is in the mix, say, via
5247 * .perfconfig trace.add_events, and filter those out.
5248 */
5249 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
5250 trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
5251 trace.trace_syscalls = true;
5252 }
5253 /*
5254 * Now that we have --verbose figured out, lets see if we need to parse
5255 * events from .perfconfig, so that if those events fail parsing, say some
5256 * BPF program fails, then we'll be able to use --verbose to see what went
5257 * wrong in more detail.
5258 */
5259 if (trace.perfconfig_events != NULL) {
5260 struct parse_events_error parse_err;
5261
5262 parse_events_error__init(&parse_err);
5263 err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
5264 if (err)
5265 parse_events_error__print(&parse_err, trace.perfconfig_events);
5266 parse_events_error__exit(&parse_err);
5267 if (err)
5268 goto out;
5269 }
5270
5271 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
5272 usage_with_options_msg(trace_usage, trace_options,
5273 "cgroup monitoring only available in system-wide mode");
5274 }
5275
5276#ifdef HAVE_BPF_SKEL
5277 if (!trace.trace_syscalls)
5278 goto skip_augmentation;
5279
5280 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
5281 pr_debug("Syscall augmentation fails with record, disabling augmentation");
5282 goto skip_augmentation;
5283 }
5284
5285 trace.skel = augmented_raw_syscalls_bpf__open();
5286 if (!trace.skel) {
5287 pr_debug("Failed to open augmented syscalls BPF skeleton");
5288 } else {
5289 /*
5290 * Disable attaching the BPF programs except for sys_enter and
5291 * sys_exit that tail call into this as necessary.
5292 */
5293 struct bpf_program *prog;
5294
5295 bpf_object__for_each_program(prog, trace.skel->obj) {
5296 if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
5297 bpf_program__set_autoattach(prog, /*autoattach=*/false);
5298 }
5299
5300 err = augmented_raw_syscalls_bpf__load(trace.skel);
5301
5302 if (err < 0) {
5303 libbpf_strerror(err, bf, sizeof(bf));
5304 pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf);
5305 } else {
5306 augmented_raw_syscalls_bpf__attach(trace.skel);
5307 trace__add_syscall_newtp(&trace);
5308 }
5309 }
5310
5311 err = bpf__setup_bpf_output(trace.evlist);
5312 if (err) {
5313 libbpf_strerror(err, bf, sizeof(bf));
5314 pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
5315 goto out;
5316 }
5317 trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
5318 assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__"));
5319skip_augmentation:
5320#endif
5321 err = -1;
5322
5323 if (trace.trace_pgfaults) {
5324 trace.opts.sample_address = true;
5325 trace.opts.sample_time = true;
5326 }
5327
5328 if (trace.opts.mmap_pages == UINT_MAX)
5329 mmap_pages_user_set = false;
5330
5331 if (trace.max_stack == UINT_MAX) {
5332 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
5333 max_stack_user_set = false;
5334 }
5335
5336#ifdef HAVE_DWARF_UNWIND_SUPPORT
5337 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
5338 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
5339 }
5340#endif
5341
5342 if (callchain_param.enabled) {
5343 if (!mmap_pages_user_set && geteuid() == 0)
5344 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
5345
5346 symbol_conf.use_callchain = true;
5347 }
5348
5349 if (trace.evlist->core.nr_entries > 0) {
5350 bool use_btf = false;
5351
5352 evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
5353 if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
5354 perror("failed to set syscalls:* tracepoint fields");
5355 goto out;
5356 }
5357
5358 if (use_btf)
5359 trace__load_vmlinux_btf(&trace);
5360 }
5361
5362 if (trace.sort_events) {
5363 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
5364 ordered_events__set_copy_on_queue(&trace.oe.data, true);
5365 }
5366
5367 /*
5368 * If we are augmenting syscalls, then combine what we put in the
5369 * __augmented_syscalls__ BPF map with what is in the
5370 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
5371 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
5372 *
5373 * We'll switch to look at two BPF maps, one for sys_enter and the
5374 * other for sys_exit when we start augmenting the sys_exit paths with
5375 * buffers that are being copied from kernel to userspace, think 'read'
5376 * syscall.
5377 */
5378 if (trace.syscalls.events.bpf_output) {
5379 evlist__for_each_entry(trace.evlist, evsel) {
5380 bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
5381
5382 if (raw_syscalls_sys_exit) {
5383 trace.raw_augmented_syscalls = true;
5384 goto init_augmented_syscall_tp;
5385 }
5386
5387 if (trace.syscalls.events.bpf_output->priv == NULL &&
5388 strstr(evsel__name(evsel), "syscalls:sys_enter")) {
5389 struct evsel *augmented = trace.syscalls.events.bpf_output;
5390 if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
5391 evsel__init_augmented_syscall_tp_args(augmented))
5392 goto out;
5393 /*
5394 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
5395 * Above we made sure we can get from the payload the tp fields
5396 * that we get from syscalls:sys_enter tracefs format file.
5397 */
5398 augmented->handler = trace__sys_enter;
5399 /*
5400 * Now we do the same for the *syscalls:sys_enter event so that
5401 * if we handle it directly, i.e. if the BPF prog returns 0 so
5402 * as not to filter it, then we'll handle it just like we would
5403 * for the BPF_OUTPUT one:
5404 */
5405 if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
5406 evsel__init_augmented_syscall_tp_args(evsel))
5407 goto out;
5408 evsel->handler = trace__sys_enter;
5409 }
5410
5411 if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
5412 struct syscall_tp *sc;
5413init_augmented_syscall_tp:
5414 if (evsel__init_augmented_syscall_tp(evsel, evsel))
5415 goto out;
5416 sc = __evsel__syscall_tp(evsel);
5417 /*
5418 * For now with BPF raw_augmented we hook into
5419 * raw_syscalls:sys_enter and there we get all
5420 * 6 syscall args plus the tracepoint common
5421 * fields and the syscall_nr (another long).
5422 * So we check if that is the case and if so
5423 * don't look after the sc->args_size but
5424 * always after the full raw_syscalls:sys_enter
5425 * payload, which is fixed.
5426 *
5427 * We'll revisit this later to pass
5428 * s->args_size to the BPF augmenter (now
5429 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
5430 * so that it copies only what we need for each
5431 * syscall, like what happens when we use
5432 * syscalls:sys_enter_NAME, so that we reduce
5433 * the kernel/userspace traffic to just what is
5434 * needed for each syscall.
5435 */
5436 if (trace.raw_augmented_syscalls)
5437 trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
5438 evsel__init_augmented_syscall_tp_ret(evsel);
5439 evsel->handler = trace__sys_exit;
5440 }
5441 }
5442 }
5443
5444 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
5445 return trace__record(&trace, argc-1, &argv[1]);
5446
5447 /* Using just --errno-summary will trigger --summary */
5448 if (trace.errno_summary && !trace.summary && !trace.summary_only)
5449 trace.summary_only = true;
5450
5451 /* summary_only implies summary option, but don't overwrite summary if set */
5452 if (trace.summary_only)
5453 trace.summary = trace.summary_only;
5454
5455 /* Keep exited threads, otherwise information might be lost for summary */
5456 if (trace.summary)
5457 symbol_conf.keep_exited_threads = true;
5458
5459 if (output_name != NULL) {
5460 err = trace__open_output(&trace, output_name);
5461 if (err < 0) {
5462 perror("failed to create output file");
5463 goto out;
5464 }
5465 }
5466
5467 err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
5468 if (err)
5469 goto out_close;
5470
5471 err = target__validate(&trace.opts.target);
5472 if (err) {
5473 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5474 fprintf(trace.output, "%s", bf);
5475 goto out_close;
5476 }
5477
5478 err = target__parse_uid(&trace.opts.target);
5479 if (err) {
5480 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5481 fprintf(trace.output, "%s", bf);
5482 goto out_close;
5483 }
5484
5485 if (!argc && target__none(&trace.opts.target))
5486 trace.opts.target.system_wide = true;
5487
5488 if (input_name)
5489 err = trace__replay(&trace);
5490 else
5491 err = trace__run(&trace, argc, argv);
5492
5493out_close:
5494 if (output_name != NULL)
5495 fclose(trace.output);
5496out:
5497 trace__exit(&trace);
5498#ifdef HAVE_BPF_SKEL
5499 augmented_raw_syscalls_bpf__destroy(trace.skel);
5500#endif
5501 return err;
5502}