Loading...
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 */
16
17#include "util/record.h"
18#include <traceevent/event-parse.h>
19#include <api/fs/tracing_path.h>
20#include <bpf/bpf.h>
21#include "util/bpf_map.h"
22#include "util/rlimit.h"
23#include "builtin.h"
24#include "util/cgroup.h"
25#include "util/color.h"
26#include "util/config.h"
27#include "util/debug.h"
28#include "util/dso.h"
29#include "util/env.h"
30#include "util/event.h"
31#include "util/evsel.h"
32#include "util/evsel_fprintf.h"
33#include "util/synthetic-events.h"
34#include "util/evlist.h"
35#include "util/evswitch.h"
36#include "util/mmap.h"
37#include <subcmd/pager.h>
38#include <subcmd/exec-cmd.h>
39#include "util/machine.h"
40#include "util/map.h"
41#include "util/symbol.h"
42#include "util/path.h"
43#include "util/session.h"
44#include "util/thread.h"
45#include <subcmd/parse-options.h>
46#include "util/strlist.h"
47#include "util/intlist.h"
48#include "util/thread_map.h"
49#include "util/stat.h"
50#include "util/tool.h"
51#include "util/util.h"
52#include "trace/beauty/beauty.h"
53#include "trace-event.h"
54#include "util/parse-events.h"
55#include "util/bpf-loader.h"
56#include "callchain.h"
57#include "print_binary.h"
58#include "string2.h"
59#include "syscalltbl.h"
60#include "rb_resort.h"
61#include "../perf.h"
62
63#include <errno.h>
64#include <inttypes.h>
65#include <poll.h>
66#include <signal.h>
67#include <stdlib.h>
68#include <string.h>
69#include <linux/err.h>
70#include <linux/filter.h>
71#include <linux/kernel.h>
72#include <linux/random.h>
73#include <linux/stringify.h>
74#include <linux/time64.h>
75#include <linux/zalloc.h>
76#include <fcntl.h>
77#include <sys/sysmacros.h>
78
79#include <linux/ctype.h>
80
81#ifndef O_CLOEXEC
82# define O_CLOEXEC 02000000
83#endif
84
85#ifndef F_LINUX_SPECIFIC_BASE
86# define F_LINUX_SPECIFIC_BASE 1024
87#endif
88
89struct trace {
90 struct perf_tool tool;
91 struct syscalltbl *sctbl;
92 struct {
93 struct syscall *table;
94 struct bpf_map *map;
95 struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
96 struct bpf_map *sys_enter,
97 *sys_exit;
98 } prog_array;
99 struct {
100 struct evsel *sys_enter,
101 *sys_exit,
102 *augmented;
103 } events;
104 struct bpf_program *unaugmented_prog;
105 } syscalls;
106 struct {
107 struct bpf_map *map;
108 } dump;
109 struct record_opts opts;
110 struct evlist *evlist;
111 struct machine *host;
112 struct thread *current;
113 struct bpf_object *bpf_obj;
114 struct cgroup *cgroup;
115 u64 base_time;
116 FILE *output;
117 unsigned long nr_events;
118 unsigned long nr_events_printed;
119 unsigned long max_events;
120 struct evswitch evswitch;
121 struct strlist *ev_qualifier;
122 struct {
123 size_t nr;
124 int *entries;
125 } ev_qualifier_ids;
126 struct {
127 size_t nr;
128 pid_t *entries;
129 struct bpf_map *map;
130 } filter_pids;
131 double duration_filter;
132 double runtime_ms;
133 struct {
134 u64 vfs_getname,
135 proc_getname;
136 } stats;
137 unsigned int max_stack;
138 unsigned int min_stack;
139 int raw_augmented_syscalls_args_size;
140 bool raw_augmented_syscalls;
141 bool fd_path_disabled;
142 bool sort_events;
143 bool not_ev_qualifier;
144 bool live;
145 bool full_time;
146 bool sched;
147 bool multiple_threads;
148 bool summary;
149 bool summary_only;
150 bool failure_only;
151 bool show_comm;
152 bool print_sample;
153 bool show_tool_stats;
154 bool trace_syscalls;
155 bool kernel_syscallchains;
156 s16 args_alignment;
157 bool show_tstamp;
158 bool show_duration;
159 bool show_zeros;
160 bool show_arg_names;
161 bool show_string_prefix;
162 bool force;
163 bool vfs_getname;
164 int trace_pgfaults;
165 struct {
166 struct ordered_events data;
167 u64 last;
168 } oe;
169};
170
171struct tp_field {
172 int offset;
173 union {
174 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
175 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
176 };
177};
178
179#define TP_UINT_FIELD(bits) \
180static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
181{ \
182 u##bits value; \
183 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
184 return value; \
185}
186
187TP_UINT_FIELD(8);
188TP_UINT_FIELD(16);
189TP_UINT_FIELD(32);
190TP_UINT_FIELD(64);
191
192#define TP_UINT_FIELD__SWAPPED(bits) \
193static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
194{ \
195 u##bits value; \
196 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
197 return bswap_##bits(value);\
198}
199
200TP_UINT_FIELD__SWAPPED(16);
201TP_UINT_FIELD__SWAPPED(32);
202TP_UINT_FIELD__SWAPPED(64);
203
204static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
205{
206 field->offset = offset;
207
208 switch (size) {
209 case 1:
210 field->integer = tp_field__u8;
211 break;
212 case 2:
213 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
214 break;
215 case 4:
216 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
217 break;
218 case 8:
219 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
220 break;
221 default:
222 return -1;
223 }
224
225 return 0;
226}
227
228static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
229{
230 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
231}
232
233static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
234{
235 return sample->raw_data + field->offset;
236}
237
238static int __tp_field__init_ptr(struct tp_field *field, int offset)
239{
240 field->offset = offset;
241 field->pointer = tp_field__ptr;
242 return 0;
243}
244
245static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
246{
247 return __tp_field__init_ptr(field, format_field->offset);
248}
249
250struct syscall_tp {
251 struct tp_field id;
252 union {
253 struct tp_field args, ret;
254 };
255};
256
257static int perf_evsel__init_tp_uint_field(struct evsel *evsel,
258 struct tp_field *field,
259 const char *name)
260{
261 struct tep_format_field *format_field = perf_evsel__field(evsel, name);
262
263 if (format_field == NULL)
264 return -1;
265
266 return tp_field__init_uint(field, format_field, evsel->needs_swap);
267}
268
269#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
270 ({ struct syscall_tp *sc = evsel->priv;\
271 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
272
273static int perf_evsel__init_tp_ptr_field(struct evsel *evsel,
274 struct tp_field *field,
275 const char *name)
276{
277 struct tep_format_field *format_field = perf_evsel__field(evsel, name);
278
279 if (format_field == NULL)
280 return -1;
281
282 return tp_field__init_ptr(field, format_field);
283}
284
285#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
286 ({ struct syscall_tp *sc = evsel->priv;\
287 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
288
289static void evsel__delete_priv(struct evsel *evsel)
290{
291 zfree(&evsel->priv);
292 evsel__delete(evsel);
293}
294
295static int perf_evsel__init_syscall_tp(struct evsel *evsel)
296{
297 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
298
299 if (evsel->priv != NULL) {
300 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
301 perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
302 goto out_delete;
303 return 0;
304 }
305
306 return -ENOMEM;
307out_delete:
308 zfree(&evsel->priv);
309 return -ENOENT;
310}
311
312static int perf_evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
313{
314 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
315
316 if (evsel->priv != NULL) {
317 struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
318 if (syscall_id == NULL)
319 syscall_id = perf_evsel__field(tp, "__syscall_nr");
320 if (syscall_id == NULL)
321 goto out_delete;
322 if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
323 goto out_delete;
324
325 return 0;
326 }
327
328 return -ENOMEM;
329out_delete:
330 zfree(&evsel->priv);
331 return -EINVAL;
332}
333
334static int perf_evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
335{
336 struct syscall_tp *sc = evsel->priv;
337
338 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
339}
340
341static int perf_evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
342{
343 struct syscall_tp *sc = evsel->priv;
344
345 return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
346}
347
348static int perf_evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
349{
350 evsel->priv = malloc(sizeof(struct syscall_tp));
351 if (evsel->priv != NULL) {
352 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
353 goto out_delete;
354
355 evsel->handler = handler;
356 return 0;
357 }
358
359 return -ENOMEM;
360
361out_delete:
362 zfree(&evsel->priv);
363 return -ENOENT;
364}
365
366static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
367{
368 struct evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
369
370 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
371 if (IS_ERR(evsel))
372 evsel = perf_evsel__newtp("syscalls", direction);
373
374 if (IS_ERR(evsel))
375 return NULL;
376
377 if (perf_evsel__init_raw_syscall_tp(evsel, handler))
378 goto out_delete;
379
380 return evsel;
381
382out_delete:
383 evsel__delete_priv(evsel);
384 return NULL;
385}
386
387#define perf_evsel__sc_tp_uint(evsel, name, sample) \
388 ({ struct syscall_tp *fields = evsel->priv; \
389 fields->name.integer(&fields->name, sample); })
390
391#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
392 ({ struct syscall_tp *fields = evsel->priv; \
393 fields->name.pointer(&fields->name, sample); })
394
395size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
396{
397 int idx = val - sa->offset;
398
399 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
400 size_t printed = scnprintf(bf, size, intfmt, val);
401 if (show_prefix)
402 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
403 return printed;
404 }
405
406 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
407}
408
409static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
410 const char *intfmt,
411 struct syscall_arg *arg)
412{
413 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
414}
415
416static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
417 struct syscall_arg *arg)
418{
419 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
420}
421
422#define SCA_STRARRAY syscall_arg__scnprintf_strarray
423
424size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
425{
426 return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
427}
428
429size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
430{
431 size_t printed;
432 int i;
433
434 for (i = 0; i < sas->nr_entries; ++i) {
435 struct strarray *sa = sas->entries[i];
436 int idx = val - sa->offset;
437
438 if (idx >= 0 && idx < sa->nr_entries) {
439 if (sa->entries[idx] == NULL)
440 break;
441 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
442 }
443 }
444
445 printed = scnprintf(bf, size, intfmt, val);
446 if (show_prefix)
447 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
448 return printed;
449}
450
451size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
452 struct syscall_arg *arg)
453{
454 return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
455}
456
457#ifndef AT_FDCWD
458#define AT_FDCWD -100
459#endif
460
461static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
462 struct syscall_arg *arg)
463{
464 int fd = arg->val;
465 const char *prefix = "AT_FD";
466
467 if (fd == AT_FDCWD)
468 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
469
470 return syscall_arg__scnprintf_fd(bf, size, arg);
471}
472
473#define SCA_FDAT syscall_arg__scnprintf_fd_at
474
475static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
476 struct syscall_arg *arg);
477
478#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
479
480size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
481{
482 return scnprintf(bf, size, "%#lx", arg->val);
483}
484
485size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
486{
487 if (arg->val == 0)
488 return scnprintf(bf, size, "NULL");
489 return syscall_arg__scnprintf_hex(bf, size, arg);
490}
491
492size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
493{
494 return scnprintf(bf, size, "%d", arg->val);
495}
496
497size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
498{
499 return scnprintf(bf, size, "%ld", arg->val);
500}
501
502static const char *bpf_cmd[] = {
503 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
504 "MAP_GET_NEXT_KEY", "PROG_LOAD",
505};
506static DEFINE_STRARRAY(bpf_cmd, "BPF_");
507
508static const char *fsmount_flags[] = {
509 [1] = "CLOEXEC",
510};
511static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
512
513#include "trace/beauty/generated/fsconfig_arrays.c"
514
515static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
516
517static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
518static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
519
520static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
521static DEFINE_STRARRAY(itimers, "ITIMER_");
522
523static const char *keyctl_options[] = {
524 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
525 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
526 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
527 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
528 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
529};
530static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
531
532static const char *whences[] = { "SET", "CUR", "END",
533#ifdef SEEK_DATA
534"DATA",
535#endif
536#ifdef SEEK_HOLE
537"HOLE",
538#endif
539};
540static DEFINE_STRARRAY(whences, "SEEK_");
541
542static const char *fcntl_cmds[] = {
543 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
544 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
545 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
546 "GETOWNER_UIDS",
547};
548static DEFINE_STRARRAY(fcntl_cmds, "F_");
549
550static const char *fcntl_linux_specific_cmds[] = {
551 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
552 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
553 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
554};
555
556static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
557
558static struct strarray *fcntl_cmds_arrays[] = {
559 &strarray__fcntl_cmds,
560 &strarray__fcntl_linux_specific_cmds,
561};
562
563static DEFINE_STRARRAYS(fcntl_cmds_arrays);
564
565static const char *rlimit_resources[] = {
566 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
567 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
568 "RTTIME",
569};
570static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
571
572static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
573static DEFINE_STRARRAY(sighow, "SIG_");
574
575static const char *clockid[] = {
576 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
577 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
578 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
579};
580static DEFINE_STRARRAY(clockid, "CLOCK_");
581
582static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
583 struct syscall_arg *arg)
584{
585 bool show_prefix = arg->show_string_prefix;
586 const char *suffix = "_OK";
587 size_t printed = 0;
588 int mode = arg->val;
589
590 if (mode == F_OK) /* 0 */
591 return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
592#define P_MODE(n) \
593 if (mode & n##_OK) { \
594 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
595 mode &= ~n##_OK; \
596 }
597
598 P_MODE(R);
599 P_MODE(W);
600 P_MODE(X);
601#undef P_MODE
602
603 if (mode)
604 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
605
606 return printed;
607}
608
609#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
610
611static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
612 struct syscall_arg *arg);
613
614#define SCA_FILENAME syscall_arg__scnprintf_filename
615
616static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
617 struct syscall_arg *arg)
618{
619 bool show_prefix = arg->show_string_prefix;
620 const char *prefix = "O_";
621 int printed = 0, flags = arg->val;
622
623#define P_FLAG(n) \
624 if (flags & O_##n) { \
625 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
626 flags &= ~O_##n; \
627 }
628
629 P_FLAG(CLOEXEC);
630 P_FLAG(NONBLOCK);
631#undef P_FLAG
632
633 if (flags)
634 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
635
636 return printed;
637}
638
639#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
640
641#ifndef GRND_NONBLOCK
642#define GRND_NONBLOCK 0x0001
643#endif
644#ifndef GRND_RANDOM
645#define GRND_RANDOM 0x0002
646#endif
647
648static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
649 struct syscall_arg *arg)
650{
651 bool show_prefix = arg->show_string_prefix;
652 const char *prefix = "GRND_";
653 int printed = 0, flags = arg->val;
654
655#define P_FLAG(n) \
656 if (flags & GRND_##n) { \
657 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
658 flags &= ~GRND_##n; \
659 }
660
661 P_FLAG(RANDOM);
662 P_FLAG(NONBLOCK);
663#undef P_FLAG
664
665 if (flags)
666 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
667
668 return printed;
669}
670
671#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
672
673#define STRARRAY(name, array) \
674 { .scnprintf = SCA_STRARRAY, \
675 .parm = &strarray__##array, }
676
677#define STRARRAY_FLAGS(name, array) \
678 { .scnprintf = SCA_STRARRAY_FLAGS, \
679 .parm = &strarray__##array, }
680
681#include "trace/beauty/arch_errno_names.c"
682#include "trace/beauty/eventfd.c"
683#include "trace/beauty/futex_op.c"
684#include "trace/beauty/futex_val3.c"
685#include "trace/beauty/mmap.c"
686#include "trace/beauty/mode_t.c"
687#include "trace/beauty/msg_flags.c"
688#include "trace/beauty/open_flags.c"
689#include "trace/beauty/perf_event_open.c"
690#include "trace/beauty/pid.c"
691#include "trace/beauty/sched_policy.c"
692#include "trace/beauty/seccomp.c"
693#include "trace/beauty/signum.c"
694#include "trace/beauty/socket_type.c"
695#include "trace/beauty/waitid_options.c"
696
697struct syscall_arg_fmt {
698 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
699 unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
700 void *parm;
701 const char *name;
702 bool show_zero;
703};
704
705static struct syscall_fmt {
706 const char *name;
707 const char *alias;
708 struct {
709 const char *sys_enter,
710 *sys_exit;
711 } bpf_prog_name;
712 struct syscall_arg_fmt arg[6];
713 u8 nr_args;
714 bool errpid;
715 bool timeout;
716 bool hexret;
717} syscall_fmts[] = {
718 { .name = "access",
719 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
720 { .name = "arch_prctl",
721 .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
722 [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
723 { .name = "bind",
724 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
725 [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ },
726 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
727 { .name = "bpf",
728 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
729 { .name = "brk", .hexret = true,
730 .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
731 { .name = "clock_gettime",
732 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
733 { .name = "clone", .errpid = true, .nr_args = 5,
734 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
735 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
736 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
737 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
738 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
739 { .name = "close",
740 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
741 { .name = "connect",
742 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
743 [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ },
744 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
745 { .name = "epoll_ctl",
746 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
747 { .name = "eventfd2",
748 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
749 { .name = "fchmodat",
750 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
751 { .name = "fchownat",
752 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
753 { .name = "fcntl",
754 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
755 .parm = &strarrays__fcntl_cmds_arrays,
756 .show_zero = true, },
757 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
758 { .name = "flock",
759 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
760 { .name = "fsconfig",
761 .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
762 { .name = "fsmount",
763 .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
764 [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
765 { .name = "fspick",
766 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
767 [1] = { .scnprintf = SCA_FILENAME, /* path */ },
768 [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
769 { .name = "fstat", .alias = "newfstat", },
770 { .name = "fstatat", .alias = "newfstatat", },
771 { .name = "futex",
772 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
773 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
774 { .name = "futimesat",
775 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
776 { .name = "getitimer",
777 .arg = { [0] = STRARRAY(which, itimers), }, },
778 { .name = "getpid", .errpid = true, },
779 { .name = "getpgid", .errpid = true, },
780 { .name = "getppid", .errpid = true, },
781 { .name = "getrandom",
782 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
783 { .name = "getrlimit",
784 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
785 { .name = "gettid", .errpid = true, },
786 { .name = "ioctl",
787 .arg = {
788#if defined(__i386__) || defined(__x86_64__)
789/*
790 * FIXME: Make this available to all arches.
791 */
792 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
793 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
794#else
795 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
796#endif
797 { .name = "kcmp", .nr_args = 5,
798 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
799 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
800 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
801 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
802 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
803 { .name = "keyctl",
804 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
805 { .name = "kill",
806 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
807 { .name = "linkat",
808 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
809 { .name = "lseek",
810 .arg = { [2] = STRARRAY(whence, whences), }, },
811 { .name = "lstat", .alias = "newlstat", },
812 { .name = "madvise",
813 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
814 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
815 { .name = "mkdirat",
816 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
817 { .name = "mknodat",
818 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
819 { .name = "mmap", .hexret = true,
820/* The standard mmap maps to old_mmap on s390x */
821#if defined(__s390x__)
822 .alias = "old_mmap",
823#endif
824 .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
825 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ },
826 [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, },
827 { .name = "mount",
828 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
829 [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
830 .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
831 { .name = "move_mount",
832 .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ },
833 [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
834 [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ },
835 [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
836 [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
837 { .name = "mprotect",
838 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
839 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
840 { .name = "mq_unlink",
841 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
842 { .name = "mremap", .hexret = true,
843 .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
844 { .name = "name_to_handle_at",
845 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
846 { .name = "newfstatat",
847 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
848 { .name = "open",
849 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
850 { .name = "open_by_handle_at",
851 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
852 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
853 { .name = "openat",
854 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
855 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
856 { .name = "perf_event_open",
857 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
858 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
859 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
860 { .name = "pipe2",
861 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
862 { .name = "pkey_alloc",
863 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
864 { .name = "pkey_free",
865 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
866 { .name = "pkey_mprotect",
867 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
868 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
869 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
870 { .name = "poll", .timeout = true, },
871 { .name = "ppoll", .timeout = true, },
872 { .name = "prctl",
873 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
874 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
875 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
876 { .name = "pread", .alias = "pread64", },
877 { .name = "preadv", .alias = "pread", },
878 { .name = "prlimit64",
879 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
880 { .name = "pwrite", .alias = "pwrite64", },
881 { .name = "readlinkat",
882 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
883 { .name = "recvfrom",
884 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
885 { .name = "recvmmsg",
886 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
887 { .name = "recvmsg",
888 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
889 { .name = "renameat",
890 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
891 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
892 { .name = "renameat2",
893 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
894 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
895 [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
896 { .name = "rt_sigaction",
897 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
898 { .name = "rt_sigprocmask",
899 .arg = { [0] = STRARRAY(how, sighow), }, },
900 { .name = "rt_sigqueueinfo",
901 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
902 { .name = "rt_tgsigqueueinfo",
903 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
904 { .name = "sched_setscheduler",
905 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
906 { .name = "seccomp",
907 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
908 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
909 { .name = "select", .timeout = true, },
910 { .name = "sendfile", .alias = "sendfile64", },
911 { .name = "sendmmsg",
912 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
913 { .name = "sendmsg",
914 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
915 { .name = "sendto",
916 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
917 [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
918 { .name = "set_tid_address", .errpid = true, },
919 { .name = "setitimer",
920 .arg = { [0] = STRARRAY(which, itimers), }, },
921 { .name = "setrlimit",
922 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
923 { .name = "socket",
924 .arg = { [0] = STRARRAY(family, socket_families),
925 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
926 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
927 { .name = "socketpair",
928 .arg = { [0] = STRARRAY(family, socket_families),
929 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
930 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
931 { .name = "stat", .alias = "newstat", },
932 { .name = "statx",
933 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
934 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
935 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
936 { .name = "swapoff",
937 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
938 { .name = "swapon",
939 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
940 { .name = "symlinkat",
941 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
942 { .name = "sync_file_range",
943 .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
944 { .name = "tgkill",
945 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
946 { .name = "tkill",
947 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
948 { .name = "umount2", .alias = "umount",
949 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
950 { .name = "uname", .alias = "newuname", },
951 { .name = "unlinkat",
952 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
953 { .name = "utimensat",
954 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
955 { .name = "wait4", .errpid = true,
956 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
957 { .name = "waitid", .errpid = true,
958 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
959};
960
961static int syscall_fmt__cmp(const void *name, const void *fmtp)
962{
963 const struct syscall_fmt *fmt = fmtp;
964 return strcmp(name, fmt->name);
965}
966
967static struct syscall_fmt *syscall_fmt__find(const char *name)
968{
969 const int nmemb = ARRAY_SIZE(syscall_fmts);
970 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
971}
972
973static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
974{
975 int i, nmemb = ARRAY_SIZE(syscall_fmts);
976
977 for (i = 0; i < nmemb; ++i) {
978 if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
979 return &syscall_fmts[i];
980 }
981
982 return NULL;
983}
984
985/*
986 * is_exit: is this "exit" or "exit_group"?
987 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
988 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
989 * nonexistent: Just a hole in the syscall table, syscall id not allocated
990 */
991struct syscall {
992 struct tep_event *tp_format;
993 int nr_args;
994 int args_size;
995 struct {
996 struct bpf_program *sys_enter,
997 *sys_exit;
998 } bpf_prog;
999 bool is_exit;
1000 bool is_open;
1001 bool nonexistent;
1002 struct tep_format_field *args;
1003 const char *name;
1004 struct syscall_fmt *fmt;
1005 struct syscall_arg_fmt *arg_fmt;
1006};
1007
1008/*
1009 * Must match what is in the BPF program:
1010 *
1011 * tools/perf/examples/bpf/augmented_raw_syscalls.c
1012 */
1013struct bpf_map_syscall_entry {
1014 bool enabled;
1015 u16 string_args_len[6];
1016};
1017
1018/*
1019 * We need to have this 'calculated' boolean because in some cases we really
1020 * don't know what is the duration of a syscall, for instance, when we start
1021 * a session and some threads are waiting for a syscall to finish, say 'poll',
1022 * in which case all we can do is to print "( ? ) for duration and for the
1023 * start timestamp.
1024 */
1025static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1026{
1027 double duration = (double)t / NSEC_PER_MSEC;
1028 size_t printed = fprintf(fp, "(");
1029
1030 if (!calculated)
1031 printed += fprintf(fp, " ");
1032 else if (duration >= 1.0)
1033 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1034 else if (duration >= 0.01)
1035 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1036 else
1037 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1038 return printed + fprintf(fp, "): ");
1039}
1040
1041/**
1042 * filename.ptr: The filename char pointer that will be vfs_getname'd
1043 * filename.entry_str_pos: Where to insert the string translated from
1044 * filename.ptr by the vfs_getname tracepoint/kprobe.
1045 * ret_scnprintf: syscall args may set this to a different syscall return
1046 * formatter, for instance, fcntl may return fds, file flags, etc.
1047 */
1048struct thread_trace {
1049 u64 entry_time;
1050 bool entry_pending;
1051 unsigned long nr_events;
1052 unsigned long pfmaj, pfmin;
1053 char *entry_str;
1054 double runtime_ms;
1055 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1056 struct {
1057 unsigned long ptr;
1058 short int entry_str_pos;
1059 bool pending_open;
1060 unsigned int namelen;
1061 char *name;
1062 } filename;
1063 struct {
1064 int max;
1065 struct file *table;
1066 } files;
1067
1068 struct intlist *syscall_stats;
1069};
1070
1071static struct thread_trace *thread_trace__new(void)
1072{
1073 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1074
1075 if (ttrace) {
1076 ttrace->files.max = -1;
1077 ttrace->syscall_stats = intlist__new(NULL);
1078 }
1079
1080 return ttrace;
1081}
1082
1083static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1084{
1085 struct thread_trace *ttrace;
1086
1087 if (thread == NULL)
1088 goto fail;
1089
1090 if (thread__priv(thread) == NULL)
1091 thread__set_priv(thread, thread_trace__new());
1092
1093 if (thread__priv(thread) == NULL)
1094 goto fail;
1095
1096 ttrace = thread__priv(thread);
1097 ++ttrace->nr_events;
1098
1099 return ttrace;
1100fail:
1101 color_fprintf(fp, PERF_COLOR_RED,
1102 "WARNING: not enough memory, dropping samples!\n");
1103 return NULL;
1104}
1105
1106
1107void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1108 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1109{
1110 struct thread_trace *ttrace = thread__priv(arg->thread);
1111
1112 ttrace->ret_scnprintf = ret_scnprintf;
1113}
1114
1115#define TRACE_PFMAJ (1 << 0)
1116#define TRACE_PFMIN (1 << 1)
1117
1118static const size_t trace__entry_str_size = 2048;
1119
1120static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1121{
1122 if (fd < 0)
1123 return NULL;
1124
1125 if (fd > ttrace->files.max) {
1126 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1127
1128 if (nfiles == NULL)
1129 return NULL;
1130
1131 if (ttrace->files.max != -1) {
1132 memset(nfiles + ttrace->files.max + 1, 0,
1133 (fd - ttrace->files.max) * sizeof(struct file));
1134 } else {
1135 memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1136 }
1137
1138 ttrace->files.table = nfiles;
1139 ttrace->files.max = fd;
1140 }
1141
1142 return ttrace->files.table + fd;
1143}
1144
1145struct file *thread__files_entry(struct thread *thread, int fd)
1146{
1147 return thread_trace__files_entry(thread__priv(thread), fd);
1148}
1149
1150static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1151{
1152 struct thread_trace *ttrace = thread__priv(thread);
1153 struct file *file = thread_trace__files_entry(ttrace, fd);
1154
1155 if (file != NULL) {
1156 struct stat st;
1157 if (stat(pathname, &st) == 0)
1158 file->dev_maj = major(st.st_rdev);
1159 file->pathname = strdup(pathname);
1160 if (file->pathname)
1161 return 0;
1162 }
1163
1164 return -1;
1165}
1166
1167static int thread__read_fd_path(struct thread *thread, int fd)
1168{
1169 char linkname[PATH_MAX], pathname[PATH_MAX];
1170 struct stat st;
1171 int ret;
1172
1173 if (thread->pid_ == thread->tid) {
1174 scnprintf(linkname, sizeof(linkname),
1175 "/proc/%d/fd/%d", thread->pid_, fd);
1176 } else {
1177 scnprintf(linkname, sizeof(linkname),
1178 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1179 }
1180
1181 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1182 return -1;
1183
1184 ret = readlink(linkname, pathname, sizeof(pathname));
1185
1186 if (ret < 0 || ret > st.st_size)
1187 return -1;
1188
1189 pathname[ret] = '\0';
1190 return trace__set_fd_pathname(thread, fd, pathname);
1191}
1192
1193static const char *thread__fd_path(struct thread *thread, int fd,
1194 struct trace *trace)
1195{
1196 struct thread_trace *ttrace = thread__priv(thread);
1197
1198 if (ttrace == NULL || trace->fd_path_disabled)
1199 return NULL;
1200
1201 if (fd < 0)
1202 return NULL;
1203
1204 if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1205 if (!trace->live)
1206 return NULL;
1207 ++trace->stats.proc_getname;
1208 if (thread__read_fd_path(thread, fd))
1209 return NULL;
1210 }
1211
1212 return ttrace->files.table[fd].pathname;
1213}
1214
1215size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1216{
1217 int fd = arg->val;
1218 size_t printed = scnprintf(bf, size, "%d", fd);
1219 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1220
1221 if (path)
1222 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1223
1224 return printed;
1225}
1226
1227size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1228{
1229 size_t printed = scnprintf(bf, size, "%d", fd);
1230 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1231
1232 if (thread) {
1233 const char *path = thread__fd_path(thread, fd, trace);
1234
1235 if (path)
1236 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1237
1238 thread__put(thread);
1239 }
1240
1241 return printed;
1242}
1243
1244static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1245 struct syscall_arg *arg)
1246{
1247 int fd = arg->val;
1248 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1249 struct thread_trace *ttrace = thread__priv(arg->thread);
1250
1251 if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1252 zfree(&ttrace->files.table[fd].pathname);
1253
1254 return printed;
1255}
1256
1257static void thread__set_filename_pos(struct thread *thread, const char *bf,
1258 unsigned long ptr)
1259{
1260 struct thread_trace *ttrace = thread__priv(thread);
1261
1262 ttrace->filename.ptr = ptr;
1263 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1264}
1265
1266static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1267{
1268 struct augmented_arg *augmented_arg = arg->augmented.args;
1269 size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1270 /*
1271 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1272 * we would have two strings, each prefixed by its size.
1273 */
1274 int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1275
1276 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1277 arg->augmented.size -= consumed;
1278
1279 return printed;
1280}
1281
1282static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1283 struct syscall_arg *arg)
1284{
1285 unsigned long ptr = arg->val;
1286
1287 if (arg->augmented.args)
1288 return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1289
1290 if (!arg->trace->vfs_getname)
1291 return scnprintf(bf, size, "%#x", ptr);
1292
1293 thread__set_filename_pos(arg->thread, bf, ptr);
1294 return 0;
1295}
1296
1297static bool trace__filter_duration(struct trace *trace, double t)
1298{
1299 return t < (trace->duration_filter * NSEC_PER_MSEC);
1300}
1301
1302static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1303{
1304 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1305
1306 return fprintf(fp, "%10.3f ", ts);
1307}
1308
1309/*
1310 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1311 * using ttrace->entry_time for a thread that receives a sys_exit without
1312 * first having received a sys_enter ("poll" issued before tracing session
1313 * starts, lost sys_enter exit due to ring buffer overflow).
1314 */
1315static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1316{
1317 if (tstamp > 0)
1318 return __trace__fprintf_tstamp(trace, tstamp, fp);
1319
1320 return fprintf(fp, " ? ");
1321}
1322
1323static bool done = false;
1324static bool interrupted = false;
1325
1326static void sig_handler(int sig)
1327{
1328 done = true;
1329 interrupted = sig == SIGINT;
1330}
1331
1332static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1333{
1334 size_t printed = 0;
1335
1336 if (trace->multiple_threads) {
1337 if (trace->show_comm)
1338 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1339 printed += fprintf(fp, "%d ", thread->tid);
1340 }
1341
1342 return printed;
1343}
1344
1345static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1346 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1347{
1348 size_t printed = 0;
1349
1350 if (trace->show_tstamp)
1351 printed = trace__fprintf_tstamp(trace, tstamp, fp);
1352 if (trace->show_duration)
1353 printed += fprintf_duration(duration, duration_calculated, fp);
1354 return printed + trace__fprintf_comm_tid(trace, thread, fp);
1355}
1356
1357static int trace__process_event(struct trace *trace, struct machine *machine,
1358 union perf_event *event, struct perf_sample *sample)
1359{
1360 int ret = 0;
1361
1362 switch (event->header.type) {
1363 case PERF_RECORD_LOST:
1364 color_fprintf(trace->output, PERF_COLOR_RED,
1365 "LOST %" PRIu64 " events!\n", event->lost.lost);
1366 ret = machine__process_lost_event(machine, event, sample);
1367 break;
1368 default:
1369 ret = machine__process_event(machine, event, sample);
1370 break;
1371 }
1372
1373 return ret;
1374}
1375
1376static int trace__tool_process(struct perf_tool *tool,
1377 union perf_event *event,
1378 struct perf_sample *sample,
1379 struct machine *machine)
1380{
1381 struct trace *trace = container_of(tool, struct trace, tool);
1382 return trace__process_event(trace, machine, event, sample);
1383}
1384
1385static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1386{
1387 struct machine *machine = vmachine;
1388
1389 if (machine->kptr_restrict_warned)
1390 return NULL;
1391
1392 if (symbol_conf.kptr_restrict) {
1393 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1394 "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1395 "Kernel samples will not be resolved.\n");
1396 machine->kptr_restrict_warned = true;
1397 return NULL;
1398 }
1399
1400 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1401}
1402
1403static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1404{
1405 int err = symbol__init(NULL);
1406
1407 if (err)
1408 return err;
1409
1410 trace->host = machine__new_host();
1411 if (trace->host == NULL)
1412 return -ENOMEM;
1413
1414 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1415 if (err < 0)
1416 goto out;
1417
1418 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1419 evlist->core.threads, trace__tool_process, false,
1420 1);
1421out:
1422 if (err)
1423 symbol__exit();
1424
1425 return err;
1426}
1427
1428static void trace__symbols__exit(struct trace *trace)
1429{
1430 machine__exit(trace->host);
1431 trace->host = NULL;
1432
1433 symbol__exit();
1434}
1435
1436static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1437{
1438 int idx;
1439
1440 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1441 nr_args = sc->fmt->nr_args;
1442
1443 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1444 if (sc->arg_fmt == NULL)
1445 return -1;
1446
1447 for (idx = 0; idx < nr_args; ++idx) {
1448 if (sc->fmt)
1449 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1450 }
1451
1452 sc->nr_args = nr_args;
1453 return 0;
1454}
1455
1456static int syscall__set_arg_fmts(struct syscall *sc)
1457{
1458 struct tep_format_field *field, *last_field = NULL;
1459 int idx = 0, len;
1460
1461 for (field = sc->args; field; field = field->next, ++idx) {
1462 last_field = field;
1463
1464 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1465 continue;
1466
1467 len = strlen(field->name);
1468
1469 if (strcmp(field->type, "const char *") == 0 &&
1470 ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
1471 strstr(field->name, "path") != NULL))
1472 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1473 else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
1474 sc->arg_fmt[idx].scnprintf = SCA_PTR;
1475 else if (strcmp(field->type, "pid_t") == 0)
1476 sc->arg_fmt[idx].scnprintf = SCA_PID;
1477 else if (strcmp(field->type, "umode_t") == 0)
1478 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1479 else if ((strcmp(field->type, "int") == 0 ||
1480 strcmp(field->type, "unsigned int") == 0 ||
1481 strcmp(field->type, "long") == 0) &&
1482 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
1483 /*
1484 * /sys/kernel/tracing/events/syscalls/sys_enter*
1485 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1486 * 65 int
1487 * 23 unsigned int
1488 * 7 unsigned long
1489 */
1490 sc->arg_fmt[idx].scnprintf = SCA_FD;
1491 }
1492 }
1493
1494 if (last_field)
1495 sc->args_size = last_field->offset + last_field->size;
1496
1497 return 0;
1498}
1499
1500static int trace__read_syscall_info(struct trace *trace, int id)
1501{
1502 char tp_name[128];
1503 struct syscall *sc;
1504 const char *name = syscalltbl__name(trace->sctbl, id);
1505
1506 if (trace->syscalls.table == NULL) {
1507 trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
1508 if (trace->syscalls.table == NULL)
1509 return -ENOMEM;
1510 }
1511
1512 sc = trace->syscalls.table + id;
1513 if (sc->nonexistent)
1514 return 0;
1515
1516 if (name == NULL) {
1517 sc->nonexistent = true;
1518 return 0;
1519 }
1520
1521 sc->name = name;
1522 sc->fmt = syscall_fmt__find(sc->name);
1523
1524 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1525 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1526
1527 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1528 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1529 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1530 }
1531
1532 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1533 return -ENOMEM;
1534
1535 if (IS_ERR(sc->tp_format))
1536 return PTR_ERR(sc->tp_format);
1537
1538 sc->args = sc->tp_format->format.fields;
1539 /*
1540 * We need to check and discard the first variable '__syscall_nr'
1541 * or 'nr' that mean the syscall number. It is needless here.
1542 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1543 */
1544 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1545 sc->args = sc->args->next;
1546 --sc->nr_args;
1547 }
1548
1549 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1550 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1551
1552 return syscall__set_arg_fmts(sc);
1553}
1554
1555static int intcmp(const void *a, const void *b)
1556{
1557 const int *one = a, *another = b;
1558
1559 return *one - *another;
1560}
1561
1562static int trace__validate_ev_qualifier(struct trace *trace)
1563{
1564 int err = 0;
1565 bool printed_invalid_prefix = false;
1566 struct str_node *pos;
1567 size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
1568
1569 trace->ev_qualifier_ids.entries = malloc(nr_allocated *
1570 sizeof(trace->ev_qualifier_ids.entries[0]));
1571
1572 if (trace->ev_qualifier_ids.entries == NULL) {
1573 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1574 trace->output);
1575 err = -EINVAL;
1576 goto out;
1577 }
1578
1579 strlist__for_each_entry(pos, trace->ev_qualifier) {
1580 const char *sc = pos->s;
1581 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1582
1583 if (id < 0) {
1584 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1585 if (id >= 0)
1586 goto matches;
1587
1588 if (!printed_invalid_prefix) {
1589 pr_debug("Skipping unknown syscalls: ");
1590 printed_invalid_prefix = true;
1591 } else {
1592 pr_debug(", ");
1593 }
1594
1595 pr_debug("%s", sc);
1596 continue;
1597 }
1598matches:
1599 trace->ev_qualifier_ids.entries[nr_used++] = id;
1600 if (match_next == -1)
1601 continue;
1602
1603 while (1) {
1604 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1605 if (id < 0)
1606 break;
1607 if (nr_allocated == nr_used) {
1608 void *entries;
1609
1610 nr_allocated += 8;
1611 entries = realloc(trace->ev_qualifier_ids.entries,
1612 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1613 if (entries == NULL) {
1614 err = -ENOMEM;
1615 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1616 goto out_free;
1617 }
1618 trace->ev_qualifier_ids.entries = entries;
1619 }
1620 trace->ev_qualifier_ids.entries[nr_used++] = id;
1621 }
1622 }
1623
1624 trace->ev_qualifier_ids.nr = nr_used;
1625 qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
1626out:
1627 if (printed_invalid_prefix)
1628 pr_debug("\n");
1629 return err;
1630out_free:
1631 zfree(&trace->ev_qualifier_ids.entries);
1632 trace->ev_qualifier_ids.nr = 0;
1633 goto out;
1634}
1635
1636static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
1637{
1638 bool in_ev_qualifier;
1639
1640 if (trace->ev_qualifier_ids.nr == 0)
1641 return true;
1642
1643 in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
1644 trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
1645
1646 if (in_ev_qualifier)
1647 return !trace->not_ev_qualifier;
1648
1649 return trace->not_ev_qualifier;
1650}
1651
1652/*
1653 * args is to be interpreted as a series of longs but we need to handle
1654 * 8-byte unaligned accesses. args points to raw_data within the event
1655 * and raw_data is guaranteed to be 8-byte unaligned because it is
1656 * preceded by raw_size which is a u32. So we need to copy args to a temp
1657 * variable to read it. Most notably this avoids extended load instructions
1658 * on unaligned addresses
1659 */
1660unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1661{
1662 unsigned long val;
1663 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1664
1665 memcpy(&val, p, sizeof(val));
1666 return val;
1667}
1668
1669static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1670 struct syscall_arg *arg)
1671{
1672 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1673 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1674
1675 return scnprintf(bf, size, "arg%d: ", arg->idx);
1676}
1677
1678/*
1679 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
1680 * as mount 'flags' argument that needs ignoring some magic flag, see comment
1681 * in tools/perf/trace/beauty/mount_flags.c
1682 */
1683static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
1684{
1685 if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
1686 return sc->arg_fmt[arg->idx].mask_val(arg, val);
1687
1688 return val;
1689}
1690
1691static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1692 struct syscall_arg *arg, unsigned long val)
1693{
1694 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1695 arg->val = val;
1696 if (sc->arg_fmt[arg->idx].parm)
1697 arg->parm = sc->arg_fmt[arg->idx].parm;
1698 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1699 }
1700 return scnprintf(bf, size, "%ld", val);
1701}
1702
1703static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1704 unsigned char *args, void *augmented_args, int augmented_args_size,
1705 struct trace *trace, struct thread *thread)
1706{
1707 size_t printed = 0;
1708 unsigned long val;
1709 u8 bit = 1;
1710 struct syscall_arg arg = {
1711 .args = args,
1712 .augmented = {
1713 .size = augmented_args_size,
1714 .args = augmented_args,
1715 },
1716 .idx = 0,
1717 .mask = 0,
1718 .trace = trace,
1719 .thread = thread,
1720 .show_string_prefix = trace->show_string_prefix,
1721 };
1722 struct thread_trace *ttrace = thread__priv(thread);
1723
1724 /*
1725 * Things like fcntl will set this in its 'cmd' formatter to pick the
1726 * right formatter for the return value (an fd? file flags?), which is
1727 * not needed for syscalls that always return a given type, say an fd.
1728 */
1729 ttrace->ret_scnprintf = NULL;
1730
1731 if (sc->args != NULL) {
1732 struct tep_format_field *field;
1733
1734 for (field = sc->args; field;
1735 field = field->next, ++arg.idx, bit <<= 1) {
1736 if (arg.mask & bit)
1737 continue;
1738
1739 val = syscall_arg__val(&arg, arg.idx);
1740 /*
1741 * Some syscall args need some mask, most don't and
1742 * return val untouched.
1743 */
1744 val = syscall__mask_val(sc, &arg, val);
1745
1746 /*
1747 * Suppress this argument if its value is zero and
1748 * and we don't have a string associated in an
1749 * strarray for it.
1750 */
1751 if (val == 0 &&
1752 !trace->show_zeros &&
1753 !(sc->arg_fmt &&
1754 (sc->arg_fmt[arg.idx].show_zero ||
1755 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1756 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1757 sc->arg_fmt[arg.idx].parm))
1758 continue;
1759
1760 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
1761
1762 if (trace->show_arg_names)
1763 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
1764
1765 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1766 }
1767 } else if (IS_ERR(sc->tp_format)) {
1768 /*
1769 * If we managed to read the tracepoint /format file, then we
1770 * may end up not having any args, like with gettid(), so only
1771 * print the raw args when we didn't manage to read it.
1772 */
1773 while (arg.idx < sc->nr_args) {
1774 if (arg.mask & bit)
1775 goto next_arg;
1776 val = syscall_arg__val(&arg, arg.idx);
1777 if (printed)
1778 printed += scnprintf(bf + printed, size - printed, ", ");
1779 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1780 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1781next_arg:
1782 ++arg.idx;
1783 bit <<= 1;
1784 }
1785 }
1786
1787 return printed;
1788}
1789
1790typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
1791 union perf_event *event,
1792 struct perf_sample *sample);
1793
1794static struct syscall *trace__syscall_info(struct trace *trace,
1795 struct evsel *evsel, int id)
1796{
1797 int err = 0;
1798
1799 if (id < 0) {
1800
1801 /*
1802 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1803 * before that, leaving at a higher verbosity level till that is
1804 * explained. Reproduced with plain ftrace with:
1805 *
1806 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1807 * grep "NR -1 " /t/trace_pipe
1808 *
1809 * After generating some load on the machine.
1810 */
1811 if (verbose > 1) {
1812 static u64 n;
1813 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1814 id, perf_evsel__name(evsel), ++n);
1815 }
1816 return NULL;
1817 }
1818
1819 err = -EINVAL;
1820
1821 if (id > trace->sctbl->syscalls.max_id)
1822 goto out_cant_read;
1823
1824 if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
1825 (err = trace__read_syscall_info(trace, id)) != 0)
1826 goto out_cant_read;
1827
1828 if (trace->syscalls.table[id].name == NULL) {
1829 if (trace->syscalls.table[id].nonexistent)
1830 return NULL;
1831 goto out_cant_read;
1832 }
1833
1834 return &trace->syscalls.table[id];
1835
1836out_cant_read:
1837 if (verbose > 0) {
1838 char sbuf[STRERR_BUFSIZE];
1839 fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
1840 if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
1841 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1842 fputs(" information\n", trace->output);
1843 }
1844 return NULL;
1845}
1846
1847static void thread__update_stats(struct thread_trace *ttrace,
1848 int id, struct perf_sample *sample)
1849{
1850 struct int_node *inode;
1851 struct stats *stats;
1852 u64 duration = 0;
1853
1854 inode = intlist__findnew(ttrace->syscall_stats, id);
1855 if (inode == NULL)
1856 return;
1857
1858 stats = inode->priv;
1859 if (stats == NULL) {
1860 stats = malloc(sizeof(struct stats));
1861 if (stats == NULL)
1862 return;
1863 init_stats(stats);
1864 inode->priv = stats;
1865 }
1866
1867 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1868 duration = sample->time - ttrace->entry_time;
1869
1870 update_stats(stats, duration);
1871}
1872
1873static int trace__printf_interrupted_entry(struct trace *trace)
1874{
1875 struct thread_trace *ttrace;
1876 size_t printed;
1877 int len;
1878
1879 if (trace->failure_only || trace->current == NULL)
1880 return 0;
1881
1882 ttrace = thread__priv(trace->current);
1883
1884 if (!ttrace->entry_pending)
1885 return 0;
1886
1887 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1888 printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
1889
1890 if (len < trace->args_alignment - 4)
1891 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
1892
1893 printed += fprintf(trace->output, " ...\n");
1894
1895 ttrace->entry_pending = false;
1896 ++trace->nr_events_printed;
1897
1898 return printed;
1899}
1900
1901static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
1902 struct perf_sample *sample, struct thread *thread)
1903{
1904 int printed = 0;
1905
1906 if (trace->print_sample) {
1907 double ts = (double)sample->time / NSEC_PER_MSEC;
1908
1909 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1910 perf_evsel__name(evsel), ts,
1911 thread__comm_str(thread),
1912 sample->pid, sample->tid, sample->cpu);
1913 }
1914
1915 return printed;
1916}
1917
1918static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
1919{
1920 void *augmented_args = NULL;
1921 /*
1922 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
1923 * and there we get all 6 syscall args plus the tracepoint common fields
1924 * that gets calculated at the start and the syscall_nr (another long).
1925 * So we check if that is the case and if so don't look after the
1926 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
1927 * which is fixed.
1928 *
1929 * We'll revisit this later to pass s->args_size to the BPF augmenter
1930 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
1931 * copies only what we need for each syscall, like what happens when we
1932 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
1933 * traffic to just what is needed for each syscall.
1934 */
1935 int args_size = raw_augmented_args_size ?: sc->args_size;
1936
1937 *augmented_args_size = sample->raw_size - args_size;
1938 if (*augmented_args_size > 0)
1939 augmented_args = sample->raw_data + args_size;
1940
1941 return augmented_args;
1942}
1943
1944static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
1945 union perf_event *event __maybe_unused,
1946 struct perf_sample *sample)
1947{
1948 char *msg;
1949 void *args;
1950 int printed = 0;
1951 struct thread *thread;
1952 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1953 int augmented_args_size = 0;
1954 void *augmented_args = NULL;
1955 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1956 struct thread_trace *ttrace;
1957
1958 if (sc == NULL)
1959 return -1;
1960
1961 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1962 ttrace = thread__trace(thread, trace->output);
1963 if (ttrace == NULL)
1964 goto out_put;
1965
1966 trace__fprintf_sample(trace, evsel, sample, thread);
1967
1968 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1969
1970 if (ttrace->entry_str == NULL) {
1971 ttrace->entry_str = malloc(trace__entry_str_size);
1972 if (!ttrace->entry_str)
1973 goto out_put;
1974 }
1975
1976 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1977 trace__printf_interrupted_entry(trace);
1978 /*
1979 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
1980 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
1981 * this breaks syscall__augmented_args() check for augmented args, as we calculate
1982 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
1983 * so when handling, say the openat syscall, we end up getting 6 args for the
1984 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
1985 * thinking that the extra 2 u64 args are the augmented filename, so just check
1986 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
1987 */
1988 if (evsel != trace->syscalls.events.sys_enter)
1989 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1990 ttrace->entry_time = sample->time;
1991 msg = ttrace->entry_str;
1992 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1993
1994 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1995 args, augmented_args, augmented_args_size, trace, thread);
1996
1997 if (sc->is_exit) {
1998 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1999 int alignment = 0;
2000
2001 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2002 printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2003 if (trace->args_alignment > printed)
2004 alignment = trace->args_alignment - printed;
2005 fprintf(trace->output, "%*s= ?\n", alignment, " ");
2006 }
2007 } else {
2008 ttrace->entry_pending = true;
2009 /* See trace__vfs_getname & trace__sys_exit */
2010 ttrace->filename.pending_open = false;
2011 }
2012
2013 if (trace->current != thread) {
2014 thread__put(trace->current);
2015 trace->current = thread__get(thread);
2016 }
2017 err = 0;
2018out_put:
2019 thread__put(thread);
2020 return err;
2021}
2022
2023static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2024 struct perf_sample *sample)
2025{
2026 struct thread_trace *ttrace;
2027 struct thread *thread;
2028 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030 char msg[1024];
2031 void *args, *augmented_args = NULL;
2032 int augmented_args_size;
2033
2034 if (sc == NULL)
2035 return -1;
2036
2037 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2038 ttrace = thread__trace(thread, trace->output);
2039 /*
2040 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2041 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2042 */
2043 if (ttrace == NULL)
2044 goto out_put;
2045
2046 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2047 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2048 syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2049 fprintf(trace->output, "%s", msg);
2050 err = 0;
2051out_put:
2052 thread__put(thread);
2053 return err;
2054}
2055
2056static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2057 struct perf_sample *sample,
2058 struct callchain_cursor *cursor)
2059{
2060 struct addr_location al;
2061 int max_stack = evsel->core.attr.sample_max_stack ?
2062 evsel->core.attr.sample_max_stack :
2063 trace->max_stack;
2064 int err;
2065
2066 if (machine__resolve(trace->host, &al, sample) < 0)
2067 return -1;
2068
2069 err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2070 addr_location__put(&al);
2071 return err;
2072}
2073
2074static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2075{
2076 /* TODO: user-configurable print_opts */
2077 const unsigned int print_opts = EVSEL__PRINT_SYM |
2078 EVSEL__PRINT_DSO |
2079 EVSEL__PRINT_UNKNOWN_AS_ADDR;
2080
2081 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output);
2082}
2083
2084static const char *errno_to_name(struct evsel *evsel, int err)
2085{
2086 struct perf_env *env = perf_evsel__env(evsel);
2087 const char *arch_name = perf_env__arch(env);
2088
2089 return arch_syscalls__strerrno(arch_name, err);
2090}
2091
2092static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2093 union perf_event *event __maybe_unused,
2094 struct perf_sample *sample)
2095{
2096 long ret;
2097 u64 duration = 0;
2098 bool duration_calculated = false;
2099 struct thread *thread;
2100 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2101 int alignment = trace->args_alignment;
2102 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2103 struct thread_trace *ttrace;
2104
2105 if (sc == NULL)
2106 return -1;
2107
2108 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2109 ttrace = thread__trace(thread, trace->output);
2110 if (ttrace == NULL)
2111 goto out_put;
2112
2113 trace__fprintf_sample(trace, evsel, sample, thread);
2114
2115 if (trace->summary)
2116 thread__update_stats(ttrace, id, sample);
2117
2118 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2119
2120 if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2121 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2122 ttrace->filename.pending_open = false;
2123 ++trace->stats.vfs_getname;
2124 }
2125
2126 if (ttrace->entry_time) {
2127 duration = sample->time - ttrace->entry_time;
2128 if (trace__filter_duration(trace, duration))
2129 goto out;
2130 duration_calculated = true;
2131 } else if (trace->duration_filter)
2132 goto out;
2133
2134 if (sample->callchain) {
2135 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2136 if (callchain_ret == 0) {
2137 if (callchain_cursor.nr < trace->min_stack)
2138 goto out;
2139 callchain_ret = 1;
2140 }
2141 }
2142
2143 if (trace->summary_only || (ret >= 0 && trace->failure_only))
2144 goto out;
2145
2146 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2147
2148 if (ttrace->entry_pending) {
2149 printed = fprintf(trace->output, "%s", ttrace->entry_str);
2150 } else {
2151 printed += fprintf(trace->output, " ... [");
2152 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2153 printed += 9;
2154 printed += fprintf(trace->output, "]: %s()", sc->name);
2155 }
2156
2157 printed++; /* the closing ')' */
2158
2159 if (alignment > printed)
2160 alignment -= printed;
2161 else
2162 alignment = 0;
2163
2164 fprintf(trace->output, ")%*s= ", alignment, " ");
2165
2166 if (sc->fmt == NULL) {
2167 if (ret < 0)
2168 goto errno_print;
2169signed_print:
2170 fprintf(trace->output, "%ld", ret);
2171 } else if (ret < 0) {
2172errno_print: {
2173 char bf[STRERR_BUFSIZE];
2174 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2175 *e = errno_to_name(evsel, -ret);
2176
2177 fprintf(trace->output, "-1 %s (%s)", e, emsg);
2178 }
2179 } else if (ret == 0 && sc->fmt->timeout)
2180 fprintf(trace->output, "0 (Timeout)");
2181 else if (ttrace->ret_scnprintf) {
2182 char bf[1024];
2183 struct syscall_arg arg = {
2184 .val = ret,
2185 .thread = thread,
2186 .trace = trace,
2187 };
2188 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2189 ttrace->ret_scnprintf = NULL;
2190 fprintf(trace->output, "%s", bf);
2191 } else if (sc->fmt->hexret)
2192 fprintf(trace->output, "%#lx", ret);
2193 else if (sc->fmt->errpid) {
2194 struct thread *child = machine__find_thread(trace->host, ret, ret);
2195
2196 if (child != NULL) {
2197 fprintf(trace->output, "%ld", ret);
2198 if (child->comm_set)
2199 fprintf(trace->output, " (%s)", thread__comm_str(child));
2200 thread__put(child);
2201 }
2202 } else
2203 goto signed_print;
2204
2205 fputc('\n', trace->output);
2206
2207 /*
2208 * We only consider an 'event' for the sake of --max-events a non-filtered
2209 * sys_enter + sys_exit and other tracepoint events.
2210 */
2211 if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2212 interrupted = true;
2213
2214 if (callchain_ret > 0)
2215 trace__fprintf_callchain(trace, sample);
2216 else if (callchain_ret < 0)
2217 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2218out:
2219 ttrace->entry_pending = false;
2220 err = 0;
2221out_put:
2222 thread__put(thread);
2223 return err;
2224}
2225
2226static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2227 union perf_event *event __maybe_unused,
2228 struct perf_sample *sample)
2229{
2230 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2231 struct thread_trace *ttrace;
2232 size_t filename_len, entry_str_len, to_move;
2233 ssize_t remaining_space;
2234 char *pos;
2235 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2236
2237 if (!thread)
2238 goto out;
2239
2240 ttrace = thread__priv(thread);
2241 if (!ttrace)
2242 goto out_put;
2243
2244 filename_len = strlen(filename);
2245 if (filename_len == 0)
2246 goto out_put;
2247
2248 if (ttrace->filename.namelen < filename_len) {
2249 char *f = realloc(ttrace->filename.name, filename_len + 1);
2250
2251 if (f == NULL)
2252 goto out_put;
2253
2254 ttrace->filename.namelen = filename_len;
2255 ttrace->filename.name = f;
2256 }
2257
2258 strcpy(ttrace->filename.name, filename);
2259 ttrace->filename.pending_open = true;
2260
2261 if (!ttrace->filename.ptr)
2262 goto out_put;
2263
2264 entry_str_len = strlen(ttrace->entry_str);
2265 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2266 if (remaining_space <= 0)
2267 goto out_put;
2268
2269 if (filename_len > (size_t)remaining_space) {
2270 filename += filename_len - remaining_space;
2271 filename_len = remaining_space;
2272 }
2273
2274 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2275 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2276 memmove(pos + filename_len, pos, to_move);
2277 memcpy(pos, filename, filename_len);
2278
2279 ttrace->filename.ptr = 0;
2280 ttrace->filename.entry_str_pos = 0;
2281out_put:
2282 thread__put(thread);
2283out:
2284 return 0;
2285}
2286
2287static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2288 union perf_event *event __maybe_unused,
2289 struct perf_sample *sample)
2290{
2291 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2292 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2293 struct thread *thread = machine__findnew_thread(trace->host,
2294 sample->pid,
2295 sample->tid);
2296 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2297
2298 if (ttrace == NULL)
2299 goto out_dump;
2300
2301 ttrace->runtime_ms += runtime_ms;
2302 trace->runtime_ms += runtime_ms;
2303out_put:
2304 thread__put(thread);
2305 return 0;
2306
2307out_dump:
2308 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2309 evsel->name,
2310 perf_evsel__strval(evsel, sample, "comm"),
2311 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2312 runtime,
2313 perf_evsel__intval(evsel, sample, "vruntime"));
2314 goto out_put;
2315}
2316
2317static int bpf_output__printer(enum binary_printer_ops op,
2318 unsigned int val, void *extra __maybe_unused, FILE *fp)
2319{
2320 unsigned char ch = (unsigned char)val;
2321
2322 switch (op) {
2323 case BINARY_PRINT_CHAR_DATA:
2324 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2325 case BINARY_PRINT_DATA_BEGIN:
2326 case BINARY_PRINT_LINE_BEGIN:
2327 case BINARY_PRINT_ADDR:
2328 case BINARY_PRINT_NUM_DATA:
2329 case BINARY_PRINT_NUM_PAD:
2330 case BINARY_PRINT_SEP:
2331 case BINARY_PRINT_CHAR_PAD:
2332 case BINARY_PRINT_LINE_END:
2333 case BINARY_PRINT_DATA_END:
2334 default:
2335 break;
2336 }
2337
2338 return 0;
2339}
2340
2341static void bpf_output__fprintf(struct trace *trace,
2342 struct perf_sample *sample)
2343{
2344 binary__fprintf(sample->raw_data, sample->raw_size, 8,
2345 bpf_output__printer, NULL, trace->output);
2346 ++trace->nr_events_printed;
2347}
2348
2349static int trace__event_handler(struct trace *trace, struct evsel *evsel,
2350 union perf_event *event __maybe_unused,
2351 struct perf_sample *sample)
2352{
2353 struct thread *thread;
2354 int callchain_ret = 0;
2355 /*
2356 * Check if we called perf_evsel__disable(evsel) due to, for instance,
2357 * this event's max_events having been hit and this is an entry coming
2358 * from the ring buffer that we should discard, since the max events
2359 * have already been considered/printed.
2360 */
2361 if (evsel->disabled)
2362 return 0;
2363
2364 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2365
2366 if (sample->callchain) {
2367 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2368 if (callchain_ret == 0) {
2369 if (callchain_cursor.nr < trace->min_stack)
2370 goto out;
2371 callchain_ret = 1;
2372 }
2373 }
2374
2375 trace__printf_interrupted_entry(trace);
2376 trace__fprintf_tstamp(trace, sample->time, trace->output);
2377
2378 if (trace->trace_syscalls && trace->show_duration)
2379 fprintf(trace->output, "( ): ");
2380
2381 if (thread)
2382 trace__fprintf_comm_tid(trace, thread, trace->output);
2383
2384 if (evsel == trace->syscalls.events.augmented) {
2385 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2386 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2387
2388 if (sc) {
2389 fprintf(trace->output, "%s(", sc->name);
2390 trace__fprintf_sys_enter(trace, evsel, sample);
2391 fputc(')', trace->output);
2392 goto newline;
2393 }
2394
2395 /*
2396 * XXX: Not having the associated syscall info or not finding/adding
2397 * the thread should never happen, but if it does...
2398 * fall thru and print it as a bpf_output event.
2399 */
2400 }
2401
2402 fprintf(trace->output, "%s:", evsel->name);
2403
2404 if (perf_evsel__is_bpf_output(evsel)) {
2405 bpf_output__fprintf(trace, sample);
2406 } else if (evsel->tp_format) {
2407 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2408 trace__fprintf_sys_enter(trace, evsel, sample)) {
2409 event_format__fprintf(evsel->tp_format, sample->cpu,
2410 sample->raw_data, sample->raw_size,
2411 trace->output);
2412 ++trace->nr_events_printed;
2413
2414 if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
2415 evsel__disable(evsel);
2416 evsel__close(evsel);
2417 }
2418 }
2419 }
2420
2421newline:
2422 fprintf(trace->output, "\n");
2423
2424 if (callchain_ret > 0)
2425 trace__fprintf_callchain(trace, sample);
2426 else if (callchain_ret < 0)
2427 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2428out:
2429 thread__put(thread);
2430 return 0;
2431}
2432
2433static void print_location(FILE *f, struct perf_sample *sample,
2434 struct addr_location *al,
2435 bool print_dso, bool print_sym)
2436{
2437
2438 if ((verbose > 0 || print_dso) && al->map)
2439 fprintf(f, "%s@", al->map->dso->long_name);
2440
2441 if ((verbose > 0 || print_sym) && al->sym)
2442 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2443 al->addr - al->sym->start);
2444 else if (al->map)
2445 fprintf(f, "0x%" PRIx64, al->addr);
2446 else
2447 fprintf(f, "0x%" PRIx64, sample->addr);
2448}
2449
2450static int trace__pgfault(struct trace *trace,
2451 struct evsel *evsel,
2452 union perf_event *event __maybe_unused,
2453 struct perf_sample *sample)
2454{
2455 struct thread *thread;
2456 struct addr_location al;
2457 char map_type = 'd';
2458 struct thread_trace *ttrace;
2459 int err = -1;
2460 int callchain_ret = 0;
2461
2462 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2463
2464 if (sample->callchain) {
2465 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2466 if (callchain_ret == 0) {
2467 if (callchain_cursor.nr < trace->min_stack)
2468 goto out_put;
2469 callchain_ret = 1;
2470 }
2471 }
2472
2473 ttrace = thread__trace(thread, trace->output);
2474 if (ttrace == NULL)
2475 goto out_put;
2476
2477 if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2478 ttrace->pfmaj++;
2479 else
2480 ttrace->pfmin++;
2481
2482 if (trace->summary_only)
2483 goto out;
2484
2485 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2486
2487 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2488
2489 fprintf(trace->output, "%sfault [",
2490 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2491 "maj" : "min");
2492
2493 print_location(trace->output, sample, &al, false, true);
2494
2495 fprintf(trace->output, "] => ");
2496
2497 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2498
2499 if (!al.map) {
2500 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2501
2502 if (al.map)
2503 map_type = 'x';
2504 else
2505 map_type = '?';
2506 }
2507
2508 print_location(trace->output, sample, &al, true, false);
2509
2510 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2511
2512 if (callchain_ret > 0)
2513 trace__fprintf_callchain(trace, sample);
2514 else if (callchain_ret < 0)
2515 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2516
2517 ++trace->nr_events_printed;
2518out:
2519 err = 0;
2520out_put:
2521 thread__put(thread);
2522 return err;
2523}
2524
2525static void trace__set_base_time(struct trace *trace,
2526 struct evsel *evsel,
2527 struct perf_sample *sample)
2528{
2529 /*
2530 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2531 * and don't use sample->time unconditionally, we may end up having
2532 * some other event in the future without PERF_SAMPLE_TIME for good
2533 * reason, i.e. we may not be interested in its timestamps, just in
2534 * it taking place, picking some piece of information when it
2535 * appears in our event stream (vfs_getname comes to mind).
2536 */
2537 if (trace->base_time == 0 && !trace->full_time &&
2538 (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
2539 trace->base_time = sample->time;
2540}
2541
2542static int trace__process_sample(struct perf_tool *tool,
2543 union perf_event *event,
2544 struct perf_sample *sample,
2545 struct evsel *evsel,
2546 struct machine *machine __maybe_unused)
2547{
2548 struct trace *trace = container_of(tool, struct trace, tool);
2549 struct thread *thread;
2550 int err = 0;
2551
2552 tracepoint_handler handler = evsel->handler;
2553
2554 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2555 if (thread && thread__is_filtered(thread))
2556 goto out;
2557
2558 trace__set_base_time(trace, evsel, sample);
2559
2560 if (handler) {
2561 ++trace->nr_events;
2562 handler(trace, evsel, event, sample);
2563 }
2564out:
2565 thread__put(thread);
2566 return err;
2567}
2568
2569static int trace__record(struct trace *trace, int argc, const char **argv)
2570{
2571 unsigned int rec_argc, i, j;
2572 const char **rec_argv;
2573 const char * const record_args[] = {
2574 "record",
2575 "-R",
2576 "-m", "1024",
2577 "-c", "1",
2578 };
2579
2580 const char * const sc_args[] = { "-e", };
2581 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2582 const char * const majpf_args[] = { "-e", "major-faults" };
2583 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2584 const char * const minpf_args[] = { "-e", "minor-faults" };
2585 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2586
2587 /* +1 is for the event string below */
2588 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2589 majpf_args_nr + minpf_args_nr + argc;
2590 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2591
2592 if (rec_argv == NULL)
2593 return -ENOMEM;
2594
2595 j = 0;
2596 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2597 rec_argv[j++] = record_args[i];
2598
2599 if (trace->trace_syscalls) {
2600 for (i = 0; i < sc_args_nr; i++)
2601 rec_argv[j++] = sc_args[i];
2602
2603 /* event string may be different for older kernels - e.g., RHEL6 */
2604 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2605 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2606 else if (is_valid_tracepoint("syscalls:sys_enter"))
2607 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2608 else {
2609 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2610 free(rec_argv);
2611 return -1;
2612 }
2613 }
2614
2615 if (trace->trace_pgfaults & TRACE_PFMAJ)
2616 for (i = 0; i < majpf_args_nr; i++)
2617 rec_argv[j++] = majpf_args[i];
2618
2619 if (trace->trace_pgfaults & TRACE_PFMIN)
2620 for (i = 0; i < minpf_args_nr; i++)
2621 rec_argv[j++] = minpf_args[i];
2622
2623 for (i = 0; i < (unsigned int)argc; i++)
2624 rec_argv[j++] = argv[i];
2625
2626 return cmd_record(j, rec_argv);
2627}
2628
2629static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2630
2631static bool evlist__add_vfs_getname(struct evlist *evlist)
2632{
2633 bool found = false;
2634 struct evsel *evsel, *tmp;
2635 struct parse_events_error err = { .idx = 0, };
2636 int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2637
2638 if (ret)
2639 return false;
2640
2641 evlist__for_each_entry_safe(evlist, evsel, tmp) {
2642 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2643 continue;
2644
2645 if (perf_evsel__field(evsel, "pathname")) {
2646 evsel->handler = trace__vfs_getname;
2647 found = true;
2648 continue;
2649 }
2650
2651 list_del_init(&evsel->core.node);
2652 evsel->evlist = NULL;
2653 evsel__delete(evsel);
2654 }
2655
2656 return found;
2657}
2658
2659static struct evsel *perf_evsel__new_pgfault(u64 config)
2660{
2661 struct evsel *evsel;
2662 struct perf_event_attr attr = {
2663 .type = PERF_TYPE_SOFTWARE,
2664 .mmap_data = 1,
2665 };
2666
2667 attr.config = config;
2668 attr.sample_period = 1;
2669
2670 event_attr_init(&attr);
2671
2672 evsel = evsel__new(&attr);
2673 if (evsel)
2674 evsel->handler = trace__pgfault;
2675
2676 return evsel;
2677}
2678
2679static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2680{
2681 const u32 type = event->header.type;
2682 struct evsel *evsel;
2683
2684 if (type != PERF_RECORD_SAMPLE) {
2685 trace__process_event(trace, trace->host, event, sample);
2686 return;
2687 }
2688
2689 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2690 if (evsel == NULL) {
2691 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2692 return;
2693 }
2694
2695 if (evswitch__discard(&trace->evswitch, evsel))
2696 return;
2697
2698 trace__set_base_time(trace, evsel, sample);
2699
2700 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
2701 sample->raw_data == NULL) {
2702 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2703 perf_evsel__name(evsel), sample->tid,
2704 sample->cpu, sample->raw_size);
2705 } else {
2706 tracepoint_handler handler = evsel->handler;
2707 handler(trace, evsel, event, sample);
2708 }
2709
2710 if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
2711 interrupted = true;
2712}
2713
2714static int trace__add_syscall_newtp(struct trace *trace)
2715{
2716 int ret = -1;
2717 struct evlist *evlist = trace->evlist;
2718 struct evsel *sys_enter, *sys_exit;
2719
2720 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2721 if (sys_enter == NULL)
2722 goto out;
2723
2724 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2725 goto out_delete_sys_enter;
2726
2727 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2728 if (sys_exit == NULL)
2729 goto out_delete_sys_enter;
2730
2731 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2732 goto out_delete_sys_exit;
2733
2734 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2735 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2736
2737 evlist__add(evlist, sys_enter);
2738 evlist__add(evlist, sys_exit);
2739
2740 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2741 /*
2742 * We're interested only in the user space callchain
2743 * leading to the syscall, allow overriding that for
2744 * debugging reasons using --kernel_syscall_callchains
2745 */
2746 sys_exit->core.attr.exclude_callchain_kernel = 1;
2747 }
2748
2749 trace->syscalls.events.sys_enter = sys_enter;
2750 trace->syscalls.events.sys_exit = sys_exit;
2751
2752 ret = 0;
2753out:
2754 return ret;
2755
2756out_delete_sys_exit:
2757 evsel__delete_priv(sys_exit);
2758out_delete_sys_enter:
2759 evsel__delete_priv(sys_enter);
2760 goto out;
2761}
2762
2763static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
2764{
2765 int err = -1;
2766 struct evsel *sys_exit;
2767 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2768 trace->ev_qualifier_ids.nr,
2769 trace->ev_qualifier_ids.entries);
2770
2771 if (filter == NULL)
2772 goto out_enomem;
2773
2774 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2775 filter)) {
2776 sys_exit = trace->syscalls.events.sys_exit;
2777 err = perf_evsel__append_tp_filter(sys_exit, filter);
2778 }
2779
2780 free(filter);
2781out:
2782 return err;
2783out_enomem:
2784 errno = ENOMEM;
2785 goto out;
2786}
2787
2788#ifdef HAVE_LIBBPF_SUPPORT
2789static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
2790{
2791 if (trace->bpf_obj == NULL)
2792 return NULL;
2793
2794 return bpf_object__find_program_by_title(trace->bpf_obj, name);
2795}
2796
2797static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
2798 const char *prog_name, const char *type)
2799{
2800 struct bpf_program *prog;
2801
2802 if (prog_name == NULL) {
2803 char default_prog_name[256];
2804 scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
2805 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
2806 if (prog != NULL)
2807 goto out_found;
2808 if (sc->fmt && sc->fmt->alias) {
2809 scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
2810 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
2811 if (prog != NULL)
2812 goto out_found;
2813 }
2814 goto out_unaugmented;
2815 }
2816
2817 prog = trace__find_bpf_program_by_title(trace, prog_name);
2818
2819 if (prog != NULL) {
2820out_found:
2821 return prog;
2822 }
2823
2824 pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
2825 prog_name, type, sc->name);
2826out_unaugmented:
2827 return trace->syscalls.unaugmented_prog;
2828}
2829
2830static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
2831{
2832 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2833
2834 if (sc == NULL)
2835 return;
2836
2837 sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
2838 sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit");
2839}
2840
2841static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
2842{
2843 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2844 return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
2845}
2846
2847static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
2848{
2849 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2850 return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
2851}
2852
2853static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry)
2854{
2855 struct syscall *sc = trace__syscall_info(trace, NULL, id);
2856 int arg = 0;
2857
2858 if (sc == NULL)
2859 goto out;
2860
2861 for (; arg < sc->nr_args; ++arg) {
2862 entry->string_args_len[arg] = 0;
2863 if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) {
2864 /* Should be set like strace -s strsize */
2865 entry->string_args_len[arg] = PATH_MAX;
2866 }
2867 }
2868out:
2869 for (; arg < 6; ++arg)
2870 entry->string_args_len[arg] = 0;
2871}
2872static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
2873{
2874 int fd = bpf_map__fd(trace->syscalls.map);
2875 struct bpf_map_syscall_entry value = {
2876 .enabled = !trace->not_ev_qualifier,
2877 };
2878 int err = 0;
2879 size_t i;
2880
2881 for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
2882 int key = trace->ev_qualifier_ids.entries[i];
2883
2884 if (value.enabled) {
2885 trace__init_bpf_map_syscall_args(trace, key, &value);
2886 trace__init_syscall_bpf_progs(trace, key);
2887 }
2888
2889 err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
2890 if (err)
2891 break;
2892 }
2893
2894 return err;
2895}
2896
2897static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
2898{
2899 int fd = bpf_map__fd(trace->syscalls.map);
2900 struct bpf_map_syscall_entry value = {
2901 .enabled = enabled,
2902 };
2903 int err = 0, key;
2904
2905 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
2906 if (enabled)
2907 trace__init_bpf_map_syscall_args(trace, key, &value);
2908
2909 err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
2910 if (err)
2911 break;
2912 }
2913
2914 return err;
2915}
2916
2917static int trace__init_syscalls_bpf_map(struct trace *trace)
2918{
2919 bool enabled = true;
2920
2921 if (trace->ev_qualifier_ids.nr)
2922 enabled = trace->not_ev_qualifier;
2923
2924 return __trace__init_syscalls_bpf_map(trace, enabled);
2925}
2926
2927static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
2928{
2929 struct tep_format_field *field, *candidate_field;
2930 int id;
2931
2932 /*
2933 * We're only interested in syscalls that have a pointer:
2934 */
2935 for (field = sc->args; field; field = field->next) {
2936 if (field->flags & TEP_FIELD_IS_POINTER)
2937 goto try_to_find_pair;
2938 }
2939
2940 return NULL;
2941
2942try_to_find_pair:
2943 for (id = 0; id < trace->sctbl->syscalls.nr_entries; ++id) {
2944 struct syscall *pair = trace__syscall_info(trace, NULL, id);
2945 struct bpf_program *pair_prog;
2946 bool is_candidate = false;
2947
2948 if (pair == NULL || pair == sc ||
2949 pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
2950 continue;
2951
2952 for (field = sc->args, candidate_field = pair->args;
2953 field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
2954 bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
2955 candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
2956
2957 if (is_pointer) {
2958 if (!candidate_is_pointer) {
2959 // The candidate just doesn't copies our pointer arg, might copy other pointers we want.
2960 continue;
2961 }
2962 } else {
2963 if (candidate_is_pointer) {
2964 // The candidate might copy a pointer we don't have, skip it.
2965 goto next_candidate;
2966 }
2967 continue;
2968 }
2969
2970 if (strcmp(field->type, candidate_field->type))
2971 goto next_candidate;
2972
2973 is_candidate = true;
2974 }
2975
2976 if (!is_candidate)
2977 goto next_candidate;
2978
2979 /*
2980 * Check if the tentative pair syscall augmenter has more pointers, if it has,
2981 * then it may be collecting that and we then can't use it, as it would collect
2982 * more than what is common to the two syscalls.
2983 */
2984 if (candidate_field) {
2985 for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
2986 if (candidate_field->flags & TEP_FIELD_IS_POINTER)
2987 goto next_candidate;
2988 }
2989
2990 pair_prog = pair->bpf_prog.sys_enter;
2991 /*
2992 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
2993 * have been searched for, so search it here and if it returns the
2994 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
2995 * program for a filtered syscall on a non-filtered one.
2996 *
2997 * For instance, we have "!syscalls:sys_enter_renameat" and that is
2998 * useful for "renameat2".
2999 */
3000 if (pair_prog == NULL) {
3001 pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3002 if (pair_prog == trace->syscalls.unaugmented_prog)
3003 goto next_candidate;
3004 }
3005
3006 pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3007 return pair_prog;
3008 next_candidate:
3009 continue;
3010 }
3011
3012 return NULL;
3013}
3014
3015static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3016{
3017 int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
3018 map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
3019 int err = 0, key;
3020
3021 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
3022 int prog_fd;
3023
3024 if (!trace__syscall_enabled(trace, key))
3025 continue;
3026
3027 trace__init_syscall_bpf_progs(trace, key);
3028
3029 // It'll get at least the "!raw_syscalls:unaugmented"
3030 prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3031 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3032 if (err)
3033 break;
3034 prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3035 err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3036 if (err)
3037 break;
3038 }
3039
3040 /*
3041 * Now lets do a second pass looking for enabled syscalls without
3042 * an augmenter that have a signature that is a superset of another
3043 * syscall with an augmenter so that we can auto-reuse it.
3044 *
3045 * I.e. if we have an augmenter for the "open" syscall that has
3046 * this signature:
3047 *
3048 * int open(const char *pathname, int flags, mode_t mode);
3049 *
3050 * I.e. that will collect just the first string argument, then we
3051 * can reuse it for the 'creat' syscall, that has this signature:
3052 *
3053 * int creat(const char *pathname, mode_t mode);
3054 *
3055 * and for:
3056 *
3057 * int stat(const char *pathname, struct stat *statbuf);
3058 * int lstat(const char *pathname, struct stat *statbuf);
3059 *
3060 * Because the 'open' augmenter will collect the first arg as a string,
3061 * and leave alone all the other args, which already helps with
3062 * beautifying 'stat' and 'lstat''s pathname arg.
3063 *
3064 * Then, in time, when 'stat' gets an augmenter that collects both
3065 * first and second arg (this one on the raw_syscalls:sys_exit prog
3066 * array tail call, then that one will be used.
3067 */
3068 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
3069 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3070 struct bpf_program *pair_prog;
3071 int prog_fd;
3072
3073 if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3074 continue;
3075
3076 /*
3077 * For now we're just reusing the sys_enter prog, and if it
3078 * already has an augmenter, we don't need to find one.
3079 */
3080 if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
3081 continue;
3082
3083 /*
3084 * Look at all the other syscalls for one that has a signature
3085 * that is close enough that we can share:
3086 */
3087 pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3088 if (pair_prog == NULL)
3089 continue;
3090
3091 sc->bpf_prog.sys_enter = pair_prog;
3092
3093 /*
3094 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3095 * with the fd for the program we're reusing:
3096 */
3097 prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3098 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3099 if (err)
3100 break;
3101 }
3102
3103
3104 return err;
3105}
3106#else
3107static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
3108{
3109 return 0;
3110}
3111
3112static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
3113{
3114 return 0;
3115}
3116
3117static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace __maybe_unused,
3118 const char *name __maybe_unused)
3119{
3120 return NULL;
3121}
3122
3123static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
3124{
3125 return 0;
3126}
3127#endif // HAVE_LIBBPF_SUPPORT
3128
3129static int trace__set_ev_qualifier_filter(struct trace *trace)
3130{
3131 if (trace->syscalls.map)
3132 return trace__set_ev_qualifier_bpf_filter(trace);
3133 if (trace->syscalls.events.sys_enter)
3134 return trace__set_ev_qualifier_tp_filter(trace);
3135 return 0;
3136}
3137
3138static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3139 size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3140{
3141 int err = 0;
3142#ifdef HAVE_LIBBPF_SUPPORT
3143 bool value = true;
3144 int map_fd = bpf_map__fd(map);
3145 size_t i;
3146
3147 for (i = 0; i < npids; ++i) {
3148 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3149 if (err)
3150 break;
3151 }
3152#endif
3153 return err;
3154}
3155
3156static int trace__set_filter_loop_pids(struct trace *trace)
3157{
3158 unsigned int nr = 1, err;
3159 pid_t pids[32] = {
3160 getpid(),
3161 };
3162 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3163
3164 while (thread && nr < ARRAY_SIZE(pids)) {
3165 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
3166
3167 if (parent == NULL)
3168 break;
3169
3170 if (!strcmp(thread__comm_str(parent), "sshd") ||
3171 strstarts(thread__comm_str(parent), "gnome-terminal")) {
3172 pids[nr++] = parent->tid;
3173 break;
3174 }
3175 thread = parent;
3176 }
3177
3178 err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
3179 if (!err && trace->filter_pids.map)
3180 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3181
3182 return err;
3183}
3184
3185static int trace__set_filter_pids(struct trace *trace)
3186{
3187 int err = 0;
3188 /*
3189 * Better not use !target__has_task() here because we need to cover the
3190 * case where no threads were specified in the command line, but a
3191 * workload was, and in that case we will fill in the thread_map when
3192 * we fork the workload in perf_evlist__prepare_workload.
3193 */
3194 if (trace->filter_pids.nr > 0) {
3195 err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
3196 trace->filter_pids.entries);
3197 if (!err && trace->filter_pids.map) {
3198 err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
3199 trace->filter_pids.entries);
3200 }
3201 } else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
3202 err = trace__set_filter_loop_pids(trace);
3203 }
3204
3205 return err;
3206}
3207
3208static int __trace__deliver_event(struct trace *trace, union perf_event *event)
3209{
3210 struct evlist *evlist = trace->evlist;
3211 struct perf_sample sample;
3212 int err;
3213
3214 err = perf_evlist__parse_sample(evlist, event, &sample);
3215 if (err)
3216 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
3217 else
3218 trace__handle_event(trace, event, &sample);
3219
3220 return 0;
3221}
3222
3223static int __trace__flush_events(struct trace *trace)
3224{
3225 u64 first = ordered_events__first_time(&trace->oe.data);
3226 u64 flush = trace->oe.last - NSEC_PER_SEC;
3227
3228 /* Is there some thing to flush.. */
3229 if (first && first < flush)
3230 return ordered_events__flush_time(&trace->oe.data, flush);
3231
3232 return 0;
3233}
3234
3235static int trace__flush_events(struct trace *trace)
3236{
3237 return !trace->sort_events ? 0 : __trace__flush_events(trace);
3238}
3239
3240static int trace__deliver_event(struct trace *trace, union perf_event *event)
3241{
3242 int err;
3243
3244 if (!trace->sort_events)
3245 return __trace__deliver_event(trace, event);
3246
3247 err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
3248 if (err && err != -1)
3249 return err;
3250
3251 err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
3252 if (err)
3253 return err;
3254
3255 return trace__flush_events(trace);
3256}
3257
3258static int ordered_events__deliver_event(struct ordered_events *oe,
3259 struct ordered_event *event)
3260{
3261 struct trace *trace = container_of(oe, struct trace, oe.data);
3262
3263 return __trace__deliver_event(trace, event->event);
3264}
3265
3266static int trace__run(struct trace *trace, int argc, const char **argv)
3267{
3268 struct evlist *evlist = trace->evlist;
3269 struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
3270 int err = -1, i;
3271 unsigned long before;
3272 const bool forks = argc > 0;
3273 bool draining = false;
3274
3275 trace->live = true;
3276
3277 if (!trace->raw_augmented_syscalls) {
3278 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
3279 goto out_error_raw_syscalls;
3280
3281 if (trace->trace_syscalls)
3282 trace->vfs_getname = evlist__add_vfs_getname(evlist);
3283 }
3284
3285 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
3286 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
3287 if (pgfault_maj == NULL)
3288 goto out_error_mem;
3289 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
3290 evlist__add(evlist, pgfault_maj);
3291 }
3292
3293 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
3294 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
3295 if (pgfault_min == NULL)
3296 goto out_error_mem;
3297 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
3298 evlist__add(evlist, pgfault_min);
3299 }
3300
3301 if (trace->sched &&
3302 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
3303 trace__sched_stat_runtime))
3304 goto out_error_sched_stat_runtime;
3305
3306 /*
3307 * If a global cgroup was set, apply it to all the events without an
3308 * explicit cgroup. I.e.:
3309 *
3310 * trace -G A -e sched:*switch
3311 *
3312 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
3313 * _and_ sched:sched_switch to the 'A' cgroup, while:
3314 *
3315 * trace -e sched:*switch -G A
3316 *
3317 * will only set the sched:sched_switch event to the 'A' cgroup, all the
3318 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
3319 * a cgroup (on the root cgroup, sys wide, etc).
3320 *
3321 * Multiple cgroups:
3322 *
3323 * trace -G A -e sched:*switch -G B
3324 *
3325 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
3326 * to the 'B' cgroup.
3327 *
3328 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
3329 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
3330 */
3331 if (trace->cgroup)
3332 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
3333
3334 err = perf_evlist__create_maps(evlist, &trace->opts.target);
3335 if (err < 0) {
3336 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
3337 goto out_delete_evlist;
3338 }
3339
3340 err = trace__symbols_init(trace, evlist);
3341 if (err < 0) {
3342 fprintf(trace->output, "Problems initializing symbol libraries!\n");
3343 goto out_delete_evlist;
3344 }
3345
3346 perf_evlist__config(evlist, &trace->opts, &callchain_param);
3347
3348 signal(SIGCHLD, sig_handler);
3349 signal(SIGINT, sig_handler);
3350
3351 if (forks) {
3352 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
3353 argv, false, NULL);
3354 if (err < 0) {
3355 fprintf(trace->output, "Couldn't run the workload!\n");
3356 goto out_delete_evlist;
3357 }
3358 }
3359
3360 err = evlist__open(evlist);
3361 if (err < 0)
3362 goto out_error_open;
3363
3364 err = bpf__apply_obj_config();
3365 if (err) {
3366 char errbuf[BUFSIZ];
3367
3368 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
3369 pr_err("ERROR: Apply config to BPF failed: %s\n",
3370 errbuf);
3371 goto out_error_open;
3372 }
3373
3374 err = trace__set_filter_pids(trace);
3375 if (err < 0)
3376 goto out_error_mem;
3377
3378 if (trace->syscalls.map)
3379 trace__init_syscalls_bpf_map(trace);
3380
3381 if (trace->syscalls.prog_array.sys_enter)
3382 trace__init_syscalls_bpf_prog_array_maps(trace);
3383
3384 if (trace->ev_qualifier_ids.nr > 0) {
3385 err = trace__set_ev_qualifier_filter(trace);
3386 if (err < 0)
3387 goto out_errno;
3388
3389 if (trace->syscalls.events.sys_exit) {
3390 pr_debug("event qualifier tracepoint filter: %s\n",
3391 trace->syscalls.events.sys_exit->filter);
3392 }
3393 }
3394
3395 /*
3396 * If the "close" syscall is not traced, then we will not have the
3397 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
3398 * fd->pathname table and were ending up showing the last value set by
3399 * syscalls opening a pathname and associating it with a descriptor or
3400 * reading it from /proc/pid/fd/ in cases where that doesn't make
3401 * sense.
3402 *
3403 * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
3404 * not in use.
3405 */
3406 trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
3407
3408 err = perf_evlist__apply_filters(evlist, &evsel);
3409 if (err < 0)
3410 goto out_error_apply_filters;
3411
3412 if (trace->dump.map)
3413 bpf_map__fprintf(trace->dump.map, trace->output);
3414
3415 err = evlist__mmap(evlist, trace->opts.mmap_pages);
3416 if (err < 0)
3417 goto out_error_mmap;
3418
3419 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
3420 evlist__enable(evlist);
3421
3422 if (forks)
3423 perf_evlist__start_workload(evlist);
3424
3425 if (trace->opts.initial_delay) {
3426 usleep(trace->opts.initial_delay * 1000);
3427 evlist__enable(evlist);
3428 }
3429
3430 trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
3431 evlist->core.threads->nr > 1 ||
3432 evlist__first(evlist)->core.attr.inherit;
3433
3434 /*
3435 * Now that we already used evsel->core.attr to ask the kernel to setup the
3436 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
3437 * trace__resolve_callchain(), allowing per-event max-stack settings
3438 * to override an explicitly set --max-stack global setting.
3439 */
3440 evlist__for_each_entry(evlist, evsel) {
3441 if (evsel__has_callchain(evsel) &&
3442 evsel->core.attr.sample_max_stack == 0)
3443 evsel->core.attr.sample_max_stack = trace->max_stack;
3444 }
3445again:
3446 before = trace->nr_events;
3447
3448 for (i = 0; i < evlist->core.nr_mmaps; i++) {
3449 union perf_event *event;
3450 struct mmap *md;
3451
3452 md = &evlist->mmap[i];
3453 if (perf_mmap__read_init(md) < 0)
3454 continue;
3455
3456 while ((event = perf_mmap__read_event(md)) != NULL) {
3457 ++trace->nr_events;
3458
3459 err = trace__deliver_event(trace, event);
3460 if (err)
3461 goto out_disable;
3462
3463 perf_mmap__consume(md);
3464
3465 if (interrupted)
3466 goto out_disable;
3467
3468 if (done && !draining) {
3469 evlist__disable(evlist);
3470 draining = true;
3471 }
3472 }
3473 perf_mmap__read_done(md);
3474 }
3475
3476 if (trace->nr_events == before) {
3477 int timeout = done ? 100 : -1;
3478
3479 if (!draining && evlist__poll(evlist, timeout) > 0) {
3480 if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
3481 draining = true;
3482
3483 goto again;
3484 } else {
3485 if (trace__flush_events(trace))
3486 goto out_disable;
3487 }
3488 } else {
3489 goto again;
3490 }
3491
3492out_disable:
3493 thread__zput(trace->current);
3494
3495 evlist__disable(evlist);
3496
3497 if (trace->sort_events)
3498 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
3499
3500 if (!err) {
3501 if (trace->summary)
3502 trace__fprintf_thread_summary(trace, trace->output);
3503
3504 if (trace->show_tool_stats) {
3505 fprintf(trace->output, "Stats:\n "
3506 " vfs_getname : %" PRIu64 "\n"
3507 " proc_getname: %" PRIu64 "\n",
3508 trace->stats.vfs_getname,
3509 trace->stats.proc_getname);
3510 }
3511 }
3512
3513out_delete_evlist:
3514 trace__symbols__exit(trace);
3515
3516 evlist__delete(evlist);
3517 cgroup__put(trace->cgroup);
3518 trace->evlist = NULL;
3519 trace->live = false;
3520 return err;
3521{
3522 char errbuf[BUFSIZ];
3523
3524out_error_sched_stat_runtime:
3525 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
3526 goto out_error;
3527
3528out_error_raw_syscalls:
3529 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
3530 goto out_error;
3531
3532out_error_mmap:
3533 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
3534 goto out_error;
3535
3536out_error_open:
3537 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
3538
3539out_error:
3540 fprintf(trace->output, "%s\n", errbuf);
3541 goto out_delete_evlist;
3542
3543out_error_apply_filters:
3544 fprintf(trace->output,
3545 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
3546 evsel->filter, perf_evsel__name(evsel), errno,
3547 str_error_r(errno, errbuf, sizeof(errbuf)));
3548 goto out_delete_evlist;
3549}
3550out_error_mem:
3551 fprintf(trace->output, "Not enough memory to run!\n");
3552 goto out_delete_evlist;
3553
3554out_errno:
3555 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
3556 goto out_delete_evlist;
3557}
3558
3559static int trace__replay(struct trace *trace)
3560{
3561 const struct evsel_str_handler handlers[] = {
3562 { "probe:vfs_getname", trace__vfs_getname, },
3563 };
3564 struct perf_data data = {
3565 .path = input_name,
3566 .mode = PERF_DATA_MODE_READ,
3567 .force = trace->force,
3568 };
3569 struct perf_session *session;
3570 struct evsel *evsel;
3571 int err = -1;
3572
3573 trace->tool.sample = trace__process_sample;
3574 trace->tool.mmap = perf_event__process_mmap;
3575 trace->tool.mmap2 = perf_event__process_mmap2;
3576 trace->tool.comm = perf_event__process_comm;
3577 trace->tool.exit = perf_event__process_exit;
3578 trace->tool.fork = perf_event__process_fork;
3579 trace->tool.attr = perf_event__process_attr;
3580 trace->tool.tracing_data = perf_event__process_tracing_data;
3581 trace->tool.build_id = perf_event__process_build_id;
3582 trace->tool.namespaces = perf_event__process_namespaces;
3583
3584 trace->tool.ordered_events = true;
3585 trace->tool.ordering_requires_timestamps = true;
3586
3587 /* add tid to output */
3588 trace->multiple_threads = true;
3589
3590 session = perf_session__new(&data, false, &trace->tool);
3591 if (IS_ERR(session))
3592 return PTR_ERR(session);
3593
3594 if (trace->opts.target.pid)
3595 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
3596
3597 if (trace->opts.target.tid)
3598 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
3599
3600 if (symbol__init(&session->header.env) < 0)
3601 goto out;
3602
3603 trace->host = &session->machines.host;
3604
3605 err = perf_session__set_tracepoints_handlers(session, handlers);
3606 if (err)
3607 goto out;
3608
3609 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3610 "raw_syscalls:sys_enter");
3611 /* older kernels have syscalls tp versus raw_syscalls */
3612 if (evsel == NULL)
3613 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3614 "syscalls:sys_enter");
3615
3616 if (evsel &&
3617 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
3618 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3619 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
3620 goto out;
3621 }
3622
3623 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3624 "raw_syscalls:sys_exit");
3625 if (evsel == NULL)
3626 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3627 "syscalls:sys_exit");
3628 if (evsel &&
3629 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
3630 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3631 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3632 goto out;
3633 }
3634
3635 evlist__for_each_entry(session->evlist, evsel) {
3636 if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
3637 (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
3638 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
3639 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
3640 evsel->handler = trace__pgfault;
3641 }
3642
3643 setup_pager();
3644
3645 err = perf_session__process_events(session);
3646 if (err)
3647 pr_err("Failed to process events, error %d", err);
3648
3649 else if (trace->summary)
3650 trace__fprintf_thread_summary(trace, trace->output);
3651
3652out:
3653 perf_session__delete(session);
3654
3655 return err;
3656}
3657
3658static size_t trace__fprintf_threads_header(FILE *fp)
3659{
3660 size_t printed;
3661
3662 printed = fprintf(fp, "\n Summary of events:\n\n");
3663
3664 return printed;
3665}
3666
3667DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
3668 struct stats *stats;
3669 double msecs;
3670 int syscall;
3671)
3672{
3673 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
3674 struct stats *stats = source->priv;
3675
3676 entry->syscall = source->i;
3677 entry->stats = stats;
3678 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
3679}
3680
3681static size_t thread__dump_stats(struct thread_trace *ttrace,
3682 struct trace *trace, FILE *fp)
3683{
3684 size_t printed = 0;
3685 struct syscall *sc;
3686 struct rb_node *nd;
3687 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
3688
3689 if (syscall_stats == NULL)
3690 return 0;
3691
3692 printed += fprintf(fp, "\n");
3693
3694 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
3695 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
3696 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
3697
3698 resort_rb__for_each_entry(nd, syscall_stats) {
3699 struct stats *stats = syscall_stats_entry->stats;
3700 if (stats) {
3701 double min = (double)(stats->min) / NSEC_PER_MSEC;
3702 double max = (double)(stats->max) / NSEC_PER_MSEC;
3703 double avg = avg_stats(stats);
3704 double pct;
3705 u64 n = (u64) stats->n;
3706
3707 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3708 avg /= NSEC_PER_MSEC;
3709
3710 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
3711 printed += fprintf(fp, " %-15s", sc->name);
3712 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3713 n, syscall_stats_entry->msecs, min, avg);
3714 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3715 }
3716 }
3717
3718 resort_rb__delete(syscall_stats);
3719 printed += fprintf(fp, "\n\n");
3720
3721 return printed;
3722}
3723
3724static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
3725{
3726 size_t printed = 0;
3727 struct thread_trace *ttrace = thread__priv(thread);
3728 double ratio;
3729
3730 if (ttrace == NULL)
3731 return 0;
3732
3733 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3734
3735 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3736 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3737 printed += fprintf(fp, "%.1f%%", ratio);
3738 if (ttrace->pfmaj)
3739 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3740 if (ttrace->pfmin)
3741 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3742 if (trace->sched)
3743 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3744 else if (fputc('\n', fp) != EOF)
3745 ++printed;
3746
3747 printed += thread__dump_stats(ttrace, trace, fp);
3748
3749 return printed;
3750}
3751
3752static unsigned long thread__nr_events(struct thread_trace *ttrace)
3753{
3754 return ttrace ? ttrace->nr_events : 0;
3755}
3756
3757DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
3758 struct thread *thread;
3759)
3760{
3761 entry->thread = rb_entry(nd, struct thread, rb_node);
3762}
3763
3764static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3765{
3766 size_t printed = trace__fprintf_threads_header(fp);
3767 struct rb_node *nd;
3768 int i;
3769
3770 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
3771 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
3772
3773 if (threads == NULL) {
3774 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
3775 return 0;
3776 }
3777
3778 resort_rb__for_each_entry(nd, threads)
3779 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
3780
3781 resort_rb__delete(threads);
3782 }
3783 return printed;
3784}
3785
3786static int trace__set_duration(const struct option *opt, const char *str,
3787 int unset __maybe_unused)
3788{
3789 struct trace *trace = opt->value;
3790
3791 trace->duration_filter = atof(str);
3792 return 0;
3793}
3794
3795static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
3796 int unset __maybe_unused)
3797{
3798 int ret = -1;
3799 size_t i;
3800 struct trace *trace = opt->value;
3801 /*
3802 * FIXME: introduce a intarray class, plain parse csv and create a
3803 * { int nr, int entries[] } struct...
3804 */
3805 struct intlist *list = intlist__new(str);
3806
3807 if (list == NULL)
3808 return -1;
3809
3810 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3811 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3812
3813 if (trace->filter_pids.entries == NULL)
3814 goto out;
3815
3816 trace->filter_pids.entries[0] = getpid();
3817
3818 for (i = 1; i < trace->filter_pids.nr; ++i)
3819 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3820
3821 intlist__delete(list);
3822 ret = 0;
3823out:
3824 return ret;
3825}
3826
3827static int trace__open_output(struct trace *trace, const char *filename)
3828{
3829 struct stat st;
3830
3831 if (!stat(filename, &st) && st.st_size) {
3832 char oldname[PATH_MAX];
3833
3834 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3835 unlink(oldname);
3836 rename(filename, oldname);
3837 }
3838
3839 trace->output = fopen(filename, "w");
3840
3841 return trace->output == NULL ? -errno : 0;
3842}
3843
3844static int parse_pagefaults(const struct option *opt, const char *str,
3845 int unset __maybe_unused)
3846{
3847 int *trace_pgfaults = opt->value;
3848
3849 if (strcmp(str, "all") == 0)
3850 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3851 else if (strcmp(str, "maj") == 0)
3852 *trace_pgfaults |= TRACE_PFMAJ;
3853 else if (strcmp(str, "min") == 0)
3854 *trace_pgfaults |= TRACE_PFMIN;
3855 else
3856 return -1;
3857
3858 return 0;
3859}
3860
3861static void evlist__set_evsel_handler(struct evlist *evlist, void *handler)
3862{
3863 struct evsel *evsel;
3864
3865 evlist__for_each_entry(evlist, evsel)
3866 evsel->handler = handler;
3867}
3868
3869static int evlist__set_syscall_tp_fields(struct evlist *evlist)
3870{
3871 struct evsel *evsel;
3872
3873 evlist__for_each_entry(evlist, evsel) {
3874 if (evsel->priv || !evsel->tp_format)
3875 continue;
3876
3877 if (strcmp(evsel->tp_format->system, "syscalls"))
3878 continue;
3879
3880 if (perf_evsel__init_syscall_tp(evsel))
3881 return -1;
3882
3883 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3884 struct syscall_tp *sc = evsel->priv;
3885
3886 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3887 return -1;
3888 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3889 struct syscall_tp *sc = evsel->priv;
3890
3891 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3892 return -1;
3893 }
3894 }
3895
3896 return 0;
3897}
3898
3899/*
3900 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3901 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3902 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3903 *
3904 * It'd be better to introduce a parse_options() variant that would return a
3905 * list with the terms it didn't match to an event...
3906 */
3907static int trace__parse_events_option(const struct option *opt, const char *str,
3908 int unset __maybe_unused)
3909{
3910 struct trace *trace = (struct trace *)opt->value;
3911 const char *s = str;
3912 char *sep = NULL, *lists[2] = { NULL, NULL, };
3913 int len = strlen(str) + 1, err = -1, list, idx;
3914 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3915 char group_name[PATH_MAX];
3916 struct syscall_fmt *fmt;
3917
3918 if (strace_groups_dir == NULL)
3919 return -1;
3920
3921 if (*s == '!') {
3922 ++s;
3923 trace->not_ev_qualifier = true;
3924 }
3925
3926 while (1) {
3927 if ((sep = strchr(s, ',')) != NULL)
3928 *sep = '\0';
3929
3930 list = 0;
3931 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3932 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3933 list = 1;
3934 goto do_concat;
3935 }
3936
3937 fmt = syscall_fmt__find_by_alias(s);
3938 if (fmt != NULL) {
3939 list = 1;
3940 s = fmt->name;
3941 } else {
3942 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3943 if (access(group_name, R_OK) == 0)
3944 list = 1;
3945 }
3946do_concat:
3947 if (lists[list]) {
3948 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3949 } else {
3950 lists[list] = malloc(len);
3951 if (lists[list] == NULL)
3952 goto out;
3953 strcpy(lists[list], s);
3954 }
3955
3956 if (!sep)
3957 break;
3958
3959 *sep = ',';
3960 s = sep + 1;
3961 }
3962
3963 if (lists[1] != NULL) {
3964 struct strlist_config slist_config = {
3965 .dirname = strace_groups_dir,
3966 };
3967
3968 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3969 if (trace->ev_qualifier == NULL) {
3970 fputs("Not enough memory to parse event qualifier", trace->output);
3971 goto out;
3972 }
3973
3974 if (trace__validate_ev_qualifier(trace))
3975 goto out;
3976 trace->trace_syscalls = true;
3977 }
3978
3979 err = 0;
3980
3981 if (lists[0]) {
3982 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3983 "event selector. use 'perf list' to list available events",
3984 parse_events_option);
3985 err = parse_events_option(&o, lists[0], 0);
3986 }
3987out:
3988 if (sep)
3989 *sep = ',';
3990
3991 return err;
3992}
3993
3994static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3995{
3996 struct trace *trace = opt->value;
3997
3998 if (!list_empty(&trace->evlist->core.entries))
3999 return parse_cgroups(opt, str, unset);
4000
4001 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
4002
4003 return 0;
4004}
4005
4006static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
4007{
4008 if (trace->bpf_obj == NULL)
4009 return NULL;
4010
4011 return bpf_object__find_map_by_name(trace->bpf_obj, name);
4012}
4013
4014static void trace__set_bpf_map_filtered_pids(struct trace *trace)
4015{
4016 trace->filter_pids.map = trace__find_bpf_map_by_name(trace, "pids_filtered");
4017}
4018
4019static void trace__set_bpf_map_syscalls(struct trace *trace)
4020{
4021 trace->syscalls.map = trace__find_bpf_map_by_name(trace, "syscalls");
4022 trace->syscalls.prog_array.sys_enter = trace__find_bpf_map_by_name(trace, "syscalls_sys_enter");
4023 trace->syscalls.prog_array.sys_exit = trace__find_bpf_map_by_name(trace, "syscalls_sys_exit");
4024}
4025
4026static int trace__config(const char *var, const char *value, void *arg)
4027{
4028 struct trace *trace = arg;
4029 int err = 0;
4030
4031 if (!strcmp(var, "trace.add_events")) {
4032 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
4033 "event selector. use 'perf list' to list available events",
4034 parse_events_option);
4035 /*
4036 * We can't propagate parse_event_option() return, as it is 1
4037 * for failure while perf_config() expects -1.
4038 */
4039 if (parse_events_option(&o, value, 0))
4040 err = -1;
4041 } else if (!strcmp(var, "trace.show_timestamp")) {
4042 trace->show_tstamp = perf_config_bool(var, value);
4043 } else if (!strcmp(var, "trace.show_duration")) {
4044 trace->show_duration = perf_config_bool(var, value);
4045 } else if (!strcmp(var, "trace.show_arg_names")) {
4046 trace->show_arg_names = perf_config_bool(var, value);
4047 if (!trace->show_arg_names)
4048 trace->show_zeros = true;
4049 } else if (!strcmp(var, "trace.show_zeros")) {
4050 bool new_show_zeros = perf_config_bool(var, value);
4051 if (!trace->show_arg_names && !new_show_zeros) {
4052 pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
4053 goto out;
4054 }
4055 trace->show_zeros = new_show_zeros;
4056 } else if (!strcmp(var, "trace.show_prefix")) {
4057 trace->show_string_prefix = perf_config_bool(var, value);
4058 } else if (!strcmp(var, "trace.no_inherit")) {
4059 trace->opts.no_inherit = perf_config_bool(var, value);
4060 } else if (!strcmp(var, "trace.args_alignment")) {
4061 int args_alignment = 0;
4062 if (perf_config_int(&args_alignment, var, value) == 0)
4063 trace->args_alignment = args_alignment;
4064 }
4065out:
4066 return err;
4067}
4068
4069int cmd_trace(int argc, const char **argv)
4070{
4071 const char *trace_usage[] = {
4072 "perf trace [<options>] [<command>]",
4073 "perf trace [<options>] -- <command> [<options>]",
4074 "perf trace record [<options>] [<command>]",
4075 "perf trace record [<options>] -- <command> [<options>]",
4076 NULL
4077 };
4078 struct trace trace = {
4079 .opts = {
4080 .target = {
4081 .uid = UINT_MAX,
4082 .uses_mmap = true,
4083 },
4084 .user_freq = UINT_MAX,
4085 .user_interval = ULLONG_MAX,
4086 .no_buffering = true,
4087 .mmap_pages = UINT_MAX,
4088 },
4089 .output = stderr,
4090 .show_comm = true,
4091 .show_tstamp = true,
4092 .show_duration = true,
4093 .show_arg_names = true,
4094 .args_alignment = 70,
4095 .trace_syscalls = false,
4096 .kernel_syscallchains = false,
4097 .max_stack = UINT_MAX,
4098 .max_events = ULONG_MAX,
4099 };
4100 const char *map_dump_str = NULL;
4101 const char *output_name = NULL;
4102 const struct option trace_options[] = {
4103 OPT_CALLBACK('e', "event", &trace, "event",
4104 "event/syscall selector. use 'perf list' to list available events",
4105 trace__parse_events_option),
4106 OPT_BOOLEAN(0, "comm", &trace.show_comm,
4107 "show the thread COMM next to its id"),
4108 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
4109 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
4110 trace__parse_events_option),
4111 OPT_STRING('o', "output", &output_name, "file", "output file name"),
4112 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
4113 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
4114 "trace events on existing process id"),
4115 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
4116 "trace events on existing thread id"),
4117 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
4118 "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
4119 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
4120 "system-wide collection from all CPUs"),
4121 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
4122 "list of cpus to monitor"),
4123 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
4124 "child tasks do not inherit counters"),
4125 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
4126 "number of mmap data pages",
4127 perf_evlist__parse_mmap_pages),
4128 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
4129 "user to profile"),
4130 OPT_CALLBACK(0, "duration", &trace, "float",
4131 "show only events with duration > N.M ms",
4132 trace__set_duration),
4133#ifdef HAVE_LIBBPF_SUPPORT
4134 OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
4135#endif
4136 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
4137 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
4138 OPT_BOOLEAN('T', "time", &trace.full_time,
4139 "Show full timestamp, not time relative to first start"),
4140 OPT_BOOLEAN(0, "failure", &trace.failure_only,
4141 "Show only syscalls that failed"),
4142 OPT_BOOLEAN('s', "summary", &trace.summary_only,
4143 "Show only syscall summary with statistics"),
4144 OPT_BOOLEAN('S', "with-summary", &trace.summary,
4145 "Show all syscalls and summary with statistics"),
4146 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
4147 "Trace pagefaults", parse_pagefaults, "maj"),
4148 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
4149 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
4150 OPT_CALLBACK(0, "call-graph", &trace.opts,
4151 "record_mode[,record_size]", record_callchain_help,
4152 &record_parse_callchain_opt),
4153 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
4154 "Show the kernel callchains on the syscall exit path"),
4155 OPT_ULONG(0, "max-events", &trace.max_events,
4156 "Set the maximum number of events to print, exit after that is reached. "),
4157 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
4158 "Set the minimum stack depth when parsing the callchain, "
4159 "anything below the specified depth will be ignored."),
4160 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
4161 "Set the maximum stack depth when parsing the callchain, "
4162 "anything beyond the specified depth will be ignored. "
4163 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
4164 OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
4165 "Sort batch of events before processing, use if getting out of order events"),
4166 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
4167 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
4168 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
4169 "per thread proc mmap processing timeout in ms"),
4170 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
4171 trace__parse_cgroups),
4172 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
4173 "ms to wait before starting measurement after program "
4174 "start"),
4175 OPTS_EVSWITCH(&trace.evswitch),
4176 OPT_END()
4177 };
4178 bool __maybe_unused max_stack_user_set = true;
4179 bool mmap_pages_user_set = true;
4180 struct evsel *evsel;
4181 const char * const trace_subcommands[] = { "record", NULL };
4182 int err = -1;
4183 char bf[BUFSIZ];
4184
4185 signal(SIGSEGV, sighandler_dump_stack);
4186 signal(SIGFPE, sighandler_dump_stack);
4187
4188 trace.evlist = evlist__new();
4189 trace.sctbl = syscalltbl__new();
4190
4191 if (trace.evlist == NULL || trace.sctbl == NULL) {
4192 pr_err("Not enough memory to run!\n");
4193 err = -ENOMEM;
4194 goto out;
4195 }
4196
4197 /*
4198 * Parsing .perfconfig may entail creating a BPF event, that may need
4199 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
4200 * is too small. This affects just this process, not touching the
4201 * global setting. If it fails we'll get something in 'perf trace -v'
4202 * to help diagnose the problem.
4203 */
4204 rlimit__bump_memlock();
4205
4206 err = perf_config(trace__config, &trace);
4207 if (err)
4208 goto out;
4209
4210 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
4211 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
4212
4213 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
4214 usage_with_options_msg(trace_usage, trace_options,
4215 "cgroup monitoring only available in system-wide mode");
4216 }
4217
4218 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
4219 if (IS_ERR(evsel)) {
4220 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
4221 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
4222 goto out;
4223 }
4224
4225 if (evsel) {
4226 trace.syscalls.events.augmented = evsel;
4227
4228 evsel = perf_evlist__find_tracepoint_by_name(trace.evlist, "raw_syscalls:sys_enter");
4229 if (evsel == NULL) {
4230 pr_err("ERROR: raw_syscalls:sys_enter not found in the augmented BPF object\n");
4231 goto out;
4232 }
4233
4234 if (evsel->bpf_obj == NULL) {
4235 pr_err("ERROR: raw_syscalls:sys_enter not associated to a BPF object\n");
4236 goto out;
4237 }
4238
4239 trace.bpf_obj = evsel->bpf_obj;
4240
4241 trace__set_bpf_map_filtered_pids(&trace);
4242 trace__set_bpf_map_syscalls(&trace);
4243 trace.syscalls.unaugmented_prog = trace__find_bpf_program_by_title(&trace, "!raw_syscalls:unaugmented");
4244 }
4245
4246 err = bpf__setup_stdout(trace.evlist);
4247 if (err) {
4248 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
4249 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
4250 goto out;
4251 }
4252
4253 err = -1;
4254
4255 if (map_dump_str) {
4256 trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
4257 if (trace.dump.map == NULL) {
4258 pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
4259 goto out;
4260 }
4261 }
4262
4263 if (trace.trace_pgfaults) {
4264 trace.opts.sample_address = true;
4265 trace.opts.sample_time = true;
4266 }
4267
4268 if (trace.opts.mmap_pages == UINT_MAX)
4269 mmap_pages_user_set = false;
4270
4271 if (trace.max_stack == UINT_MAX) {
4272 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
4273 max_stack_user_set = false;
4274 }
4275
4276#ifdef HAVE_DWARF_UNWIND_SUPPORT
4277 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
4278 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
4279 }
4280#endif
4281
4282 if (callchain_param.enabled) {
4283 if (!mmap_pages_user_set && geteuid() == 0)
4284 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
4285
4286 symbol_conf.use_callchain = true;
4287 }
4288
4289 if (trace.evlist->core.nr_entries > 0) {
4290 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
4291 if (evlist__set_syscall_tp_fields(trace.evlist)) {
4292 perror("failed to set syscalls:* tracepoint fields");
4293 goto out;
4294 }
4295 }
4296
4297 if (trace.sort_events) {
4298 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
4299 ordered_events__set_copy_on_queue(&trace.oe.data, true);
4300 }
4301
4302 /*
4303 * If we are augmenting syscalls, then combine what we put in the
4304 * __augmented_syscalls__ BPF map with what is in the
4305 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
4306 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
4307 *
4308 * We'll switch to look at two BPF maps, one for sys_enter and the
4309 * other for sys_exit when we start augmenting the sys_exit paths with
4310 * buffers that are being copied from kernel to userspace, think 'read'
4311 * syscall.
4312 */
4313 if (trace.syscalls.events.augmented) {
4314 evlist__for_each_entry(trace.evlist, evsel) {
4315 bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
4316
4317 if (raw_syscalls_sys_exit) {
4318 trace.raw_augmented_syscalls = true;
4319 goto init_augmented_syscall_tp;
4320 }
4321
4322 if (trace.syscalls.events.augmented->priv == NULL &&
4323 strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) {
4324 struct evsel *augmented = trace.syscalls.events.augmented;
4325 if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
4326 perf_evsel__init_augmented_syscall_tp_args(augmented))
4327 goto out;
4328 /*
4329 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
4330 * Above we made sure we can get from the payload the tp fields
4331 * that we get from syscalls:sys_enter tracefs format file.
4332 */
4333 augmented->handler = trace__sys_enter;
4334 /*
4335 * Now we do the same for the *syscalls:sys_enter event so that
4336 * if we handle it directly, i.e. if the BPF prog returns 0 so
4337 * as not to filter it, then we'll handle it just like we would
4338 * for the BPF_OUTPUT one:
4339 */
4340 if (perf_evsel__init_augmented_syscall_tp(evsel, evsel) ||
4341 perf_evsel__init_augmented_syscall_tp_args(evsel))
4342 goto out;
4343 evsel->handler = trace__sys_enter;
4344 }
4345
4346 if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
4347 struct syscall_tp *sc;
4348init_augmented_syscall_tp:
4349 if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
4350 goto out;
4351 sc = evsel->priv;
4352 /*
4353 * For now with BPF raw_augmented we hook into
4354 * raw_syscalls:sys_enter and there we get all
4355 * 6 syscall args plus the tracepoint common
4356 * fields and the syscall_nr (another long).
4357 * So we check if that is the case and if so
4358 * don't look after the sc->args_size but
4359 * always after the full raw_syscalls:sys_enter
4360 * payload, which is fixed.
4361 *
4362 * We'll revisit this later to pass
4363 * s->args_size to the BPF augmenter (now
4364 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
4365 * so that it copies only what we need for each
4366 * syscall, like what happens when we use
4367 * syscalls:sys_enter_NAME, so that we reduce
4368 * the kernel/userspace traffic to just what is
4369 * needed for each syscall.
4370 */
4371 if (trace.raw_augmented_syscalls)
4372 trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
4373 perf_evsel__init_augmented_syscall_tp_ret(evsel);
4374 evsel->handler = trace__sys_exit;
4375 }
4376 }
4377 }
4378
4379 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
4380 return trace__record(&trace, argc-1, &argv[1]);
4381
4382 /* summary_only implies summary option, but don't overwrite summary if set */
4383 if (trace.summary_only)
4384 trace.summary = trace.summary_only;
4385
4386 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
4387 trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
4388 trace.trace_syscalls = true;
4389 }
4390
4391 if (output_name != NULL) {
4392 err = trace__open_output(&trace, output_name);
4393 if (err < 0) {
4394 perror("failed to create output file");
4395 goto out;
4396 }
4397 }
4398
4399 err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
4400 if (err)
4401 goto out_close;
4402
4403 err = target__validate(&trace.opts.target);
4404 if (err) {
4405 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4406 fprintf(trace.output, "%s", bf);
4407 goto out_close;
4408 }
4409
4410 err = target__parse_uid(&trace.opts.target);
4411 if (err) {
4412 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4413 fprintf(trace.output, "%s", bf);
4414 goto out_close;
4415 }
4416
4417 if (!argc && target__none(&trace.opts.target))
4418 trace.opts.target.system_wide = true;
4419
4420 if (input_name)
4421 err = trace__replay(&trace);
4422 else
4423 err = trace__run(&trace, argc, argv);
4424
4425out_close:
4426 if (output_name != NULL)
4427 fclose(trace.output);
4428out:
4429 return err;
4430}
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19#include <traceevent/event-parse.h>
20#include <api/fs/tracing_path.h>
21#include "builtin.h"
22#include "util/color.h"
23#include "util/debug.h"
24#include "util/evlist.h"
25#include <subcmd/exec-cmd.h>
26#include "util/machine.h"
27#include "util/session.h"
28#include "util/thread.h"
29#include <subcmd/parse-options.h>
30#include "util/strlist.h"
31#include "util/intlist.h"
32#include "util/thread_map.h"
33#include "util/stat.h"
34#include "trace-event.h"
35#include "util/parse-events.h"
36#include "util/bpf-loader.h"
37
38#include <libaudit.h>
39#include <stdlib.h>
40#include <sys/mman.h>
41#include <linux/futex.h>
42#include <linux/err.h>
43
44/* For older distros: */
45#ifndef MAP_STACK
46# define MAP_STACK 0x20000
47#endif
48
49#ifndef MADV_HWPOISON
50# define MADV_HWPOISON 100
51
52#endif
53
54#ifndef MADV_MERGEABLE
55# define MADV_MERGEABLE 12
56#endif
57
58#ifndef MADV_UNMERGEABLE
59# define MADV_UNMERGEABLE 13
60#endif
61
62#ifndef EFD_SEMAPHORE
63# define EFD_SEMAPHORE 1
64#endif
65
66#ifndef EFD_NONBLOCK
67# define EFD_NONBLOCK 00004000
68#endif
69
70#ifndef EFD_CLOEXEC
71# define EFD_CLOEXEC 02000000
72#endif
73
74#ifndef O_CLOEXEC
75# define O_CLOEXEC 02000000
76#endif
77
78#ifndef SOCK_DCCP
79# define SOCK_DCCP 6
80#endif
81
82#ifndef SOCK_CLOEXEC
83# define SOCK_CLOEXEC 02000000
84#endif
85
86#ifndef SOCK_NONBLOCK
87# define SOCK_NONBLOCK 00004000
88#endif
89
90#ifndef MSG_CMSG_CLOEXEC
91# define MSG_CMSG_CLOEXEC 0x40000000
92#endif
93
94#ifndef PERF_FLAG_FD_NO_GROUP
95# define PERF_FLAG_FD_NO_GROUP (1UL << 0)
96#endif
97
98#ifndef PERF_FLAG_FD_OUTPUT
99# define PERF_FLAG_FD_OUTPUT (1UL << 1)
100#endif
101
102#ifndef PERF_FLAG_PID_CGROUP
103# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
104#endif
105
106#ifndef PERF_FLAG_FD_CLOEXEC
107# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
108#endif
109
110
111struct tp_field {
112 int offset;
113 union {
114 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
115 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 };
117};
118
119#define TP_UINT_FIELD(bits) \
120static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
121{ \
122 u##bits value; \
123 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
124 return value; \
125}
126
127TP_UINT_FIELD(8);
128TP_UINT_FIELD(16);
129TP_UINT_FIELD(32);
130TP_UINT_FIELD(64);
131
132#define TP_UINT_FIELD__SWAPPED(bits) \
133static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
134{ \
135 u##bits value; \
136 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 return bswap_##bits(value);\
138}
139
140TP_UINT_FIELD__SWAPPED(16);
141TP_UINT_FIELD__SWAPPED(32);
142TP_UINT_FIELD__SWAPPED(64);
143
144static int tp_field__init_uint(struct tp_field *field,
145 struct format_field *format_field,
146 bool needs_swap)
147{
148 field->offset = format_field->offset;
149
150 switch (format_field->size) {
151 case 1:
152 field->integer = tp_field__u8;
153 break;
154 case 2:
155 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
156 break;
157 case 4:
158 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
159 break;
160 case 8:
161 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
162 break;
163 default:
164 return -1;
165 }
166
167 return 0;
168}
169
170static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
171{
172 return sample->raw_data + field->offset;
173}
174
175static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
176{
177 field->offset = format_field->offset;
178 field->pointer = tp_field__ptr;
179 return 0;
180}
181
182struct syscall_tp {
183 struct tp_field id;
184 union {
185 struct tp_field args, ret;
186 };
187};
188
189static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
190 struct tp_field *field,
191 const char *name)
192{
193 struct format_field *format_field = perf_evsel__field(evsel, name);
194
195 if (format_field == NULL)
196 return -1;
197
198 return tp_field__init_uint(field, format_field, evsel->needs_swap);
199}
200
201#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
202 ({ struct syscall_tp *sc = evsel->priv;\
203 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
204
205static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
206 struct tp_field *field,
207 const char *name)
208{
209 struct format_field *format_field = perf_evsel__field(evsel, name);
210
211 if (format_field == NULL)
212 return -1;
213
214 return tp_field__init_ptr(field, format_field);
215}
216
217#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
218 ({ struct syscall_tp *sc = evsel->priv;\
219 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
220
221static void perf_evsel__delete_priv(struct perf_evsel *evsel)
222{
223 zfree(&evsel->priv);
224 perf_evsel__delete(evsel);
225}
226
227static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
228{
229 evsel->priv = malloc(sizeof(struct syscall_tp));
230 if (evsel->priv != NULL) {
231 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
232 goto out_delete;
233
234 evsel->handler = handler;
235 return 0;
236 }
237
238 return -ENOMEM;
239
240out_delete:
241 zfree(&evsel->priv);
242 return -ENOENT;
243}
244
245static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
246{
247 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
248
249 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
250 if (IS_ERR(evsel))
251 evsel = perf_evsel__newtp("syscalls", direction);
252
253 if (IS_ERR(evsel))
254 return NULL;
255
256 if (perf_evsel__init_syscall_tp(evsel, handler))
257 goto out_delete;
258
259 return evsel;
260
261out_delete:
262 perf_evsel__delete_priv(evsel);
263 return NULL;
264}
265
266#define perf_evsel__sc_tp_uint(evsel, name, sample) \
267 ({ struct syscall_tp *fields = evsel->priv; \
268 fields->name.integer(&fields->name, sample); })
269
270#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
271 ({ struct syscall_tp *fields = evsel->priv; \
272 fields->name.pointer(&fields->name, sample); })
273
274struct syscall_arg {
275 unsigned long val;
276 struct thread *thread;
277 struct trace *trace;
278 void *parm;
279 u8 idx;
280 u8 mask;
281};
282
283struct strarray {
284 int offset;
285 int nr_entries;
286 const char **entries;
287};
288
289#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
290 .nr_entries = ARRAY_SIZE(array), \
291 .entries = array, \
292}
293
294#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
295 .offset = off, \
296 .nr_entries = ARRAY_SIZE(array), \
297 .entries = array, \
298}
299
300static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 const char *intfmt,
302 struct syscall_arg *arg)
303{
304 struct strarray *sa = arg->parm;
305 int idx = arg->val - sa->offset;
306
307 if (idx < 0 || idx >= sa->nr_entries)
308 return scnprintf(bf, size, intfmt, arg->val);
309
310 return scnprintf(bf, size, "%s", sa->entries[idx]);
311}
312
313static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
314 struct syscall_arg *arg)
315{
316 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
317}
318
319#define SCA_STRARRAY syscall_arg__scnprintf_strarray
320
321#if defined(__i386__) || defined(__x86_64__)
322/*
323 * FIXME: Make this available to all arches as soon as the ioctl beautifier
324 * gets rewritten to support all arches.
325 */
326static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
327 struct syscall_arg *arg)
328{
329 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
330}
331
332#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
333#endif /* defined(__i386__) || defined(__x86_64__) */
334
335static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
336 struct syscall_arg *arg);
337
338#define SCA_FD syscall_arg__scnprintf_fd
339
340static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341 struct syscall_arg *arg)
342{
343 int fd = arg->val;
344
345 if (fd == AT_FDCWD)
346 return scnprintf(bf, size, "CWD");
347
348 return syscall_arg__scnprintf_fd(bf, size, arg);
349}
350
351#define SCA_FDAT syscall_arg__scnprintf_fd_at
352
353static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354 struct syscall_arg *arg);
355
356#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357
358static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359 struct syscall_arg *arg)
360{
361 return scnprintf(bf, size, "%#lx", arg->val);
362}
363
364#define SCA_HEX syscall_arg__scnprintf_hex
365
366static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367 struct syscall_arg *arg)
368{
369 return scnprintf(bf, size, "%d", arg->val);
370}
371
372#define SCA_INT syscall_arg__scnprintf_int
373
374static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
375 struct syscall_arg *arg)
376{
377 int printed = 0, prot = arg->val;
378
379 if (prot == PROT_NONE)
380 return scnprintf(bf, size, "NONE");
381#define P_MMAP_PROT(n) \
382 if (prot & PROT_##n) { \
383 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
384 prot &= ~PROT_##n; \
385 }
386
387 P_MMAP_PROT(EXEC);
388 P_MMAP_PROT(READ);
389 P_MMAP_PROT(WRITE);
390#ifdef PROT_SEM
391 P_MMAP_PROT(SEM);
392#endif
393 P_MMAP_PROT(GROWSDOWN);
394 P_MMAP_PROT(GROWSUP);
395#undef P_MMAP_PROT
396
397 if (prot)
398 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
399
400 return printed;
401}
402
403#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
404
405static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
406 struct syscall_arg *arg)
407{
408 int printed = 0, flags = arg->val;
409
410#define P_MMAP_FLAG(n) \
411 if (flags & MAP_##n) { \
412 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
413 flags &= ~MAP_##n; \
414 }
415
416 P_MMAP_FLAG(SHARED);
417 P_MMAP_FLAG(PRIVATE);
418#ifdef MAP_32BIT
419 P_MMAP_FLAG(32BIT);
420#endif
421 P_MMAP_FLAG(ANONYMOUS);
422 P_MMAP_FLAG(DENYWRITE);
423 P_MMAP_FLAG(EXECUTABLE);
424 P_MMAP_FLAG(FILE);
425 P_MMAP_FLAG(FIXED);
426 P_MMAP_FLAG(GROWSDOWN);
427#ifdef MAP_HUGETLB
428 P_MMAP_FLAG(HUGETLB);
429#endif
430 P_MMAP_FLAG(LOCKED);
431 P_MMAP_FLAG(NONBLOCK);
432 P_MMAP_FLAG(NORESERVE);
433 P_MMAP_FLAG(POPULATE);
434 P_MMAP_FLAG(STACK);
435#ifdef MAP_UNINITIALIZED
436 P_MMAP_FLAG(UNINITIALIZED);
437#endif
438#undef P_MMAP_FLAG
439
440 if (flags)
441 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
442
443 return printed;
444}
445
446#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
447
448static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
449 struct syscall_arg *arg)
450{
451 int printed = 0, flags = arg->val;
452
453#define P_MREMAP_FLAG(n) \
454 if (flags & MREMAP_##n) { \
455 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
456 flags &= ~MREMAP_##n; \
457 }
458
459 P_MREMAP_FLAG(MAYMOVE);
460#ifdef MREMAP_FIXED
461 P_MREMAP_FLAG(FIXED);
462#endif
463#undef P_MREMAP_FLAG
464
465 if (flags)
466 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
467
468 return printed;
469}
470
471#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
472
473static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
474 struct syscall_arg *arg)
475{
476 int behavior = arg->val;
477
478 switch (behavior) {
479#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
480 P_MADV_BHV(NORMAL);
481 P_MADV_BHV(RANDOM);
482 P_MADV_BHV(SEQUENTIAL);
483 P_MADV_BHV(WILLNEED);
484 P_MADV_BHV(DONTNEED);
485 P_MADV_BHV(REMOVE);
486 P_MADV_BHV(DONTFORK);
487 P_MADV_BHV(DOFORK);
488 P_MADV_BHV(HWPOISON);
489#ifdef MADV_SOFT_OFFLINE
490 P_MADV_BHV(SOFT_OFFLINE);
491#endif
492 P_MADV_BHV(MERGEABLE);
493 P_MADV_BHV(UNMERGEABLE);
494#ifdef MADV_HUGEPAGE
495 P_MADV_BHV(HUGEPAGE);
496#endif
497#ifdef MADV_NOHUGEPAGE
498 P_MADV_BHV(NOHUGEPAGE);
499#endif
500#ifdef MADV_DONTDUMP
501 P_MADV_BHV(DONTDUMP);
502#endif
503#ifdef MADV_DODUMP
504 P_MADV_BHV(DODUMP);
505#endif
506#undef P_MADV_PHV
507 default: break;
508 }
509
510 return scnprintf(bf, size, "%#x", behavior);
511}
512
513#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
514
515static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
516 struct syscall_arg *arg)
517{
518 int printed = 0, op = arg->val;
519
520 if (op == 0)
521 return scnprintf(bf, size, "NONE");
522#define P_CMD(cmd) \
523 if ((op & LOCK_##cmd) == LOCK_##cmd) { \
524 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
525 op &= ~LOCK_##cmd; \
526 }
527
528 P_CMD(SH);
529 P_CMD(EX);
530 P_CMD(NB);
531 P_CMD(UN);
532 P_CMD(MAND);
533 P_CMD(RW);
534 P_CMD(READ);
535 P_CMD(WRITE);
536#undef P_OP
537
538 if (op)
539 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
540
541 return printed;
542}
543
544#define SCA_FLOCK syscall_arg__scnprintf_flock
545
546static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
547{
548 enum syscall_futex_args {
549 SCF_UADDR = (1 << 0),
550 SCF_OP = (1 << 1),
551 SCF_VAL = (1 << 2),
552 SCF_TIMEOUT = (1 << 3),
553 SCF_UADDR2 = (1 << 4),
554 SCF_VAL3 = (1 << 5),
555 };
556 int op = arg->val;
557 int cmd = op & FUTEX_CMD_MASK;
558 size_t printed = 0;
559
560 switch (cmd) {
561#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
562 P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
563 P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564 P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
565 P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break;
566 P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break;
567 P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break;
568 P_FUTEX_OP(WAKE_OP); break;
569 P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570 P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
571 P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
572 P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break;
573 P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break;
574 P_FUTEX_OP(WAIT_REQUEUE_PI); break;
575 default: printed = scnprintf(bf, size, "%#x", cmd); break;
576 }
577
578 if (op & FUTEX_PRIVATE_FLAG)
579 printed += scnprintf(bf + printed, size - printed, "|PRIV");
580
581 if (op & FUTEX_CLOCK_REALTIME)
582 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
583
584 return printed;
585}
586
587#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op
588
589static const char *bpf_cmd[] = {
590 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
591 "MAP_GET_NEXT_KEY", "PROG_LOAD",
592};
593static DEFINE_STRARRAY(bpf_cmd);
594
595static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
596static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
597
598static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
599static DEFINE_STRARRAY(itimers);
600
601static const char *keyctl_options[] = {
602 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
603 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
604 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
605 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
606 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
607};
608static DEFINE_STRARRAY(keyctl_options);
609
610static const char *whences[] = { "SET", "CUR", "END",
611#ifdef SEEK_DATA
612"DATA",
613#endif
614#ifdef SEEK_HOLE
615"HOLE",
616#endif
617};
618static DEFINE_STRARRAY(whences);
619
620static const char *fcntl_cmds[] = {
621 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
622 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
623 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
624 "F_GETOWNER_UIDS",
625};
626static DEFINE_STRARRAY(fcntl_cmds);
627
628static const char *rlimit_resources[] = {
629 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
630 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
631 "RTTIME",
632};
633static DEFINE_STRARRAY(rlimit_resources);
634
635static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
636static DEFINE_STRARRAY(sighow);
637
638static const char *clockid[] = {
639 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
640 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
641 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
642};
643static DEFINE_STRARRAY(clockid);
644
645static const char *socket_families[] = {
646 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
647 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
648 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
649 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
650 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
651 "ALG", "NFC", "VSOCK",
652};
653static DEFINE_STRARRAY(socket_families);
654
655#ifndef SOCK_TYPE_MASK
656#define SOCK_TYPE_MASK 0xf
657#endif
658
659static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
660 struct syscall_arg *arg)
661{
662 size_t printed;
663 int type = arg->val,
664 flags = type & ~SOCK_TYPE_MASK;
665
666 type &= SOCK_TYPE_MASK;
667 /*
668 * Can't use a strarray, MIPS may override for ABI reasons.
669 */
670 switch (type) {
671#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
672 P_SK_TYPE(STREAM);
673 P_SK_TYPE(DGRAM);
674 P_SK_TYPE(RAW);
675 P_SK_TYPE(RDM);
676 P_SK_TYPE(SEQPACKET);
677 P_SK_TYPE(DCCP);
678 P_SK_TYPE(PACKET);
679#undef P_SK_TYPE
680 default:
681 printed = scnprintf(bf, size, "%#x", type);
682 }
683
684#define P_SK_FLAG(n) \
685 if (flags & SOCK_##n) { \
686 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
687 flags &= ~SOCK_##n; \
688 }
689
690 P_SK_FLAG(CLOEXEC);
691 P_SK_FLAG(NONBLOCK);
692#undef P_SK_FLAG
693
694 if (flags)
695 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
696
697 return printed;
698}
699
700#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
701
702#ifndef MSG_PROBE
703#define MSG_PROBE 0x10
704#endif
705#ifndef MSG_WAITFORONE
706#define MSG_WAITFORONE 0x10000
707#endif
708#ifndef MSG_SENDPAGE_NOTLAST
709#define MSG_SENDPAGE_NOTLAST 0x20000
710#endif
711#ifndef MSG_FASTOPEN
712#define MSG_FASTOPEN 0x20000000
713#endif
714
715static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
716 struct syscall_arg *arg)
717{
718 int printed = 0, flags = arg->val;
719
720 if (flags == 0)
721 return scnprintf(bf, size, "NONE");
722#define P_MSG_FLAG(n) \
723 if (flags & MSG_##n) { \
724 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
725 flags &= ~MSG_##n; \
726 }
727
728 P_MSG_FLAG(OOB);
729 P_MSG_FLAG(PEEK);
730 P_MSG_FLAG(DONTROUTE);
731 P_MSG_FLAG(TRYHARD);
732 P_MSG_FLAG(CTRUNC);
733 P_MSG_FLAG(PROBE);
734 P_MSG_FLAG(TRUNC);
735 P_MSG_FLAG(DONTWAIT);
736 P_MSG_FLAG(EOR);
737 P_MSG_FLAG(WAITALL);
738 P_MSG_FLAG(FIN);
739 P_MSG_FLAG(SYN);
740 P_MSG_FLAG(CONFIRM);
741 P_MSG_FLAG(RST);
742 P_MSG_FLAG(ERRQUEUE);
743 P_MSG_FLAG(NOSIGNAL);
744 P_MSG_FLAG(MORE);
745 P_MSG_FLAG(WAITFORONE);
746 P_MSG_FLAG(SENDPAGE_NOTLAST);
747 P_MSG_FLAG(FASTOPEN);
748 P_MSG_FLAG(CMSG_CLOEXEC);
749#undef P_MSG_FLAG
750
751 if (flags)
752 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
753
754 return printed;
755}
756
757#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
758
759static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
760 struct syscall_arg *arg)
761{
762 size_t printed = 0;
763 int mode = arg->val;
764
765 if (mode == F_OK) /* 0 */
766 return scnprintf(bf, size, "F");
767#define P_MODE(n) \
768 if (mode & n##_OK) { \
769 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
770 mode &= ~n##_OK; \
771 }
772
773 P_MODE(R);
774 P_MODE(W);
775 P_MODE(X);
776#undef P_MODE
777
778 if (mode)
779 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
780
781 return printed;
782}
783
784#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
785
786static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
787 struct syscall_arg *arg);
788
789#define SCA_FILENAME syscall_arg__scnprintf_filename
790
791static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
792 struct syscall_arg *arg)
793{
794 int printed = 0, flags = arg->val;
795
796 if (!(flags & O_CREAT))
797 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
798
799 if (flags == 0)
800 return scnprintf(bf, size, "RDONLY");
801#define P_FLAG(n) \
802 if (flags & O_##n) { \
803 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
804 flags &= ~O_##n; \
805 }
806
807 P_FLAG(APPEND);
808 P_FLAG(ASYNC);
809 P_FLAG(CLOEXEC);
810 P_FLAG(CREAT);
811 P_FLAG(DIRECT);
812 P_FLAG(DIRECTORY);
813 P_FLAG(EXCL);
814 P_FLAG(LARGEFILE);
815 P_FLAG(NOATIME);
816 P_FLAG(NOCTTY);
817#ifdef O_NONBLOCK
818 P_FLAG(NONBLOCK);
819#elif O_NDELAY
820 P_FLAG(NDELAY);
821#endif
822#ifdef O_PATH
823 P_FLAG(PATH);
824#endif
825 P_FLAG(RDWR);
826#ifdef O_DSYNC
827 if ((flags & O_SYNC) == O_SYNC)
828 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
829 else {
830 P_FLAG(DSYNC);
831 }
832#else
833 P_FLAG(SYNC);
834#endif
835 P_FLAG(TRUNC);
836 P_FLAG(WRONLY);
837#undef P_FLAG
838
839 if (flags)
840 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
841
842 return printed;
843}
844
845#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
846
847static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
848 struct syscall_arg *arg)
849{
850 int printed = 0, flags = arg->val;
851
852 if (flags == 0)
853 return 0;
854
855#define P_FLAG(n) \
856 if (flags & PERF_FLAG_##n) { \
857 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
858 flags &= ~PERF_FLAG_##n; \
859 }
860
861 P_FLAG(FD_NO_GROUP);
862 P_FLAG(FD_OUTPUT);
863 P_FLAG(PID_CGROUP);
864 P_FLAG(FD_CLOEXEC);
865#undef P_FLAG
866
867 if (flags)
868 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
869
870 return printed;
871}
872
873#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
874
875static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
876 struct syscall_arg *arg)
877{
878 int printed = 0, flags = arg->val;
879
880 if (flags == 0)
881 return scnprintf(bf, size, "NONE");
882#define P_FLAG(n) \
883 if (flags & EFD_##n) { \
884 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
885 flags &= ~EFD_##n; \
886 }
887
888 P_FLAG(SEMAPHORE);
889 P_FLAG(CLOEXEC);
890 P_FLAG(NONBLOCK);
891#undef P_FLAG
892
893 if (flags)
894 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
895
896 return printed;
897}
898
899#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
900
901static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
902 struct syscall_arg *arg)
903{
904 int printed = 0, flags = arg->val;
905
906#define P_FLAG(n) \
907 if (flags & O_##n) { \
908 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
909 flags &= ~O_##n; \
910 }
911
912 P_FLAG(CLOEXEC);
913 P_FLAG(NONBLOCK);
914#undef P_FLAG
915
916 if (flags)
917 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
918
919 return printed;
920}
921
922#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
923
924static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
925{
926 int sig = arg->val;
927
928 switch (sig) {
929#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
930 P_SIGNUM(HUP);
931 P_SIGNUM(INT);
932 P_SIGNUM(QUIT);
933 P_SIGNUM(ILL);
934 P_SIGNUM(TRAP);
935 P_SIGNUM(ABRT);
936 P_SIGNUM(BUS);
937 P_SIGNUM(FPE);
938 P_SIGNUM(KILL);
939 P_SIGNUM(USR1);
940 P_SIGNUM(SEGV);
941 P_SIGNUM(USR2);
942 P_SIGNUM(PIPE);
943 P_SIGNUM(ALRM);
944 P_SIGNUM(TERM);
945 P_SIGNUM(CHLD);
946 P_SIGNUM(CONT);
947 P_SIGNUM(STOP);
948 P_SIGNUM(TSTP);
949 P_SIGNUM(TTIN);
950 P_SIGNUM(TTOU);
951 P_SIGNUM(URG);
952 P_SIGNUM(XCPU);
953 P_SIGNUM(XFSZ);
954 P_SIGNUM(VTALRM);
955 P_SIGNUM(PROF);
956 P_SIGNUM(WINCH);
957 P_SIGNUM(IO);
958 P_SIGNUM(PWR);
959 P_SIGNUM(SYS);
960#ifdef SIGEMT
961 P_SIGNUM(EMT);
962#endif
963#ifdef SIGSTKFLT
964 P_SIGNUM(STKFLT);
965#endif
966#ifdef SIGSWI
967 P_SIGNUM(SWI);
968#endif
969 default: break;
970 }
971
972 return scnprintf(bf, size, "%#x", sig);
973}
974
975#define SCA_SIGNUM syscall_arg__scnprintf_signum
976
977#if defined(__i386__) || defined(__x86_64__)
978/*
979 * FIXME: Make this available to all arches.
980 */
981#define TCGETS 0x5401
982
983static const char *tioctls[] = {
984 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
985 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
986 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
987 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
988 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
989 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
990 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
991 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
992 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
993 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
994 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
995 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
996 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
997 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
998 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
999};
1000
1001static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1002#endif /* defined(__i386__) || defined(__x86_64__) */
1003
1004#define STRARRAY(arg, name, array) \
1005 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1006 .arg_parm = { [arg] = &strarray__##array, }
1007
1008static struct syscall_fmt {
1009 const char *name;
1010 const char *alias;
1011 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1012 void *arg_parm[6];
1013 bool errmsg;
1014 bool timeout;
1015 bool hexret;
1016} syscall_fmts[] = {
1017 { .name = "access", .errmsg = true,
1018 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1019 [1] = SCA_ACCMODE, /* mode */ }, },
1020 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
1021 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1022 { .name = "brk", .hexret = true,
1023 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1024 { .name = "chdir", .errmsg = true,
1025 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1026 { .name = "chmod", .errmsg = true,
1027 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1028 { .name = "chroot", .errmsg = true,
1029 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1030 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
1031 { .name = "close", .errmsg = true,
1032 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1033 { .name = "connect", .errmsg = true, },
1034 { .name = "creat", .errmsg = true,
1035 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1036 { .name = "dup", .errmsg = true,
1037 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1038 { .name = "dup2", .errmsg = true,
1039 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040 { .name = "dup3", .errmsg = true,
1041 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1042 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1043 { .name = "eventfd2", .errmsg = true,
1044 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1045 { .name = "faccessat", .errmsg = true,
1046 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1047 [1] = SCA_FILENAME, /* filename */ }, },
1048 { .name = "fadvise64", .errmsg = true,
1049 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1050 { .name = "fallocate", .errmsg = true,
1051 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1052 { .name = "fchdir", .errmsg = true,
1053 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1054 { .name = "fchmod", .errmsg = true,
1055 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1056 { .name = "fchmodat", .errmsg = true,
1057 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1058 [1] = SCA_FILENAME, /* filename */ }, },
1059 { .name = "fchown", .errmsg = true,
1060 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 { .name = "fchownat", .errmsg = true,
1062 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1063 [1] = SCA_FILENAME, /* filename */ }, },
1064 { .name = "fcntl", .errmsg = true,
1065 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1066 [1] = SCA_STRARRAY, /* cmd */ },
1067 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1068 { .name = "fdatasync", .errmsg = true,
1069 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1070 { .name = "flock", .errmsg = true,
1071 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1072 [1] = SCA_FLOCK, /* cmd */ }, },
1073 { .name = "fsetxattr", .errmsg = true,
1074 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1075 { .name = "fstat", .errmsg = true, .alias = "newfstat",
1076 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1077 { .name = "fstatat", .errmsg = true, .alias = "newfstatat",
1078 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1079 [1] = SCA_FILENAME, /* filename */ }, },
1080 { .name = "fstatfs", .errmsg = true,
1081 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1082 { .name = "fsync", .errmsg = true,
1083 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1084 { .name = "ftruncate", .errmsg = true,
1085 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1086 { .name = "futex", .errmsg = true,
1087 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1088 { .name = "futimesat", .errmsg = true,
1089 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1090 [1] = SCA_FILENAME, /* filename */ }, },
1091 { .name = "getdents", .errmsg = true,
1092 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093 { .name = "getdents64", .errmsg = true,
1094 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1095 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
1096 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1097 { .name = "getxattr", .errmsg = true,
1098 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1099 { .name = "inotify_add_watch", .errmsg = true,
1100 .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1101 { .name = "ioctl", .errmsg = true,
1102 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1103#if defined(__i386__) || defined(__x86_64__)
1104/*
1105 * FIXME: Make this available to all arches.
1106 */
1107 [1] = SCA_STRHEXARRAY, /* cmd */
1108 [2] = SCA_HEX, /* arg */ },
1109 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
1110#else
1111 [2] = SCA_HEX, /* arg */ }, },
1112#endif
1113 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
1114 { .name = "kill", .errmsg = true,
1115 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1116 { .name = "lchown", .errmsg = true,
1117 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1118 { .name = "lgetxattr", .errmsg = true,
1119 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1120 { .name = "linkat", .errmsg = true,
1121 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1122 { .name = "listxattr", .errmsg = true,
1123 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1124 { .name = "llistxattr", .errmsg = true,
1125 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1126 { .name = "lremovexattr", .errmsg = true,
1127 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1128 { .name = "lseek", .errmsg = true,
1129 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1130 [2] = SCA_STRARRAY, /* whence */ },
1131 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
1132 { .name = "lsetxattr", .errmsg = true,
1133 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1134 { .name = "lstat", .errmsg = true, .alias = "newlstat",
1135 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1136 { .name = "lsxattr", .errmsg = true,
1137 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1138 { .name = "madvise", .errmsg = true,
1139 .arg_scnprintf = { [0] = SCA_HEX, /* start */
1140 [2] = SCA_MADV_BHV, /* behavior */ }, },
1141 { .name = "mkdir", .errmsg = true,
1142 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1143 { .name = "mkdirat", .errmsg = true,
1144 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1145 [1] = SCA_FILENAME, /* pathname */ }, },
1146 { .name = "mknod", .errmsg = true,
1147 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1148 { .name = "mknodat", .errmsg = true,
1149 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1150 [1] = SCA_FILENAME, /* filename */ }, },
1151 { .name = "mlock", .errmsg = true,
1152 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1153 { .name = "mlockall", .errmsg = true,
1154 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1155 { .name = "mmap", .hexret = true,
1156 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1157 [2] = SCA_MMAP_PROT, /* prot */
1158 [3] = SCA_MMAP_FLAGS, /* flags */
1159 [4] = SCA_FD, /* fd */ }, },
1160 { .name = "mprotect", .errmsg = true,
1161 .arg_scnprintf = { [0] = SCA_HEX, /* start */
1162 [2] = SCA_MMAP_PROT, /* prot */ }, },
1163 { .name = "mq_unlink", .errmsg = true,
1164 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1165 { .name = "mremap", .hexret = true,
1166 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1167 [3] = SCA_MREMAP_FLAGS, /* flags */
1168 [4] = SCA_HEX, /* new_addr */ }, },
1169 { .name = "munlock", .errmsg = true,
1170 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1171 { .name = "munmap", .errmsg = true,
1172 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1173 { .name = "name_to_handle_at", .errmsg = true,
1174 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1175 { .name = "newfstatat", .errmsg = true,
1176 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1177 [1] = SCA_FILENAME, /* filename */ }, },
1178 { .name = "open", .errmsg = true,
1179 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1180 [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1181 { .name = "open_by_handle_at", .errmsg = true,
1182 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1183 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1184 { .name = "openat", .errmsg = true,
1185 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1186 [1] = SCA_FILENAME, /* filename */
1187 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1188 { .name = "perf_event_open", .errmsg = true,
1189 .arg_scnprintf = { [1] = SCA_INT, /* pid */
1190 [2] = SCA_INT, /* cpu */
1191 [3] = SCA_FD, /* group_fd */
1192 [4] = SCA_PERF_FLAGS, /* flags */ }, },
1193 { .name = "pipe2", .errmsg = true,
1194 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1195 { .name = "poll", .errmsg = true, .timeout = true, },
1196 { .name = "ppoll", .errmsg = true, .timeout = true, },
1197 { .name = "pread", .errmsg = true, .alias = "pread64",
1198 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1199 { .name = "preadv", .errmsg = true, .alias = "pread",
1200 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1201 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1202 { .name = "pwrite", .errmsg = true, .alias = "pwrite64",
1203 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204 { .name = "pwritev", .errmsg = true,
1205 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1206 { .name = "read", .errmsg = true,
1207 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1208 { .name = "readlink", .errmsg = true,
1209 .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1210 { .name = "readlinkat", .errmsg = true,
1211 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1212 [1] = SCA_FILENAME, /* pathname */ }, },
1213 { .name = "readv", .errmsg = true,
1214 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1215 { .name = "recvfrom", .errmsg = true,
1216 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1217 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1218 { .name = "recvmmsg", .errmsg = true,
1219 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1220 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1221 { .name = "recvmsg", .errmsg = true,
1222 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1223 [2] = SCA_MSG_FLAGS, /* flags */ }, },
1224 { .name = "removexattr", .errmsg = true,
1225 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1226 { .name = "renameat", .errmsg = true,
1227 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1228 { .name = "rmdir", .errmsg = true,
1229 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1230 { .name = "rt_sigaction", .errmsg = true,
1231 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1232 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
1233 { .name = "rt_sigqueueinfo", .errmsg = true,
1234 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1235 { .name = "rt_tgsigqueueinfo", .errmsg = true,
1236 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1237 { .name = "select", .errmsg = true, .timeout = true, },
1238 { .name = "sendmmsg", .errmsg = true,
1239 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1240 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1241 { .name = "sendmsg", .errmsg = true,
1242 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1243 [2] = SCA_MSG_FLAGS, /* flags */ }, },
1244 { .name = "sendto", .errmsg = true,
1245 .arg_scnprintf = { [0] = SCA_FD, /* fd */
1246 [3] = SCA_MSG_FLAGS, /* flags */ }, },
1247 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
1248 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1249 { .name = "setxattr", .errmsg = true,
1250 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1251 { .name = "shutdown", .errmsg = true,
1252 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1253 { .name = "socket", .errmsg = true,
1254 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1255 [1] = SCA_SK_TYPE, /* type */ },
1256 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1257 { .name = "socketpair", .errmsg = true,
1258 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1259 [1] = SCA_SK_TYPE, /* type */ },
1260 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1261 { .name = "stat", .errmsg = true, .alias = "newstat",
1262 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1263 { .name = "statfs", .errmsg = true,
1264 .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1265 { .name = "swapoff", .errmsg = true,
1266 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1267 { .name = "swapon", .errmsg = true,
1268 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1269 { .name = "symlinkat", .errmsg = true,
1270 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1271 { .name = "tgkill", .errmsg = true,
1272 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1273 { .name = "tkill", .errmsg = true,
1274 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1275 { .name = "truncate", .errmsg = true,
1276 .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1277 { .name = "uname", .errmsg = true, .alias = "newuname", },
1278 { .name = "unlinkat", .errmsg = true,
1279 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1280 [1] = SCA_FILENAME, /* pathname */ }, },
1281 { .name = "utime", .errmsg = true,
1282 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1283 { .name = "utimensat", .errmsg = true,
1284 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1285 [1] = SCA_FILENAME, /* filename */ }, },
1286 { .name = "utimes", .errmsg = true,
1287 .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1288 { .name = "vmsplice", .errmsg = true,
1289 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1290 { .name = "write", .errmsg = true,
1291 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1292 { .name = "writev", .errmsg = true,
1293 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1294};
1295
1296static int syscall_fmt__cmp(const void *name, const void *fmtp)
1297{
1298 const struct syscall_fmt *fmt = fmtp;
1299 return strcmp(name, fmt->name);
1300}
1301
1302static struct syscall_fmt *syscall_fmt__find(const char *name)
1303{
1304 const int nmemb = ARRAY_SIZE(syscall_fmts);
1305 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1306}
1307
1308struct syscall {
1309 struct event_format *tp_format;
1310 int nr_args;
1311 struct format_field *args;
1312 const char *name;
1313 bool is_exit;
1314 struct syscall_fmt *fmt;
1315 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1316 void **arg_parm;
1317};
1318
1319static size_t fprintf_duration(unsigned long t, FILE *fp)
1320{
1321 double duration = (double)t / NSEC_PER_MSEC;
1322 size_t printed = fprintf(fp, "(");
1323
1324 if (duration >= 1.0)
1325 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1326 else if (duration >= 0.01)
1327 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1328 else
1329 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1330 return printed + fprintf(fp, "): ");
1331}
1332
1333/**
1334 * filename.ptr: The filename char pointer that will be vfs_getname'd
1335 * filename.entry_str_pos: Where to insert the string translated from
1336 * filename.ptr by the vfs_getname tracepoint/kprobe.
1337 */
1338struct thread_trace {
1339 u64 entry_time;
1340 u64 exit_time;
1341 bool entry_pending;
1342 unsigned long nr_events;
1343 unsigned long pfmaj, pfmin;
1344 char *entry_str;
1345 double runtime_ms;
1346 struct {
1347 unsigned long ptr;
1348 short int entry_str_pos;
1349 bool pending_open;
1350 unsigned int namelen;
1351 char *name;
1352 } filename;
1353 struct {
1354 int max;
1355 char **table;
1356 } paths;
1357
1358 struct intlist *syscall_stats;
1359};
1360
1361static struct thread_trace *thread_trace__new(void)
1362{
1363 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1364
1365 if (ttrace)
1366 ttrace->paths.max = -1;
1367
1368 ttrace->syscall_stats = intlist__new(NULL);
1369
1370 return ttrace;
1371}
1372
1373static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1374{
1375 struct thread_trace *ttrace;
1376
1377 if (thread == NULL)
1378 goto fail;
1379
1380 if (thread__priv(thread) == NULL)
1381 thread__set_priv(thread, thread_trace__new());
1382
1383 if (thread__priv(thread) == NULL)
1384 goto fail;
1385
1386 ttrace = thread__priv(thread);
1387 ++ttrace->nr_events;
1388
1389 return ttrace;
1390fail:
1391 color_fprintf(fp, PERF_COLOR_RED,
1392 "WARNING: not enough memory, dropping samples!\n");
1393 return NULL;
1394}
1395
1396#define TRACE_PFMAJ (1 << 0)
1397#define TRACE_PFMIN (1 << 1)
1398
1399static const size_t trace__entry_str_size = 2048;
1400
1401struct trace {
1402 struct perf_tool tool;
1403 struct {
1404 int machine;
1405 int open_id;
1406 } audit;
1407 struct {
1408 int max;
1409 struct syscall *table;
1410 struct {
1411 struct perf_evsel *sys_enter,
1412 *sys_exit;
1413 } events;
1414 } syscalls;
1415 struct record_opts opts;
1416 struct perf_evlist *evlist;
1417 struct machine *host;
1418 struct thread *current;
1419 u64 base_time;
1420 FILE *output;
1421 unsigned long nr_events;
1422 struct strlist *ev_qualifier;
1423 struct {
1424 size_t nr;
1425 int *entries;
1426 } ev_qualifier_ids;
1427 struct intlist *tid_list;
1428 struct intlist *pid_list;
1429 struct {
1430 size_t nr;
1431 pid_t *entries;
1432 } filter_pids;
1433 double duration_filter;
1434 double runtime_ms;
1435 struct {
1436 u64 vfs_getname,
1437 proc_getname;
1438 } stats;
1439 bool not_ev_qualifier;
1440 bool live;
1441 bool full_time;
1442 bool sched;
1443 bool multiple_threads;
1444 bool summary;
1445 bool summary_only;
1446 bool show_comm;
1447 bool show_tool_stats;
1448 bool trace_syscalls;
1449 bool force;
1450 bool vfs_getname;
1451 int trace_pgfaults;
1452};
1453
1454static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1455{
1456 struct thread_trace *ttrace = thread__priv(thread);
1457
1458 if (fd > ttrace->paths.max) {
1459 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1460
1461 if (npath == NULL)
1462 return -1;
1463
1464 if (ttrace->paths.max != -1) {
1465 memset(npath + ttrace->paths.max + 1, 0,
1466 (fd - ttrace->paths.max) * sizeof(char *));
1467 } else {
1468 memset(npath, 0, (fd + 1) * sizeof(char *));
1469 }
1470
1471 ttrace->paths.table = npath;
1472 ttrace->paths.max = fd;
1473 }
1474
1475 ttrace->paths.table[fd] = strdup(pathname);
1476
1477 return ttrace->paths.table[fd] != NULL ? 0 : -1;
1478}
1479
1480static int thread__read_fd_path(struct thread *thread, int fd)
1481{
1482 char linkname[PATH_MAX], pathname[PATH_MAX];
1483 struct stat st;
1484 int ret;
1485
1486 if (thread->pid_ == thread->tid) {
1487 scnprintf(linkname, sizeof(linkname),
1488 "/proc/%d/fd/%d", thread->pid_, fd);
1489 } else {
1490 scnprintf(linkname, sizeof(linkname),
1491 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1492 }
1493
1494 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1495 return -1;
1496
1497 ret = readlink(linkname, pathname, sizeof(pathname));
1498
1499 if (ret < 0 || ret > st.st_size)
1500 return -1;
1501
1502 pathname[ret] = '\0';
1503 return trace__set_fd_pathname(thread, fd, pathname);
1504}
1505
1506static const char *thread__fd_path(struct thread *thread, int fd,
1507 struct trace *trace)
1508{
1509 struct thread_trace *ttrace = thread__priv(thread);
1510
1511 if (ttrace == NULL)
1512 return NULL;
1513
1514 if (fd < 0)
1515 return NULL;
1516
1517 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1518 if (!trace->live)
1519 return NULL;
1520 ++trace->stats.proc_getname;
1521 if (thread__read_fd_path(thread, fd))
1522 return NULL;
1523 }
1524
1525 return ttrace->paths.table[fd];
1526}
1527
1528static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1529 struct syscall_arg *arg)
1530{
1531 int fd = arg->val;
1532 size_t printed = scnprintf(bf, size, "%d", fd);
1533 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1534
1535 if (path)
1536 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1537
1538 return printed;
1539}
1540
1541static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1542 struct syscall_arg *arg)
1543{
1544 int fd = arg->val;
1545 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1546 struct thread_trace *ttrace = thread__priv(arg->thread);
1547
1548 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1549 zfree(&ttrace->paths.table[fd]);
1550
1551 return printed;
1552}
1553
1554static void thread__set_filename_pos(struct thread *thread, const char *bf,
1555 unsigned long ptr)
1556{
1557 struct thread_trace *ttrace = thread__priv(thread);
1558
1559 ttrace->filename.ptr = ptr;
1560 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1561}
1562
1563static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1564 struct syscall_arg *arg)
1565{
1566 unsigned long ptr = arg->val;
1567
1568 if (!arg->trace->vfs_getname)
1569 return scnprintf(bf, size, "%#x", ptr);
1570
1571 thread__set_filename_pos(arg->thread, bf, ptr);
1572 return 0;
1573}
1574
1575static bool trace__filter_duration(struct trace *trace, double t)
1576{
1577 return t < (trace->duration_filter * NSEC_PER_MSEC);
1578}
1579
1580static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1581{
1582 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1583
1584 return fprintf(fp, "%10.3f ", ts);
1585}
1586
1587static bool done = false;
1588static bool interrupted = false;
1589
1590static void sig_handler(int sig)
1591{
1592 done = true;
1593 interrupted = sig == SIGINT;
1594}
1595
1596static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1597 u64 duration, u64 tstamp, FILE *fp)
1598{
1599 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1600 printed += fprintf_duration(duration, fp);
1601
1602 if (trace->multiple_threads) {
1603 if (trace->show_comm)
1604 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1605 printed += fprintf(fp, "%d ", thread->tid);
1606 }
1607
1608 return printed;
1609}
1610
1611static int trace__process_event(struct trace *trace, struct machine *machine,
1612 union perf_event *event, struct perf_sample *sample)
1613{
1614 int ret = 0;
1615
1616 switch (event->header.type) {
1617 case PERF_RECORD_LOST:
1618 color_fprintf(trace->output, PERF_COLOR_RED,
1619 "LOST %" PRIu64 " events!\n", event->lost.lost);
1620 ret = machine__process_lost_event(machine, event, sample);
1621 default:
1622 ret = machine__process_event(machine, event, sample);
1623 break;
1624 }
1625
1626 return ret;
1627}
1628
1629static int trace__tool_process(struct perf_tool *tool,
1630 union perf_event *event,
1631 struct perf_sample *sample,
1632 struct machine *machine)
1633{
1634 struct trace *trace = container_of(tool, struct trace, tool);
1635 return trace__process_event(trace, machine, event, sample);
1636}
1637
1638static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1639{
1640 int err = symbol__init(NULL);
1641
1642 if (err)
1643 return err;
1644
1645 trace->host = machine__new_host();
1646 if (trace->host == NULL)
1647 return -ENOMEM;
1648
1649 if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1650 return -errno;
1651
1652 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1653 evlist->threads, trace__tool_process, false,
1654 trace->opts.proc_map_timeout);
1655 if (err)
1656 symbol__exit();
1657
1658 return err;
1659}
1660
1661static int syscall__set_arg_fmts(struct syscall *sc)
1662{
1663 struct format_field *field;
1664 int idx = 0;
1665
1666 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1667 if (sc->arg_scnprintf == NULL)
1668 return -1;
1669
1670 if (sc->fmt)
1671 sc->arg_parm = sc->fmt->arg_parm;
1672
1673 for (field = sc->args; field; field = field->next) {
1674 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1675 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1676 else if (field->flags & FIELD_IS_POINTER)
1677 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1678 ++idx;
1679 }
1680
1681 return 0;
1682}
1683
1684static int trace__read_syscall_info(struct trace *trace, int id)
1685{
1686 char tp_name[128];
1687 struct syscall *sc;
1688 const char *name = audit_syscall_to_name(id, trace->audit.machine);
1689
1690 if (name == NULL)
1691 return -1;
1692
1693 if (id > trace->syscalls.max) {
1694 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1695
1696 if (nsyscalls == NULL)
1697 return -1;
1698
1699 if (trace->syscalls.max != -1) {
1700 memset(nsyscalls + trace->syscalls.max + 1, 0,
1701 (id - trace->syscalls.max) * sizeof(*sc));
1702 } else {
1703 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1704 }
1705
1706 trace->syscalls.table = nsyscalls;
1707 trace->syscalls.max = id;
1708 }
1709
1710 sc = trace->syscalls.table + id;
1711 sc->name = name;
1712
1713 sc->fmt = syscall_fmt__find(sc->name);
1714
1715 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1716 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1717
1718 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1719 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1720 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1721 }
1722
1723 if (IS_ERR(sc->tp_format))
1724 return -1;
1725
1726 sc->args = sc->tp_format->format.fields;
1727 sc->nr_args = sc->tp_format->format.nr_fields;
1728 /*
1729 * We need to check and discard the first variable '__syscall_nr'
1730 * or 'nr' that mean the syscall number. It is needless here.
1731 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1732 */
1733 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1734 sc->args = sc->args->next;
1735 --sc->nr_args;
1736 }
1737
1738 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1739
1740 return syscall__set_arg_fmts(sc);
1741}
1742
1743static int trace__validate_ev_qualifier(struct trace *trace)
1744{
1745 int err = 0, i;
1746 struct str_node *pos;
1747
1748 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1749 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1750 sizeof(trace->ev_qualifier_ids.entries[0]));
1751
1752 if (trace->ev_qualifier_ids.entries == NULL) {
1753 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1754 trace->output);
1755 err = -EINVAL;
1756 goto out;
1757 }
1758
1759 i = 0;
1760
1761 strlist__for_each(pos, trace->ev_qualifier) {
1762 const char *sc = pos->s;
1763 int id = audit_name_to_syscall(sc, trace->audit.machine);
1764
1765 if (id < 0) {
1766 if (err == 0) {
1767 fputs("Error:\tInvalid syscall ", trace->output);
1768 err = -EINVAL;
1769 } else {
1770 fputs(", ", trace->output);
1771 }
1772
1773 fputs(sc, trace->output);
1774 }
1775
1776 trace->ev_qualifier_ids.entries[i++] = id;
1777 }
1778
1779 if (err < 0) {
1780 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1781 "\nHint:\tand: 'man syscalls'\n", trace->output);
1782 zfree(&trace->ev_qualifier_ids.entries);
1783 trace->ev_qualifier_ids.nr = 0;
1784 }
1785out:
1786 return err;
1787}
1788
1789/*
1790 * args is to be interpreted as a series of longs but we need to handle
1791 * 8-byte unaligned accesses. args points to raw_data within the event
1792 * and raw_data is guaranteed to be 8-byte unaligned because it is
1793 * preceded by raw_size which is a u32. So we need to copy args to a temp
1794 * variable to read it. Most notably this avoids extended load instructions
1795 * on unaligned addresses
1796 */
1797
1798static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1799 unsigned char *args, struct trace *trace,
1800 struct thread *thread)
1801{
1802 size_t printed = 0;
1803 unsigned char *p;
1804 unsigned long val;
1805
1806 if (sc->args != NULL) {
1807 struct format_field *field;
1808 u8 bit = 1;
1809 struct syscall_arg arg = {
1810 .idx = 0,
1811 .mask = 0,
1812 .trace = trace,
1813 .thread = thread,
1814 };
1815
1816 for (field = sc->args; field;
1817 field = field->next, ++arg.idx, bit <<= 1) {
1818 if (arg.mask & bit)
1819 continue;
1820
1821 /* special care for unaligned accesses */
1822 p = args + sizeof(unsigned long) * arg.idx;
1823 memcpy(&val, p, sizeof(val));
1824
1825 /*
1826 * Suppress this argument if its value is zero and
1827 * and we don't have a string associated in an
1828 * strarray for it.
1829 */
1830 if (val == 0 &&
1831 !(sc->arg_scnprintf &&
1832 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1833 sc->arg_parm[arg.idx]))
1834 continue;
1835
1836 printed += scnprintf(bf + printed, size - printed,
1837 "%s%s: ", printed ? ", " : "", field->name);
1838 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1839 arg.val = val;
1840 if (sc->arg_parm)
1841 arg.parm = sc->arg_parm[arg.idx];
1842 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1843 size - printed, &arg);
1844 } else {
1845 printed += scnprintf(bf + printed, size - printed,
1846 "%ld", val);
1847 }
1848 }
1849 } else {
1850 int i = 0;
1851
1852 while (i < 6) {
1853 /* special care for unaligned accesses */
1854 p = args + sizeof(unsigned long) * i;
1855 memcpy(&val, p, sizeof(val));
1856 printed += scnprintf(bf + printed, size - printed,
1857 "%sarg%d: %ld",
1858 printed ? ", " : "", i, val);
1859 ++i;
1860 }
1861 }
1862
1863 return printed;
1864}
1865
1866typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1867 union perf_event *event,
1868 struct perf_sample *sample);
1869
1870static struct syscall *trace__syscall_info(struct trace *trace,
1871 struct perf_evsel *evsel, int id)
1872{
1873
1874 if (id < 0) {
1875
1876 /*
1877 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1878 * before that, leaving at a higher verbosity level till that is
1879 * explained. Reproduced with plain ftrace with:
1880 *
1881 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1882 * grep "NR -1 " /t/trace_pipe
1883 *
1884 * After generating some load on the machine.
1885 */
1886 if (verbose > 1) {
1887 static u64 n;
1888 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1889 id, perf_evsel__name(evsel), ++n);
1890 }
1891 return NULL;
1892 }
1893
1894 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1895 trace__read_syscall_info(trace, id))
1896 goto out_cant_read;
1897
1898 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1899 goto out_cant_read;
1900
1901 return &trace->syscalls.table[id];
1902
1903out_cant_read:
1904 if (verbose) {
1905 fprintf(trace->output, "Problems reading syscall %d", id);
1906 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1907 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1908 fputs(" information\n", trace->output);
1909 }
1910 return NULL;
1911}
1912
1913static void thread__update_stats(struct thread_trace *ttrace,
1914 int id, struct perf_sample *sample)
1915{
1916 struct int_node *inode;
1917 struct stats *stats;
1918 u64 duration = 0;
1919
1920 inode = intlist__findnew(ttrace->syscall_stats, id);
1921 if (inode == NULL)
1922 return;
1923
1924 stats = inode->priv;
1925 if (stats == NULL) {
1926 stats = malloc(sizeof(struct stats));
1927 if (stats == NULL)
1928 return;
1929 init_stats(stats);
1930 inode->priv = stats;
1931 }
1932
1933 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1934 duration = sample->time - ttrace->entry_time;
1935
1936 update_stats(stats, duration);
1937}
1938
1939static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1940{
1941 struct thread_trace *ttrace;
1942 u64 duration;
1943 size_t printed;
1944
1945 if (trace->current == NULL)
1946 return 0;
1947
1948 ttrace = thread__priv(trace->current);
1949
1950 if (!ttrace->entry_pending)
1951 return 0;
1952
1953 duration = sample->time - ttrace->entry_time;
1954
1955 printed = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1956 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1957 ttrace->entry_pending = false;
1958
1959 return printed;
1960}
1961
1962static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1963 union perf_event *event __maybe_unused,
1964 struct perf_sample *sample)
1965{
1966 char *msg;
1967 void *args;
1968 size_t printed = 0;
1969 struct thread *thread;
1970 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1971 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1972 struct thread_trace *ttrace;
1973
1974 if (sc == NULL)
1975 return -1;
1976
1977 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978 ttrace = thread__trace(thread, trace->output);
1979 if (ttrace == NULL)
1980 goto out_put;
1981
1982 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1983
1984 if (ttrace->entry_str == NULL) {
1985 ttrace->entry_str = malloc(trace__entry_str_size);
1986 if (!ttrace->entry_str)
1987 goto out_put;
1988 }
1989
1990 if (!trace->summary_only)
1991 trace__printf_interrupted_entry(trace, sample);
1992
1993 ttrace->entry_time = sample->time;
1994 msg = ttrace->entry_str;
1995 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1996
1997 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1998 args, trace, thread);
1999
2000 if (sc->is_exit) {
2001 if (!trace->duration_filter && !trace->summary_only) {
2002 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2003 fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2004 }
2005 } else {
2006 ttrace->entry_pending = true;
2007 /* See trace__vfs_getname & trace__sys_exit */
2008 ttrace->filename.pending_open = false;
2009 }
2010
2011 if (trace->current != thread) {
2012 thread__put(trace->current);
2013 trace->current = thread__get(thread);
2014 }
2015 err = 0;
2016out_put:
2017 thread__put(thread);
2018 return err;
2019}
2020
2021static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2022 union perf_event *event __maybe_unused,
2023 struct perf_sample *sample)
2024{
2025 long ret;
2026 u64 duration = 0;
2027 struct thread *thread;
2028 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030 struct thread_trace *ttrace;
2031
2032 if (sc == NULL)
2033 return -1;
2034
2035 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2036 ttrace = thread__trace(thread, trace->output);
2037 if (ttrace == NULL)
2038 goto out_put;
2039
2040 if (trace->summary)
2041 thread__update_stats(ttrace, id, sample);
2042
2043 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2044
2045 if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2046 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2047 ttrace->filename.pending_open = false;
2048 ++trace->stats.vfs_getname;
2049 }
2050
2051 ttrace->exit_time = sample->time;
2052
2053 if (ttrace->entry_time) {
2054 duration = sample->time - ttrace->entry_time;
2055 if (trace__filter_duration(trace, duration))
2056 goto out;
2057 } else if (trace->duration_filter)
2058 goto out;
2059
2060 if (trace->summary_only)
2061 goto out;
2062
2063 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2064
2065 if (ttrace->entry_pending) {
2066 fprintf(trace->output, "%-70s", ttrace->entry_str);
2067 } else {
2068 fprintf(trace->output, " ... [");
2069 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2070 fprintf(trace->output, "]: %s()", sc->name);
2071 }
2072
2073 if (sc->fmt == NULL) {
2074signed_print:
2075 fprintf(trace->output, ") = %ld", ret);
2076 } else if (ret < 0 && sc->fmt->errmsg) {
2077 char bf[STRERR_BUFSIZE];
2078 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2079 *e = audit_errno_to_name(-ret);
2080
2081 fprintf(trace->output, ") = -1 %s %s", e, emsg);
2082 } else if (ret == 0 && sc->fmt->timeout)
2083 fprintf(trace->output, ") = 0 Timeout");
2084 else if (sc->fmt->hexret)
2085 fprintf(trace->output, ") = %#lx", ret);
2086 else
2087 goto signed_print;
2088
2089 fputc('\n', trace->output);
2090out:
2091 ttrace->entry_pending = false;
2092 err = 0;
2093out_put:
2094 thread__put(thread);
2095 return err;
2096}
2097
2098static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2099 union perf_event *event __maybe_unused,
2100 struct perf_sample *sample)
2101{
2102 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2103 struct thread_trace *ttrace;
2104 size_t filename_len, entry_str_len, to_move;
2105 ssize_t remaining_space;
2106 char *pos;
2107 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2108
2109 if (!thread)
2110 goto out;
2111
2112 ttrace = thread__priv(thread);
2113 if (!ttrace)
2114 goto out;
2115
2116 filename_len = strlen(filename);
2117
2118 if (ttrace->filename.namelen < filename_len) {
2119 char *f = realloc(ttrace->filename.name, filename_len + 1);
2120
2121 if (f == NULL)
2122 goto out;
2123
2124 ttrace->filename.namelen = filename_len;
2125 ttrace->filename.name = f;
2126 }
2127
2128 strcpy(ttrace->filename.name, filename);
2129 ttrace->filename.pending_open = true;
2130
2131 if (!ttrace->filename.ptr)
2132 goto out;
2133
2134 entry_str_len = strlen(ttrace->entry_str);
2135 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2136 if (remaining_space <= 0)
2137 goto out;
2138
2139 if (filename_len > (size_t)remaining_space) {
2140 filename += filename_len - remaining_space;
2141 filename_len = remaining_space;
2142 }
2143
2144 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2145 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2146 memmove(pos + filename_len, pos, to_move);
2147 memcpy(pos, filename, filename_len);
2148
2149 ttrace->filename.ptr = 0;
2150 ttrace->filename.entry_str_pos = 0;
2151out:
2152 return 0;
2153}
2154
2155static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2156 union perf_event *event __maybe_unused,
2157 struct perf_sample *sample)
2158{
2159 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2160 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2161 struct thread *thread = machine__findnew_thread(trace->host,
2162 sample->pid,
2163 sample->tid);
2164 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2165
2166 if (ttrace == NULL)
2167 goto out_dump;
2168
2169 ttrace->runtime_ms += runtime_ms;
2170 trace->runtime_ms += runtime_ms;
2171 thread__put(thread);
2172 return 0;
2173
2174out_dump:
2175 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2176 evsel->name,
2177 perf_evsel__strval(evsel, sample, "comm"),
2178 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2179 runtime,
2180 perf_evsel__intval(evsel, sample, "vruntime"));
2181 thread__put(thread);
2182 return 0;
2183}
2184
2185static void bpf_output__printer(enum binary_printer_ops op,
2186 unsigned int val, void *extra)
2187{
2188 FILE *output = extra;
2189 unsigned char ch = (unsigned char)val;
2190
2191 switch (op) {
2192 case BINARY_PRINT_CHAR_DATA:
2193 fprintf(output, "%c", isprint(ch) ? ch : '.');
2194 break;
2195 case BINARY_PRINT_DATA_BEGIN:
2196 case BINARY_PRINT_LINE_BEGIN:
2197 case BINARY_PRINT_ADDR:
2198 case BINARY_PRINT_NUM_DATA:
2199 case BINARY_PRINT_NUM_PAD:
2200 case BINARY_PRINT_SEP:
2201 case BINARY_PRINT_CHAR_PAD:
2202 case BINARY_PRINT_LINE_END:
2203 case BINARY_PRINT_DATA_END:
2204 default:
2205 break;
2206 }
2207}
2208
2209static void bpf_output__fprintf(struct trace *trace,
2210 struct perf_sample *sample)
2211{
2212 print_binary(sample->raw_data, sample->raw_size, 8,
2213 bpf_output__printer, trace->output);
2214}
2215
2216static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2217 union perf_event *event __maybe_unused,
2218 struct perf_sample *sample)
2219{
2220 trace__printf_interrupted_entry(trace, sample);
2221 trace__fprintf_tstamp(trace, sample->time, trace->output);
2222
2223 if (trace->trace_syscalls)
2224 fprintf(trace->output, "( ): ");
2225
2226 fprintf(trace->output, "%s:", evsel->name);
2227
2228 if (perf_evsel__is_bpf_output(evsel)) {
2229 bpf_output__fprintf(trace, sample);
2230 } else if (evsel->tp_format) {
2231 event_format__fprintf(evsel->tp_format, sample->cpu,
2232 sample->raw_data, sample->raw_size,
2233 trace->output);
2234 }
2235
2236 fprintf(trace->output, ")\n");
2237 return 0;
2238}
2239
2240static void print_location(FILE *f, struct perf_sample *sample,
2241 struct addr_location *al,
2242 bool print_dso, bool print_sym)
2243{
2244
2245 if ((verbose || print_dso) && al->map)
2246 fprintf(f, "%s@", al->map->dso->long_name);
2247
2248 if ((verbose || print_sym) && al->sym)
2249 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2250 al->addr - al->sym->start);
2251 else if (al->map)
2252 fprintf(f, "0x%" PRIx64, al->addr);
2253 else
2254 fprintf(f, "0x%" PRIx64, sample->addr);
2255}
2256
2257static int trace__pgfault(struct trace *trace,
2258 struct perf_evsel *evsel,
2259 union perf_event *event __maybe_unused,
2260 struct perf_sample *sample)
2261{
2262 struct thread *thread;
2263 struct addr_location al;
2264 char map_type = 'd';
2265 struct thread_trace *ttrace;
2266 int err = -1;
2267
2268 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2269 ttrace = thread__trace(thread, trace->output);
2270 if (ttrace == NULL)
2271 goto out_put;
2272
2273 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2274 ttrace->pfmaj++;
2275 else
2276 ttrace->pfmin++;
2277
2278 if (trace->summary_only)
2279 goto out;
2280
2281 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2282 sample->ip, &al);
2283
2284 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2285
2286 fprintf(trace->output, "%sfault [",
2287 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2288 "maj" : "min");
2289
2290 print_location(trace->output, sample, &al, false, true);
2291
2292 fprintf(trace->output, "] => ");
2293
2294 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2295 sample->addr, &al);
2296
2297 if (!al.map) {
2298 thread__find_addr_location(thread, sample->cpumode,
2299 MAP__FUNCTION, sample->addr, &al);
2300
2301 if (al.map)
2302 map_type = 'x';
2303 else
2304 map_type = '?';
2305 }
2306
2307 print_location(trace->output, sample, &al, true, false);
2308
2309 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2310out:
2311 err = 0;
2312out_put:
2313 thread__put(thread);
2314 return err;
2315}
2316
2317static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2318{
2319 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2320 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2321 return false;
2322
2323 if (trace->pid_list || trace->tid_list)
2324 return true;
2325
2326 return false;
2327}
2328
2329static int trace__process_sample(struct perf_tool *tool,
2330 union perf_event *event,
2331 struct perf_sample *sample,
2332 struct perf_evsel *evsel,
2333 struct machine *machine __maybe_unused)
2334{
2335 struct trace *trace = container_of(tool, struct trace, tool);
2336 int err = 0;
2337
2338 tracepoint_handler handler = evsel->handler;
2339
2340 if (skip_sample(trace, sample))
2341 return 0;
2342
2343 if (!trace->full_time && trace->base_time == 0)
2344 trace->base_time = sample->time;
2345
2346 if (handler) {
2347 ++trace->nr_events;
2348 handler(trace, evsel, event, sample);
2349 }
2350
2351 return err;
2352}
2353
2354static int parse_target_str(struct trace *trace)
2355{
2356 if (trace->opts.target.pid) {
2357 trace->pid_list = intlist__new(trace->opts.target.pid);
2358 if (trace->pid_list == NULL) {
2359 pr_err("Error parsing process id string\n");
2360 return -EINVAL;
2361 }
2362 }
2363
2364 if (trace->opts.target.tid) {
2365 trace->tid_list = intlist__new(trace->opts.target.tid);
2366 if (trace->tid_list == NULL) {
2367 pr_err("Error parsing thread id string\n");
2368 return -EINVAL;
2369 }
2370 }
2371
2372 return 0;
2373}
2374
2375static int trace__record(struct trace *trace, int argc, const char **argv)
2376{
2377 unsigned int rec_argc, i, j;
2378 const char **rec_argv;
2379 const char * const record_args[] = {
2380 "record",
2381 "-R",
2382 "-m", "1024",
2383 "-c", "1",
2384 };
2385
2386 const char * const sc_args[] = { "-e", };
2387 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2388 const char * const majpf_args[] = { "-e", "major-faults" };
2389 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2390 const char * const minpf_args[] = { "-e", "minor-faults" };
2391 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2392
2393 /* +1 is for the event string below */
2394 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2395 majpf_args_nr + minpf_args_nr + argc;
2396 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2397
2398 if (rec_argv == NULL)
2399 return -ENOMEM;
2400
2401 j = 0;
2402 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2403 rec_argv[j++] = record_args[i];
2404
2405 if (trace->trace_syscalls) {
2406 for (i = 0; i < sc_args_nr; i++)
2407 rec_argv[j++] = sc_args[i];
2408
2409 /* event string may be different for older kernels - e.g., RHEL6 */
2410 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2411 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2412 else if (is_valid_tracepoint("syscalls:sys_enter"))
2413 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2414 else {
2415 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2416 return -1;
2417 }
2418 }
2419
2420 if (trace->trace_pgfaults & TRACE_PFMAJ)
2421 for (i = 0; i < majpf_args_nr; i++)
2422 rec_argv[j++] = majpf_args[i];
2423
2424 if (trace->trace_pgfaults & TRACE_PFMIN)
2425 for (i = 0; i < minpf_args_nr; i++)
2426 rec_argv[j++] = minpf_args[i];
2427
2428 for (i = 0; i < (unsigned int)argc; i++)
2429 rec_argv[j++] = argv[i];
2430
2431 return cmd_record(j, rec_argv, NULL);
2432}
2433
2434static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2435
2436static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2437{
2438 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2439
2440 if (IS_ERR(evsel))
2441 return false;
2442
2443 if (perf_evsel__field(evsel, "pathname") == NULL) {
2444 perf_evsel__delete(evsel);
2445 return false;
2446 }
2447
2448 evsel->handler = trace__vfs_getname;
2449 perf_evlist__add(evlist, evsel);
2450 return true;
2451}
2452
2453static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2454 u64 config)
2455{
2456 struct perf_evsel *evsel;
2457 struct perf_event_attr attr = {
2458 .type = PERF_TYPE_SOFTWARE,
2459 .mmap_data = 1,
2460 };
2461
2462 attr.config = config;
2463 attr.sample_period = 1;
2464
2465 event_attr_init(&attr);
2466
2467 evsel = perf_evsel__new(&attr);
2468 if (!evsel)
2469 return -ENOMEM;
2470
2471 evsel->handler = trace__pgfault;
2472 perf_evlist__add(evlist, evsel);
2473
2474 return 0;
2475}
2476
2477static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2478{
2479 const u32 type = event->header.type;
2480 struct perf_evsel *evsel;
2481
2482 if (!trace->full_time && trace->base_time == 0)
2483 trace->base_time = sample->time;
2484
2485 if (type != PERF_RECORD_SAMPLE) {
2486 trace__process_event(trace, trace->host, event, sample);
2487 return;
2488 }
2489
2490 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2491 if (evsel == NULL) {
2492 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2493 return;
2494 }
2495
2496 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2497 sample->raw_data == NULL) {
2498 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2499 perf_evsel__name(evsel), sample->tid,
2500 sample->cpu, sample->raw_size);
2501 } else {
2502 tracepoint_handler handler = evsel->handler;
2503 handler(trace, evsel, event, sample);
2504 }
2505}
2506
2507static int trace__add_syscall_newtp(struct trace *trace)
2508{
2509 int ret = -1;
2510 struct perf_evlist *evlist = trace->evlist;
2511 struct perf_evsel *sys_enter, *sys_exit;
2512
2513 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2514 if (sys_enter == NULL)
2515 goto out;
2516
2517 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2518 goto out_delete_sys_enter;
2519
2520 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2521 if (sys_exit == NULL)
2522 goto out_delete_sys_enter;
2523
2524 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2525 goto out_delete_sys_exit;
2526
2527 perf_evlist__add(evlist, sys_enter);
2528 perf_evlist__add(evlist, sys_exit);
2529
2530 trace->syscalls.events.sys_enter = sys_enter;
2531 trace->syscalls.events.sys_exit = sys_exit;
2532
2533 ret = 0;
2534out:
2535 return ret;
2536
2537out_delete_sys_exit:
2538 perf_evsel__delete_priv(sys_exit);
2539out_delete_sys_enter:
2540 perf_evsel__delete_priv(sys_enter);
2541 goto out;
2542}
2543
2544static int trace__set_ev_qualifier_filter(struct trace *trace)
2545{
2546 int err = -1;
2547 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2548 trace->ev_qualifier_ids.nr,
2549 trace->ev_qualifier_ids.entries);
2550
2551 if (filter == NULL)
2552 goto out_enomem;
2553
2554 if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2555 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2556
2557 free(filter);
2558out:
2559 return err;
2560out_enomem:
2561 errno = ENOMEM;
2562 goto out;
2563}
2564
2565static int trace__run(struct trace *trace, int argc, const char **argv)
2566{
2567 struct perf_evlist *evlist = trace->evlist;
2568 struct perf_evsel *evsel;
2569 int err = -1, i;
2570 unsigned long before;
2571 const bool forks = argc > 0;
2572 bool draining = false;
2573
2574 trace->live = true;
2575
2576 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2577 goto out_error_raw_syscalls;
2578
2579 if (trace->trace_syscalls)
2580 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2581
2582 if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2583 perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2584 goto out_error_mem;
2585 }
2586
2587 if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2588 perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2589 goto out_error_mem;
2590
2591 if (trace->sched &&
2592 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2593 trace__sched_stat_runtime))
2594 goto out_error_sched_stat_runtime;
2595
2596 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2597 if (err < 0) {
2598 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2599 goto out_delete_evlist;
2600 }
2601
2602 err = trace__symbols_init(trace, evlist);
2603 if (err < 0) {
2604 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2605 goto out_delete_evlist;
2606 }
2607
2608 perf_evlist__config(evlist, &trace->opts);
2609
2610 signal(SIGCHLD, sig_handler);
2611 signal(SIGINT, sig_handler);
2612
2613 if (forks) {
2614 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2615 argv, false, NULL);
2616 if (err < 0) {
2617 fprintf(trace->output, "Couldn't run the workload!\n");
2618 goto out_delete_evlist;
2619 }
2620 }
2621
2622 err = perf_evlist__open(evlist);
2623 if (err < 0)
2624 goto out_error_open;
2625
2626 err = bpf__apply_obj_config();
2627 if (err) {
2628 char errbuf[BUFSIZ];
2629
2630 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2631 pr_err("ERROR: Apply config to BPF failed: %s\n",
2632 errbuf);
2633 goto out_error_open;
2634 }
2635
2636 /*
2637 * Better not use !target__has_task() here because we need to cover the
2638 * case where no threads were specified in the command line, but a
2639 * workload was, and in that case we will fill in the thread_map when
2640 * we fork the workload in perf_evlist__prepare_workload.
2641 */
2642 if (trace->filter_pids.nr > 0)
2643 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2644 else if (thread_map__pid(evlist->threads, 0) == -1)
2645 err = perf_evlist__set_filter_pid(evlist, getpid());
2646
2647 if (err < 0)
2648 goto out_error_mem;
2649
2650 if (trace->ev_qualifier_ids.nr > 0) {
2651 err = trace__set_ev_qualifier_filter(trace);
2652 if (err < 0)
2653 goto out_errno;
2654
2655 pr_debug("event qualifier tracepoint filter: %s\n",
2656 trace->syscalls.events.sys_exit->filter);
2657 }
2658
2659 err = perf_evlist__apply_filters(evlist, &evsel);
2660 if (err < 0)
2661 goto out_error_apply_filters;
2662
2663 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2664 if (err < 0)
2665 goto out_error_mmap;
2666
2667 if (!target__none(&trace->opts.target))
2668 perf_evlist__enable(evlist);
2669
2670 if (forks)
2671 perf_evlist__start_workload(evlist);
2672
2673 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2674 evlist->threads->nr > 1 ||
2675 perf_evlist__first(evlist)->attr.inherit;
2676again:
2677 before = trace->nr_events;
2678
2679 for (i = 0; i < evlist->nr_mmaps; i++) {
2680 union perf_event *event;
2681
2682 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2683 struct perf_sample sample;
2684
2685 ++trace->nr_events;
2686
2687 err = perf_evlist__parse_sample(evlist, event, &sample);
2688 if (err) {
2689 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2690 goto next_event;
2691 }
2692
2693 trace__handle_event(trace, event, &sample);
2694next_event:
2695 perf_evlist__mmap_consume(evlist, i);
2696
2697 if (interrupted)
2698 goto out_disable;
2699
2700 if (done && !draining) {
2701 perf_evlist__disable(evlist);
2702 draining = true;
2703 }
2704 }
2705 }
2706
2707 if (trace->nr_events == before) {
2708 int timeout = done ? 100 : -1;
2709
2710 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2711 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2712 draining = true;
2713
2714 goto again;
2715 }
2716 } else {
2717 goto again;
2718 }
2719
2720out_disable:
2721 thread__zput(trace->current);
2722
2723 perf_evlist__disable(evlist);
2724
2725 if (!err) {
2726 if (trace->summary)
2727 trace__fprintf_thread_summary(trace, trace->output);
2728
2729 if (trace->show_tool_stats) {
2730 fprintf(trace->output, "Stats:\n "
2731 " vfs_getname : %" PRIu64 "\n"
2732 " proc_getname: %" PRIu64 "\n",
2733 trace->stats.vfs_getname,
2734 trace->stats.proc_getname);
2735 }
2736 }
2737
2738out_delete_evlist:
2739 perf_evlist__delete(evlist);
2740 trace->evlist = NULL;
2741 trace->live = false;
2742 return err;
2743{
2744 char errbuf[BUFSIZ];
2745
2746out_error_sched_stat_runtime:
2747 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2748 goto out_error;
2749
2750out_error_raw_syscalls:
2751 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2752 goto out_error;
2753
2754out_error_mmap:
2755 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2756 goto out_error;
2757
2758out_error_open:
2759 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2760
2761out_error:
2762 fprintf(trace->output, "%s\n", errbuf);
2763 goto out_delete_evlist;
2764
2765out_error_apply_filters:
2766 fprintf(trace->output,
2767 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2768 evsel->filter, perf_evsel__name(evsel), errno,
2769 strerror_r(errno, errbuf, sizeof(errbuf)));
2770 goto out_delete_evlist;
2771}
2772out_error_mem:
2773 fprintf(trace->output, "Not enough memory to run!\n");
2774 goto out_delete_evlist;
2775
2776out_errno:
2777 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2778 goto out_delete_evlist;
2779}
2780
2781static int trace__replay(struct trace *trace)
2782{
2783 const struct perf_evsel_str_handler handlers[] = {
2784 { "probe:vfs_getname", trace__vfs_getname, },
2785 };
2786 struct perf_data_file file = {
2787 .path = input_name,
2788 .mode = PERF_DATA_MODE_READ,
2789 .force = trace->force,
2790 };
2791 struct perf_session *session;
2792 struct perf_evsel *evsel;
2793 int err = -1;
2794
2795 trace->tool.sample = trace__process_sample;
2796 trace->tool.mmap = perf_event__process_mmap;
2797 trace->tool.mmap2 = perf_event__process_mmap2;
2798 trace->tool.comm = perf_event__process_comm;
2799 trace->tool.exit = perf_event__process_exit;
2800 trace->tool.fork = perf_event__process_fork;
2801 trace->tool.attr = perf_event__process_attr;
2802 trace->tool.tracing_data = perf_event__process_tracing_data;
2803 trace->tool.build_id = perf_event__process_build_id;
2804
2805 trace->tool.ordered_events = true;
2806 trace->tool.ordering_requires_timestamps = true;
2807
2808 /* add tid to output */
2809 trace->multiple_threads = true;
2810
2811 session = perf_session__new(&file, false, &trace->tool);
2812 if (session == NULL)
2813 return -1;
2814
2815 if (symbol__init(&session->header.env) < 0)
2816 goto out;
2817
2818 trace->host = &session->machines.host;
2819
2820 err = perf_session__set_tracepoints_handlers(session, handlers);
2821 if (err)
2822 goto out;
2823
2824 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2825 "raw_syscalls:sys_enter");
2826 /* older kernels have syscalls tp versus raw_syscalls */
2827 if (evsel == NULL)
2828 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2829 "syscalls:sys_enter");
2830
2831 if (evsel &&
2832 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2833 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2834 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2835 goto out;
2836 }
2837
2838 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2839 "raw_syscalls:sys_exit");
2840 if (evsel == NULL)
2841 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2842 "syscalls:sys_exit");
2843 if (evsel &&
2844 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2845 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2846 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2847 goto out;
2848 }
2849
2850 evlist__for_each(session->evlist, evsel) {
2851 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2852 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2853 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2854 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2855 evsel->handler = trace__pgfault;
2856 }
2857
2858 err = parse_target_str(trace);
2859 if (err != 0)
2860 goto out;
2861
2862 setup_pager();
2863
2864 err = perf_session__process_events(session);
2865 if (err)
2866 pr_err("Failed to process events, error %d", err);
2867
2868 else if (trace->summary)
2869 trace__fprintf_thread_summary(trace, trace->output);
2870
2871out:
2872 perf_session__delete(session);
2873
2874 return err;
2875}
2876
2877static size_t trace__fprintf_threads_header(FILE *fp)
2878{
2879 size_t printed;
2880
2881 printed = fprintf(fp, "\n Summary of events:\n\n");
2882
2883 return printed;
2884}
2885
2886static size_t thread__dump_stats(struct thread_trace *ttrace,
2887 struct trace *trace, FILE *fp)
2888{
2889 struct stats *stats;
2890 size_t printed = 0;
2891 struct syscall *sc;
2892 struct int_node *inode = intlist__first(ttrace->syscall_stats);
2893
2894 if (inode == NULL)
2895 return 0;
2896
2897 printed += fprintf(fp, "\n");
2898
2899 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2900 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2901 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2902
2903 /* each int_node is a syscall */
2904 while (inode) {
2905 stats = inode->priv;
2906 if (stats) {
2907 double min = (double)(stats->min) / NSEC_PER_MSEC;
2908 double max = (double)(stats->max) / NSEC_PER_MSEC;
2909 double avg = avg_stats(stats);
2910 double pct;
2911 u64 n = (u64) stats->n;
2912
2913 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2914 avg /= NSEC_PER_MSEC;
2915
2916 sc = &trace->syscalls.table[inode->i];
2917 printed += fprintf(fp, " %-15s", sc->name);
2918 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2919 n, avg * n, min, avg);
2920 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2921 }
2922
2923 inode = intlist__next(inode);
2924 }
2925
2926 printed += fprintf(fp, "\n\n");
2927
2928 return printed;
2929}
2930
2931/* struct used to pass data to per-thread function */
2932struct summary_data {
2933 FILE *fp;
2934 struct trace *trace;
2935 size_t printed;
2936};
2937
2938static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2939{
2940 struct summary_data *data = priv;
2941 FILE *fp = data->fp;
2942 size_t printed = data->printed;
2943 struct trace *trace = data->trace;
2944 struct thread_trace *ttrace = thread__priv(thread);
2945 double ratio;
2946
2947 if (ttrace == NULL)
2948 return 0;
2949
2950 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2951
2952 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2953 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2954 printed += fprintf(fp, "%.1f%%", ratio);
2955 if (ttrace->pfmaj)
2956 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2957 if (ttrace->pfmin)
2958 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2959 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2960 printed += thread__dump_stats(ttrace, trace, fp);
2961
2962 data->printed += printed;
2963
2964 return 0;
2965}
2966
2967static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2968{
2969 struct summary_data data = {
2970 .fp = fp,
2971 .trace = trace
2972 };
2973 data.printed = trace__fprintf_threads_header(fp);
2974
2975 machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2976
2977 return data.printed;
2978}
2979
2980static int trace__set_duration(const struct option *opt, const char *str,
2981 int unset __maybe_unused)
2982{
2983 struct trace *trace = opt->value;
2984
2985 trace->duration_filter = atof(str);
2986 return 0;
2987}
2988
2989static int trace__set_filter_pids(const struct option *opt, const char *str,
2990 int unset __maybe_unused)
2991{
2992 int ret = -1;
2993 size_t i;
2994 struct trace *trace = opt->value;
2995 /*
2996 * FIXME: introduce a intarray class, plain parse csv and create a
2997 * { int nr, int entries[] } struct...
2998 */
2999 struct intlist *list = intlist__new(str);
3000
3001 if (list == NULL)
3002 return -1;
3003
3004 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3005 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3006
3007 if (trace->filter_pids.entries == NULL)
3008 goto out;
3009
3010 trace->filter_pids.entries[0] = getpid();
3011
3012 for (i = 1; i < trace->filter_pids.nr; ++i)
3013 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3014
3015 intlist__delete(list);
3016 ret = 0;
3017out:
3018 return ret;
3019}
3020
3021static int trace__open_output(struct trace *trace, const char *filename)
3022{
3023 struct stat st;
3024
3025 if (!stat(filename, &st) && st.st_size) {
3026 char oldname[PATH_MAX];
3027
3028 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3029 unlink(oldname);
3030 rename(filename, oldname);
3031 }
3032
3033 trace->output = fopen(filename, "w");
3034
3035 return trace->output == NULL ? -errno : 0;
3036}
3037
3038static int parse_pagefaults(const struct option *opt, const char *str,
3039 int unset __maybe_unused)
3040{
3041 int *trace_pgfaults = opt->value;
3042
3043 if (strcmp(str, "all") == 0)
3044 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3045 else if (strcmp(str, "maj") == 0)
3046 *trace_pgfaults |= TRACE_PFMAJ;
3047 else if (strcmp(str, "min") == 0)
3048 *trace_pgfaults |= TRACE_PFMIN;
3049 else
3050 return -1;
3051
3052 return 0;
3053}
3054
3055static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3056{
3057 struct perf_evsel *evsel;
3058
3059 evlist__for_each(evlist, evsel)
3060 evsel->handler = handler;
3061}
3062
3063int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3064{
3065 const char *trace_usage[] = {
3066 "perf trace [<options>] [<command>]",
3067 "perf trace [<options>] -- <command> [<options>]",
3068 "perf trace record [<options>] [<command>]",
3069 "perf trace record [<options>] -- <command> [<options>]",
3070 NULL
3071 };
3072 struct trace trace = {
3073 .audit = {
3074 .machine = audit_detect_machine(),
3075 .open_id = audit_name_to_syscall("open", trace.audit.machine),
3076 },
3077 .syscalls = {
3078 . max = -1,
3079 },
3080 .opts = {
3081 .target = {
3082 .uid = UINT_MAX,
3083 .uses_mmap = true,
3084 },
3085 .user_freq = UINT_MAX,
3086 .user_interval = ULLONG_MAX,
3087 .no_buffering = true,
3088 .mmap_pages = UINT_MAX,
3089 .proc_map_timeout = 500,
3090 },
3091 .output = stderr,
3092 .show_comm = true,
3093 .trace_syscalls = true,
3094 };
3095 const char *output_name = NULL;
3096 const char *ev_qualifier_str = NULL;
3097 const struct option trace_options[] = {
3098 OPT_CALLBACK(0, "event", &trace.evlist, "event",
3099 "event selector. use 'perf list' to list available events",
3100 parse_events_option),
3101 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3102 "show the thread COMM next to its id"),
3103 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3104 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3105 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3106 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3107 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3108 "trace events on existing process id"),
3109 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3110 "trace events on existing thread id"),
3111 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3112 "pids to filter (by the kernel)", trace__set_filter_pids),
3113 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3114 "system-wide collection from all CPUs"),
3115 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3116 "list of cpus to monitor"),
3117 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3118 "child tasks do not inherit counters"),
3119 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3120 "number of mmap data pages",
3121 perf_evlist__parse_mmap_pages),
3122 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3123 "user to profile"),
3124 OPT_CALLBACK(0, "duration", &trace, "float",
3125 "show only events with duration > N.M ms",
3126 trace__set_duration),
3127 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3128 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3129 OPT_BOOLEAN('T', "time", &trace.full_time,
3130 "Show full timestamp, not time relative to first start"),
3131 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3132 "Show only syscall summary with statistics"),
3133 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3134 "Show all syscalls and summary with statistics"),
3135 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3136 "Trace pagefaults", parse_pagefaults, "maj"),
3137 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3138 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3139 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3140 "per thread proc mmap processing timeout in ms"),
3141 OPT_END()
3142 };
3143 const char * const trace_subcommands[] = { "record", NULL };
3144 int err;
3145 char bf[BUFSIZ];
3146
3147 signal(SIGSEGV, sighandler_dump_stack);
3148 signal(SIGFPE, sighandler_dump_stack);
3149
3150 trace.evlist = perf_evlist__new();
3151
3152 if (trace.evlist == NULL) {
3153 pr_err("Not enough memory to run!\n");
3154 err = -ENOMEM;
3155 goto out;
3156 }
3157
3158 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3159 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3160
3161 if (trace.trace_pgfaults) {
3162 trace.opts.sample_address = true;
3163 trace.opts.sample_time = true;
3164 }
3165
3166 if (trace.evlist->nr_entries > 0)
3167 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3168
3169 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3170 return trace__record(&trace, argc-1, &argv[1]);
3171
3172 /* summary_only implies summary option, but don't overwrite summary if set */
3173 if (trace.summary_only)
3174 trace.summary = trace.summary_only;
3175
3176 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3177 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3178 pr_err("Please specify something to trace.\n");
3179 return -1;
3180 }
3181
3182 if (output_name != NULL) {
3183 err = trace__open_output(&trace, output_name);
3184 if (err < 0) {
3185 perror("failed to create output file");
3186 goto out;
3187 }
3188 }
3189
3190 if (ev_qualifier_str != NULL) {
3191 const char *s = ev_qualifier_str;
3192 struct strlist_config slist_config = {
3193 .dirname = system_path(STRACE_GROUPS_DIR),
3194 };
3195
3196 trace.not_ev_qualifier = *s == '!';
3197 if (trace.not_ev_qualifier)
3198 ++s;
3199 trace.ev_qualifier = strlist__new(s, &slist_config);
3200 if (trace.ev_qualifier == NULL) {
3201 fputs("Not enough memory to parse event qualifier",
3202 trace.output);
3203 err = -ENOMEM;
3204 goto out_close;
3205 }
3206
3207 err = trace__validate_ev_qualifier(&trace);
3208 if (err)
3209 goto out_close;
3210 }
3211
3212 err = target__validate(&trace.opts.target);
3213 if (err) {
3214 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3215 fprintf(trace.output, "%s", bf);
3216 goto out_close;
3217 }
3218
3219 err = target__parse_uid(&trace.opts.target);
3220 if (err) {
3221 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3222 fprintf(trace.output, "%s", bf);
3223 goto out_close;
3224 }
3225
3226 if (!argc && target__none(&trace.opts.target))
3227 trace.opts.target.system_wide = true;
3228
3229 if (input_name)
3230 err = trace__replay(&trace);
3231 else
3232 err = trace__run(&trace, argc, argv);
3233
3234out_close:
3235 if (output_name != NULL)
3236 fclose(trace.output);
3237out:
3238 return err;
3239}