Loading...
1/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 */
16
17#include "util/record.h"
18#include <api/fs/tracing_path.h>
19#ifdef HAVE_LIBBPF_SUPPORT
20#include <bpf/bpf.h>
21#include <bpf/libbpf.h>
22#include <bpf/btf.h>
23#ifdef HAVE_BPF_SKEL
24#include "bpf_skel/augmented_raw_syscalls.skel.h"
25#endif
26#endif
27#include "util/bpf_map.h"
28#include "util/rlimit.h"
29#include "builtin.h"
30#include "util/cgroup.h"
31#include "util/color.h"
32#include "util/config.h"
33#include "util/debug.h"
34#include "util/dso.h"
35#include "util/env.h"
36#include "util/event.h"
37#include "util/evsel.h"
38#include "util/evsel_fprintf.h"
39#include "util/synthetic-events.h"
40#include "util/evlist.h"
41#include "util/evswitch.h"
42#include "util/mmap.h"
43#include <subcmd/pager.h>
44#include <subcmd/exec-cmd.h>
45#include "util/machine.h"
46#include "util/map.h"
47#include "util/symbol.h"
48#include "util/path.h"
49#include "util/session.h"
50#include "util/thread.h"
51#include <subcmd/parse-options.h>
52#include "util/strlist.h"
53#include "util/intlist.h"
54#include "util/thread_map.h"
55#include "util/stat.h"
56#include "util/tool.h"
57#include "util/util.h"
58#include "trace/beauty/beauty.h"
59#include "trace-event.h"
60#include "util/parse-events.h"
61#include "util/tracepoint.h"
62#include "callchain.h"
63#include "print_binary.h"
64#include "string2.h"
65#include "syscalltbl.h"
66#include "rb_resort.h"
67#include "../perf.h"
68#include "trace_augment.h"
69
70#include <errno.h>
71#include <inttypes.h>
72#include <poll.h>
73#include <signal.h>
74#include <stdlib.h>
75#include <string.h>
76#include <linux/err.h>
77#include <linux/filter.h>
78#include <linux/kernel.h>
79#include <linux/list_sort.h>
80#include <linux/random.h>
81#include <linux/stringify.h>
82#include <linux/time64.h>
83#include <linux/zalloc.h>
84#include <fcntl.h>
85#include <sys/sysmacros.h>
86
87#include <linux/ctype.h>
88#include <perf/mmap.h>
89
90#ifdef HAVE_LIBTRACEEVENT
91#include <event-parse.h>
92#endif
93
94#ifndef O_CLOEXEC
95# define O_CLOEXEC 02000000
96#endif
97
98#ifndef F_LINUX_SPECIFIC_BASE
99# define F_LINUX_SPECIFIC_BASE 1024
100#endif
101
102#define RAW_SYSCALL_ARGS_NUM 6
103
104/*
105 * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
106 *
107 * We have to explicitely mark the direction of the flow of data, if from the
108 * kernel to user space or the other way around, since the BPF collector we
109 * have so far copies only from user to kernel space, mark the arguments that
110 * go that direction, so that we donĀ“t end up collecting the previous contents
111 * for syscall args that goes from kernel to user space.
112 */
113struct syscall_arg_fmt {
114 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
115 bool (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val);
116 unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
117 void *parm;
118 const char *name;
119 u16 nr_entries; // for arrays
120 bool from_user;
121 bool show_zero;
122#ifdef HAVE_LIBBPF_SUPPORT
123 const struct btf_type *type;
124 int type_id; /* used in btf_dump */
125#endif
126};
127
128struct syscall_fmt {
129 const char *name;
130 const char *alias;
131 struct {
132 const char *sys_enter,
133 *sys_exit;
134 } bpf_prog_name;
135 struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
136 u8 nr_args;
137 bool errpid;
138 bool timeout;
139 bool hexret;
140};
141
142struct trace {
143 struct perf_tool tool;
144 struct syscalltbl *sctbl;
145 struct {
146 struct syscall *table;
147 struct {
148 struct evsel *sys_enter,
149 *sys_exit,
150 *bpf_output;
151 } events;
152 } syscalls;
153#ifdef HAVE_BPF_SKEL
154 struct augmented_raw_syscalls_bpf *skel;
155#endif
156#ifdef HAVE_LIBBPF_SUPPORT
157 struct btf *btf;
158#endif
159 struct record_opts opts;
160 struct evlist *evlist;
161 struct machine *host;
162 struct thread *current;
163 struct cgroup *cgroup;
164 u64 base_time;
165 FILE *output;
166 unsigned long nr_events;
167 unsigned long nr_events_printed;
168 unsigned long max_events;
169 struct evswitch evswitch;
170 struct strlist *ev_qualifier;
171 struct {
172 size_t nr;
173 int *entries;
174 } ev_qualifier_ids;
175 struct {
176 size_t nr;
177 pid_t *entries;
178 struct bpf_map *map;
179 } filter_pids;
180 double duration_filter;
181 double runtime_ms;
182 struct {
183 u64 vfs_getname,
184 proc_getname;
185 } stats;
186 unsigned int max_stack;
187 unsigned int min_stack;
188 int raw_augmented_syscalls_args_size;
189 bool raw_augmented_syscalls;
190 bool fd_path_disabled;
191 bool sort_events;
192 bool not_ev_qualifier;
193 bool live;
194 bool full_time;
195 bool sched;
196 bool multiple_threads;
197 bool summary;
198 bool summary_only;
199 bool errno_summary;
200 bool failure_only;
201 bool show_comm;
202 bool print_sample;
203 bool show_tool_stats;
204 bool trace_syscalls;
205 bool libtraceevent_print;
206 bool kernel_syscallchains;
207 s16 args_alignment;
208 bool show_tstamp;
209 bool show_duration;
210 bool show_zeros;
211 bool show_arg_names;
212 bool show_string_prefix;
213 bool force;
214 bool vfs_getname;
215 bool force_btf;
216 int trace_pgfaults;
217 char *perfconfig_events;
218 struct {
219 struct ordered_events data;
220 u64 last;
221 } oe;
222};
223
224static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused)
225{
226#ifdef HAVE_LIBBPF_SUPPORT
227 if (trace->btf != NULL)
228 return;
229
230 trace->btf = btf__load_vmlinux_btf();
231 if (verbose > 0) {
232 fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" :
233 "Failed to load vmlinux BTF\n");
234 }
235#endif
236}
237
238struct tp_field {
239 int offset;
240 union {
241 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
242 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
243 };
244};
245
246#define TP_UINT_FIELD(bits) \
247static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
248{ \
249 u##bits value; \
250 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
251 return value; \
252}
253
254TP_UINT_FIELD(8);
255TP_UINT_FIELD(16);
256TP_UINT_FIELD(32);
257TP_UINT_FIELD(64);
258
259#define TP_UINT_FIELD__SWAPPED(bits) \
260static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
261{ \
262 u##bits value; \
263 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
264 return bswap_##bits(value);\
265}
266
267TP_UINT_FIELD__SWAPPED(16);
268TP_UINT_FIELD__SWAPPED(32);
269TP_UINT_FIELD__SWAPPED(64);
270
271static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
272{
273 field->offset = offset;
274
275 switch (size) {
276 case 1:
277 field->integer = tp_field__u8;
278 break;
279 case 2:
280 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
281 break;
282 case 4:
283 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
284 break;
285 case 8:
286 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
287 break;
288 default:
289 return -1;
290 }
291
292 return 0;
293}
294
295static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
296{
297 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
298}
299
300static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
301{
302 return sample->raw_data + field->offset;
303}
304
305static int __tp_field__init_ptr(struct tp_field *field, int offset)
306{
307 field->offset = offset;
308 field->pointer = tp_field__ptr;
309 return 0;
310}
311
312static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
313{
314 return __tp_field__init_ptr(field, format_field->offset);
315}
316
317struct syscall_tp {
318 struct tp_field id;
319 union {
320 struct tp_field args, ret;
321 };
322};
323
324/*
325 * The evsel->priv as used by 'perf trace'
326 * sc: for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME
327 * fmt: for all the other tracepoints
328 */
329struct evsel_trace {
330 struct syscall_tp sc;
331 struct syscall_arg_fmt *fmt;
332};
333
334static struct evsel_trace *evsel_trace__new(void)
335{
336 return zalloc(sizeof(struct evsel_trace));
337}
338
339static void evsel_trace__delete(struct evsel_trace *et)
340{
341 if (et == NULL)
342 return;
343
344 zfree(&et->fmt);
345 free(et);
346}
347
348/*
349 * Used with raw_syscalls:sys_{enter,exit} and with the
350 * syscalls:sys_{enter,exit}_SYSCALL tracepoints
351 */
352static inline struct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
353{
354 struct evsel_trace *et = evsel->priv;
355
356 return &et->sc;
357}
358
359static struct syscall_tp *evsel__syscall_tp(struct evsel *evsel)
360{
361 if (evsel->priv == NULL) {
362 evsel->priv = evsel_trace__new();
363 if (evsel->priv == NULL)
364 return NULL;
365 }
366
367 return __evsel__syscall_tp(evsel);
368}
369
370/*
371 * Used with all the other tracepoints.
372 */
373static inline struct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
374{
375 struct evsel_trace *et = evsel->priv;
376
377 return et->fmt;
378}
379
380static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel)
381{
382 struct evsel_trace *et = evsel->priv;
383
384 if (evsel->priv == NULL) {
385 et = evsel->priv = evsel_trace__new();
386
387 if (et == NULL)
388 return NULL;
389 }
390
391 if (et->fmt == NULL) {
392 et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt));
393 if (et->fmt == NULL)
394 goto out_delete;
395 }
396
397 return __evsel__syscall_arg_fmt(evsel);
398
399out_delete:
400 evsel_trace__delete(evsel->priv);
401 evsel->priv = NULL;
402 return NULL;
403}
404
405static int evsel__init_tp_uint_field(struct evsel *evsel, struct tp_field *field, const char *name)
406{
407 struct tep_format_field *format_field = evsel__field(evsel, name);
408
409 if (format_field == NULL)
410 return -1;
411
412 return tp_field__init_uint(field, format_field, evsel->needs_swap);
413}
414
415#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
416 ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
417 evsel__init_tp_uint_field(evsel, &sc->name, #name); })
418
419static int evsel__init_tp_ptr_field(struct evsel *evsel, struct tp_field *field, const char *name)
420{
421 struct tep_format_field *format_field = evsel__field(evsel, name);
422
423 if (format_field == NULL)
424 return -1;
425
426 return tp_field__init_ptr(field, format_field);
427}
428
429#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
430 ({ struct syscall_tp *sc = __evsel__syscall_tp(evsel);\
431 evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
432
433static void evsel__delete_priv(struct evsel *evsel)
434{
435 zfree(&evsel->priv);
436 evsel__delete(evsel);
437}
438
439static int evsel__init_syscall_tp(struct evsel *evsel)
440{
441 struct syscall_tp *sc = evsel__syscall_tp(evsel);
442
443 if (sc != NULL) {
444 if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
445 evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
446 return -ENOENT;
447
448 return 0;
449 }
450
451 return -ENOMEM;
452}
453
454static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
455{
456 struct syscall_tp *sc = evsel__syscall_tp(evsel);
457
458 if (sc != NULL) {
459 struct tep_format_field *syscall_id = evsel__field(tp, "id");
460 if (syscall_id == NULL)
461 syscall_id = evsel__field(tp, "__syscall_nr");
462 if (syscall_id == NULL ||
463 __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
464 return -EINVAL;
465
466 return 0;
467 }
468
469 return -ENOMEM;
470}
471
472static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
473{
474 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
475
476 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
477}
478
479static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
480{
481 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
482
483 return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
484}
485
486static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
487{
488 if (evsel__syscall_tp(evsel) != NULL) {
489 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
490 return -ENOENT;
491
492 evsel->handler = handler;
493 return 0;
494 }
495
496 return -ENOMEM;
497}
498
499static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
500{
501 struct evsel *evsel = evsel__newtp("raw_syscalls", direction);
502
503 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
504 if (IS_ERR(evsel))
505 evsel = evsel__newtp("syscalls", direction);
506
507 if (IS_ERR(evsel))
508 return NULL;
509
510 if (evsel__init_raw_syscall_tp(evsel, handler))
511 goto out_delete;
512
513 return evsel;
514
515out_delete:
516 evsel__delete_priv(evsel);
517 return NULL;
518}
519
520#define perf_evsel__sc_tp_uint(evsel, name, sample) \
521 ({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
522 fields->name.integer(&fields->name, sample); })
523
524#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
525 ({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
526 fields->name.pointer(&fields->name, sample); })
527
528size_t strarray__scnprintf_suffix(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_suffix, int val)
529{
530 int idx = val - sa->offset;
531
532 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
533 size_t printed = scnprintf(bf, size, intfmt, val);
534 if (show_suffix)
535 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
536 return printed;
537 }
538
539 return scnprintf(bf, size, "%s%s", sa->entries[idx], show_suffix ? sa->prefix : "");
540}
541
542size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
543{
544 int idx = val - sa->offset;
545
546 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
547 size_t printed = scnprintf(bf, size, intfmt, val);
548 if (show_prefix)
549 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
550 return printed;
551 }
552
553 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
554}
555
556static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
557 const char *intfmt,
558 struct syscall_arg *arg)
559{
560 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
561}
562
563static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
564 struct syscall_arg *arg)
565{
566 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
567}
568
569#define SCA_STRARRAY syscall_arg__scnprintf_strarray
570
571bool syscall_arg__strtoul_strarray(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
572{
573 return strarray__strtoul(arg->parm, bf, size, ret);
574}
575
576bool syscall_arg__strtoul_strarray_flags(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
577{
578 return strarray__strtoul_flags(arg->parm, bf, size, ret);
579}
580
581bool syscall_arg__strtoul_strarrays(char *bf, size_t size, struct syscall_arg *arg, u64 *ret)
582{
583 return strarrays__strtoul(arg->parm, bf, size, ret);
584}
585
586size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
587{
588 return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
589}
590
591size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
592{
593 size_t printed;
594 int i;
595
596 for (i = 0; i < sas->nr_entries; ++i) {
597 struct strarray *sa = sas->entries[i];
598 int idx = val - sa->offset;
599
600 if (idx >= 0 && idx < sa->nr_entries) {
601 if (sa->entries[idx] == NULL)
602 break;
603 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
604 }
605 }
606
607 printed = scnprintf(bf, size, intfmt, val);
608 if (show_prefix)
609 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
610 return printed;
611}
612
613bool strarray__strtoul(struct strarray *sa, char *bf, size_t size, u64 *ret)
614{
615 int i;
616
617 for (i = 0; i < sa->nr_entries; ++i) {
618 if (sa->entries[i] && strncmp(sa->entries[i], bf, size) == 0 && sa->entries[i][size] == '\0') {
619 *ret = sa->offset + i;
620 return true;
621 }
622 }
623
624 return false;
625}
626
627bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *ret)
628{
629 u64 val = 0;
630 char *tok = bf, *sep, *end;
631
632 *ret = 0;
633
634 while (size != 0) {
635 int toklen = size;
636
637 sep = memchr(tok, '|', size);
638 if (sep != NULL) {
639 size -= sep - tok + 1;
640
641 end = sep - 1;
642 while (end > tok && isspace(*end))
643 --end;
644
645 toklen = end - tok + 1;
646 }
647
648 while (isspace(*tok))
649 ++tok;
650
651 if (isalpha(*tok) || *tok == '_') {
652 if (!strarray__strtoul(sa, tok, toklen, &val))
653 return false;
654 } else
655 val = strtoul(tok, NULL, 0);
656
657 *ret |= (1 << (val - 1));
658
659 if (sep == NULL)
660 break;
661 tok = sep + 1;
662 }
663
664 return true;
665}
666
667bool strarrays__strtoul(struct strarrays *sas, char *bf, size_t size, u64 *ret)
668{
669 int i;
670
671 for (i = 0; i < sas->nr_entries; ++i) {
672 struct strarray *sa = sas->entries[i];
673
674 if (strarray__strtoul(sa, bf, size, ret))
675 return true;
676 }
677
678 return false;
679}
680
681size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
682 struct syscall_arg *arg)
683{
684 return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
685}
686
687#ifndef AT_FDCWD
688#define AT_FDCWD -100
689#endif
690
691static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
692 struct syscall_arg *arg)
693{
694 int fd = arg->val;
695 const char *prefix = "AT_FD";
696
697 if (fd == AT_FDCWD)
698 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
699
700 return syscall_arg__scnprintf_fd(bf, size, arg);
701}
702
703#define SCA_FDAT syscall_arg__scnprintf_fd_at
704
705static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
706 struct syscall_arg *arg);
707
708#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
709
710size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
711{
712 return scnprintf(bf, size, "%#lx", arg->val);
713}
714
715size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
716{
717 if (arg->val == 0)
718 return scnprintf(bf, size, "NULL");
719 return syscall_arg__scnprintf_hex(bf, size, arg);
720}
721
722size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
723{
724 return scnprintf(bf, size, "%d", arg->val);
725}
726
727size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
728{
729 return scnprintf(bf, size, "%ld", arg->val);
730}
731
732static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
733{
734 // XXX Hey, maybe for sched:sched_switch prev/next comm fields we can
735 // fill missing comms using thread__set_comm()...
736 // here or in a special syscall_arg__scnprintf_pid_sched_tp...
737 return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
738}
739
740#define SCA_CHAR_ARRAY syscall_arg__scnprintf_char_array
741
742static const char *bpf_cmd[] = {
743 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
744 "MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH",
745 "PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID",
746 "PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD",
747 "PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID",
748 "TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE",
749 "BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH",
750 "MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE",
751 "LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE",
752 "LINK_DETACH", "PROG_BIND_MAP",
753};
754static DEFINE_STRARRAY(bpf_cmd, "BPF_");
755
756static const char *fsmount_flags[] = {
757 [1] = "CLOEXEC",
758};
759static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
760
761#include "trace/beauty/generated/fsconfig_arrays.c"
762
763static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
764
765static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
766static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
767
768static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
769static DEFINE_STRARRAY(itimers, "ITIMER_");
770
771static const char *keyctl_options[] = {
772 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
773 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
774 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
775 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
776 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
777};
778static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
779
780static const char *whences[] = { "SET", "CUR", "END",
781#ifdef SEEK_DATA
782"DATA",
783#endif
784#ifdef SEEK_HOLE
785"HOLE",
786#endif
787};
788static DEFINE_STRARRAY(whences, "SEEK_");
789
790static const char *fcntl_cmds[] = {
791 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
792 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
793 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
794 "GETOWNER_UIDS",
795};
796static DEFINE_STRARRAY(fcntl_cmds, "F_");
797
798static const char *fcntl_linux_specific_cmds[] = {
799 "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
800 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
801 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
802};
803
804static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
805
806static struct strarray *fcntl_cmds_arrays[] = {
807 &strarray__fcntl_cmds,
808 &strarray__fcntl_linux_specific_cmds,
809};
810
811static DEFINE_STRARRAYS(fcntl_cmds_arrays);
812
813static const char *rlimit_resources[] = {
814 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
815 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
816 "RTTIME",
817};
818static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
819
820static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
821static DEFINE_STRARRAY(sighow, "SIG_");
822
823static const char *clockid[] = {
824 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
825 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
826 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
827};
828static DEFINE_STRARRAY(clockid, "CLOCK_");
829
830static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
831 struct syscall_arg *arg)
832{
833 bool show_prefix = arg->show_string_prefix;
834 const char *suffix = "_OK";
835 size_t printed = 0;
836 int mode = arg->val;
837
838 if (mode == F_OK) /* 0 */
839 return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
840#define P_MODE(n) \
841 if (mode & n##_OK) { \
842 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
843 mode &= ~n##_OK; \
844 }
845
846 P_MODE(R);
847 P_MODE(W);
848 P_MODE(X);
849#undef P_MODE
850
851 if (mode)
852 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
853
854 return printed;
855}
856
857#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
858
859static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
860 struct syscall_arg *arg);
861
862#define SCA_FILENAME syscall_arg__scnprintf_filename
863
864// 'argname' is just documentational at this point, to remove the previous comment with that info
865#define SCA_FILENAME_FROM_USER(argname) \
866 { .scnprintf = SCA_FILENAME, \
867 .from_user = true, }
868
869static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg);
870
871#define SCA_BUF syscall_arg__scnprintf_buf
872
873static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
874 struct syscall_arg *arg)
875{
876 bool show_prefix = arg->show_string_prefix;
877 const char *prefix = "O_";
878 int printed = 0, flags = arg->val;
879
880#define P_FLAG(n) \
881 if (flags & O_##n) { \
882 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
883 flags &= ~O_##n; \
884 }
885
886 P_FLAG(CLOEXEC);
887 P_FLAG(NONBLOCK);
888#undef P_FLAG
889
890 if (flags)
891 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
892
893 return printed;
894}
895
896#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
897
898#ifndef GRND_NONBLOCK
899#define GRND_NONBLOCK 0x0001
900#endif
901#ifndef GRND_RANDOM
902#define GRND_RANDOM 0x0002
903#endif
904
905static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
906 struct syscall_arg *arg)
907{
908 bool show_prefix = arg->show_string_prefix;
909 const char *prefix = "GRND_";
910 int printed = 0, flags = arg->val;
911
912#define P_FLAG(n) \
913 if (flags & GRND_##n) { \
914 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
915 flags &= ~GRND_##n; \
916 }
917
918 P_FLAG(RANDOM);
919 P_FLAG(NONBLOCK);
920#undef P_FLAG
921
922 if (flags)
923 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
924
925 return printed;
926}
927
928#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
929
930#ifdef HAVE_LIBBPF_SUPPORT
931static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
932{
933 int id;
934
935 type = strstr(type, "enum ");
936 if (type == NULL)
937 return;
938
939 type += 5; // skip "enum " to get the enumeration name
940
941 id = btf__find_by_name(btf, type);
942 if (id < 0)
943 return;
944
945 arg_fmt->type = btf__type_by_id(btf, id);
946}
947
948static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
949{
950 const struct btf_type *bt = arg->fmt->type;
951 struct btf *btf = arg->trace->btf;
952 struct btf_enum *be = btf_enum(bt);
953
954 for (int i = 0; i < btf_vlen(bt); ++i, ++be) {
955 const char *name = btf__name_by_offset(btf, be->name_off);
956 int max_len = max(size, strlen(name));
957
958 if (strncmp(name, bf, max_len) == 0) {
959 *val = be->val;
960 return true;
961 }
962 }
963
964 return false;
965}
966
967static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val)
968{
969 const struct btf_type *bt;
970 char *type = arg->type_name;
971 struct btf *btf;
972
973 trace__load_vmlinux_btf(arg->trace);
974
975 btf = arg->trace->btf;
976 if (btf == NULL)
977 return false;
978
979 if (arg->fmt->type == NULL) {
980 // See if this is an enum
981 syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type);
982 }
983
984 // Now let's see if we have a BTF type resolved
985 bt = arg->fmt->type;
986 if (bt == NULL)
987 return false;
988
989 // If it is an enum:
990 if (btf_is_enum(arg->fmt->type))
991 return syscall_arg__strtoul_btf_enum(bf, size, arg, val);
992
993 return false;
994}
995
996static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val)
997{
998 struct btf_enum *be = btf_enum(type);
999 const int nr_entries = btf_vlen(type);
1000
1001 for (int i = 0; i < nr_entries; ++i, ++be) {
1002 if (be->val == val) {
1003 return scnprintf(bf, size, "%s",
1004 btf__name_by_offset(btf, be->name_off));
1005 }
1006 }
1007
1008 return 0;
1009}
1010
1011struct trace_btf_dump_snprintf_ctx {
1012 char *bf;
1013 size_t printed, size;
1014};
1015
1016static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args)
1017{
1018 struct trace_btf_dump_snprintf_ctx *ctx = vctx;
1019
1020 ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args);
1021}
1022
1023static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg)
1024{
1025 struct trace_btf_dump_snprintf_ctx ctx = {
1026 .bf = bf,
1027 .size = size,
1028 };
1029 struct augmented_arg *augmented_arg = arg->augmented.args;
1030 int type_id = arg->fmt->type_id, consumed;
1031 struct btf_dump *btf_dump;
1032
1033 LIBBPF_OPTS(btf_dump_opts, dump_opts);
1034 LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts);
1035
1036 if (arg == NULL || arg->augmented.args == NULL)
1037 return 0;
1038
1039 dump_data_opts.compact = true;
1040 dump_data_opts.skip_names = !arg->trace->show_arg_names;
1041
1042 btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts);
1043 if (btf_dump == NULL)
1044 return 0;
1045
1046 /* pretty print the struct data here */
1047 if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0)
1048 return 0;
1049
1050 consumed = sizeof(*augmented_arg) + augmented_arg->size;
1051 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1052 arg->augmented.size -= consumed;
1053
1054 btf_dump__free(btf_dump);
1055
1056 return ctx.printed;
1057}
1058
1059static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf,
1060 size_t size, int val, char *type)
1061{
1062 struct syscall_arg_fmt *arg_fmt = arg->fmt;
1063
1064 if (trace->btf == NULL)
1065 return 0;
1066
1067 if (arg_fmt->type == NULL) {
1068 // Check if this is an enum and if we have the BTF type for it.
1069 syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
1070 }
1071
1072 // Did we manage to find a BTF type for the syscall/tracepoint argument?
1073 if (arg_fmt->type == NULL)
1074 return 0;
1075
1076 if (btf_is_enum(arg_fmt->type))
1077 return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val);
1078 else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type))
1079 return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg);
1080
1081 return 0;
1082}
1083
1084#else // HAVE_LIBBPF_SUPPORT
1085static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused,
1086 char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused,
1087 char *type __maybe_unused)
1088{
1089 return 0;
1090}
1091
1092static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused,
1093 struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused)
1094{
1095 return false;
1096}
1097#endif // HAVE_LIBBPF_SUPPORT
1098
1099#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type
1100
1101#define STRARRAY(name, array) \
1102 { .scnprintf = SCA_STRARRAY, \
1103 .strtoul = STUL_STRARRAY, \
1104 .parm = &strarray__##array, }
1105
1106#define STRARRAY_FLAGS(name, array) \
1107 { .scnprintf = SCA_STRARRAY_FLAGS, \
1108 .strtoul = STUL_STRARRAY_FLAGS, \
1109 .parm = &strarray__##array, }
1110
1111#include "trace/beauty/arch_errno_names.c"
1112#include "trace/beauty/eventfd.c"
1113#include "trace/beauty/futex_op.c"
1114#include "trace/beauty/futex_val3.c"
1115#include "trace/beauty/mmap.c"
1116#include "trace/beauty/mode_t.c"
1117#include "trace/beauty/msg_flags.c"
1118#include "trace/beauty/open_flags.c"
1119#include "trace/beauty/perf_event_open.c"
1120#include "trace/beauty/pid.c"
1121#include "trace/beauty/sched_policy.c"
1122#include "trace/beauty/seccomp.c"
1123#include "trace/beauty/signum.c"
1124#include "trace/beauty/socket_type.c"
1125#include "trace/beauty/waitid_options.c"
1126
1127static const struct syscall_fmt syscall_fmts[] = {
1128 { .name = "access",
1129 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
1130 { .name = "arch_prctl",
1131 .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
1132 [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
1133 { .name = "bind",
1134 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1135 [1] = SCA_SOCKADDR_FROM_USER(umyaddr),
1136 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1137 { .name = "bpf",
1138 .arg = { [0] = STRARRAY(cmd, bpf_cmd),
1139 [1] = { .from_user = true /* attr */, }, } },
1140 { .name = "brk", .hexret = true,
1141 .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
1142 { .name = "clock_gettime",
1143 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
1144 { .name = "clock_nanosleep",
1145 .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, },
1146 { .name = "clone", .errpid = true, .nr_args = 5,
1147 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
1148 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
1149 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
1150 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
1151 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
1152 { .name = "close",
1153 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
1154 { .name = "connect",
1155 .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ },
1156 [1] = SCA_SOCKADDR_FROM_USER(servaddr),
1157 [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, },
1158 { .name = "epoll_ctl",
1159 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
1160 { .name = "eventfd2",
1161 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
1162 { .name = "faccessat",
1163 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1164 [1] = SCA_FILENAME_FROM_USER(pathname),
1165 [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
1166 { .name = "faccessat2",
1167 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1168 [1] = SCA_FILENAME_FROM_USER(pathname),
1169 [2] = { .scnprintf = SCA_ACCMODE, /* mode */ },
1170 [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, },
1171 { .name = "fchmodat",
1172 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1173 { .name = "fchownat",
1174 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1175 { .name = "fcntl",
1176 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
1177 .strtoul = STUL_STRARRAYS,
1178 .parm = &strarrays__fcntl_cmds_arrays,
1179 .show_zero = true, },
1180 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
1181 { .name = "flock",
1182 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
1183 { .name = "fsconfig",
1184 .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
1185 { .name = "fsmount",
1186 .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
1187 [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
1188 { .name = "fspick",
1189 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1190 [1] = SCA_FILENAME_FROM_USER(path),
1191 [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
1192 { .name = "fstat", .alias = "newfstat", },
1193 { .name = "futex",
1194 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
1195 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
1196 { .name = "futimesat",
1197 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1198 { .name = "getitimer",
1199 .arg = { [0] = STRARRAY(which, itimers), }, },
1200 { .name = "getpid", .errpid = true, },
1201 { .name = "getpgid", .errpid = true, },
1202 { .name = "getppid", .errpid = true, },
1203 { .name = "getrandom",
1204 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
1205 { .name = "getrlimit",
1206 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
1207 { .name = "getsockopt",
1208 .arg = { [1] = STRARRAY(level, socket_level), }, },
1209 { .name = "gettid", .errpid = true, },
1210 { .name = "ioctl",
1211 .arg = {
1212#if defined(__i386__) || defined(__x86_64__)
1213/*
1214 * FIXME: Make this available to all arches.
1215 */
1216 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
1217 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1218#else
1219 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
1220#endif
1221 { .name = "kcmp", .nr_args = 5,
1222 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
1223 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
1224 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
1225 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
1226 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
1227 { .name = "keyctl",
1228 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
1229 { .name = "kill",
1230 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1231 { .name = "linkat",
1232 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1233 { .name = "lseek",
1234 .arg = { [2] = STRARRAY(whence, whences), }, },
1235 { .name = "lstat", .alias = "newlstat", },
1236 { .name = "madvise",
1237 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1238 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
1239 { .name = "mkdirat",
1240 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1241 { .name = "mknodat",
1242 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
1243 { .name = "mmap", .hexret = true,
1244/* The standard mmap maps to old_mmap on s390x */
1245#if defined(__s390x__)
1246 .alias = "old_mmap",
1247#endif
1248 .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1249 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */
1250 .strtoul = STUL_STRARRAY_FLAGS,
1251 .parm = &strarray__mmap_flags, },
1252 [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, },
1253 { .name = "mount",
1254 .arg = { [0] = SCA_FILENAME_FROM_USER(devname),
1255 [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
1256 .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
1257 { .name = "move_mount",
1258 .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ },
1259 [1] = SCA_FILENAME_FROM_USER(pathname),
1260 [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ },
1261 [3] = SCA_FILENAME_FROM_USER(pathname),
1262 [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
1263 { .name = "mprotect",
1264 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1265 [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, },
1266 { .name = "mq_unlink",
1267 .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, },
1268 { .name = "mremap", .hexret = true,
1269 .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
1270 { .name = "name_to_handle_at",
1271 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1272 { .name = "nanosleep",
1273 .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, },
1274 { .name = "newfstatat", .alias = "fstatat",
1275 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ },
1276 [1] = SCA_FILENAME_FROM_USER(pathname),
1277 [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1278 { .name = "open",
1279 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1280 { .name = "open_by_handle_at",
1281 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1282 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1283 { .name = "openat",
1284 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1285 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
1286 { .name = "perf_event_open",
1287 .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr),
1288 [2] = { .scnprintf = SCA_INT, /* cpu */ },
1289 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
1290 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
1291 { .name = "pipe2",
1292 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
1293 { .name = "pkey_alloc",
1294 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
1295 { .name = "pkey_free",
1296 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
1297 { .name = "pkey_mprotect",
1298 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
1299 [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ },
1300 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
1301 { .name = "poll", .timeout = true, },
1302 { .name = "ppoll", .timeout = true, },
1303 { .name = "prctl",
1304 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */
1305 .strtoul = STUL_STRARRAY,
1306 .parm = &strarray__prctl_options, },
1307 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
1308 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
1309 { .name = "pread", .alias = "pread64", },
1310 { .name = "preadv", .alias = "pread", },
1311 { .name = "prlimit64",
1312 .arg = { [1] = STRARRAY(resource, rlimit_resources),
1313 [2] = { .from_user = true /* new_rlim */, }, }, },
1314 { .name = "pwrite", .alias = "pwrite64", },
1315 { .name = "readlinkat",
1316 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1317 { .name = "recvfrom",
1318 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1319 { .name = "recvmmsg",
1320 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1321 { .name = "recvmsg",
1322 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1323 { .name = "renameat",
1324 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1325 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
1326 { .name = "renameat2",
1327 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
1328 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
1329 [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
1330 { .name = "rseq", .errpid = true,
1331 .arg = { [0] = { .from_user = true /* rseq */, }, }, },
1332 { .name = "rt_sigaction",
1333 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1334 { .name = "rt_sigprocmask",
1335 .arg = { [0] = STRARRAY(how, sighow), }, },
1336 { .name = "rt_sigqueueinfo",
1337 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1338 { .name = "rt_tgsigqueueinfo",
1339 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1340 { .name = "sched_setscheduler",
1341 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
1342 { .name = "seccomp",
1343 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
1344 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
1345 { .name = "select", .timeout = true, },
1346 { .name = "sendfile", .alias = "sendfile64", },
1347 { .name = "sendmmsg",
1348 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1349 { .name = "sendmsg",
1350 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
1351 { .name = "sendto",
1352 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
1353 [4] = SCA_SOCKADDR_FROM_USER(addr), }, },
1354 { .name = "set_robust_list", .errpid = true,
1355 .arg = { [0] = { .from_user = true /* head */, }, }, },
1356 { .name = "set_tid_address", .errpid = true, },
1357 { .name = "setitimer",
1358 .arg = { [0] = STRARRAY(which, itimers), }, },
1359 { .name = "setrlimit",
1360 .arg = { [0] = STRARRAY(resource, rlimit_resources),
1361 [1] = { .from_user = true /* rlim */, }, }, },
1362 { .name = "setsockopt",
1363 .arg = { [1] = STRARRAY(level, socket_level), }, },
1364 { .name = "socket",
1365 .arg = { [0] = STRARRAY(family, socket_families),
1366 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1367 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1368 { .name = "socketpair",
1369 .arg = { [0] = STRARRAY(family, socket_families),
1370 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
1371 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
1372 { .name = "stat", .alias = "newstat", },
1373 { .name = "statx",
1374 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
1375 [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } ,
1376 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
1377 { .name = "swapoff",
1378 .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1379 { .name = "swapon",
1380 .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, },
1381 { .name = "symlinkat",
1382 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
1383 { .name = "sync_file_range",
1384 .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
1385 { .name = "tgkill",
1386 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1387 { .name = "tkill",
1388 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
1389 { .name = "umount2", .alias = "umount",
1390 .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, },
1391 { .name = "uname", .alias = "newuname", },
1392 { .name = "unlinkat",
1393 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
1394 [1] = SCA_FILENAME_FROM_USER(pathname),
1395 [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, },
1396 { .name = "utimensat",
1397 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
1398 { .name = "wait4", .errpid = true,
1399 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1400 { .name = "waitid", .errpid = true,
1401 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
1402 { .name = "write",
1403 .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, },
1404};
1405
1406static int syscall_fmt__cmp(const void *name, const void *fmtp)
1407{
1408 const struct syscall_fmt *fmt = fmtp;
1409 return strcmp(name, fmt->name);
1410}
1411
1412static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts,
1413 const int nmemb,
1414 const char *name)
1415{
1416 return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1417}
1418
1419static const struct syscall_fmt *syscall_fmt__find(const char *name)
1420{
1421 const int nmemb = ARRAY_SIZE(syscall_fmts);
1422 return __syscall_fmt__find(syscall_fmts, nmemb, name);
1423}
1424
1425static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts,
1426 const int nmemb, const char *alias)
1427{
1428 int i;
1429
1430 for (i = 0; i < nmemb; ++i) {
1431 if (fmts[i].alias && strcmp(fmts[i].alias, alias) == 0)
1432 return &fmts[i];
1433 }
1434
1435 return NULL;
1436}
1437
1438static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
1439{
1440 const int nmemb = ARRAY_SIZE(syscall_fmts);
1441 return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
1442}
1443
1444/*
1445 * is_exit: is this "exit" or "exit_group"?
1446 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
1447 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
1448 * nonexistent: Just a hole in the syscall table, syscall id not allocated
1449 */
1450struct syscall {
1451 struct tep_event *tp_format;
1452 int nr_args;
1453 int args_size;
1454 struct {
1455 struct bpf_program *sys_enter,
1456 *sys_exit;
1457 } bpf_prog;
1458 bool is_exit;
1459 bool is_open;
1460 bool nonexistent;
1461 bool use_btf;
1462 struct tep_format_field *args;
1463 const char *name;
1464 const struct syscall_fmt *fmt;
1465 struct syscall_arg_fmt *arg_fmt;
1466};
1467
1468/*
1469 * We need to have this 'calculated' boolean because in some cases we really
1470 * don't know what is the duration of a syscall, for instance, when we start
1471 * a session and some threads are waiting for a syscall to finish, say 'poll',
1472 * in which case all we can do is to print "( ? ) for duration and for the
1473 * start timestamp.
1474 */
1475static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
1476{
1477 double duration = (double)t / NSEC_PER_MSEC;
1478 size_t printed = fprintf(fp, "(");
1479
1480 if (!calculated)
1481 printed += fprintf(fp, " ");
1482 else if (duration >= 1.0)
1483 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1484 else if (duration >= 0.01)
1485 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1486 else
1487 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1488 return printed + fprintf(fp, "): ");
1489}
1490
1491/**
1492 * filename.ptr: The filename char pointer that will be vfs_getname'd
1493 * filename.entry_str_pos: Where to insert the string translated from
1494 * filename.ptr by the vfs_getname tracepoint/kprobe.
1495 * ret_scnprintf: syscall args may set this to a different syscall return
1496 * formatter, for instance, fcntl may return fds, file flags, etc.
1497 */
1498struct thread_trace {
1499 u64 entry_time;
1500 bool entry_pending;
1501 unsigned long nr_events;
1502 unsigned long pfmaj, pfmin;
1503 char *entry_str;
1504 double runtime_ms;
1505 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1506 struct {
1507 unsigned long ptr;
1508 short int entry_str_pos;
1509 bool pending_open;
1510 unsigned int namelen;
1511 char *name;
1512 } filename;
1513 struct {
1514 int max;
1515 struct file *table;
1516 } files;
1517
1518 struct intlist *syscall_stats;
1519};
1520
1521static struct thread_trace *thread_trace__new(void)
1522{
1523 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1524
1525 if (ttrace) {
1526 ttrace->files.max = -1;
1527 ttrace->syscall_stats = intlist__new(NULL);
1528 }
1529
1530 return ttrace;
1531}
1532
1533static void thread_trace__free_files(struct thread_trace *ttrace);
1534
1535static void thread_trace__delete(void *pttrace)
1536{
1537 struct thread_trace *ttrace = pttrace;
1538
1539 if (!ttrace)
1540 return;
1541
1542 intlist__delete(ttrace->syscall_stats);
1543 ttrace->syscall_stats = NULL;
1544 thread_trace__free_files(ttrace);
1545 zfree(&ttrace->entry_str);
1546 free(ttrace);
1547}
1548
1549static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1550{
1551 struct thread_trace *ttrace;
1552
1553 if (thread == NULL)
1554 goto fail;
1555
1556 if (thread__priv(thread) == NULL)
1557 thread__set_priv(thread, thread_trace__new());
1558
1559 if (thread__priv(thread) == NULL)
1560 goto fail;
1561
1562 ttrace = thread__priv(thread);
1563 ++ttrace->nr_events;
1564
1565 return ttrace;
1566fail:
1567 color_fprintf(fp, PERF_COLOR_RED,
1568 "WARNING: not enough memory, dropping samples!\n");
1569 return NULL;
1570}
1571
1572
1573void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1574 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1575{
1576 struct thread_trace *ttrace = thread__priv(arg->thread);
1577
1578 ttrace->ret_scnprintf = ret_scnprintf;
1579}
1580
1581#define TRACE_PFMAJ (1 << 0)
1582#define TRACE_PFMIN (1 << 1)
1583
1584static const size_t trace__entry_str_size = 2048;
1585
1586static void thread_trace__free_files(struct thread_trace *ttrace)
1587{
1588 for (int i = 0; i < ttrace->files.max; ++i) {
1589 struct file *file = ttrace->files.table + i;
1590 zfree(&file->pathname);
1591 }
1592
1593 zfree(&ttrace->files.table);
1594 ttrace->files.max = -1;
1595}
1596
1597static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1598{
1599 if (fd < 0)
1600 return NULL;
1601
1602 if (fd > ttrace->files.max) {
1603 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1604
1605 if (nfiles == NULL)
1606 return NULL;
1607
1608 if (ttrace->files.max != -1) {
1609 memset(nfiles + ttrace->files.max + 1, 0,
1610 (fd - ttrace->files.max) * sizeof(struct file));
1611 } else {
1612 memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1613 }
1614
1615 ttrace->files.table = nfiles;
1616 ttrace->files.max = fd;
1617 }
1618
1619 return ttrace->files.table + fd;
1620}
1621
1622struct file *thread__files_entry(struct thread *thread, int fd)
1623{
1624 return thread_trace__files_entry(thread__priv(thread), fd);
1625}
1626
1627static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1628{
1629 struct thread_trace *ttrace = thread__priv(thread);
1630 struct file *file = thread_trace__files_entry(ttrace, fd);
1631
1632 if (file != NULL) {
1633 struct stat st;
1634 if (stat(pathname, &st) == 0)
1635 file->dev_maj = major(st.st_rdev);
1636 file->pathname = strdup(pathname);
1637 if (file->pathname)
1638 return 0;
1639 }
1640
1641 return -1;
1642}
1643
1644static int thread__read_fd_path(struct thread *thread, int fd)
1645{
1646 char linkname[PATH_MAX], pathname[PATH_MAX];
1647 struct stat st;
1648 int ret;
1649
1650 if (thread__pid(thread) == thread__tid(thread)) {
1651 scnprintf(linkname, sizeof(linkname),
1652 "/proc/%d/fd/%d", thread__pid(thread), fd);
1653 } else {
1654 scnprintf(linkname, sizeof(linkname),
1655 "/proc/%d/task/%d/fd/%d",
1656 thread__pid(thread), thread__tid(thread), fd);
1657 }
1658
1659 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1660 return -1;
1661
1662 ret = readlink(linkname, pathname, sizeof(pathname));
1663
1664 if (ret < 0 || ret > st.st_size)
1665 return -1;
1666
1667 pathname[ret] = '\0';
1668 return trace__set_fd_pathname(thread, fd, pathname);
1669}
1670
1671static const char *thread__fd_path(struct thread *thread, int fd,
1672 struct trace *trace)
1673{
1674 struct thread_trace *ttrace = thread__priv(thread);
1675
1676 if (ttrace == NULL || trace->fd_path_disabled)
1677 return NULL;
1678
1679 if (fd < 0)
1680 return NULL;
1681
1682 if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1683 if (!trace->live)
1684 return NULL;
1685 ++trace->stats.proc_getname;
1686 if (thread__read_fd_path(thread, fd))
1687 return NULL;
1688 }
1689
1690 return ttrace->files.table[fd].pathname;
1691}
1692
1693size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1694{
1695 int fd = arg->val;
1696 size_t printed = scnprintf(bf, size, "%d", fd);
1697 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1698
1699 if (path)
1700 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1701
1702 return printed;
1703}
1704
1705size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1706{
1707 size_t printed = scnprintf(bf, size, "%d", fd);
1708 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1709
1710 if (thread) {
1711 const char *path = thread__fd_path(thread, fd, trace);
1712
1713 if (path)
1714 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1715
1716 thread__put(thread);
1717 }
1718
1719 return printed;
1720}
1721
1722static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1723 struct syscall_arg *arg)
1724{
1725 int fd = arg->val;
1726 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1727 struct thread_trace *ttrace = thread__priv(arg->thread);
1728
1729 if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1730 zfree(&ttrace->files.table[fd].pathname);
1731
1732 return printed;
1733}
1734
1735static void thread__set_filename_pos(struct thread *thread, const char *bf,
1736 unsigned long ptr)
1737{
1738 struct thread_trace *ttrace = thread__priv(thread);
1739
1740 ttrace->filename.ptr = ptr;
1741 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1742}
1743
1744static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1745{
1746 struct augmented_arg *augmented_arg = arg->augmented.args;
1747 size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1748 /*
1749 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1750 * we would have two strings, each prefixed by its size.
1751 */
1752 int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1753
1754 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1755 arg->augmented.size -= consumed;
1756
1757 return printed;
1758}
1759
1760static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1761 struct syscall_arg *arg)
1762{
1763 unsigned long ptr = arg->val;
1764
1765 if (arg->augmented.args)
1766 return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1767
1768 if (!arg->trace->vfs_getname)
1769 return scnprintf(bf, size, "%#x", ptr);
1770
1771 thread__set_filename_pos(arg->thread, bf, ptr);
1772 return 0;
1773}
1774
1775#define MAX_CONTROL_CHAR 31
1776#define MAX_ASCII 127
1777
1778static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg)
1779{
1780 struct augmented_arg *augmented_arg = arg->augmented.args;
1781 unsigned char *orig = (unsigned char *)augmented_arg->value;
1782 size_t printed = 0;
1783 int consumed;
1784
1785 if (augmented_arg == NULL)
1786 return 0;
1787
1788 for (int j = 0; j < augmented_arg->size; ++j) {
1789 bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII;
1790 /* print control characters (0~31 and 127), and non-ascii characters in \(digits) */
1791 printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]);
1792 }
1793
1794 consumed = sizeof(*augmented_arg) + augmented_arg->size;
1795 arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1796 arg->augmented.size -= consumed;
1797
1798 return printed;
1799}
1800
1801static bool trace__filter_duration(struct trace *trace, double t)
1802{
1803 return t < (trace->duration_filter * NSEC_PER_MSEC);
1804}
1805
1806static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1807{
1808 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1809
1810 return fprintf(fp, "%10.3f ", ts);
1811}
1812
1813/*
1814 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1815 * using ttrace->entry_time for a thread that receives a sys_exit without
1816 * first having received a sys_enter ("poll" issued before tracing session
1817 * starts, lost sys_enter exit due to ring buffer overflow).
1818 */
1819static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1820{
1821 if (tstamp > 0)
1822 return __trace__fprintf_tstamp(trace, tstamp, fp);
1823
1824 return fprintf(fp, " ? ");
1825}
1826
1827static pid_t workload_pid = -1;
1828static volatile sig_atomic_t done = false;
1829static volatile sig_atomic_t interrupted = false;
1830
1831static void sighandler_interrupt(int sig __maybe_unused)
1832{
1833 done = interrupted = true;
1834}
1835
1836static void sighandler_chld(int sig __maybe_unused, siginfo_t *info,
1837 void *context __maybe_unused)
1838{
1839 if (info->si_pid == workload_pid)
1840 done = true;
1841}
1842
1843static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1844{
1845 size_t printed = 0;
1846
1847 if (trace->multiple_threads) {
1848 if (trace->show_comm)
1849 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1850 printed += fprintf(fp, "%d ", thread__tid(thread));
1851 }
1852
1853 return printed;
1854}
1855
1856static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1857 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1858{
1859 size_t printed = 0;
1860
1861 if (trace->show_tstamp)
1862 printed = trace__fprintf_tstamp(trace, tstamp, fp);
1863 if (trace->show_duration)
1864 printed += fprintf_duration(duration, duration_calculated, fp);
1865 return printed + trace__fprintf_comm_tid(trace, thread, fp);
1866}
1867
1868static int trace__process_event(struct trace *trace, struct machine *machine,
1869 union perf_event *event, struct perf_sample *sample)
1870{
1871 int ret = 0;
1872
1873 switch (event->header.type) {
1874 case PERF_RECORD_LOST:
1875 color_fprintf(trace->output, PERF_COLOR_RED,
1876 "LOST %" PRIu64 " events!\n", (u64)event->lost.lost);
1877 ret = machine__process_lost_event(machine, event, sample);
1878 break;
1879 default:
1880 ret = machine__process_event(machine, event, sample);
1881 break;
1882 }
1883
1884 return ret;
1885}
1886
1887static int trace__tool_process(const struct perf_tool *tool,
1888 union perf_event *event,
1889 struct perf_sample *sample,
1890 struct machine *machine)
1891{
1892 struct trace *trace = container_of(tool, struct trace, tool);
1893 return trace__process_event(trace, machine, event, sample);
1894}
1895
1896static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1897{
1898 struct machine *machine = vmachine;
1899
1900 if (machine->kptr_restrict_warned)
1901 return NULL;
1902
1903 if (symbol_conf.kptr_restrict) {
1904 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1905 "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1906 "Kernel samples will not be resolved.\n");
1907 machine->kptr_restrict_warned = true;
1908 return NULL;
1909 }
1910
1911 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1912}
1913
1914static int trace__symbols_init(struct trace *trace, struct evlist *evlist)
1915{
1916 int err = symbol__init(NULL);
1917
1918 if (err)
1919 return err;
1920
1921 trace->host = machine__new_host();
1922 if (trace->host == NULL)
1923 return -ENOMEM;
1924
1925 thread__set_priv_destructor(thread_trace__delete);
1926
1927 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1928 if (err < 0)
1929 goto out;
1930
1931 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1932 evlist->core.threads, trace__tool_process,
1933 true, false, 1);
1934out:
1935 if (err)
1936 symbol__exit();
1937
1938 return err;
1939}
1940
1941static void trace__symbols__exit(struct trace *trace)
1942{
1943 machine__exit(trace->host);
1944 trace->host = NULL;
1945
1946 symbol__exit();
1947}
1948
1949static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1950{
1951 int idx;
1952
1953 if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
1954 nr_args = sc->fmt->nr_args;
1955
1956 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1957 if (sc->arg_fmt == NULL)
1958 return -1;
1959
1960 for (idx = 0; idx < nr_args; ++idx) {
1961 if (sc->fmt)
1962 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1963 }
1964
1965 sc->nr_args = nr_args;
1966 return 0;
1967}
1968
1969static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = {
1970 { .name = "msr", .scnprintf = SCA_X86_MSR, .strtoul = STUL_X86_MSR, },
1971 { .name = "vector", .scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, },
1972};
1973
1974static int syscall_arg_fmt__cmp(const void *name, const void *fmtp)
1975{
1976 const struct syscall_arg_fmt *fmt = fmtp;
1977 return strcmp(name, fmt->name);
1978}
1979
1980static const struct syscall_arg_fmt *
1981__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb,
1982 const char *name)
1983{
1984 return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp);
1985}
1986
1987static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name)
1988{
1989 const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name);
1990 return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name);
1991}
1992
1993static struct tep_format_field *
1994syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field,
1995 bool *use_btf)
1996{
1997 struct tep_format_field *last_field = NULL;
1998 int len;
1999
2000 for (; field; field = field->next, ++arg) {
2001 last_field = field;
2002
2003 if (arg->scnprintf)
2004 continue;
2005
2006 len = strlen(field->name);
2007
2008 // As far as heuristics (or intention) goes this seems to hold true, and makes sense!
2009 if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
2010 arg->from_user = true;
2011
2012 if (strcmp(field->type, "const char *") == 0 &&
2013 ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
2014 strstr(field->name, "path") != NULL)) {
2015 arg->scnprintf = SCA_FILENAME;
2016 } else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
2017 arg->scnprintf = SCA_PTR;
2018 else if (strcmp(field->type, "pid_t") == 0)
2019 arg->scnprintf = SCA_PID;
2020 else if (strcmp(field->type, "umode_t") == 0)
2021 arg->scnprintf = SCA_MODE_T;
2022 else if ((field->flags & TEP_FIELD_IS_ARRAY) && strstr(field->type, "char")) {
2023 arg->scnprintf = SCA_CHAR_ARRAY;
2024 arg->nr_entries = field->arraylen;
2025 } else if ((strcmp(field->type, "int") == 0 ||
2026 strcmp(field->type, "unsigned int") == 0 ||
2027 strcmp(field->type, "long") == 0) &&
2028 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
2029 /*
2030 * /sys/kernel/tracing/events/syscalls/sys_enter*
2031 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
2032 * 65 int
2033 * 23 unsigned int
2034 * 7 unsigned long
2035 */
2036 arg->scnprintf = SCA_FD;
2037 } else if (strstr(field->type, "enum") && use_btf != NULL) {
2038 *use_btf = true;
2039 arg->strtoul = STUL_BTF_TYPE;
2040 } else {
2041 const struct syscall_arg_fmt *fmt =
2042 syscall_arg_fmt__find_by_name(field->name);
2043
2044 if (fmt) {
2045 arg->scnprintf = fmt->scnprintf;
2046 arg->strtoul = fmt->strtoul;
2047 }
2048 }
2049 }
2050
2051 return last_field;
2052}
2053
2054static int syscall__set_arg_fmts(struct syscall *sc)
2055{
2056 struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args,
2057 &sc->use_btf);
2058
2059 if (last_field)
2060 sc->args_size = last_field->offset + last_field->size;
2061
2062 return 0;
2063}
2064
2065static int trace__read_syscall_info(struct trace *trace, int id)
2066{
2067 char tp_name[128];
2068 struct syscall *sc;
2069 const char *name = syscalltbl__name(trace->sctbl, id);
2070 int err;
2071
2072#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2073 if (trace->syscalls.table == NULL) {
2074 trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
2075 if (trace->syscalls.table == NULL)
2076 return -ENOMEM;
2077 }
2078#else
2079 if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) {
2080 // When using libaudit we don't know beforehand what is the max syscall id
2081 struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
2082
2083 if (table == NULL)
2084 return -ENOMEM;
2085
2086 // Need to memset from offset 0 and +1 members if brand new
2087 if (trace->syscalls.table == NULL)
2088 memset(table, 0, (id + 1) * sizeof(*sc));
2089 else
2090 memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc));
2091
2092 trace->syscalls.table = table;
2093 trace->sctbl->syscalls.max_id = id;
2094 }
2095#endif
2096 sc = trace->syscalls.table + id;
2097 if (sc->nonexistent)
2098 return -EEXIST;
2099
2100 if (name == NULL) {
2101 sc->nonexistent = true;
2102 return -EEXIST;
2103 }
2104
2105 sc->name = name;
2106 sc->fmt = syscall_fmt__find(sc->name);
2107
2108 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
2109 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2110
2111 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
2112 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
2113 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
2114 }
2115
2116 /*
2117 * Fails to read trace point format via sysfs node, so the trace point
2118 * doesn't exist. Set the 'nonexistent' flag as true.
2119 */
2120 if (IS_ERR(sc->tp_format)) {
2121 sc->nonexistent = true;
2122 return PTR_ERR(sc->tp_format);
2123 }
2124
2125 /*
2126 * The tracepoint format contains __syscall_nr field, so it's one more
2127 * than the actual number of syscall arguments.
2128 */
2129 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
2130 RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
2131 return -ENOMEM;
2132
2133 sc->args = sc->tp_format->format.fields;
2134 /*
2135 * We need to check and discard the first variable '__syscall_nr'
2136 * or 'nr' that mean the syscall number. It is needless here.
2137 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
2138 */
2139 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
2140 sc->args = sc->args->next;
2141 --sc->nr_args;
2142 }
2143
2144 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
2145 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
2146
2147 err = syscall__set_arg_fmts(sc);
2148
2149 /* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */
2150 if (sc->use_btf)
2151 trace__load_vmlinux_btf(trace);
2152
2153 return err;
2154}
2155
2156static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf)
2157{
2158 struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
2159
2160 if (fmt != NULL) {
2161 syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf);
2162 return 0;
2163 }
2164
2165 return -ENOMEM;
2166}
2167
2168static int intcmp(const void *a, const void *b)
2169{
2170 const int *one = a, *another = b;
2171
2172 return *one - *another;
2173}
2174
2175static int trace__validate_ev_qualifier(struct trace *trace)
2176{
2177 int err = 0;
2178 bool printed_invalid_prefix = false;
2179 struct str_node *pos;
2180 size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
2181
2182 trace->ev_qualifier_ids.entries = malloc(nr_allocated *
2183 sizeof(trace->ev_qualifier_ids.entries[0]));
2184
2185 if (trace->ev_qualifier_ids.entries == NULL) {
2186 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
2187 trace->output);
2188 err = -EINVAL;
2189 goto out;
2190 }
2191
2192 strlist__for_each_entry(pos, trace->ev_qualifier) {
2193 const char *sc = pos->s;
2194 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
2195
2196 if (id < 0) {
2197 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
2198 if (id >= 0)
2199 goto matches;
2200
2201 if (!printed_invalid_prefix) {
2202 pr_debug("Skipping unknown syscalls: ");
2203 printed_invalid_prefix = true;
2204 } else {
2205 pr_debug(", ");
2206 }
2207
2208 pr_debug("%s", sc);
2209 continue;
2210 }
2211matches:
2212 trace->ev_qualifier_ids.entries[nr_used++] = id;
2213 if (match_next == -1)
2214 continue;
2215
2216 while (1) {
2217 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
2218 if (id < 0)
2219 break;
2220 if (nr_allocated == nr_used) {
2221 void *entries;
2222
2223 nr_allocated += 8;
2224 entries = realloc(trace->ev_qualifier_ids.entries,
2225 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
2226 if (entries == NULL) {
2227 err = -ENOMEM;
2228 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
2229 goto out_free;
2230 }
2231 trace->ev_qualifier_ids.entries = entries;
2232 }
2233 trace->ev_qualifier_ids.entries[nr_used++] = id;
2234 }
2235 }
2236
2237 trace->ev_qualifier_ids.nr = nr_used;
2238 qsort(trace->ev_qualifier_ids.entries, nr_used, sizeof(int), intcmp);
2239out:
2240 if (printed_invalid_prefix)
2241 pr_debug("\n");
2242 return err;
2243out_free:
2244 zfree(&trace->ev_qualifier_ids.entries);
2245 trace->ev_qualifier_ids.nr = 0;
2246 goto out;
2247}
2248
2249static __maybe_unused bool trace__syscall_enabled(struct trace *trace, int id)
2250{
2251 bool in_ev_qualifier;
2252
2253 if (trace->ev_qualifier_ids.nr == 0)
2254 return true;
2255
2256 in_ev_qualifier = bsearch(&id, trace->ev_qualifier_ids.entries,
2257 trace->ev_qualifier_ids.nr, sizeof(int), intcmp) != NULL;
2258
2259 if (in_ev_qualifier)
2260 return !trace->not_ev_qualifier;
2261
2262 return trace->not_ev_qualifier;
2263}
2264
2265/*
2266 * args is to be interpreted as a series of longs but we need to handle
2267 * 8-byte unaligned accesses. args points to raw_data within the event
2268 * and raw_data is guaranteed to be 8-byte unaligned because it is
2269 * preceded by raw_size which is a u32. So we need to copy args to a temp
2270 * variable to read it. Most notably this avoids extended load instructions
2271 * on unaligned addresses
2272 */
2273unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
2274{
2275 unsigned long val;
2276 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
2277
2278 memcpy(&val, p, sizeof(val));
2279 return val;
2280}
2281
2282static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
2283 struct syscall_arg *arg)
2284{
2285 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
2286 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
2287
2288 return scnprintf(bf, size, "arg%d: ", arg->idx);
2289}
2290
2291/*
2292 * Check if the value is in fact zero, i.e. mask whatever needs masking, such
2293 * as mount 'flags' argument that needs ignoring some magic flag, see comment
2294 * in tools/perf/trace/beauty/mount_flags.c
2295 */
2296static unsigned long syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsigned long val)
2297{
2298 if (fmt && fmt->mask_val)
2299 return fmt->mask_val(arg, val);
2300
2301 return val;
2302}
2303
2304static size_t syscall_arg_fmt__scnprintf_val(struct syscall_arg_fmt *fmt, char *bf, size_t size,
2305 struct syscall_arg *arg, unsigned long val)
2306{
2307 if (fmt && fmt->scnprintf) {
2308 arg->val = val;
2309 if (fmt->parm)
2310 arg->parm = fmt->parm;
2311 return fmt->scnprintf(bf, size, arg);
2312 }
2313 return scnprintf(bf, size, "%ld", val);
2314}
2315
2316static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
2317 unsigned char *args, void *augmented_args, int augmented_args_size,
2318 struct trace *trace, struct thread *thread)
2319{
2320 size_t printed = 0, btf_printed;
2321 unsigned long val;
2322 u8 bit = 1;
2323 struct syscall_arg arg = {
2324 .args = args,
2325 .augmented = {
2326 .size = augmented_args_size,
2327 .args = augmented_args,
2328 },
2329 .idx = 0,
2330 .mask = 0,
2331 .trace = trace,
2332 .thread = thread,
2333 .show_string_prefix = trace->show_string_prefix,
2334 };
2335 struct thread_trace *ttrace = thread__priv(thread);
2336 void *default_scnprintf;
2337
2338 /*
2339 * Things like fcntl will set this in its 'cmd' formatter to pick the
2340 * right formatter for the return value (an fd? file flags?), which is
2341 * not needed for syscalls that always return a given type, say an fd.
2342 */
2343 ttrace->ret_scnprintf = NULL;
2344
2345 if (sc->args != NULL) {
2346 struct tep_format_field *field;
2347
2348 for (field = sc->args; field;
2349 field = field->next, ++arg.idx, bit <<= 1) {
2350 if (arg.mask & bit)
2351 continue;
2352
2353 arg.fmt = &sc->arg_fmt[arg.idx];
2354 val = syscall_arg__val(&arg, arg.idx);
2355 /*
2356 * Some syscall args need some mask, most don't and
2357 * return val untouched.
2358 */
2359 val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
2360
2361 /*
2362 * Suppress this argument if its value is zero and show_zero
2363 * property isn't set.
2364 *
2365 * If it has a BTF type, then override the zero suppression knob
2366 * as the common case is for zero in an enum to have an associated entry.
2367 */
2368 if (val == 0 && !trace->show_zeros &&
2369 !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
2370 !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE))
2371 continue;
2372
2373 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
2374
2375 if (trace->show_arg_names)
2376 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
2377
2378 default_scnprintf = sc->arg_fmt[arg.idx].scnprintf;
2379
2380 if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) {
2381 btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed,
2382 size - printed, val, field->type);
2383 if (btf_printed) {
2384 printed += btf_printed;
2385 continue;
2386 }
2387 }
2388
2389 printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
2390 bf + printed, size - printed, &arg, val);
2391 }
2392 } else if (IS_ERR(sc->tp_format)) {
2393 /*
2394 * If we managed to read the tracepoint /format file, then we
2395 * may end up not having any args, like with gettid(), so only
2396 * print the raw args when we didn't manage to read it.
2397 */
2398 while (arg.idx < sc->nr_args) {
2399 if (arg.mask & bit)
2400 goto next_arg;
2401 val = syscall_arg__val(&arg, arg.idx);
2402 if (printed)
2403 printed += scnprintf(bf + printed, size - printed, ", ");
2404 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
2405 printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
2406next_arg:
2407 ++arg.idx;
2408 bit <<= 1;
2409 }
2410 }
2411
2412 return printed;
2413}
2414
2415typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
2416 union perf_event *event,
2417 struct perf_sample *sample);
2418
2419static struct syscall *trace__syscall_info(struct trace *trace,
2420 struct evsel *evsel, int id)
2421{
2422 int err = 0;
2423
2424 if (id < 0) {
2425
2426 /*
2427 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
2428 * before that, leaving at a higher verbosity level till that is
2429 * explained. Reproduced with plain ftrace with:
2430 *
2431 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
2432 * grep "NR -1 " /t/trace_pipe
2433 *
2434 * After generating some load on the machine.
2435 */
2436 if (verbose > 1) {
2437 static u64 n;
2438 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
2439 id, evsel__name(evsel), ++n);
2440 }
2441 return NULL;
2442 }
2443
2444 err = -EINVAL;
2445
2446#ifdef HAVE_SYSCALL_TABLE_SUPPORT
2447 if (id > trace->sctbl->syscalls.max_id) {
2448#else
2449 if (id >= trace->sctbl->syscalls.max_id) {
2450 /*
2451 * With libaudit we don't know beforehand what is the max_id,
2452 * so we let trace__read_syscall_info() figure that out as we
2453 * go on reading syscalls.
2454 */
2455 err = trace__read_syscall_info(trace, id);
2456 if (err)
2457#endif
2458 goto out_cant_read;
2459 }
2460
2461 if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
2462 (err = trace__read_syscall_info(trace, id)) != 0)
2463 goto out_cant_read;
2464
2465 if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
2466 goto out_cant_read;
2467
2468 return &trace->syscalls.table[id];
2469
2470out_cant_read:
2471 if (verbose > 0) {
2472 char sbuf[STRERR_BUFSIZE];
2473 fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
2474 if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
2475 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2476 fputs(" information\n", trace->output);
2477 }
2478 return NULL;
2479}
2480
2481struct syscall_stats {
2482 struct stats stats;
2483 u64 nr_failures;
2484 int max_errno;
2485 u32 *errnos;
2486};
2487
2488static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace,
2489 int id, struct perf_sample *sample, long err, bool errno_summary)
2490{
2491 struct int_node *inode;
2492 struct syscall_stats *stats;
2493 u64 duration = 0;
2494
2495 inode = intlist__findnew(ttrace->syscall_stats, id);
2496 if (inode == NULL)
2497 return;
2498
2499 stats = inode->priv;
2500 if (stats == NULL) {
2501 stats = zalloc(sizeof(*stats));
2502 if (stats == NULL)
2503 return;
2504
2505 init_stats(&stats->stats);
2506 inode->priv = stats;
2507 }
2508
2509 if (ttrace->entry_time && sample->time > ttrace->entry_time)
2510 duration = sample->time - ttrace->entry_time;
2511
2512 update_stats(&stats->stats, duration);
2513
2514 if (err < 0) {
2515 ++stats->nr_failures;
2516
2517 if (!errno_summary)
2518 return;
2519
2520 err = -err;
2521 if (err > stats->max_errno) {
2522 u32 *new_errnos = realloc(stats->errnos, err * sizeof(u32));
2523
2524 if (new_errnos) {
2525 memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32));
2526 } else {
2527 pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n",
2528 thread__comm_str(thread), thread__pid(thread),
2529 thread__tid(thread));
2530 return;
2531 }
2532
2533 stats->errnos = new_errnos;
2534 stats->max_errno = err;
2535 }
2536
2537 ++stats->errnos[err - 1];
2538 }
2539}
2540
2541static int trace__printf_interrupted_entry(struct trace *trace)
2542{
2543 struct thread_trace *ttrace;
2544 size_t printed;
2545 int len;
2546
2547 if (trace->failure_only || trace->current == NULL)
2548 return 0;
2549
2550 ttrace = thread__priv(trace->current);
2551
2552 if (!ttrace->entry_pending)
2553 return 0;
2554
2555 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
2556 printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
2557
2558 if (len < trace->args_alignment - 4)
2559 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
2560
2561 printed += fprintf(trace->output, " ...\n");
2562
2563 ttrace->entry_pending = false;
2564 ++trace->nr_events_printed;
2565
2566 return printed;
2567}
2568
2569static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel,
2570 struct perf_sample *sample, struct thread *thread)
2571{
2572 int printed = 0;
2573
2574 if (trace->print_sample) {
2575 double ts = (double)sample->time / NSEC_PER_MSEC;
2576
2577 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
2578 evsel__name(evsel), ts,
2579 thread__comm_str(thread),
2580 sample->pid, sample->tid, sample->cpu);
2581 }
2582
2583 return printed;
2584}
2585
2586static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
2587{
2588 void *augmented_args = NULL;
2589 /*
2590 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
2591 * and there we get all 6 syscall args plus the tracepoint common fields
2592 * that gets calculated at the start and the syscall_nr (another long).
2593 * So we check if that is the case and if so don't look after the
2594 * sc->args_size but always after the full raw_syscalls:sys_enter payload,
2595 * which is fixed.
2596 *
2597 * We'll revisit this later to pass s->args_size to the BPF augmenter
2598 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
2599 * copies only what we need for each syscall, like what happens when we
2600 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
2601 * traffic to just what is needed for each syscall.
2602 */
2603 int args_size = raw_augmented_args_size ?: sc->args_size;
2604
2605 *augmented_args_size = sample->raw_size - args_size;
2606 if (*augmented_args_size > 0)
2607 augmented_args = sample->raw_data + args_size;
2608
2609 return augmented_args;
2610}
2611
2612static void syscall__exit(struct syscall *sc)
2613{
2614 if (!sc)
2615 return;
2616
2617 zfree(&sc->arg_fmt);
2618}
2619
2620static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
2621 union perf_event *event __maybe_unused,
2622 struct perf_sample *sample)
2623{
2624 char *msg;
2625 void *args;
2626 int printed = 0;
2627 struct thread *thread;
2628 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2629 int augmented_args_size = 0;
2630 void *augmented_args = NULL;
2631 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2632 struct thread_trace *ttrace;
2633
2634 if (sc == NULL)
2635 return -1;
2636
2637 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2638 ttrace = thread__trace(thread, trace->output);
2639 if (ttrace == NULL)
2640 goto out_put;
2641
2642 trace__fprintf_sample(trace, evsel, sample, thread);
2643
2644 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2645
2646 if (ttrace->entry_str == NULL) {
2647 ttrace->entry_str = malloc(trace__entry_str_size);
2648 if (!ttrace->entry_str)
2649 goto out_put;
2650 }
2651
2652 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
2653 trace__printf_interrupted_entry(trace);
2654 /*
2655 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
2656 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
2657 * this breaks syscall__augmented_args() check for augmented args, as we calculate
2658 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
2659 * so when handling, say the openat syscall, we end up getting 6 args for the
2660 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
2661 * thinking that the extra 2 u64 args are the augmented filename, so just check
2662 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
2663 */
2664 if (evsel != trace->syscalls.events.sys_enter)
2665 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2666 ttrace->entry_time = sample->time;
2667 msg = ttrace->entry_str;
2668 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2669
2670 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2671 args, augmented_args, augmented_args_size, trace, thread);
2672
2673 if (sc->is_exit) {
2674 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
2675 int alignment = 0;
2676
2677 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
2678 printed = fprintf(trace->output, "%s)", ttrace->entry_str);
2679 if (trace->args_alignment > printed)
2680 alignment = trace->args_alignment - printed;
2681 fprintf(trace->output, "%*s= ?\n", alignment, " ");
2682 }
2683 } else {
2684 ttrace->entry_pending = true;
2685 /* See trace__vfs_getname & trace__sys_exit */
2686 ttrace->filename.pending_open = false;
2687 }
2688
2689 if (trace->current != thread) {
2690 thread__put(trace->current);
2691 trace->current = thread__get(thread);
2692 }
2693 err = 0;
2694out_put:
2695 thread__put(thread);
2696 return err;
2697}
2698
2699static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
2700 struct perf_sample *sample)
2701{
2702 struct thread_trace *ttrace;
2703 struct thread *thread;
2704 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2705 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2706 char msg[1024];
2707 void *args, *augmented_args = NULL;
2708 int augmented_args_size;
2709 size_t printed = 0;
2710
2711 if (sc == NULL)
2712 return -1;
2713
2714 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2715 ttrace = thread__trace(thread, trace->output);
2716 /*
2717 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
2718 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
2719 */
2720 if (ttrace == NULL)
2721 goto out_put;
2722
2723 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2724 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
2725 printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
2726 fprintf(trace->output, "%.*s", (int)printed, msg);
2727 err = 0;
2728out_put:
2729 thread__put(thread);
2730 return err;
2731}
2732
2733static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
2734 struct perf_sample *sample,
2735 struct callchain_cursor *cursor)
2736{
2737 struct addr_location al;
2738 int max_stack = evsel->core.attr.sample_max_stack ?
2739 evsel->core.attr.sample_max_stack :
2740 trace->max_stack;
2741 int err = -1;
2742
2743 addr_location__init(&al);
2744 if (machine__resolve(trace->host, &al, sample) < 0)
2745 goto out;
2746
2747 err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2748out:
2749 addr_location__exit(&al);
2750 return err;
2751}
2752
2753static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2754{
2755 /* TODO: user-configurable print_opts */
2756 const unsigned int print_opts = EVSEL__PRINT_SYM |
2757 EVSEL__PRINT_DSO |
2758 EVSEL__PRINT_UNKNOWN_AS_ADDR;
2759
2760 return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
2761}
2762
2763static const char *errno_to_name(struct evsel *evsel, int err)
2764{
2765 struct perf_env *env = evsel__env(evsel);
2766
2767 return perf_env__arch_strerrno(env, err);
2768}
2769
2770static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
2771 union perf_event *event __maybe_unused,
2772 struct perf_sample *sample)
2773{
2774 long ret;
2775 u64 duration = 0;
2776 bool duration_calculated = false;
2777 struct thread *thread;
2778 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2779 int alignment = trace->args_alignment;
2780 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2781 struct thread_trace *ttrace;
2782
2783 if (sc == NULL)
2784 return -1;
2785
2786 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2787 ttrace = thread__trace(thread, trace->output);
2788 if (ttrace == NULL)
2789 goto out_put;
2790
2791 trace__fprintf_sample(trace, evsel, sample, thread);
2792
2793 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2794
2795 if (trace->summary)
2796 thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary);
2797
2798 if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2799 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2800 ttrace->filename.pending_open = false;
2801 ++trace->stats.vfs_getname;
2802 }
2803
2804 if (ttrace->entry_time) {
2805 duration = sample->time - ttrace->entry_time;
2806 if (trace__filter_duration(trace, duration))
2807 goto out;
2808 duration_calculated = true;
2809 } else if (trace->duration_filter)
2810 goto out;
2811
2812 if (sample->callchain) {
2813 struct callchain_cursor *cursor = get_tls_callchain_cursor();
2814
2815 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
2816 if (callchain_ret == 0) {
2817 if (cursor->nr < trace->min_stack)
2818 goto out;
2819 callchain_ret = 1;
2820 }
2821 }
2822
2823 if (trace->summary_only || (ret >= 0 && trace->failure_only))
2824 goto out;
2825
2826 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2827
2828 if (ttrace->entry_pending) {
2829 printed = fprintf(trace->output, "%s", ttrace->entry_str);
2830 } else {
2831 printed += fprintf(trace->output, " ... [");
2832 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2833 printed += 9;
2834 printed += fprintf(trace->output, "]: %s()", sc->name);
2835 }
2836
2837 printed++; /* the closing ')' */
2838
2839 if (alignment > printed)
2840 alignment -= printed;
2841 else
2842 alignment = 0;
2843
2844 fprintf(trace->output, ")%*s= ", alignment, " ");
2845
2846 if (sc->fmt == NULL) {
2847 if (ret < 0)
2848 goto errno_print;
2849signed_print:
2850 fprintf(trace->output, "%ld", ret);
2851 } else if (ret < 0) {
2852errno_print: {
2853 char bf[STRERR_BUFSIZE];
2854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2855 *e = errno_to_name(evsel, -ret);
2856
2857 fprintf(trace->output, "-1 %s (%s)", e, emsg);
2858 }
2859 } else if (ret == 0 && sc->fmt->timeout)
2860 fprintf(trace->output, "0 (Timeout)");
2861 else if (ttrace->ret_scnprintf) {
2862 char bf[1024];
2863 struct syscall_arg arg = {
2864 .val = ret,
2865 .thread = thread,
2866 .trace = trace,
2867 };
2868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2869 ttrace->ret_scnprintf = NULL;
2870 fprintf(trace->output, "%s", bf);
2871 } else if (sc->fmt->hexret)
2872 fprintf(trace->output, "%#lx", ret);
2873 else if (sc->fmt->errpid) {
2874 struct thread *child = machine__find_thread(trace->host, ret, ret);
2875
2876 if (child != NULL) {
2877 fprintf(trace->output, "%ld", ret);
2878 if (thread__comm_set(child))
2879 fprintf(trace->output, " (%s)", thread__comm_str(child));
2880 thread__put(child);
2881 }
2882 } else
2883 goto signed_print;
2884
2885 fputc('\n', trace->output);
2886
2887 /*
2888 * We only consider an 'event' for the sake of --max-events a non-filtered
2889 * sys_enter + sys_exit and other tracepoint events.
2890 */
2891 if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2892 interrupted = true;
2893
2894 if (callchain_ret > 0)
2895 trace__fprintf_callchain(trace, sample);
2896 else if (callchain_ret < 0)
2897 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
2898out:
2899 ttrace->entry_pending = false;
2900 err = 0;
2901out_put:
2902 thread__put(thread);
2903 return err;
2904}
2905
2906static int trace__vfs_getname(struct trace *trace, struct evsel *evsel,
2907 union perf_event *event __maybe_unused,
2908 struct perf_sample *sample)
2909{
2910 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2911 struct thread_trace *ttrace;
2912 size_t filename_len, entry_str_len, to_move;
2913 ssize_t remaining_space;
2914 char *pos;
2915 const char *filename = evsel__rawptr(evsel, sample, "pathname");
2916
2917 if (!thread)
2918 goto out;
2919
2920 ttrace = thread__priv(thread);
2921 if (!ttrace)
2922 goto out_put;
2923
2924 filename_len = strlen(filename);
2925 if (filename_len == 0)
2926 goto out_put;
2927
2928 if (ttrace->filename.namelen < filename_len) {
2929 char *f = realloc(ttrace->filename.name, filename_len + 1);
2930
2931 if (f == NULL)
2932 goto out_put;
2933
2934 ttrace->filename.namelen = filename_len;
2935 ttrace->filename.name = f;
2936 }
2937
2938 strcpy(ttrace->filename.name, filename);
2939 ttrace->filename.pending_open = true;
2940
2941 if (!ttrace->filename.ptr)
2942 goto out_put;
2943
2944 entry_str_len = strlen(ttrace->entry_str);
2945 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2946 if (remaining_space <= 0)
2947 goto out_put;
2948
2949 if (filename_len > (size_t)remaining_space) {
2950 filename += filename_len - remaining_space;
2951 filename_len = remaining_space;
2952 }
2953
2954 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2955 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2956 memmove(pos + filename_len, pos, to_move);
2957 memcpy(pos, filename, filename_len);
2958
2959 ttrace->filename.ptr = 0;
2960 ttrace->filename.entry_str_pos = 0;
2961out_put:
2962 thread__put(thread);
2963out:
2964 return 0;
2965}
2966
2967static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel,
2968 union perf_event *event __maybe_unused,
2969 struct perf_sample *sample)
2970{
2971 u64 runtime = evsel__intval(evsel, sample, "runtime");
2972 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2973 struct thread *thread = machine__findnew_thread(trace->host,
2974 sample->pid,
2975 sample->tid);
2976 struct thread_trace *ttrace = thread__trace(thread, trace->output);
2977
2978 if (ttrace == NULL)
2979 goto out_dump;
2980
2981 ttrace->runtime_ms += runtime_ms;
2982 trace->runtime_ms += runtime_ms;
2983out_put:
2984 thread__put(thread);
2985 return 0;
2986
2987out_dump:
2988 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2989 evsel->name,
2990 evsel__strval(evsel, sample, "comm"),
2991 (pid_t)evsel__intval(evsel, sample, "pid"),
2992 runtime,
2993 evsel__intval(evsel, sample, "vruntime"));
2994 goto out_put;
2995}
2996
2997static int bpf_output__printer(enum binary_printer_ops op,
2998 unsigned int val, void *extra __maybe_unused, FILE *fp)
2999{
3000 unsigned char ch = (unsigned char)val;
3001
3002 switch (op) {
3003 case BINARY_PRINT_CHAR_DATA:
3004 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
3005 case BINARY_PRINT_DATA_BEGIN:
3006 case BINARY_PRINT_LINE_BEGIN:
3007 case BINARY_PRINT_ADDR:
3008 case BINARY_PRINT_NUM_DATA:
3009 case BINARY_PRINT_NUM_PAD:
3010 case BINARY_PRINT_SEP:
3011 case BINARY_PRINT_CHAR_PAD:
3012 case BINARY_PRINT_LINE_END:
3013 case BINARY_PRINT_DATA_END:
3014 default:
3015 break;
3016 }
3017
3018 return 0;
3019}
3020
3021static void bpf_output__fprintf(struct trace *trace,
3022 struct perf_sample *sample)
3023{
3024 binary__fprintf(sample->raw_data, sample->raw_size, 8,
3025 bpf_output__printer, NULL, trace->output);
3026 ++trace->nr_events_printed;
3027}
3028
3029static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, struct perf_sample *sample,
3030 struct thread *thread, void *augmented_args, int augmented_args_size)
3031{
3032 char bf[2048];
3033 size_t size = sizeof(bf);
3034 struct tep_format_field *field = evsel->tp_format->format.fields;
3035 struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel);
3036 size_t printed = 0, btf_printed;
3037 unsigned long val;
3038 u8 bit = 1;
3039 struct syscall_arg syscall_arg = {
3040 .augmented = {
3041 .size = augmented_args_size,
3042 .args = augmented_args,
3043 },
3044 .idx = 0,
3045 .mask = 0,
3046 .trace = trace,
3047 .thread = thread,
3048 .show_string_prefix = trace->show_string_prefix,
3049 };
3050
3051 for (; field && arg; field = field->next, ++syscall_arg.idx, bit <<= 1, ++arg) {
3052 if (syscall_arg.mask & bit)
3053 continue;
3054
3055 syscall_arg.len = 0;
3056 syscall_arg.fmt = arg;
3057 if (field->flags & TEP_FIELD_IS_ARRAY) {
3058 int offset = field->offset;
3059
3060 if (field->flags & TEP_FIELD_IS_DYNAMIC) {
3061 offset = format_field__intval(field, sample, evsel->needs_swap);
3062 syscall_arg.len = offset >> 16;
3063 offset &= 0xffff;
3064 if (tep_field_is_relative(field->flags))
3065 offset += field->offset + field->size;
3066 }
3067
3068 val = (uintptr_t)(sample->raw_data + offset);
3069 } else
3070 val = format_field__intval(field, sample, evsel->needs_swap);
3071 /*
3072 * Some syscall args need some mask, most don't and
3073 * return val untouched.
3074 */
3075 val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
3076
3077 /* Suppress this argument if its value is zero and show_zero property isn't set. */
3078 if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE)
3079 continue;
3080
3081 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
3082
3083 if (trace->show_arg_names)
3084 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
3085
3086 btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type);
3087 if (btf_printed) {
3088 printed += btf_printed;
3089 continue;
3090 }
3091
3092 printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
3093 }
3094
3095 return printed + fprintf(trace->output, "%.*s", (int)printed, bf);
3096}
3097
3098static int trace__event_handler(struct trace *trace, struct evsel *evsel,
3099 union perf_event *event __maybe_unused,
3100 struct perf_sample *sample)
3101{
3102 struct thread *thread;
3103 int callchain_ret = 0;
3104
3105 if (evsel->nr_events_printed >= evsel->max_events)
3106 return 0;
3107
3108 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3109
3110 if (sample->callchain) {
3111 struct callchain_cursor *cursor = get_tls_callchain_cursor();
3112
3113 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3114 if (callchain_ret == 0) {
3115 if (cursor->nr < trace->min_stack)
3116 goto out;
3117 callchain_ret = 1;
3118 }
3119 }
3120
3121 trace__printf_interrupted_entry(trace);
3122 trace__fprintf_tstamp(trace, sample->time, trace->output);
3123
3124 if (trace->trace_syscalls && trace->show_duration)
3125 fprintf(trace->output, "( ): ");
3126
3127 if (thread)
3128 trace__fprintf_comm_tid(trace, thread, trace->output);
3129
3130 if (evsel == trace->syscalls.events.bpf_output) {
3131 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
3132 struct syscall *sc = trace__syscall_info(trace, evsel, id);
3133
3134 if (sc) {
3135 fprintf(trace->output, "%s(", sc->name);
3136 trace__fprintf_sys_enter(trace, evsel, sample);
3137 fputc(')', trace->output);
3138 goto newline;
3139 }
3140
3141 /*
3142 * XXX: Not having the associated syscall info or not finding/adding
3143 * the thread should never happen, but if it does...
3144 * fall thru and print it as a bpf_output event.
3145 */
3146 }
3147
3148 fprintf(trace->output, "%s(", evsel->name);
3149
3150 if (evsel__is_bpf_output(evsel)) {
3151 bpf_output__fprintf(trace, sample);
3152 } else if (evsel->tp_format) {
3153 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
3154 trace__fprintf_sys_enter(trace, evsel, sample)) {
3155 if (trace->libtraceevent_print) {
3156 event_format__fprintf(evsel->tp_format, sample->cpu,
3157 sample->raw_data, sample->raw_size,
3158 trace->output);
3159 } else {
3160 trace__fprintf_tp_fields(trace, evsel, sample, thread, NULL, 0);
3161 }
3162 }
3163 }
3164
3165newline:
3166 fprintf(trace->output, ")\n");
3167
3168 if (callchain_ret > 0)
3169 trace__fprintf_callchain(trace, sample);
3170 else if (callchain_ret < 0)
3171 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3172
3173 ++trace->nr_events_printed;
3174
3175 if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
3176 evsel__disable(evsel);
3177 evsel__close(evsel);
3178 }
3179out:
3180 thread__put(thread);
3181 return 0;
3182}
3183
3184static void print_location(FILE *f, struct perf_sample *sample,
3185 struct addr_location *al,
3186 bool print_dso, bool print_sym)
3187{
3188
3189 if ((verbose > 0 || print_dso) && al->map)
3190 fprintf(f, "%s@", dso__long_name(map__dso(al->map)));
3191
3192 if ((verbose > 0 || print_sym) && al->sym)
3193 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
3194 al->addr - al->sym->start);
3195 else if (al->map)
3196 fprintf(f, "0x%" PRIx64, al->addr);
3197 else
3198 fprintf(f, "0x%" PRIx64, sample->addr);
3199}
3200
3201static int trace__pgfault(struct trace *trace,
3202 struct evsel *evsel,
3203 union perf_event *event __maybe_unused,
3204 struct perf_sample *sample)
3205{
3206 struct thread *thread;
3207 struct addr_location al;
3208 char map_type = 'd';
3209 struct thread_trace *ttrace;
3210 int err = -1;
3211 int callchain_ret = 0;
3212
3213 addr_location__init(&al);
3214 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3215
3216 if (sample->callchain) {
3217 struct callchain_cursor *cursor = get_tls_callchain_cursor();
3218
3219 callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
3220 if (callchain_ret == 0) {
3221 if (cursor->nr < trace->min_stack)
3222 goto out_put;
3223 callchain_ret = 1;
3224 }
3225 }
3226
3227 ttrace = thread__trace(thread, trace->output);
3228 if (ttrace == NULL)
3229 goto out_put;
3230
3231 if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
3232 ttrace->pfmaj++;
3233 else
3234 ttrace->pfmin++;
3235
3236 if (trace->summary_only)
3237 goto out;
3238
3239 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
3240
3241 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
3242
3243 fprintf(trace->output, "%sfault [",
3244 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
3245 "maj" : "min");
3246
3247 print_location(trace->output, sample, &al, false, true);
3248
3249 fprintf(trace->output, "] => ");
3250
3251 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
3252
3253 if (!al.map) {
3254 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
3255
3256 if (al.map)
3257 map_type = 'x';
3258 else
3259 map_type = '?';
3260 }
3261
3262 print_location(trace->output, sample, &al, true, false);
3263
3264 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
3265
3266 if (callchain_ret > 0)
3267 trace__fprintf_callchain(trace, sample);
3268 else if (callchain_ret < 0)
3269 pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
3270
3271 ++trace->nr_events_printed;
3272out:
3273 err = 0;
3274out_put:
3275 thread__put(thread);
3276 addr_location__exit(&al);
3277 return err;
3278}
3279
3280static void trace__set_base_time(struct trace *trace,
3281 struct evsel *evsel,
3282 struct perf_sample *sample)
3283{
3284 /*
3285 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
3286 * and don't use sample->time unconditionally, we may end up having
3287 * some other event in the future without PERF_SAMPLE_TIME for good
3288 * reason, i.e. we may not be interested in its timestamps, just in
3289 * it taking place, picking some piece of information when it
3290 * appears in our event stream (vfs_getname comes to mind).
3291 */
3292 if (trace->base_time == 0 && !trace->full_time &&
3293 (evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
3294 trace->base_time = sample->time;
3295}
3296
3297static int trace__process_sample(const struct perf_tool *tool,
3298 union perf_event *event,
3299 struct perf_sample *sample,
3300 struct evsel *evsel,
3301 struct machine *machine __maybe_unused)
3302{
3303 struct trace *trace = container_of(tool, struct trace, tool);
3304 struct thread *thread;
3305 int err = 0;
3306
3307 tracepoint_handler handler = evsel->handler;
3308
3309 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
3310 if (thread && thread__is_filtered(thread))
3311 goto out;
3312
3313 trace__set_base_time(trace, evsel, sample);
3314
3315 if (handler) {
3316 ++trace->nr_events;
3317 handler(trace, evsel, event, sample);
3318 }
3319out:
3320 thread__put(thread);
3321 return err;
3322}
3323
3324static int trace__record(struct trace *trace, int argc, const char **argv)
3325{
3326 unsigned int rec_argc, i, j;
3327 const char **rec_argv;
3328 const char * const record_args[] = {
3329 "record",
3330 "-R",
3331 "-m", "1024",
3332 "-c", "1",
3333 };
3334 pid_t pid = getpid();
3335 char *filter = asprintf__tp_filter_pids(1, &pid);
3336 const char * const sc_args[] = { "-e", };
3337 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
3338 const char * const majpf_args[] = { "-e", "major-faults" };
3339 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
3340 const char * const minpf_args[] = { "-e", "minor-faults" };
3341 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
3342 int err = -1;
3343
3344 /* +3 is for the event string below and the pid filter */
3345 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 3 +
3346 majpf_args_nr + minpf_args_nr + argc;
3347 rec_argv = calloc(rec_argc + 1, sizeof(char *));
3348
3349 if (rec_argv == NULL || filter == NULL)
3350 goto out_free;
3351
3352 j = 0;
3353 for (i = 0; i < ARRAY_SIZE(record_args); i++)
3354 rec_argv[j++] = record_args[i];
3355
3356 if (trace->trace_syscalls) {
3357 for (i = 0; i < sc_args_nr; i++)
3358 rec_argv[j++] = sc_args[i];
3359
3360 /* event string may be different for older kernels - e.g., RHEL6 */
3361 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
3362 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
3363 else if (is_valid_tracepoint("syscalls:sys_enter"))
3364 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
3365 else {
3366 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
3367 goto out_free;
3368 }
3369 }
3370
3371 rec_argv[j++] = "--filter";
3372 rec_argv[j++] = filter;
3373
3374 if (trace->trace_pgfaults & TRACE_PFMAJ)
3375 for (i = 0; i < majpf_args_nr; i++)
3376 rec_argv[j++] = majpf_args[i];
3377
3378 if (trace->trace_pgfaults & TRACE_PFMIN)
3379 for (i = 0; i < minpf_args_nr; i++)
3380 rec_argv[j++] = minpf_args[i];
3381
3382 for (i = 0; i < (unsigned int)argc; i++)
3383 rec_argv[j++] = argv[i];
3384
3385 err = cmd_record(j, rec_argv);
3386out_free:
3387 free(filter);
3388 free(rec_argv);
3389 return err;
3390}
3391
3392static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
3393
3394static bool evlist__add_vfs_getname(struct evlist *evlist)
3395{
3396 bool found = false;
3397 struct evsel *evsel, *tmp;
3398 struct parse_events_error err;
3399 int ret;
3400
3401 parse_events_error__init(&err);
3402 ret = parse_events(evlist, "probe:vfs_getname*", &err);
3403 parse_events_error__exit(&err);
3404 if (ret)
3405 return false;
3406
3407 evlist__for_each_entry_safe(evlist, evsel, tmp) {
3408 if (!strstarts(evsel__name(evsel), "probe:vfs_getname"))
3409 continue;
3410
3411 if (evsel__field(evsel, "pathname")) {
3412 evsel->handler = trace__vfs_getname;
3413 found = true;
3414 continue;
3415 }
3416
3417 list_del_init(&evsel->core.node);
3418 evsel->evlist = NULL;
3419 evsel__delete(evsel);
3420 }
3421
3422 return found;
3423}
3424
3425static struct evsel *evsel__new_pgfault(u64 config)
3426{
3427 struct evsel *evsel;
3428 struct perf_event_attr attr = {
3429 .type = PERF_TYPE_SOFTWARE,
3430 .mmap_data = 1,
3431 };
3432
3433 attr.config = config;
3434 attr.sample_period = 1;
3435
3436 event_attr_init(&attr);
3437
3438 evsel = evsel__new(&attr);
3439 if (evsel)
3440 evsel->handler = trace__pgfault;
3441
3442 return evsel;
3443}
3444
3445static void evlist__free_syscall_tp_fields(struct evlist *evlist)
3446{
3447 struct evsel *evsel;
3448
3449 evlist__for_each_entry(evlist, evsel) {
3450 evsel_trace__delete(evsel->priv);
3451 evsel->priv = NULL;
3452 }
3453}
3454
3455static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
3456{
3457 const u32 type = event->header.type;
3458 struct evsel *evsel;
3459
3460 if (type != PERF_RECORD_SAMPLE) {
3461 trace__process_event(trace, trace->host, event, sample);
3462 return;
3463 }
3464
3465 evsel = evlist__id2evsel(trace->evlist, sample->id);
3466 if (evsel == NULL) {
3467 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
3468 return;
3469 }
3470
3471 if (evswitch__discard(&trace->evswitch, evsel))
3472 return;
3473
3474 trace__set_base_time(trace, evsel, sample);
3475
3476 if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT &&
3477 sample->raw_data == NULL) {
3478 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
3479 evsel__name(evsel), sample->tid,
3480 sample->cpu, sample->raw_size);
3481 } else {
3482 tracepoint_handler handler = evsel->handler;
3483 handler(trace, evsel, event, sample);
3484 }
3485
3486 if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
3487 interrupted = true;
3488}
3489
3490static int trace__add_syscall_newtp(struct trace *trace)
3491{
3492 int ret = -1;
3493 struct evlist *evlist = trace->evlist;
3494 struct evsel *sys_enter, *sys_exit;
3495
3496 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
3497 if (sys_enter == NULL)
3498 goto out;
3499
3500 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
3501 goto out_delete_sys_enter;
3502
3503 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
3504 if (sys_exit == NULL)
3505 goto out_delete_sys_enter;
3506
3507 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
3508 goto out_delete_sys_exit;
3509
3510 evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
3511 evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
3512
3513 evlist__add(evlist, sys_enter);
3514 evlist__add(evlist, sys_exit);
3515
3516 if (callchain_param.enabled && !trace->kernel_syscallchains) {
3517 /*
3518 * We're interested only in the user space callchain
3519 * leading to the syscall, allow overriding that for
3520 * debugging reasons using --kernel_syscall_callchains
3521 */
3522 sys_exit->core.attr.exclude_callchain_kernel = 1;
3523 }
3524
3525 trace->syscalls.events.sys_enter = sys_enter;
3526 trace->syscalls.events.sys_exit = sys_exit;
3527
3528 ret = 0;
3529out:
3530 return ret;
3531
3532out_delete_sys_exit:
3533 evsel__delete_priv(sys_exit);
3534out_delete_sys_enter:
3535 evsel__delete_priv(sys_enter);
3536 goto out;
3537}
3538
3539static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
3540{
3541 int err = -1;
3542 struct evsel *sys_exit;
3543 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
3544 trace->ev_qualifier_ids.nr,
3545 trace->ev_qualifier_ids.entries);
3546
3547 if (filter == NULL)
3548 goto out_enomem;
3549
3550 if (!evsel__append_tp_filter(trace->syscalls.events.sys_enter, filter)) {
3551 sys_exit = trace->syscalls.events.sys_exit;
3552 err = evsel__append_tp_filter(sys_exit, filter);
3553 }
3554
3555 free(filter);
3556out:
3557 return err;
3558out_enomem:
3559 errno = ENOMEM;
3560 goto out;
3561}
3562
3563#ifdef HAVE_BPF_SKEL
3564static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type)
3565{
3566 int id;
3567
3568 if (arg_fmt->type != NULL)
3569 return -1;
3570
3571 id = btf__find_by_name(btf, type);
3572 if (id < 0)
3573 return -1;
3574
3575 arg_fmt->type = btf__type_by_id(btf, id);
3576 arg_fmt->type_id = id;
3577
3578 return 0;
3579}
3580
3581static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
3582{
3583 struct bpf_program *pos, *prog = NULL;
3584 const char *sec_name;
3585
3586 if (trace->skel->obj == NULL)
3587 return NULL;
3588
3589 bpf_object__for_each_program(pos, trace->skel->obj) {
3590 sec_name = bpf_program__section_name(pos);
3591 if (sec_name && !strcmp(sec_name, name)) {
3592 prog = pos;
3593 break;
3594 }
3595 }
3596
3597 return prog;
3598}
3599
3600static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
3601 const char *prog_name, const char *type)
3602{
3603 struct bpf_program *prog;
3604
3605 if (prog_name == NULL) {
3606 char default_prog_name[256];
3607 scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name);
3608 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3609 if (prog != NULL)
3610 goto out_found;
3611 if (sc->fmt && sc->fmt->alias) {
3612 scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
3613 prog = trace__find_bpf_program_by_title(trace, default_prog_name);
3614 if (prog != NULL)
3615 goto out_found;
3616 }
3617 goto out_unaugmented;
3618 }
3619
3620 prog = trace__find_bpf_program_by_title(trace, prog_name);
3621
3622 if (prog != NULL) {
3623out_found:
3624 return prog;
3625 }
3626
3627 pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
3628 prog_name, type, sc->name);
3629out_unaugmented:
3630 return trace->skel->progs.syscall_unaugmented;
3631}
3632
3633static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
3634{
3635 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3636
3637 if (sc == NULL)
3638 return;
3639
3640 sc->bpf_prog.sys_enter = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3641 sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit");
3642}
3643
3644static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
3645{
3646 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3647 return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3648}
3649
3650static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
3651{
3652 struct syscall *sc = trace__syscall_info(trace, NULL, id);
3653 return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
3654}
3655
3656static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
3657{
3658 struct tep_format_field *field;
3659 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3660 const struct btf_type *bt;
3661 char *struct_offset, *tmp, name[32];
3662 bool can_augment = false;
3663 int i, cnt;
3664
3665 if (sc == NULL)
3666 return -1;
3667
3668 trace__load_vmlinux_btf(trace);
3669 if (trace->btf == NULL)
3670 return -1;
3671
3672 for (i = 0, field = sc->args; field; ++i, field = field->next) {
3673 // XXX We're only collecting pointer payloads _from_ user space
3674 if (!sc->arg_fmt[i].from_user)
3675 continue;
3676
3677 struct_offset = strstr(field->type, "struct ");
3678 if (struct_offset == NULL)
3679 struct_offset = strstr(field->type, "union ");
3680 else
3681 struct_offset++; // "union" is shorter
3682
3683 if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
3684 struct_offset += 6;
3685
3686 /* for 'struct foo *', we only want 'foo' */
3687 for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
3688 }
3689
3690 strncpy(name, struct_offset, cnt);
3691 name[cnt] = '\0';
3692
3693 /* cache struct's btf_type and type_id */
3694 if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name))
3695 continue;
3696
3697 bt = sc->arg_fmt[i].type;
3698 beauty_array[i] = bt->size;
3699 can_augment = true;
3700 } else if (field->flags & TEP_FIELD_IS_POINTER && /* string */
3701 strcmp(field->type, "const char *") == 0 &&
3702 (strstr(field->name, "name") ||
3703 strstr(field->name, "path") ||
3704 strstr(field->name, "file") ||
3705 strstr(field->name, "root") ||
3706 strstr(field->name, "key") ||
3707 strstr(field->name, "special") ||
3708 strstr(field->name, "type") ||
3709 strstr(field->name, "description"))) {
3710 beauty_array[i] = 1;
3711 can_augment = true;
3712 } else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */
3713 strstr(field->type, "char *") &&
3714 (strstr(field->name, "buf") ||
3715 strstr(field->name, "val") ||
3716 strstr(field->name, "msg"))) {
3717 int j;
3718 struct tep_format_field *field_tmp;
3719
3720 /* find the size of the buffer that appears in pairs with buf */
3721 for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) {
3722 if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */
3723 (strstr(field_tmp->name, "count") ||
3724 strstr(field_tmp->name, "siz") || /* size, bufsiz */
3725 (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) {
3726 /* filename's got 'len' in it, we don't want that */
3727 beauty_array[i] = -(j + 1);
3728 can_augment = true;
3729 break;
3730 }
3731 }
3732 }
3733 }
3734
3735 if (can_augment)
3736 return 0;
3737
3738 return -1;
3739}
3740
3741static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
3742{
3743 struct tep_format_field *field, *candidate_field;
3744 /*
3745 * We're only interested in syscalls that have a pointer:
3746 */
3747 for (field = sc->args; field; field = field->next) {
3748 if (field->flags & TEP_FIELD_IS_POINTER)
3749 goto try_to_find_pair;
3750 }
3751
3752 return NULL;
3753
3754try_to_find_pair:
3755 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3756 int id = syscalltbl__id_at_idx(trace->sctbl, i);
3757 struct syscall *pair = trace__syscall_info(trace, NULL, id);
3758 struct bpf_program *pair_prog;
3759 bool is_candidate = false;
3760
3761 if (pair == NULL || pair == sc ||
3762 pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
3763 continue;
3764
3765 for (field = sc->args, candidate_field = pair->args;
3766 field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
3767 bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
3768 candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
3769
3770 if (is_pointer) {
3771 if (!candidate_is_pointer) {
3772 // The candidate just doesn't copies our pointer arg, might copy other pointers we want.
3773 continue;
3774 }
3775 } else {
3776 if (candidate_is_pointer) {
3777 // The candidate might copy a pointer we don't have, skip it.
3778 goto next_candidate;
3779 }
3780 continue;
3781 }
3782
3783 if (strcmp(field->type, candidate_field->type))
3784 goto next_candidate;
3785
3786 /*
3787 * This is limited in the BPF program but sys_write
3788 * uses "const char *" for its "buf" arg so we need to
3789 * use some heuristic that is kinda future proof...
3790 */
3791 if (strcmp(field->type, "const char *") == 0 &&
3792 !(strstr(field->name, "name") ||
3793 strstr(field->name, "path") ||
3794 strstr(field->name, "file") ||
3795 strstr(field->name, "root") ||
3796 strstr(field->name, "description")))
3797 goto next_candidate;
3798
3799 is_candidate = true;
3800 }
3801
3802 if (!is_candidate)
3803 goto next_candidate;
3804
3805 /*
3806 * Check if the tentative pair syscall augmenter has more pointers, if it has,
3807 * then it may be collecting that and we then can't use it, as it would collect
3808 * more than what is common to the two syscalls.
3809 */
3810 if (candidate_field) {
3811 for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next)
3812 if (candidate_field->flags & TEP_FIELD_IS_POINTER)
3813 goto next_candidate;
3814 }
3815
3816 pair_prog = pair->bpf_prog.sys_enter;
3817 /*
3818 * If the pair isn't enabled, then its bpf_prog.sys_enter will not
3819 * have been searched for, so search it here and if it returns the
3820 * unaugmented one, then ignore it, otherwise we'll reuse that BPF
3821 * program for a filtered syscall on a non-filtered one.
3822 *
3823 * For instance, we have "!syscalls:sys_enter_renameat" and that is
3824 * useful for "renameat2".
3825 */
3826 if (pair_prog == NULL) {
3827 pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
3828 if (pair_prog == trace->skel->progs.syscall_unaugmented)
3829 goto next_candidate;
3830 }
3831
3832 pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
3833 return pair_prog;
3834 next_candidate:
3835 continue;
3836 }
3837
3838 return NULL;
3839}
3840
3841static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
3842{
3843 int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
3844 int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
3845 int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter);
3846 int err = 0;
3847 unsigned int beauty_array[6];
3848
3849 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3850 int prog_fd, key = syscalltbl__id_at_idx(trace->sctbl, i);
3851
3852 if (!trace__syscall_enabled(trace, key))
3853 continue;
3854
3855 trace__init_syscall_bpf_progs(trace, key);
3856
3857 // It'll get at least the "!raw_syscalls:unaugmented"
3858 prog_fd = trace__bpf_prog_sys_enter_fd(trace, key);
3859 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3860 if (err)
3861 break;
3862 prog_fd = trace__bpf_prog_sys_exit_fd(trace, key);
3863 err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY);
3864 if (err)
3865 break;
3866
3867 /* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
3868 memset(beauty_array, 0, sizeof(beauty_array));
3869 err = trace__bpf_sys_enter_beauty_map(trace, key, (unsigned int *)beauty_array);
3870 if (err)
3871 continue;
3872 err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY);
3873 if (err)
3874 break;
3875 }
3876
3877 /*
3878 * Now lets do a second pass looking for enabled syscalls without
3879 * an augmenter that have a signature that is a superset of another
3880 * syscall with an augmenter so that we can auto-reuse it.
3881 *
3882 * I.e. if we have an augmenter for the "open" syscall that has
3883 * this signature:
3884 *
3885 * int open(const char *pathname, int flags, mode_t mode);
3886 *
3887 * I.e. that will collect just the first string argument, then we
3888 * can reuse it for the 'creat' syscall, that has this signature:
3889 *
3890 * int creat(const char *pathname, mode_t mode);
3891 *
3892 * and for:
3893 *
3894 * int stat(const char *pathname, struct stat *statbuf);
3895 * int lstat(const char *pathname, struct stat *statbuf);
3896 *
3897 * Because the 'open' augmenter will collect the first arg as a string,
3898 * and leave alone all the other args, which already helps with
3899 * beautifying 'stat' and 'lstat''s pathname arg.
3900 *
3901 * Then, in time, when 'stat' gets an augmenter that collects both
3902 * first and second arg (this one on the raw_syscalls:sys_exit prog
3903 * array tail call, then that one will be used.
3904 */
3905 for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
3906 int key = syscalltbl__id_at_idx(trace->sctbl, i);
3907 struct syscall *sc = trace__syscall_info(trace, NULL, key);
3908 struct bpf_program *pair_prog;
3909 int prog_fd;
3910
3911 if (sc == NULL || sc->bpf_prog.sys_enter == NULL)
3912 continue;
3913
3914 /*
3915 * For now we're just reusing the sys_enter prog, and if it
3916 * already has an augmenter, we don't need to find one.
3917 */
3918 if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
3919 continue;
3920
3921 /*
3922 * Look at all the other syscalls for one that has a signature
3923 * that is close enough that we can share:
3924 */
3925 pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
3926 if (pair_prog == NULL)
3927 continue;
3928
3929 sc->bpf_prog.sys_enter = pair_prog;
3930
3931 /*
3932 * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter
3933 * with the fd for the program we're reusing:
3934 */
3935 prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
3936 err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY);
3937 if (err)
3938 break;
3939 }
3940
3941 return err;
3942}
3943#endif // HAVE_BPF_SKEL
3944
3945static int trace__set_ev_qualifier_filter(struct trace *trace)
3946{
3947 if (trace->syscalls.events.sys_enter)
3948 return trace__set_ev_qualifier_tp_filter(trace);
3949 return 0;
3950}
3951
3952static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
3953 size_t npids __maybe_unused, pid_t *pids __maybe_unused)
3954{
3955 int err = 0;
3956#ifdef HAVE_LIBBPF_SUPPORT
3957 bool value = true;
3958 int map_fd = bpf_map__fd(map);
3959 size_t i;
3960
3961 for (i = 0; i < npids; ++i) {
3962 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
3963 if (err)
3964 break;
3965 }
3966#endif
3967 return err;
3968}
3969
3970static int trace__set_filter_loop_pids(struct trace *trace)
3971{
3972 unsigned int nr = 1, err;
3973 pid_t pids[32] = {
3974 getpid(),
3975 };
3976 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
3977
3978 while (thread && nr < ARRAY_SIZE(pids)) {
3979 struct thread *parent = machine__find_thread(trace->host,
3980 thread__ppid(thread),
3981 thread__ppid(thread));
3982
3983 if (parent == NULL)
3984 break;
3985
3986 if (!strcmp(thread__comm_str(parent), "sshd") ||
3987 strstarts(thread__comm_str(parent), "gnome-terminal")) {
3988 pids[nr++] = thread__tid(parent);
3989 break;
3990 }
3991 thread = parent;
3992 }
3993
3994 err = evlist__append_tp_filter_pids(trace->evlist, nr, pids);
3995 if (!err && trace->filter_pids.map)
3996 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
3997
3998 return err;
3999}
4000
4001static int trace__set_filter_pids(struct trace *trace)
4002{
4003 int err = 0;
4004 /*
4005 * Better not use !target__has_task() here because we need to cover the
4006 * case where no threads were specified in the command line, but a
4007 * workload was, and in that case we will fill in the thread_map when
4008 * we fork the workload in evlist__prepare_workload.
4009 */
4010 if (trace->filter_pids.nr > 0) {
4011 err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
4012 trace->filter_pids.entries);
4013 if (!err && trace->filter_pids.map) {
4014 err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
4015 trace->filter_pids.entries);
4016 }
4017 } else if (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
4018 err = trace__set_filter_loop_pids(trace);
4019 }
4020
4021 return err;
4022}
4023
4024static int __trace__deliver_event(struct trace *trace, union perf_event *event)
4025{
4026 struct evlist *evlist = trace->evlist;
4027 struct perf_sample sample;
4028 int err = evlist__parse_sample(evlist, event, &sample);
4029
4030 if (err)
4031 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
4032 else
4033 trace__handle_event(trace, event, &sample);
4034
4035 return 0;
4036}
4037
4038static int __trace__flush_events(struct trace *trace)
4039{
4040 u64 first = ordered_events__first_time(&trace->oe.data);
4041 u64 flush = trace->oe.last - NSEC_PER_SEC;
4042
4043 /* Is there some thing to flush.. */
4044 if (first && first < flush)
4045 return ordered_events__flush_time(&trace->oe.data, flush);
4046
4047 return 0;
4048}
4049
4050static int trace__flush_events(struct trace *trace)
4051{
4052 return !trace->sort_events ? 0 : __trace__flush_events(trace);
4053}
4054
4055static int trace__deliver_event(struct trace *trace, union perf_event *event)
4056{
4057 int err;
4058
4059 if (!trace->sort_events)
4060 return __trace__deliver_event(trace, event);
4061
4062 err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
4063 if (err && err != -1)
4064 return err;
4065
4066 err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL);
4067 if (err)
4068 return err;
4069
4070 return trace__flush_events(trace);
4071}
4072
4073static int ordered_events__deliver_event(struct ordered_events *oe,
4074 struct ordered_event *event)
4075{
4076 struct trace *trace = container_of(oe, struct trace, oe.data);
4077
4078 return __trace__deliver_event(trace, event->event);
4079}
4080
4081static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg,
4082 char **type)
4083{
4084 struct tep_format_field *field;
4085 struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel);
4086
4087 if (evsel->tp_format == NULL || fmt == NULL)
4088 return NULL;
4089
4090 for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt)
4091 if (strcmp(field->name, arg) == 0) {
4092 *type = field->type;
4093 return fmt;
4094 }
4095
4096 return NULL;
4097}
4098
4099static int trace__expand_filter(struct trace *trace, struct evsel *evsel)
4100{
4101 char *tok, *left = evsel->filter, *new_filter = evsel->filter;
4102
4103 while ((tok = strpbrk(left, "=<>!")) != NULL) {
4104 char *right = tok + 1, *right_end;
4105
4106 if (*right == '=')
4107 ++right;
4108
4109 while (isspace(*right))
4110 ++right;
4111
4112 if (*right == '\0')
4113 break;
4114
4115 while (!isalpha(*left))
4116 if (++left == tok) {
4117 /*
4118 * Bail out, can't find the name of the argument that is being
4119 * used in the filter, let it try to set this filter, will fail later.
4120 */
4121 return 0;
4122 }
4123
4124 right_end = right + 1;
4125 while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
4126 ++right_end;
4127
4128 if (isalpha(*right)) {
4129 struct syscall_arg_fmt *fmt;
4130 int left_size = tok - left,
4131 right_size = right_end - right;
4132 char arg[128], *type;
4133
4134 while (isspace(left[left_size - 1]))
4135 --left_size;
4136
4137 scnprintf(arg, sizeof(arg), "%.*s", left_size, left);
4138
4139 fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type);
4140 if (fmt == NULL) {
4141 pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
4142 arg, evsel->name, evsel->filter);
4143 return -1;
4144 }
4145
4146 pr_debug2("trying to expand \"%s\" \"%.*s\" \"%.*s\" -> ",
4147 arg, (int)(right - tok), tok, right_size, right);
4148
4149 if (fmt->strtoul) {
4150 u64 val;
4151 struct syscall_arg syscall_arg = {
4152 .trace = trace,
4153 .fmt = fmt,
4154 .type_name = type,
4155 .parm = fmt->parm,
4156 };
4157
4158 if (fmt->strtoul(right, right_size, &syscall_arg, &val)) {
4159 char *n, expansion[19];
4160 int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val);
4161 int expansion_offset = right - new_filter;
4162
4163 pr_debug("%s", expansion);
4164
4165 if (asprintf(&n, "%.*s%s%s", expansion_offset, new_filter, expansion, right_end) < 0) {
4166 pr_debug(" out of memory!\n");
4167 free(new_filter);
4168 return -1;
4169 }
4170 if (new_filter != evsel->filter)
4171 free(new_filter);
4172 left = n + expansion_offset + expansion_lenght;
4173 new_filter = n;
4174 } else {
4175 pr_err("\"%.*s\" not found for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4176 right_size, right, arg, evsel->name, evsel->filter);
4177 return -1;
4178 }
4179 } else {
4180 pr_err("No resolver (strtoul) for \"%s\" in \"%s\", can't set filter \"%s\"\n",
4181 arg, evsel->name, evsel->filter);
4182 return -1;
4183 }
4184
4185 pr_debug("\n");
4186 } else {
4187 left = right_end;
4188 }
4189 }
4190
4191 if (new_filter != evsel->filter) {
4192 pr_debug("New filter for %s: %s\n", evsel->name, new_filter);
4193 evsel__set_filter(evsel, new_filter);
4194 free(new_filter);
4195 }
4196
4197 return 0;
4198}
4199
4200static int trace__expand_filters(struct trace *trace, struct evsel **err_evsel)
4201{
4202 struct evlist *evlist = trace->evlist;
4203 struct evsel *evsel;
4204
4205 evlist__for_each_entry(evlist, evsel) {
4206 if (evsel->filter == NULL)
4207 continue;
4208
4209 if (trace__expand_filter(trace, evsel)) {
4210 *err_evsel = evsel;
4211 return -1;
4212 }
4213 }
4214
4215 return 0;
4216}
4217
4218static int trace__run(struct trace *trace, int argc, const char **argv)
4219{
4220 struct evlist *evlist = trace->evlist;
4221 struct evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
4222 int err = -1, i;
4223 unsigned long before;
4224 const bool forks = argc > 0;
4225 bool draining = false;
4226
4227 trace->live = true;
4228
4229 if (!trace->raw_augmented_syscalls) {
4230 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
4231 goto out_error_raw_syscalls;
4232
4233 if (trace->trace_syscalls)
4234 trace->vfs_getname = evlist__add_vfs_getname(evlist);
4235 }
4236
4237 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
4238 pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
4239 if (pgfault_maj == NULL)
4240 goto out_error_mem;
4241 evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
4242 evlist__add(evlist, pgfault_maj);
4243 }
4244
4245 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
4246 pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
4247 if (pgfault_min == NULL)
4248 goto out_error_mem;
4249 evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
4250 evlist__add(evlist, pgfault_min);
4251 }
4252
4253 /* Enable ignoring missing threads when -u/-p option is defined. */
4254 trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
4255
4256 if (trace->sched &&
4257 evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
4258 goto out_error_sched_stat_runtime;
4259 /*
4260 * If a global cgroup was set, apply it to all the events without an
4261 * explicit cgroup. I.e.:
4262 *
4263 * trace -G A -e sched:*switch
4264 *
4265 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
4266 * _and_ sched:sched_switch to the 'A' cgroup, while:
4267 *
4268 * trace -e sched:*switch -G A
4269 *
4270 * will only set the sched:sched_switch event to the 'A' cgroup, all the
4271 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
4272 * a cgroup (on the root cgroup, sys wide, etc).
4273 *
4274 * Multiple cgroups:
4275 *
4276 * trace -G A -e sched:*switch -G B
4277 *
4278 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
4279 * to the 'B' cgroup.
4280 *
4281 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
4282 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
4283 */
4284 if (trace->cgroup)
4285 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
4286
4287 err = evlist__create_maps(evlist, &trace->opts.target);
4288 if (err < 0) {
4289 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
4290 goto out_delete_evlist;
4291 }
4292
4293 err = trace__symbols_init(trace, evlist);
4294 if (err < 0) {
4295 fprintf(trace->output, "Problems initializing symbol libraries!\n");
4296 goto out_delete_evlist;
4297 }
4298
4299 evlist__config(evlist, &trace->opts, &callchain_param);
4300
4301 if (forks) {
4302 err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL);
4303 if (err < 0) {
4304 fprintf(trace->output, "Couldn't run the workload!\n");
4305 goto out_delete_evlist;
4306 }
4307 workload_pid = evlist->workload.pid;
4308 }
4309
4310 err = evlist__open(evlist);
4311 if (err < 0)
4312 goto out_error_open;
4313#ifdef HAVE_BPF_SKEL
4314 if (trace->syscalls.events.bpf_output) {
4315 struct perf_cpu cpu;
4316
4317 /*
4318 * Set up the __augmented_syscalls__ BPF map to hold for each
4319 * CPU the bpf-output event's file descriptor.
4320 */
4321 perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
4322 bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
4323 &cpu.cpu, sizeof(int),
4324 xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
4325 cpu.cpu, 0),
4326 sizeof(__u32), BPF_ANY);
4327 }
4328 }
4329
4330 if (trace->skel)
4331 trace->filter_pids.map = trace->skel->maps.pids_filtered;
4332#endif
4333 err = trace__set_filter_pids(trace);
4334 if (err < 0)
4335 goto out_error_mem;
4336
4337#ifdef HAVE_BPF_SKEL
4338 if (trace->skel && trace->skel->progs.sys_enter)
4339 trace__init_syscalls_bpf_prog_array_maps(trace);
4340#endif
4341
4342 if (trace->ev_qualifier_ids.nr > 0) {
4343 err = trace__set_ev_qualifier_filter(trace);
4344 if (err < 0)
4345 goto out_errno;
4346
4347 if (trace->syscalls.events.sys_exit) {
4348 pr_debug("event qualifier tracepoint filter: %s\n",
4349 trace->syscalls.events.sys_exit->filter);
4350 }
4351 }
4352
4353 /*
4354 * If the "close" syscall is not traced, then we will not have the
4355 * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the
4356 * fd->pathname table and were ending up showing the last value set by
4357 * syscalls opening a pathname and associating it with a descriptor or
4358 * reading it from /proc/pid/fd/ in cases where that doesn't make
4359 * sense.
4360 *
4361 * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is
4362 * not in use.
4363 */
4364 trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close"));
4365
4366 err = trace__expand_filters(trace, &evsel);
4367 if (err)
4368 goto out_delete_evlist;
4369 err = evlist__apply_filters(evlist, &evsel, &trace->opts.target);
4370 if (err < 0)
4371 goto out_error_apply_filters;
4372
4373 err = evlist__mmap(evlist, trace->opts.mmap_pages);
4374 if (err < 0)
4375 goto out_error_mmap;
4376
4377 if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
4378 evlist__enable(evlist);
4379
4380 if (forks)
4381 evlist__start_workload(evlist);
4382
4383 if (trace->opts.target.initial_delay) {
4384 usleep(trace->opts.target.initial_delay * 1000);
4385 evlist__enable(evlist);
4386 }
4387
4388 trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
4389 perf_thread_map__nr(evlist->core.threads) > 1 ||
4390 evlist__first(evlist)->core.attr.inherit;
4391
4392 /*
4393 * Now that we already used evsel->core.attr to ask the kernel to setup the
4394 * events, lets reuse evsel->core.attr.sample_max_stack as the limit in
4395 * trace__resolve_callchain(), allowing per-event max-stack settings
4396 * to override an explicitly set --max-stack global setting.
4397 */
4398 evlist__for_each_entry(evlist, evsel) {
4399 if (evsel__has_callchain(evsel) &&
4400 evsel->core.attr.sample_max_stack == 0)
4401 evsel->core.attr.sample_max_stack = trace->max_stack;
4402 }
4403again:
4404 before = trace->nr_events;
4405
4406 for (i = 0; i < evlist->core.nr_mmaps; i++) {
4407 union perf_event *event;
4408 struct mmap *md;
4409
4410 md = &evlist->mmap[i];
4411 if (perf_mmap__read_init(&md->core) < 0)
4412 continue;
4413
4414 while ((event = perf_mmap__read_event(&md->core)) != NULL) {
4415 ++trace->nr_events;
4416
4417 err = trace__deliver_event(trace, event);
4418 if (err)
4419 goto out_disable;
4420
4421 perf_mmap__consume(&md->core);
4422
4423 if (interrupted)
4424 goto out_disable;
4425
4426 if (done && !draining) {
4427 evlist__disable(evlist);
4428 draining = true;
4429 }
4430 }
4431 perf_mmap__read_done(&md->core);
4432 }
4433
4434 if (trace->nr_events == before) {
4435 int timeout = done ? 100 : -1;
4436
4437 if (!draining && evlist__poll(evlist, timeout) > 0) {
4438 if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
4439 draining = true;
4440
4441 goto again;
4442 } else {
4443 if (trace__flush_events(trace))
4444 goto out_disable;
4445 }
4446 } else {
4447 goto again;
4448 }
4449
4450out_disable:
4451 thread__zput(trace->current);
4452
4453 evlist__disable(evlist);
4454
4455 if (trace->sort_events)
4456 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
4457
4458 if (!err) {
4459 if (trace->summary)
4460 trace__fprintf_thread_summary(trace, trace->output);
4461
4462 if (trace->show_tool_stats) {
4463 fprintf(trace->output, "Stats:\n "
4464 " vfs_getname : %" PRIu64 "\n"
4465 " proc_getname: %" PRIu64 "\n",
4466 trace->stats.vfs_getname,
4467 trace->stats.proc_getname);
4468 }
4469 }
4470
4471out_delete_evlist:
4472 trace__symbols__exit(trace);
4473 evlist__free_syscall_tp_fields(evlist);
4474 evlist__delete(evlist);
4475 cgroup__put(trace->cgroup);
4476 trace->evlist = NULL;
4477 trace->live = false;
4478 return err;
4479{
4480 char errbuf[BUFSIZ];
4481
4482out_error_sched_stat_runtime:
4483 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
4484 goto out_error;
4485
4486out_error_raw_syscalls:
4487 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
4488 goto out_error;
4489
4490out_error_mmap:
4491 evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
4492 goto out_error;
4493
4494out_error_open:
4495 evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
4496
4497out_error:
4498 fprintf(trace->output, "%s\n", errbuf);
4499 goto out_delete_evlist;
4500
4501out_error_apply_filters:
4502 fprintf(trace->output,
4503 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
4504 evsel->filter, evsel__name(evsel), errno,
4505 str_error_r(errno, errbuf, sizeof(errbuf)));
4506 goto out_delete_evlist;
4507}
4508out_error_mem:
4509 fprintf(trace->output, "Not enough memory to run!\n");
4510 goto out_delete_evlist;
4511
4512out_errno:
4513 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
4514 goto out_delete_evlist;
4515}
4516
4517static int trace__replay(struct trace *trace)
4518{
4519 const struct evsel_str_handler handlers[] = {
4520 { "probe:vfs_getname", trace__vfs_getname, },
4521 };
4522 struct perf_data data = {
4523 .path = input_name,
4524 .mode = PERF_DATA_MODE_READ,
4525 .force = trace->force,
4526 };
4527 struct perf_session *session;
4528 struct evsel *evsel;
4529 int err = -1;
4530
4531 trace->tool.sample = trace__process_sample;
4532 trace->tool.mmap = perf_event__process_mmap;
4533 trace->tool.mmap2 = perf_event__process_mmap2;
4534 trace->tool.comm = perf_event__process_comm;
4535 trace->tool.exit = perf_event__process_exit;
4536 trace->tool.fork = perf_event__process_fork;
4537 trace->tool.attr = perf_event__process_attr;
4538 trace->tool.tracing_data = perf_event__process_tracing_data;
4539 trace->tool.build_id = perf_event__process_build_id;
4540 trace->tool.namespaces = perf_event__process_namespaces;
4541
4542 trace->tool.ordered_events = true;
4543 trace->tool.ordering_requires_timestamps = true;
4544
4545 /* add tid to output */
4546 trace->multiple_threads = true;
4547
4548 session = perf_session__new(&data, &trace->tool);
4549 if (IS_ERR(session))
4550 return PTR_ERR(session);
4551
4552 if (trace->opts.target.pid)
4553 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
4554
4555 if (trace->opts.target.tid)
4556 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
4557
4558 if (symbol__init(&session->header.env) < 0)
4559 goto out;
4560
4561 trace->host = &session->machines.host;
4562
4563 err = perf_session__set_tracepoints_handlers(session, handlers);
4564 if (err)
4565 goto out;
4566
4567 evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter");
4568 trace->syscalls.events.sys_enter = evsel;
4569 /* older kernels have syscalls tp versus raw_syscalls */
4570 if (evsel == NULL)
4571 evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter");
4572
4573 if (evsel &&
4574 (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
4575 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
4576 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
4577 goto out;
4578 }
4579
4580 evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit");
4581 trace->syscalls.events.sys_exit = evsel;
4582 if (evsel == NULL)
4583 evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit");
4584 if (evsel &&
4585 (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
4586 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
4587 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
4588 goto out;
4589 }
4590
4591 evlist__for_each_entry(session->evlist, evsel) {
4592 if (evsel->core.attr.type == PERF_TYPE_SOFTWARE &&
4593 (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
4594 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
4595 evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS))
4596 evsel->handler = trace__pgfault;
4597 }
4598
4599 setup_pager();
4600
4601 err = perf_session__process_events(session);
4602 if (err)
4603 pr_err("Failed to process events, error %d", err);
4604
4605 else if (trace->summary)
4606 trace__fprintf_thread_summary(trace, trace->output);
4607
4608out:
4609 perf_session__delete(session);
4610
4611 return err;
4612}
4613
4614static size_t trace__fprintf_threads_header(FILE *fp)
4615{
4616 size_t printed;
4617
4618 printed = fprintf(fp, "\n Summary of events:\n\n");
4619
4620 return printed;
4621}
4622
4623DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
4624 struct syscall_stats *stats;
4625 double msecs;
4626 int syscall;
4627)
4628{
4629 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
4630 struct syscall_stats *stats = source->priv;
4631
4632 entry->syscall = source->i;
4633 entry->stats = stats;
4634 entry->msecs = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0;
4635}
4636
4637static size_t thread__dump_stats(struct thread_trace *ttrace,
4638 struct trace *trace, FILE *fp)
4639{
4640 size_t printed = 0;
4641 struct syscall *sc;
4642 struct rb_node *nd;
4643 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
4644
4645 if (syscall_stats == NULL)
4646 return 0;
4647
4648 printed += fprintf(fp, "\n");
4649
4650 printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
4651 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
4652 printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
4653
4654 resort_rb__for_each_entry(nd, syscall_stats) {
4655 struct syscall_stats *stats = syscall_stats_entry->stats;
4656 if (stats) {
4657 double min = (double)(stats->stats.min) / NSEC_PER_MSEC;
4658 double max = (double)(stats->stats.max) / NSEC_PER_MSEC;
4659 double avg = avg_stats(&stats->stats);
4660 double pct;
4661 u64 n = (u64)stats->stats.n;
4662
4663 pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
4664 avg /= NSEC_PER_MSEC;
4665
4666 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
4667 printed += fprintf(fp, " %-15s", sc->name);
4668 printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
4669 n, stats->nr_failures, syscall_stats_entry->msecs, min, avg);
4670 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
4671
4672 if (trace->errno_summary && stats->nr_failures) {
4673 int e;
4674
4675 for (e = 0; e < stats->max_errno; ++e) {
4676 if (stats->errnos[e] != 0)
4677 fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
4678 }
4679 }
4680 }
4681 }
4682
4683 resort_rb__delete(syscall_stats);
4684 printed += fprintf(fp, "\n\n");
4685
4686 return printed;
4687}
4688
4689static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
4690{
4691 size_t printed = 0;
4692 struct thread_trace *ttrace = thread__priv(thread);
4693 double ratio;
4694
4695 if (ttrace == NULL)
4696 return 0;
4697
4698 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
4699
4700 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread));
4701 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
4702 printed += fprintf(fp, "%.1f%%", ratio);
4703 if (ttrace->pfmaj)
4704 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
4705 if (ttrace->pfmin)
4706 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
4707 if (trace->sched)
4708 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
4709 else if (fputc('\n', fp) != EOF)
4710 ++printed;
4711
4712 printed += thread__dump_stats(ttrace, trace, fp);
4713
4714 return printed;
4715}
4716
4717static unsigned long thread__nr_events(struct thread_trace *ttrace)
4718{
4719 return ttrace ? ttrace->nr_events : 0;
4720}
4721
4722static int trace_nr_events_cmp(void *priv __maybe_unused,
4723 const struct list_head *la,
4724 const struct list_head *lb)
4725{
4726 struct thread_list *a = list_entry(la, struct thread_list, list);
4727 struct thread_list *b = list_entry(lb, struct thread_list, list);
4728 unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
4729 unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
4730
4731 if (a_nr_events != b_nr_events)
4732 return a_nr_events < b_nr_events ? -1 : 1;
4733
4734 /* Identical number of threads, place smaller tids first. */
4735 return thread__tid(a->thread) < thread__tid(b->thread)
4736 ? -1
4737 : (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
4738}
4739
4740static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
4741{
4742 size_t printed = trace__fprintf_threads_header(fp);
4743 LIST_HEAD(threads);
4744
4745 if (machine__thread_list(trace->host, &threads) == 0) {
4746 struct thread_list *pos;
4747
4748 list_sort(NULL, &threads, trace_nr_events_cmp);
4749
4750 list_for_each_entry(pos, &threads, list)
4751 printed += trace__fprintf_thread(fp, pos->thread, trace);
4752 }
4753 thread_list__delete(&threads);
4754 return printed;
4755}
4756
4757static int trace__set_duration(const struct option *opt, const char *str,
4758 int unset __maybe_unused)
4759{
4760 struct trace *trace = opt->value;
4761
4762 trace->duration_filter = atof(str);
4763 return 0;
4764}
4765
4766static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
4767 int unset __maybe_unused)
4768{
4769 int ret = -1;
4770 size_t i;
4771 struct trace *trace = opt->value;
4772 /*
4773 * FIXME: introduce a intarray class, plain parse csv and create a
4774 * { int nr, int entries[] } struct...
4775 */
4776 struct intlist *list = intlist__new(str);
4777
4778 if (list == NULL)
4779 return -1;
4780
4781 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
4782 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
4783
4784 if (trace->filter_pids.entries == NULL)
4785 goto out;
4786
4787 trace->filter_pids.entries[0] = getpid();
4788
4789 for (i = 1; i < trace->filter_pids.nr; ++i)
4790 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
4791
4792 intlist__delete(list);
4793 ret = 0;
4794out:
4795 return ret;
4796}
4797
4798static int trace__open_output(struct trace *trace, const char *filename)
4799{
4800 struct stat st;
4801
4802 if (!stat(filename, &st) && st.st_size) {
4803 char oldname[PATH_MAX];
4804
4805 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
4806 unlink(oldname);
4807 rename(filename, oldname);
4808 }
4809
4810 trace->output = fopen(filename, "w");
4811
4812 return trace->output == NULL ? -errno : 0;
4813}
4814
4815static int parse_pagefaults(const struct option *opt, const char *str,
4816 int unset __maybe_unused)
4817{
4818 int *trace_pgfaults = opt->value;
4819
4820 if (strcmp(str, "all") == 0)
4821 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
4822 else if (strcmp(str, "maj") == 0)
4823 *trace_pgfaults |= TRACE_PFMAJ;
4824 else if (strcmp(str, "min") == 0)
4825 *trace_pgfaults |= TRACE_PFMIN;
4826 else
4827 return -1;
4828
4829 return 0;
4830}
4831
4832static void evlist__set_default_evsel_handler(struct evlist *evlist, void *handler)
4833{
4834 struct evsel *evsel;
4835
4836 evlist__for_each_entry(evlist, evsel) {
4837 if (evsel->handler == NULL)
4838 evsel->handler = handler;
4839 }
4840}
4841
4842static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name)
4843{
4844 struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel);
4845
4846 if (fmt) {
4847 const struct syscall_fmt *scfmt = syscall_fmt__find(name);
4848
4849 if (scfmt) {
4850 int skip = 0;
4851
4852 if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 ||
4853 strcmp(evsel->tp_format->format.fields->name, "nr") == 0)
4854 ++skip;
4855
4856 memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt));
4857 }
4858 }
4859}
4860
4861static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf)
4862{
4863 struct evsel *evsel;
4864
4865 evlist__for_each_entry(evlist, evsel) {
4866 if (evsel->priv || !evsel->tp_format)
4867 continue;
4868
4869 if (strcmp(evsel->tp_format->system, "syscalls")) {
4870 evsel__init_tp_arg_scnprintf(evsel, use_btf);
4871 continue;
4872 }
4873
4874 if (evsel__init_syscall_tp(evsel))
4875 return -1;
4876
4877 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
4878 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4879
4880 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
4881 return -1;
4882
4883 evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1);
4884 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
4885 struct syscall_tp *sc = __evsel__syscall_tp(evsel);
4886
4887 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
4888 return -1;
4889
4890 evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1);
4891 }
4892 }
4893
4894 return 0;
4895}
4896
4897/*
4898 * XXX: Hackish, just splitting the combined -e+--event (syscalls
4899 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
4900 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
4901 *
4902 * It'd be better to introduce a parse_options() variant that would return a
4903 * list with the terms it didn't match to an event...
4904 */
4905static int trace__parse_events_option(const struct option *opt, const char *str,
4906 int unset __maybe_unused)
4907{
4908 struct trace *trace = (struct trace *)opt->value;
4909 const char *s = str;
4910 char *sep = NULL, *lists[2] = { NULL, NULL, };
4911 int len = strlen(str) + 1, err = -1, list, idx;
4912 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
4913 char group_name[PATH_MAX];
4914 const struct syscall_fmt *fmt;
4915
4916 if (strace_groups_dir == NULL)
4917 return -1;
4918
4919 if (*s == '!') {
4920 ++s;
4921 trace->not_ev_qualifier = true;
4922 }
4923
4924 while (1) {
4925 if ((sep = strchr(s, ',')) != NULL)
4926 *sep = '\0';
4927
4928 list = 0;
4929 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
4930 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
4931 list = 1;
4932 goto do_concat;
4933 }
4934
4935 fmt = syscall_fmt__find_by_alias(s);
4936 if (fmt != NULL) {
4937 list = 1;
4938 s = fmt->name;
4939 } else {
4940 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
4941 if (access(group_name, R_OK) == 0)
4942 list = 1;
4943 }
4944do_concat:
4945 if (lists[list]) {
4946 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
4947 } else {
4948 lists[list] = malloc(len);
4949 if (lists[list] == NULL)
4950 goto out;
4951 strcpy(lists[list], s);
4952 }
4953
4954 if (!sep)
4955 break;
4956
4957 *sep = ',';
4958 s = sep + 1;
4959 }
4960
4961 if (lists[1] != NULL) {
4962 struct strlist_config slist_config = {
4963 .dirname = strace_groups_dir,
4964 };
4965
4966 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
4967 if (trace->ev_qualifier == NULL) {
4968 fputs("Not enough memory to parse event qualifier", trace->output);
4969 goto out;
4970 }
4971
4972 if (trace__validate_ev_qualifier(trace))
4973 goto out;
4974 trace->trace_syscalls = true;
4975 }
4976
4977 err = 0;
4978
4979 if (lists[0]) {
4980 struct parse_events_option_args parse_events_option_args = {
4981 .evlistp = &trace->evlist,
4982 };
4983 struct option o = {
4984 .value = &parse_events_option_args,
4985 };
4986 err = parse_events_option(&o, lists[0], 0);
4987 }
4988out:
4989 free(strace_groups_dir);
4990 free(lists[0]);
4991 free(lists[1]);
4992 if (sep)
4993 *sep = ',';
4994
4995 return err;
4996}
4997
4998static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
4999{
5000 struct trace *trace = opt->value;
5001
5002 if (!list_empty(&trace->evlist->core.entries)) {
5003 struct option o = {
5004 .value = &trace->evlist,
5005 };
5006 return parse_cgroups(&o, str, unset);
5007 }
5008 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
5009
5010 return 0;
5011}
5012
5013static int trace__config(const char *var, const char *value, void *arg)
5014{
5015 struct trace *trace = arg;
5016 int err = 0;
5017
5018 if (!strcmp(var, "trace.add_events")) {
5019 trace->perfconfig_events = strdup(value);
5020 if (trace->perfconfig_events == NULL) {
5021 pr_err("Not enough memory for %s\n", "trace.add_events");
5022 return -1;
5023 }
5024 } else if (!strcmp(var, "trace.show_timestamp")) {
5025 trace->show_tstamp = perf_config_bool(var, value);
5026 } else if (!strcmp(var, "trace.show_duration")) {
5027 trace->show_duration = perf_config_bool(var, value);
5028 } else if (!strcmp(var, "trace.show_arg_names")) {
5029 trace->show_arg_names = perf_config_bool(var, value);
5030 if (!trace->show_arg_names)
5031 trace->show_zeros = true;
5032 } else if (!strcmp(var, "trace.show_zeros")) {
5033 bool new_show_zeros = perf_config_bool(var, value);
5034 if (!trace->show_arg_names && !new_show_zeros) {
5035 pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
5036 goto out;
5037 }
5038 trace->show_zeros = new_show_zeros;
5039 } else if (!strcmp(var, "trace.show_prefix")) {
5040 trace->show_string_prefix = perf_config_bool(var, value);
5041 } else if (!strcmp(var, "trace.no_inherit")) {
5042 trace->opts.no_inherit = perf_config_bool(var, value);
5043 } else if (!strcmp(var, "trace.args_alignment")) {
5044 int args_alignment = 0;
5045 if (perf_config_int(&args_alignment, var, value) == 0)
5046 trace->args_alignment = args_alignment;
5047 } else if (!strcmp(var, "trace.tracepoint_beautifiers")) {
5048 if (strcasecmp(value, "libtraceevent") == 0)
5049 trace->libtraceevent_print = true;
5050 else if (strcasecmp(value, "libbeauty") == 0)
5051 trace->libtraceevent_print = false;
5052 }
5053out:
5054 return err;
5055}
5056
5057static void trace__exit(struct trace *trace)
5058{
5059 int i;
5060
5061 strlist__delete(trace->ev_qualifier);
5062 zfree(&trace->ev_qualifier_ids.entries);
5063 if (trace->syscalls.table) {
5064 for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
5065 syscall__exit(&trace->syscalls.table[i]);
5066 zfree(&trace->syscalls.table);
5067 }
5068 syscalltbl__delete(trace->sctbl);
5069 zfree(&trace->perfconfig_events);
5070}
5071
5072#ifdef HAVE_BPF_SKEL
5073static int bpf__setup_bpf_output(struct evlist *evlist)
5074{
5075 int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
5076
5077 if (err)
5078 pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
5079
5080 return err;
5081}
5082#endif
5083
5084int cmd_trace(int argc, const char **argv)
5085{
5086 const char *trace_usage[] = {
5087 "perf trace [<options>] [<command>]",
5088 "perf trace [<options>] -- <command> [<options>]",
5089 "perf trace record [<options>] [<command>]",
5090 "perf trace record [<options>] -- <command> [<options>]",
5091 NULL
5092 };
5093 struct trace trace = {
5094 .opts = {
5095 .target = {
5096 .uid = UINT_MAX,
5097 .uses_mmap = true,
5098 },
5099 .user_freq = UINT_MAX,
5100 .user_interval = ULLONG_MAX,
5101 .no_buffering = true,
5102 .mmap_pages = UINT_MAX,
5103 },
5104 .output = stderr,
5105 .show_comm = true,
5106 .show_tstamp = true,
5107 .show_duration = true,
5108 .show_arg_names = true,
5109 .args_alignment = 70,
5110 .trace_syscalls = false,
5111 .kernel_syscallchains = false,
5112 .max_stack = UINT_MAX,
5113 .max_events = ULONG_MAX,
5114 };
5115 const char *output_name = NULL;
5116 const struct option trace_options[] = {
5117 OPT_CALLBACK('e', "event", &trace, "event",
5118 "event/syscall selector. use 'perf list' to list available events",
5119 trace__parse_events_option),
5120 OPT_CALLBACK(0, "filter", &trace.evlist, "filter",
5121 "event filter", parse_filter),
5122 OPT_BOOLEAN(0, "comm", &trace.show_comm,
5123 "show the thread COMM next to its id"),
5124 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
5125 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
5126 trace__parse_events_option),
5127 OPT_STRING('o', "output", &output_name, "file", "output file name"),
5128 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
5129 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
5130 "trace events on existing process id"),
5131 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
5132 "trace events on existing thread id"),
5133 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
5134 "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
5135 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
5136 "system-wide collection from all CPUs"),
5137 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
5138 "list of cpus to monitor"),
5139 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
5140 "child tasks do not inherit counters"),
5141 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
5142 "number of mmap data pages", evlist__parse_mmap_pages),
5143 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
5144 "user to profile"),
5145 OPT_CALLBACK(0, "duration", &trace, "float",
5146 "show only events with duration > N.M ms",
5147 trace__set_duration),
5148 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
5149 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
5150 OPT_BOOLEAN('T', "time", &trace.full_time,
5151 "Show full timestamp, not time relative to first start"),
5152 OPT_BOOLEAN(0, "failure", &trace.failure_only,
5153 "Show only syscalls that failed"),
5154 OPT_BOOLEAN('s', "summary", &trace.summary_only,
5155 "Show only syscall summary with statistics"),
5156 OPT_BOOLEAN('S', "with-summary", &trace.summary,
5157 "Show all syscalls and summary with statistics"),
5158 OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
5159 "Show errno stats per syscall, use with -s or -S"),
5160 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
5161 "Trace pagefaults", parse_pagefaults, "maj"),
5162 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
5163 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
5164 OPT_CALLBACK(0, "call-graph", &trace.opts,
5165 "record_mode[,record_size]", record_callchain_help,
5166 &record_parse_callchain_opt),
5167 OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
5168 "Use libtraceevent to print the tracepoint arguments."),
5169 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
5170 "Show the kernel callchains on the syscall exit path"),
5171 OPT_ULONG(0, "max-events", &trace.max_events,
5172 "Set the maximum number of events to print, exit after that is reached. "),
5173 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
5174 "Set the minimum stack depth when parsing the callchain, "
5175 "anything below the specified depth will be ignored."),
5176 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
5177 "Set the maximum stack depth when parsing the callchain, "
5178 "anything beyond the specified depth will be ignored. "
5179 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
5180 OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
5181 "Sort batch of events before processing, use if getting out of order events"),
5182 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
5183 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
5184 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
5185 "per thread proc mmap processing timeout in ms"),
5186 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
5187 trace__parse_cgroups),
5188 OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay,
5189 "ms to wait before starting measurement after program "
5190 "start"),
5191 OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
5192 "to customized ones"),
5193 OPTS_EVSWITCH(&trace.evswitch),
5194 OPT_END()
5195 };
5196 bool __maybe_unused max_stack_user_set = true;
5197 bool mmap_pages_user_set = true;
5198 struct evsel *evsel;
5199 const char * const trace_subcommands[] = { "record", NULL };
5200 int err = -1;
5201 char bf[BUFSIZ];
5202 struct sigaction sigchld_act;
5203
5204 signal(SIGSEGV, sighandler_dump_stack);
5205 signal(SIGFPE, sighandler_dump_stack);
5206 signal(SIGINT, sighandler_interrupt);
5207
5208 memset(&sigchld_act, 0, sizeof(sigchld_act));
5209 sigchld_act.sa_flags = SA_SIGINFO;
5210 sigchld_act.sa_sigaction = sighandler_chld;
5211 sigaction(SIGCHLD, &sigchld_act, NULL);
5212
5213 trace.evlist = evlist__new();
5214 trace.sctbl = syscalltbl__new();
5215
5216 if (trace.evlist == NULL || trace.sctbl == NULL) {
5217 pr_err("Not enough memory to run!\n");
5218 err = -ENOMEM;
5219 goto out;
5220 }
5221
5222 /*
5223 * Parsing .perfconfig may entail creating a BPF event, that may need
5224 * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting
5225 * is too small. This affects just this process, not touching the
5226 * global setting. If it fails we'll get something in 'perf trace -v'
5227 * to help diagnose the problem.
5228 */
5229 rlimit__bump_memlock();
5230
5231 err = perf_config(trace__config, &trace);
5232 if (err)
5233 goto out;
5234
5235 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
5236 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
5237
5238 /*
5239 * Here we already passed thru trace__parse_events_option() and it has
5240 * already figured out if -e syscall_name, if not but if --event
5241 * foo:bar was used, the user is interested _just_ in those, say,
5242 * tracepoint events, not in the strace-like syscall-name-based mode.
5243 *
5244 * This is important because we need to check if strace-like mode is
5245 * needed to decided if we should filter out the eBPF
5246 * __augmented_syscalls__ code, if it is in the mix, say, via
5247 * .perfconfig trace.add_events, and filter those out.
5248 */
5249 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
5250 trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
5251 trace.trace_syscalls = true;
5252 }
5253 /*
5254 * Now that we have --verbose figured out, lets see if we need to parse
5255 * events from .perfconfig, so that if those events fail parsing, say some
5256 * BPF program fails, then we'll be able to use --verbose to see what went
5257 * wrong in more detail.
5258 */
5259 if (trace.perfconfig_events != NULL) {
5260 struct parse_events_error parse_err;
5261
5262 parse_events_error__init(&parse_err);
5263 err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err);
5264 if (err)
5265 parse_events_error__print(&parse_err, trace.perfconfig_events);
5266 parse_events_error__exit(&parse_err);
5267 if (err)
5268 goto out;
5269 }
5270
5271 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
5272 usage_with_options_msg(trace_usage, trace_options,
5273 "cgroup monitoring only available in system-wide mode");
5274 }
5275
5276#ifdef HAVE_BPF_SKEL
5277 if (!trace.trace_syscalls)
5278 goto skip_augmentation;
5279
5280 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
5281 pr_debug("Syscall augmentation fails with record, disabling augmentation");
5282 goto skip_augmentation;
5283 }
5284
5285 trace.skel = augmented_raw_syscalls_bpf__open();
5286 if (!trace.skel) {
5287 pr_debug("Failed to open augmented syscalls BPF skeleton");
5288 } else {
5289 /*
5290 * Disable attaching the BPF programs except for sys_enter and
5291 * sys_exit that tail call into this as necessary.
5292 */
5293 struct bpf_program *prog;
5294
5295 bpf_object__for_each_program(prog, trace.skel->obj) {
5296 if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit)
5297 bpf_program__set_autoattach(prog, /*autoattach=*/false);
5298 }
5299
5300 err = augmented_raw_syscalls_bpf__load(trace.skel);
5301
5302 if (err < 0) {
5303 libbpf_strerror(err, bf, sizeof(bf));
5304 pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf);
5305 } else {
5306 augmented_raw_syscalls_bpf__attach(trace.skel);
5307 trace__add_syscall_newtp(&trace);
5308 }
5309 }
5310
5311 err = bpf__setup_bpf_output(trace.evlist);
5312 if (err) {
5313 libbpf_strerror(err, bf, sizeof(bf));
5314 pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
5315 goto out;
5316 }
5317 trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
5318 assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__"));
5319skip_augmentation:
5320#endif
5321 err = -1;
5322
5323 if (trace.trace_pgfaults) {
5324 trace.opts.sample_address = true;
5325 trace.opts.sample_time = true;
5326 }
5327
5328 if (trace.opts.mmap_pages == UINT_MAX)
5329 mmap_pages_user_set = false;
5330
5331 if (trace.max_stack == UINT_MAX) {
5332 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
5333 max_stack_user_set = false;
5334 }
5335
5336#ifdef HAVE_DWARF_UNWIND_SUPPORT
5337 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
5338 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
5339 }
5340#endif
5341
5342 if (callchain_param.enabled) {
5343 if (!mmap_pages_user_set && geteuid() == 0)
5344 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
5345
5346 symbol_conf.use_callchain = true;
5347 }
5348
5349 if (trace.evlist->core.nr_entries > 0) {
5350 bool use_btf = false;
5351
5352 evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
5353 if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
5354 perror("failed to set syscalls:* tracepoint fields");
5355 goto out;
5356 }
5357
5358 if (use_btf)
5359 trace__load_vmlinux_btf(&trace);
5360 }
5361
5362 if (trace.sort_events) {
5363 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
5364 ordered_events__set_copy_on_queue(&trace.oe.data, true);
5365 }
5366
5367 /*
5368 * If we are augmenting syscalls, then combine what we put in the
5369 * __augmented_syscalls__ BPF map with what is in the
5370 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
5371 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
5372 *
5373 * We'll switch to look at two BPF maps, one for sys_enter and the
5374 * other for sys_exit when we start augmenting the sys_exit paths with
5375 * buffers that are being copied from kernel to userspace, think 'read'
5376 * syscall.
5377 */
5378 if (trace.syscalls.events.bpf_output) {
5379 evlist__for_each_entry(trace.evlist, evsel) {
5380 bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
5381
5382 if (raw_syscalls_sys_exit) {
5383 trace.raw_augmented_syscalls = true;
5384 goto init_augmented_syscall_tp;
5385 }
5386
5387 if (trace.syscalls.events.bpf_output->priv == NULL &&
5388 strstr(evsel__name(evsel), "syscalls:sys_enter")) {
5389 struct evsel *augmented = trace.syscalls.events.bpf_output;
5390 if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
5391 evsel__init_augmented_syscall_tp_args(augmented))
5392 goto out;
5393 /*
5394 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
5395 * Above we made sure we can get from the payload the tp fields
5396 * that we get from syscalls:sys_enter tracefs format file.
5397 */
5398 augmented->handler = trace__sys_enter;
5399 /*
5400 * Now we do the same for the *syscalls:sys_enter event so that
5401 * if we handle it directly, i.e. if the BPF prog returns 0 so
5402 * as not to filter it, then we'll handle it just like we would
5403 * for the BPF_OUTPUT one:
5404 */
5405 if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
5406 evsel__init_augmented_syscall_tp_args(evsel))
5407 goto out;
5408 evsel->handler = trace__sys_enter;
5409 }
5410
5411 if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
5412 struct syscall_tp *sc;
5413init_augmented_syscall_tp:
5414 if (evsel__init_augmented_syscall_tp(evsel, evsel))
5415 goto out;
5416 sc = __evsel__syscall_tp(evsel);
5417 /*
5418 * For now with BPF raw_augmented we hook into
5419 * raw_syscalls:sys_enter and there we get all
5420 * 6 syscall args plus the tracepoint common
5421 * fields and the syscall_nr (another long).
5422 * So we check if that is the case and if so
5423 * don't look after the sc->args_size but
5424 * always after the full raw_syscalls:sys_enter
5425 * payload, which is fixed.
5426 *
5427 * We'll revisit this later to pass
5428 * s->args_size to the BPF augmenter (now
5429 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
5430 * so that it copies only what we need for each
5431 * syscall, like what happens when we use
5432 * syscalls:sys_enter_NAME, so that we reduce
5433 * the kernel/userspace traffic to just what is
5434 * needed for each syscall.
5435 */
5436 if (trace.raw_augmented_syscalls)
5437 trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
5438 evsel__init_augmented_syscall_tp_ret(evsel);
5439 evsel->handler = trace__sys_exit;
5440 }
5441 }
5442 }
5443
5444 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
5445 return trace__record(&trace, argc-1, &argv[1]);
5446
5447 /* Using just --errno-summary will trigger --summary */
5448 if (trace.errno_summary && !trace.summary && !trace.summary_only)
5449 trace.summary_only = true;
5450
5451 /* summary_only implies summary option, but don't overwrite summary if set */
5452 if (trace.summary_only)
5453 trace.summary = trace.summary_only;
5454
5455 /* Keep exited threads, otherwise information might be lost for summary */
5456 if (trace.summary)
5457 symbol_conf.keep_exited_threads = true;
5458
5459 if (output_name != NULL) {
5460 err = trace__open_output(&trace, output_name);
5461 if (err < 0) {
5462 perror("failed to create output file");
5463 goto out;
5464 }
5465 }
5466
5467 err = evswitch__init(&trace.evswitch, trace.evlist, stderr);
5468 if (err)
5469 goto out_close;
5470
5471 err = target__validate(&trace.opts.target);
5472 if (err) {
5473 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5474 fprintf(trace.output, "%s", bf);
5475 goto out_close;
5476 }
5477
5478 err = target__parse_uid(&trace.opts.target);
5479 if (err) {
5480 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
5481 fprintf(trace.output, "%s", bf);
5482 goto out_close;
5483 }
5484
5485 if (!argc && target__none(&trace.opts.target))
5486 trace.opts.target.system_wide = true;
5487
5488 if (input_name)
5489 err = trace__replay(&trace);
5490 else
5491 err = trace__run(&trace, argc, argv);
5492
5493out_close:
5494 if (output_name != NULL)
5495 fclose(trace.output);
5496out:
5497 trace__exit(&trace);
5498#ifdef HAVE_BPF_SKEL
5499 augmented_raw_syscalls_bpf__destroy(trace.skel);
5500#endif
5501 return err;
5502}
1#include <traceevent/event-parse.h>
2#include "builtin.h"
3#include "util/color.h"
4#include "util/debug.h"
5#include "util/evlist.h"
6#include "util/machine.h"
7#include "util/session.h"
8#include "util/thread.h"
9#include "util/parse-options.h"
10#include "util/strlist.h"
11#include "util/intlist.h"
12#include "util/thread_map.h"
13#include "util/stat.h"
14#include "trace-event.h"
15#include "util/parse-events.h"
16
17#include <libaudit.h>
18#include <stdlib.h>
19#include <sys/eventfd.h>
20#include <sys/mman.h>
21#include <linux/futex.h>
22
23/* For older distros: */
24#ifndef MAP_STACK
25# define MAP_STACK 0x20000
26#endif
27
28#ifndef MADV_HWPOISON
29# define MADV_HWPOISON 100
30#endif
31
32#ifndef MADV_MERGEABLE
33# define MADV_MERGEABLE 12
34#endif
35
36#ifndef MADV_UNMERGEABLE
37# define MADV_UNMERGEABLE 13
38#endif
39
40#ifndef EFD_SEMAPHORE
41# define EFD_SEMAPHORE 1
42#endif
43
44struct tp_field {
45 int offset;
46 union {
47 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
48 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
49 };
50};
51
52#define TP_UINT_FIELD(bits) \
53static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
54{ \
55 return *(u##bits *)(sample->raw_data + field->offset); \
56}
57
58TP_UINT_FIELD(8);
59TP_UINT_FIELD(16);
60TP_UINT_FIELD(32);
61TP_UINT_FIELD(64);
62
63#define TP_UINT_FIELD__SWAPPED(bits) \
64static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
65{ \
66 u##bits value = *(u##bits *)(sample->raw_data + field->offset); \
67 return bswap_##bits(value);\
68}
69
70TP_UINT_FIELD__SWAPPED(16);
71TP_UINT_FIELD__SWAPPED(32);
72TP_UINT_FIELD__SWAPPED(64);
73
74static int tp_field__init_uint(struct tp_field *field,
75 struct format_field *format_field,
76 bool needs_swap)
77{
78 field->offset = format_field->offset;
79
80 switch (format_field->size) {
81 case 1:
82 field->integer = tp_field__u8;
83 break;
84 case 2:
85 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
86 break;
87 case 4:
88 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
89 break;
90 case 8:
91 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
92 break;
93 default:
94 return -1;
95 }
96
97 return 0;
98}
99
100static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
101{
102 return sample->raw_data + field->offset;
103}
104
105static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
106{
107 field->offset = format_field->offset;
108 field->pointer = tp_field__ptr;
109 return 0;
110}
111
112struct syscall_tp {
113 struct tp_field id;
114 union {
115 struct tp_field args, ret;
116 };
117};
118
119static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
120 struct tp_field *field,
121 const char *name)
122{
123 struct format_field *format_field = perf_evsel__field(evsel, name);
124
125 if (format_field == NULL)
126 return -1;
127
128 return tp_field__init_uint(field, format_field, evsel->needs_swap);
129}
130
131#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
132 ({ struct syscall_tp *sc = evsel->priv;\
133 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
134
135static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
136 struct tp_field *field,
137 const char *name)
138{
139 struct format_field *format_field = perf_evsel__field(evsel, name);
140
141 if (format_field == NULL)
142 return -1;
143
144 return tp_field__init_ptr(field, format_field);
145}
146
147#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
148 ({ struct syscall_tp *sc = evsel->priv;\
149 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
150
151static void perf_evsel__delete_priv(struct perf_evsel *evsel)
152{
153 zfree(&evsel->priv);
154 perf_evsel__delete(evsel);
155}
156
157static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
158{
159 evsel->priv = malloc(sizeof(struct syscall_tp));
160 if (evsel->priv != NULL) {
161 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
162 goto out_delete;
163
164 evsel->handler = handler;
165 return 0;
166 }
167
168 return -ENOMEM;
169
170out_delete:
171 zfree(&evsel->priv);
172 return -ENOENT;
173}
174
175static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
176{
177 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
178
179 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
180 if (evsel == NULL)
181 evsel = perf_evsel__newtp("syscalls", direction);
182
183 if (evsel) {
184 if (perf_evsel__init_syscall_tp(evsel, handler))
185 goto out_delete;
186 }
187
188 return evsel;
189
190out_delete:
191 perf_evsel__delete_priv(evsel);
192 return NULL;
193}
194
195#define perf_evsel__sc_tp_uint(evsel, name, sample) \
196 ({ struct syscall_tp *fields = evsel->priv; \
197 fields->name.integer(&fields->name, sample); })
198
199#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
200 ({ struct syscall_tp *fields = evsel->priv; \
201 fields->name.pointer(&fields->name, sample); })
202
203static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
204 void *sys_enter_handler,
205 void *sys_exit_handler)
206{
207 int ret = -1;
208 struct perf_evsel *sys_enter, *sys_exit;
209
210 sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
211 if (sys_enter == NULL)
212 goto out;
213
214 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
215 goto out_delete_sys_enter;
216
217 sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
218 if (sys_exit == NULL)
219 goto out_delete_sys_enter;
220
221 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
222 goto out_delete_sys_exit;
223
224 perf_evlist__add(evlist, sys_enter);
225 perf_evlist__add(evlist, sys_exit);
226
227 ret = 0;
228out:
229 return ret;
230
231out_delete_sys_exit:
232 perf_evsel__delete_priv(sys_exit);
233out_delete_sys_enter:
234 perf_evsel__delete_priv(sys_enter);
235 goto out;
236}
237
238
239struct syscall_arg {
240 unsigned long val;
241 struct thread *thread;
242 struct trace *trace;
243 void *parm;
244 u8 idx;
245 u8 mask;
246};
247
248struct strarray {
249 int offset;
250 int nr_entries;
251 const char **entries;
252};
253
254#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
255 .nr_entries = ARRAY_SIZE(array), \
256 .entries = array, \
257}
258
259#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
260 .offset = off, \
261 .nr_entries = ARRAY_SIZE(array), \
262 .entries = array, \
263}
264
265static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
266 const char *intfmt,
267 struct syscall_arg *arg)
268{
269 struct strarray *sa = arg->parm;
270 int idx = arg->val - sa->offset;
271
272 if (idx < 0 || idx >= sa->nr_entries)
273 return scnprintf(bf, size, intfmt, arg->val);
274
275 return scnprintf(bf, size, "%s", sa->entries[idx]);
276}
277
278static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
279 struct syscall_arg *arg)
280{
281 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
282}
283
284#define SCA_STRARRAY syscall_arg__scnprintf_strarray
285
286#if defined(__i386__) || defined(__x86_64__)
287/*
288 * FIXME: Make this available to all arches as soon as the ioctl beautifier
289 * gets rewritten to support all arches.
290 */
291static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
292 struct syscall_arg *arg)
293{
294 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
295}
296
297#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
298#endif /* defined(__i386__) || defined(__x86_64__) */
299
300static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
301 struct syscall_arg *arg);
302
303#define SCA_FD syscall_arg__scnprintf_fd
304
305static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
306 struct syscall_arg *arg)
307{
308 int fd = arg->val;
309
310 if (fd == AT_FDCWD)
311 return scnprintf(bf, size, "CWD");
312
313 return syscall_arg__scnprintf_fd(bf, size, arg);
314}
315
316#define SCA_FDAT syscall_arg__scnprintf_fd_at
317
318static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
319 struct syscall_arg *arg);
320
321#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
322
323static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
324 struct syscall_arg *arg)
325{
326 return scnprintf(bf, size, "%#lx", arg->val);
327}
328
329#define SCA_HEX syscall_arg__scnprintf_hex
330
331static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
332 struct syscall_arg *arg)
333{
334 int printed = 0, prot = arg->val;
335
336 if (prot == PROT_NONE)
337 return scnprintf(bf, size, "NONE");
338#define P_MMAP_PROT(n) \
339 if (prot & PROT_##n) { \
340 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
341 prot &= ~PROT_##n; \
342 }
343
344 P_MMAP_PROT(EXEC);
345 P_MMAP_PROT(READ);
346 P_MMAP_PROT(WRITE);
347#ifdef PROT_SEM
348 P_MMAP_PROT(SEM);
349#endif
350 P_MMAP_PROT(GROWSDOWN);
351 P_MMAP_PROT(GROWSUP);
352#undef P_MMAP_PROT
353
354 if (prot)
355 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
356
357 return printed;
358}
359
360#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
361
362static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
363 struct syscall_arg *arg)
364{
365 int printed = 0, flags = arg->val;
366
367#define P_MMAP_FLAG(n) \
368 if (flags & MAP_##n) { \
369 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
370 flags &= ~MAP_##n; \
371 }
372
373 P_MMAP_FLAG(SHARED);
374 P_MMAP_FLAG(PRIVATE);
375#ifdef MAP_32BIT
376 P_MMAP_FLAG(32BIT);
377#endif
378 P_MMAP_FLAG(ANONYMOUS);
379 P_MMAP_FLAG(DENYWRITE);
380 P_MMAP_FLAG(EXECUTABLE);
381 P_MMAP_FLAG(FILE);
382 P_MMAP_FLAG(FIXED);
383 P_MMAP_FLAG(GROWSDOWN);
384#ifdef MAP_HUGETLB
385 P_MMAP_FLAG(HUGETLB);
386#endif
387 P_MMAP_FLAG(LOCKED);
388 P_MMAP_FLAG(NONBLOCK);
389 P_MMAP_FLAG(NORESERVE);
390 P_MMAP_FLAG(POPULATE);
391 P_MMAP_FLAG(STACK);
392#ifdef MAP_UNINITIALIZED
393 P_MMAP_FLAG(UNINITIALIZED);
394#endif
395#undef P_MMAP_FLAG
396
397 if (flags)
398 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
399
400 return printed;
401}
402
403#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
404
405static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
406 struct syscall_arg *arg)
407{
408 int behavior = arg->val;
409
410 switch (behavior) {
411#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
412 P_MADV_BHV(NORMAL);
413 P_MADV_BHV(RANDOM);
414 P_MADV_BHV(SEQUENTIAL);
415 P_MADV_BHV(WILLNEED);
416 P_MADV_BHV(DONTNEED);
417 P_MADV_BHV(REMOVE);
418 P_MADV_BHV(DONTFORK);
419 P_MADV_BHV(DOFORK);
420 P_MADV_BHV(HWPOISON);
421#ifdef MADV_SOFT_OFFLINE
422 P_MADV_BHV(SOFT_OFFLINE);
423#endif
424 P_MADV_BHV(MERGEABLE);
425 P_MADV_BHV(UNMERGEABLE);
426#ifdef MADV_HUGEPAGE
427 P_MADV_BHV(HUGEPAGE);
428#endif
429#ifdef MADV_NOHUGEPAGE
430 P_MADV_BHV(NOHUGEPAGE);
431#endif
432#ifdef MADV_DONTDUMP
433 P_MADV_BHV(DONTDUMP);
434#endif
435#ifdef MADV_DODUMP
436 P_MADV_BHV(DODUMP);
437#endif
438#undef P_MADV_PHV
439 default: break;
440 }
441
442 return scnprintf(bf, size, "%#x", behavior);
443}
444
445#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
446
447static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
448 struct syscall_arg *arg)
449{
450 int printed = 0, op = arg->val;
451
452 if (op == 0)
453 return scnprintf(bf, size, "NONE");
454#define P_CMD(cmd) \
455 if ((op & LOCK_##cmd) == LOCK_##cmd) { \
456 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
457 op &= ~LOCK_##cmd; \
458 }
459
460 P_CMD(SH);
461 P_CMD(EX);
462 P_CMD(NB);
463 P_CMD(UN);
464 P_CMD(MAND);
465 P_CMD(RW);
466 P_CMD(READ);
467 P_CMD(WRITE);
468#undef P_OP
469
470 if (op)
471 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
472
473 return printed;
474}
475
476#define SCA_FLOCK syscall_arg__scnprintf_flock
477
478static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
479{
480 enum syscall_futex_args {
481 SCF_UADDR = (1 << 0),
482 SCF_OP = (1 << 1),
483 SCF_VAL = (1 << 2),
484 SCF_TIMEOUT = (1 << 3),
485 SCF_UADDR2 = (1 << 4),
486 SCF_VAL3 = (1 << 5),
487 };
488 int op = arg->val;
489 int cmd = op & FUTEX_CMD_MASK;
490 size_t printed = 0;
491
492 switch (cmd) {
493#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
494 P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
495 P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
496 P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
497 P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break;
498 P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break;
499 P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break;
500 P_FUTEX_OP(WAKE_OP); break;
501 P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
502 P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
503 P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
504 P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break;
505 P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break;
506 P_FUTEX_OP(WAIT_REQUEUE_PI); break;
507 default: printed = scnprintf(bf, size, "%#x", cmd); break;
508 }
509
510 if (op & FUTEX_PRIVATE_FLAG)
511 printed += scnprintf(bf + printed, size - printed, "|PRIV");
512
513 if (op & FUTEX_CLOCK_REALTIME)
514 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
515
516 return printed;
517}
518
519#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op
520
521static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
522static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
523
524static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
525static DEFINE_STRARRAY(itimers);
526
527static const char *whences[] = { "SET", "CUR", "END",
528#ifdef SEEK_DATA
529"DATA",
530#endif
531#ifdef SEEK_HOLE
532"HOLE",
533#endif
534};
535static DEFINE_STRARRAY(whences);
536
537static const char *fcntl_cmds[] = {
538 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
539 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
540 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
541 "F_GETOWNER_UIDS",
542};
543static DEFINE_STRARRAY(fcntl_cmds);
544
545static const char *rlimit_resources[] = {
546 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
547 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
548 "RTTIME",
549};
550static DEFINE_STRARRAY(rlimit_resources);
551
552static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
553static DEFINE_STRARRAY(sighow);
554
555static const char *clockid[] = {
556 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
557 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
558};
559static DEFINE_STRARRAY(clockid);
560
561static const char *socket_families[] = {
562 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
563 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
564 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
565 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
566 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
567 "ALG", "NFC", "VSOCK",
568};
569static DEFINE_STRARRAY(socket_families);
570
571#ifndef SOCK_TYPE_MASK
572#define SOCK_TYPE_MASK 0xf
573#endif
574
575static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
576 struct syscall_arg *arg)
577{
578 size_t printed;
579 int type = arg->val,
580 flags = type & ~SOCK_TYPE_MASK;
581
582 type &= SOCK_TYPE_MASK;
583 /*
584 * Can't use a strarray, MIPS may override for ABI reasons.
585 */
586 switch (type) {
587#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
588 P_SK_TYPE(STREAM);
589 P_SK_TYPE(DGRAM);
590 P_SK_TYPE(RAW);
591 P_SK_TYPE(RDM);
592 P_SK_TYPE(SEQPACKET);
593 P_SK_TYPE(DCCP);
594 P_SK_TYPE(PACKET);
595#undef P_SK_TYPE
596 default:
597 printed = scnprintf(bf, size, "%#x", type);
598 }
599
600#define P_SK_FLAG(n) \
601 if (flags & SOCK_##n) { \
602 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
603 flags &= ~SOCK_##n; \
604 }
605
606 P_SK_FLAG(CLOEXEC);
607 P_SK_FLAG(NONBLOCK);
608#undef P_SK_FLAG
609
610 if (flags)
611 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
612
613 return printed;
614}
615
616#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
617
618#ifndef MSG_PROBE
619#define MSG_PROBE 0x10
620#endif
621#ifndef MSG_WAITFORONE
622#define MSG_WAITFORONE 0x10000
623#endif
624#ifndef MSG_SENDPAGE_NOTLAST
625#define MSG_SENDPAGE_NOTLAST 0x20000
626#endif
627#ifndef MSG_FASTOPEN
628#define MSG_FASTOPEN 0x20000000
629#endif
630
631static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
632 struct syscall_arg *arg)
633{
634 int printed = 0, flags = arg->val;
635
636 if (flags == 0)
637 return scnprintf(bf, size, "NONE");
638#define P_MSG_FLAG(n) \
639 if (flags & MSG_##n) { \
640 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
641 flags &= ~MSG_##n; \
642 }
643
644 P_MSG_FLAG(OOB);
645 P_MSG_FLAG(PEEK);
646 P_MSG_FLAG(DONTROUTE);
647 P_MSG_FLAG(TRYHARD);
648 P_MSG_FLAG(CTRUNC);
649 P_MSG_FLAG(PROBE);
650 P_MSG_FLAG(TRUNC);
651 P_MSG_FLAG(DONTWAIT);
652 P_MSG_FLAG(EOR);
653 P_MSG_FLAG(WAITALL);
654 P_MSG_FLAG(FIN);
655 P_MSG_FLAG(SYN);
656 P_MSG_FLAG(CONFIRM);
657 P_MSG_FLAG(RST);
658 P_MSG_FLAG(ERRQUEUE);
659 P_MSG_FLAG(NOSIGNAL);
660 P_MSG_FLAG(MORE);
661 P_MSG_FLAG(WAITFORONE);
662 P_MSG_FLAG(SENDPAGE_NOTLAST);
663 P_MSG_FLAG(FASTOPEN);
664 P_MSG_FLAG(CMSG_CLOEXEC);
665#undef P_MSG_FLAG
666
667 if (flags)
668 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
669
670 return printed;
671}
672
673#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
674
675static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
676 struct syscall_arg *arg)
677{
678 size_t printed = 0;
679 int mode = arg->val;
680
681 if (mode == F_OK) /* 0 */
682 return scnprintf(bf, size, "F");
683#define P_MODE(n) \
684 if (mode & n##_OK) { \
685 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
686 mode &= ~n##_OK; \
687 }
688
689 P_MODE(R);
690 P_MODE(W);
691 P_MODE(X);
692#undef P_MODE
693
694 if (mode)
695 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
696
697 return printed;
698}
699
700#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
701
702static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
703 struct syscall_arg *arg)
704{
705 int printed = 0, flags = arg->val;
706
707 if (!(flags & O_CREAT))
708 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
709
710 if (flags == 0)
711 return scnprintf(bf, size, "RDONLY");
712#define P_FLAG(n) \
713 if (flags & O_##n) { \
714 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
715 flags &= ~O_##n; \
716 }
717
718 P_FLAG(APPEND);
719 P_FLAG(ASYNC);
720 P_FLAG(CLOEXEC);
721 P_FLAG(CREAT);
722 P_FLAG(DIRECT);
723 P_FLAG(DIRECTORY);
724 P_FLAG(EXCL);
725 P_FLAG(LARGEFILE);
726 P_FLAG(NOATIME);
727 P_FLAG(NOCTTY);
728#ifdef O_NONBLOCK
729 P_FLAG(NONBLOCK);
730#elif O_NDELAY
731 P_FLAG(NDELAY);
732#endif
733#ifdef O_PATH
734 P_FLAG(PATH);
735#endif
736 P_FLAG(RDWR);
737#ifdef O_DSYNC
738 if ((flags & O_SYNC) == O_SYNC)
739 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
740 else {
741 P_FLAG(DSYNC);
742 }
743#else
744 P_FLAG(SYNC);
745#endif
746 P_FLAG(TRUNC);
747 P_FLAG(WRONLY);
748#undef P_FLAG
749
750 if (flags)
751 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
752
753 return printed;
754}
755
756#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
757
758static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
759 struct syscall_arg *arg)
760{
761 int printed = 0, flags = arg->val;
762
763 if (flags == 0)
764 return scnprintf(bf, size, "NONE");
765#define P_FLAG(n) \
766 if (flags & EFD_##n) { \
767 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
768 flags &= ~EFD_##n; \
769 }
770
771 P_FLAG(SEMAPHORE);
772 P_FLAG(CLOEXEC);
773 P_FLAG(NONBLOCK);
774#undef P_FLAG
775
776 if (flags)
777 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
778
779 return printed;
780}
781
782#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
783
784static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
785 struct syscall_arg *arg)
786{
787 int printed = 0, flags = arg->val;
788
789#define P_FLAG(n) \
790 if (flags & O_##n) { \
791 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
792 flags &= ~O_##n; \
793 }
794
795 P_FLAG(CLOEXEC);
796 P_FLAG(NONBLOCK);
797#undef P_FLAG
798
799 if (flags)
800 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
801
802 return printed;
803}
804
805#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
806
807static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
808{
809 int sig = arg->val;
810
811 switch (sig) {
812#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
813 P_SIGNUM(HUP);
814 P_SIGNUM(INT);
815 P_SIGNUM(QUIT);
816 P_SIGNUM(ILL);
817 P_SIGNUM(TRAP);
818 P_SIGNUM(ABRT);
819 P_SIGNUM(BUS);
820 P_SIGNUM(FPE);
821 P_SIGNUM(KILL);
822 P_SIGNUM(USR1);
823 P_SIGNUM(SEGV);
824 P_SIGNUM(USR2);
825 P_SIGNUM(PIPE);
826 P_SIGNUM(ALRM);
827 P_SIGNUM(TERM);
828 P_SIGNUM(CHLD);
829 P_SIGNUM(CONT);
830 P_SIGNUM(STOP);
831 P_SIGNUM(TSTP);
832 P_SIGNUM(TTIN);
833 P_SIGNUM(TTOU);
834 P_SIGNUM(URG);
835 P_SIGNUM(XCPU);
836 P_SIGNUM(XFSZ);
837 P_SIGNUM(VTALRM);
838 P_SIGNUM(PROF);
839 P_SIGNUM(WINCH);
840 P_SIGNUM(IO);
841 P_SIGNUM(PWR);
842 P_SIGNUM(SYS);
843#ifdef SIGEMT
844 P_SIGNUM(EMT);
845#endif
846#ifdef SIGSTKFLT
847 P_SIGNUM(STKFLT);
848#endif
849#ifdef SIGSWI
850 P_SIGNUM(SWI);
851#endif
852 default: break;
853 }
854
855 return scnprintf(bf, size, "%#x", sig);
856}
857
858#define SCA_SIGNUM syscall_arg__scnprintf_signum
859
860#if defined(__i386__) || defined(__x86_64__)
861/*
862 * FIXME: Make this available to all arches.
863 */
864#define TCGETS 0x5401
865
866static const char *tioctls[] = {
867 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
868 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
869 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
870 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
871 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
872 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
873 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
874 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
875 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
876 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
877 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
878 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
879 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
880 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
881 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
882};
883
884static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
885#endif /* defined(__i386__) || defined(__x86_64__) */
886
887#define STRARRAY(arg, name, array) \
888 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
889 .arg_parm = { [arg] = &strarray__##array, }
890
891static struct syscall_fmt {
892 const char *name;
893 const char *alias;
894 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
895 void *arg_parm[6];
896 bool errmsg;
897 bool timeout;
898 bool hexret;
899} syscall_fmts[] = {
900 { .name = "access", .errmsg = true,
901 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
902 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
903 { .name = "brk", .hexret = true,
904 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
905 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
906 { .name = "close", .errmsg = true,
907 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
908 { .name = "connect", .errmsg = true, },
909 { .name = "dup", .errmsg = true,
910 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
911 { .name = "dup2", .errmsg = true,
912 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
913 { .name = "dup3", .errmsg = true,
914 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
915 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
916 { .name = "eventfd2", .errmsg = true,
917 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
918 { .name = "faccessat", .errmsg = true,
919 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
920 { .name = "fadvise64", .errmsg = true,
921 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
922 { .name = "fallocate", .errmsg = true,
923 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
924 { .name = "fchdir", .errmsg = true,
925 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
926 { .name = "fchmod", .errmsg = true,
927 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
928 { .name = "fchmodat", .errmsg = true,
929 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
930 { .name = "fchown", .errmsg = true,
931 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
932 { .name = "fchownat", .errmsg = true,
933 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
934 { .name = "fcntl", .errmsg = true,
935 .arg_scnprintf = { [0] = SCA_FD, /* fd */
936 [1] = SCA_STRARRAY, /* cmd */ },
937 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
938 { .name = "fdatasync", .errmsg = true,
939 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
940 { .name = "flock", .errmsg = true,
941 .arg_scnprintf = { [0] = SCA_FD, /* fd */
942 [1] = SCA_FLOCK, /* cmd */ }, },
943 { .name = "fsetxattr", .errmsg = true,
944 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
945 { .name = "fstat", .errmsg = true, .alias = "newfstat",
946 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
947 { .name = "fstatat", .errmsg = true, .alias = "newfstatat",
948 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
949 { .name = "fstatfs", .errmsg = true,
950 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
951 { .name = "fsync", .errmsg = true,
952 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
953 { .name = "ftruncate", .errmsg = true,
954 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
955 { .name = "futex", .errmsg = true,
956 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
957 { .name = "futimesat", .errmsg = true,
958 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
959 { .name = "getdents", .errmsg = true,
960 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
961 { .name = "getdents64", .errmsg = true,
962 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
963 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
964 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
965 { .name = "ioctl", .errmsg = true,
966 .arg_scnprintf = { [0] = SCA_FD, /* fd */
967#if defined(__i386__) || defined(__x86_64__)
968/*
969 * FIXME: Make this available to all arches.
970 */
971 [1] = SCA_STRHEXARRAY, /* cmd */
972 [2] = SCA_HEX, /* arg */ },
973 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
974#else
975 [2] = SCA_HEX, /* arg */ }, },
976#endif
977 { .name = "kill", .errmsg = true,
978 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
979 { .name = "linkat", .errmsg = true,
980 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
981 { .name = "lseek", .errmsg = true,
982 .arg_scnprintf = { [0] = SCA_FD, /* fd */
983 [2] = SCA_STRARRAY, /* whence */ },
984 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
985 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
986 { .name = "madvise", .errmsg = true,
987 .arg_scnprintf = { [0] = SCA_HEX, /* start */
988 [2] = SCA_MADV_BHV, /* behavior */ }, },
989 { .name = "mkdirat", .errmsg = true,
990 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
991 { .name = "mknodat", .errmsg = true,
992 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
993 { .name = "mlock", .errmsg = true,
994 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
995 { .name = "mlockall", .errmsg = true,
996 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
997 { .name = "mmap", .hexret = true,
998 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
999 [2] = SCA_MMAP_PROT, /* prot */
1000 [3] = SCA_MMAP_FLAGS, /* flags */
1001 [4] = SCA_FD, /* fd */ }, },
1002 { .name = "mprotect", .errmsg = true,
1003 .arg_scnprintf = { [0] = SCA_HEX, /* start */
1004 [2] = SCA_MMAP_PROT, /* prot */ }, },
1005 { .name = "mremap", .hexret = true,
1006 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1007 [4] = SCA_HEX, /* new_addr */ }, },
1008 { .name = "munlock", .errmsg = true,
1009 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1010 { .name = "munmap", .errmsg = true,
1011 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1012 { .name = "name_to_handle_at", .errmsg = true,
1013 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1014 { .name = "newfstatat", .errmsg = true,
1015 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1016 { .name = "open", .errmsg = true,
1017 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1018 { .name = "open_by_handle_at", .errmsg = true,
1019 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1020 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1021 { .name = "openat", .errmsg = true,
1022 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1023 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1024 { .name = "pipe2", .errmsg = true,
1025 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1026 { .name = "poll", .errmsg = true, .timeout = true, },
1027 { .name = "ppoll", .errmsg = true, .timeout = true, },
1028 { .name = "pread", .errmsg = true, .alias = "pread64",
1029 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1030 { .name = "preadv", .errmsg = true, .alias = "pread",
1031 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1032 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1033 { .name = "pwrite", .errmsg = true, .alias = "pwrite64",
1034 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1035 { .name = "pwritev", .errmsg = true,
1036 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1037 { .name = "read", .errmsg = true,
1038 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039 { .name = "readlinkat", .errmsg = true,
1040 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1041 { .name = "readv", .errmsg = true,
1042 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1043 { .name = "recvfrom", .errmsg = true,
1044 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1045 { .name = "recvmmsg", .errmsg = true,
1046 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1047 { .name = "recvmsg", .errmsg = true,
1048 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1049 { .name = "renameat", .errmsg = true,
1050 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1051 { .name = "rt_sigaction", .errmsg = true,
1052 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1053 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
1054 { .name = "rt_sigqueueinfo", .errmsg = true,
1055 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1056 { .name = "rt_tgsigqueueinfo", .errmsg = true,
1057 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1058 { .name = "select", .errmsg = true, .timeout = true, },
1059 { .name = "sendmmsg", .errmsg = true,
1060 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1061 { .name = "sendmsg", .errmsg = true,
1062 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1063 { .name = "sendto", .errmsg = true,
1064 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1065 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
1066 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1067 { .name = "shutdown", .errmsg = true,
1068 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069 { .name = "socket", .errmsg = true,
1070 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1071 [1] = SCA_SK_TYPE, /* type */ },
1072 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1073 { .name = "socketpair", .errmsg = true,
1074 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1075 [1] = SCA_SK_TYPE, /* type */ },
1076 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
1077 { .name = "stat", .errmsg = true, .alias = "newstat", },
1078 { .name = "symlinkat", .errmsg = true,
1079 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1080 { .name = "tgkill", .errmsg = true,
1081 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1082 { .name = "tkill", .errmsg = true,
1083 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1084 { .name = "uname", .errmsg = true, .alias = "newuname", },
1085 { .name = "unlinkat", .errmsg = true,
1086 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1087 { .name = "utimensat", .errmsg = true,
1088 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1089 { .name = "write", .errmsg = true,
1090 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1091 { .name = "writev", .errmsg = true,
1092 .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093};
1094
1095static int syscall_fmt__cmp(const void *name, const void *fmtp)
1096{
1097 const struct syscall_fmt *fmt = fmtp;
1098 return strcmp(name, fmt->name);
1099}
1100
1101static struct syscall_fmt *syscall_fmt__find(const char *name)
1102{
1103 const int nmemb = ARRAY_SIZE(syscall_fmts);
1104 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1105}
1106
1107struct syscall {
1108 struct event_format *tp_format;
1109 const char *name;
1110 bool filtered;
1111 struct syscall_fmt *fmt;
1112 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1113 void **arg_parm;
1114};
1115
1116static size_t fprintf_duration(unsigned long t, FILE *fp)
1117{
1118 double duration = (double)t / NSEC_PER_MSEC;
1119 size_t printed = fprintf(fp, "(");
1120
1121 if (duration >= 1.0)
1122 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1123 else if (duration >= 0.01)
1124 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1125 else
1126 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1127 return printed + fprintf(fp, "): ");
1128}
1129
1130struct thread_trace {
1131 u64 entry_time;
1132 u64 exit_time;
1133 bool entry_pending;
1134 unsigned long nr_events;
1135 char *entry_str;
1136 double runtime_ms;
1137 struct {
1138 int max;
1139 char **table;
1140 } paths;
1141
1142 struct intlist *syscall_stats;
1143};
1144
1145static struct thread_trace *thread_trace__new(void)
1146{
1147 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
1148
1149 if (ttrace)
1150 ttrace->paths.max = -1;
1151
1152 ttrace->syscall_stats = intlist__new(NULL);
1153
1154 return ttrace;
1155}
1156
1157static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1158{
1159 struct thread_trace *ttrace;
1160
1161 if (thread == NULL)
1162 goto fail;
1163
1164 if (thread->priv == NULL)
1165 thread->priv = thread_trace__new();
1166
1167 if (thread->priv == NULL)
1168 goto fail;
1169
1170 ttrace = thread->priv;
1171 ++ttrace->nr_events;
1172
1173 return ttrace;
1174fail:
1175 color_fprintf(fp, PERF_COLOR_RED,
1176 "WARNING: not enough memory, dropping samples!\n");
1177 return NULL;
1178}
1179
1180struct trace {
1181 struct perf_tool tool;
1182 struct {
1183 int machine;
1184 int open_id;
1185 } audit;
1186 struct {
1187 int max;
1188 struct syscall *table;
1189 } syscalls;
1190 struct record_opts opts;
1191 struct machine *host;
1192 u64 base_time;
1193 FILE *output;
1194 unsigned long nr_events;
1195 struct strlist *ev_qualifier;
1196 const char *last_vfs_getname;
1197 struct intlist *tid_list;
1198 struct intlist *pid_list;
1199 double duration_filter;
1200 double runtime_ms;
1201 struct {
1202 u64 vfs_getname,
1203 proc_getname;
1204 } stats;
1205 bool not_ev_qualifier;
1206 bool live;
1207 bool full_time;
1208 bool sched;
1209 bool multiple_threads;
1210 bool summary;
1211 bool summary_only;
1212 bool show_comm;
1213 bool show_tool_stats;
1214};
1215
1216static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1217{
1218 struct thread_trace *ttrace = thread->priv;
1219
1220 if (fd > ttrace->paths.max) {
1221 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1222
1223 if (npath == NULL)
1224 return -1;
1225
1226 if (ttrace->paths.max != -1) {
1227 memset(npath + ttrace->paths.max + 1, 0,
1228 (fd - ttrace->paths.max) * sizeof(char *));
1229 } else {
1230 memset(npath, 0, (fd + 1) * sizeof(char *));
1231 }
1232
1233 ttrace->paths.table = npath;
1234 ttrace->paths.max = fd;
1235 }
1236
1237 ttrace->paths.table[fd] = strdup(pathname);
1238
1239 return ttrace->paths.table[fd] != NULL ? 0 : -1;
1240}
1241
1242static int thread__read_fd_path(struct thread *thread, int fd)
1243{
1244 char linkname[PATH_MAX], pathname[PATH_MAX];
1245 struct stat st;
1246 int ret;
1247
1248 if (thread->pid_ == thread->tid) {
1249 scnprintf(linkname, sizeof(linkname),
1250 "/proc/%d/fd/%d", thread->pid_, fd);
1251 } else {
1252 scnprintf(linkname, sizeof(linkname),
1253 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1254 }
1255
1256 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1257 return -1;
1258
1259 ret = readlink(linkname, pathname, sizeof(pathname));
1260
1261 if (ret < 0 || ret > st.st_size)
1262 return -1;
1263
1264 pathname[ret] = '\0';
1265 return trace__set_fd_pathname(thread, fd, pathname);
1266}
1267
1268static const char *thread__fd_path(struct thread *thread, int fd,
1269 struct trace *trace)
1270{
1271 struct thread_trace *ttrace = thread->priv;
1272
1273 if (ttrace == NULL)
1274 return NULL;
1275
1276 if (fd < 0)
1277 return NULL;
1278
1279 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL))
1280 if (!trace->live)
1281 return NULL;
1282 ++trace->stats.proc_getname;
1283 if (thread__read_fd_path(thread, fd)) {
1284 return NULL;
1285 }
1286
1287 return ttrace->paths.table[fd];
1288}
1289
1290static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1291 struct syscall_arg *arg)
1292{
1293 int fd = arg->val;
1294 size_t printed = scnprintf(bf, size, "%d", fd);
1295 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1296
1297 if (path)
1298 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1299
1300 return printed;
1301}
1302
1303static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1304 struct syscall_arg *arg)
1305{
1306 int fd = arg->val;
1307 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1308 struct thread_trace *ttrace = arg->thread->priv;
1309
1310 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1311 zfree(&ttrace->paths.table[fd]);
1312
1313 return printed;
1314}
1315
1316static bool trace__filter_duration(struct trace *trace, double t)
1317{
1318 return t < (trace->duration_filter * NSEC_PER_MSEC);
1319}
1320
1321static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1322{
1323 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1324
1325 return fprintf(fp, "%10.3f ", ts);
1326}
1327
1328static bool done = false;
1329static bool interrupted = false;
1330
1331static void sig_handler(int sig)
1332{
1333 done = true;
1334 interrupted = sig == SIGINT;
1335}
1336
1337static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1338 u64 duration, u64 tstamp, FILE *fp)
1339{
1340 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1341 printed += fprintf_duration(duration, fp);
1342
1343 if (trace->multiple_threads) {
1344 if (trace->show_comm)
1345 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1346 printed += fprintf(fp, "%d ", thread->tid);
1347 }
1348
1349 return printed;
1350}
1351
1352static int trace__process_event(struct trace *trace, struct machine *machine,
1353 union perf_event *event, struct perf_sample *sample)
1354{
1355 int ret = 0;
1356
1357 switch (event->header.type) {
1358 case PERF_RECORD_LOST:
1359 color_fprintf(trace->output, PERF_COLOR_RED,
1360 "LOST %" PRIu64 " events!\n", event->lost.lost);
1361 ret = machine__process_lost_event(machine, event, sample);
1362 default:
1363 ret = machine__process_event(machine, event, sample);
1364 break;
1365 }
1366
1367 return ret;
1368}
1369
1370static int trace__tool_process(struct perf_tool *tool,
1371 union perf_event *event,
1372 struct perf_sample *sample,
1373 struct machine *machine)
1374{
1375 struct trace *trace = container_of(tool, struct trace, tool);
1376 return trace__process_event(trace, machine, event, sample);
1377}
1378
1379static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1380{
1381 int err = symbol__init();
1382
1383 if (err)
1384 return err;
1385
1386 trace->host = machine__new_host();
1387 if (trace->host == NULL)
1388 return -ENOMEM;
1389
1390 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1391 evlist->threads, trace__tool_process, false);
1392 if (err)
1393 symbol__exit();
1394
1395 return err;
1396}
1397
1398static int syscall__set_arg_fmts(struct syscall *sc)
1399{
1400 struct format_field *field;
1401 int idx = 0;
1402
1403 sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
1404 if (sc->arg_scnprintf == NULL)
1405 return -1;
1406
1407 if (sc->fmt)
1408 sc->arg_parm = sc->fmt->arg_parm;
1409
1410 for (field = sc->tp_format->format.fields->next; field; field = field->next) {
1411 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1412 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1413 else if (field->flags & FIELD_IS_POINTER)
1414 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1415 ++idx;
1416 }
1417
1418 return 0;
1419}
1420
1421static int trace__read_syscall_info(struct trace *trace, int id)
1422{
1423 char tp_name[128];
1424 struct syscall *sc;
1425 const char *name = audit_syscall_to_name(id, trace->audit.machine);
1426
1427 if (name == NULL)
1428 return -1;
1429
1430 if (id > trace->syscalls.max) {
1431 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1432
1433 if (nsyscalls == NULL)
1434 return -1;
1435
1436 if (trace->syscalls.max != -1) {
1437 memset(nsyscalls + trace->syscalls.max + 1, 0,
1438 (id - trace->syscalls.max) * sizeof(*sc));
1439 } else {
1440 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1441 }
1442
1443 trace->syscalls.table = nsyscalls;
1444 trace->syscalls.max = id;
1445 }
1446
1447 sc = trace->syscalls.table + id;
1448 sc->name = name;
1449
1450 if (trace->ev_qualifier) {
1451 bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1452
1453 if (!(in ^ trace->not_ev_qualifier)) {
1454 sc->filtered = true;
1455 /*
1456 * No need to do read tracepoint information since this will be
1457 * filtered out.
1458 */
1459 return 0;
1460 }
1461 }
1462
1463 sc->fmt = syscall_fmt__find(sc->name);
1464
1465 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1466 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1467
1468 if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1469 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1470 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1471 }
1472
1473 if (sc->tp_format == NULL)
1474 return -1;
1475
1476 return syscall__set_arg_fmts(sc);
1477}
1478
1479static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1480 unsigned long *args, struct trace *trace,
1481 struct thread *thread)
1482{
1483 size_t printed = 0;
1484
1485 if (sc->tp_format != NULL) {
1486 struct format_field *field;
1487 u8 bit = 1;
1488 struct syscall_arg arg = {
1489 .idx = 0,
1490 .mask = 0,
1491 .trace = trace,
1492 .thread = thread,
1493 };
1494
1495 for (field = sc->tp_format->format.fields->next; field;
1496 field = field->next, ++arg.idx, bit <<= 1) {
1497 if (arg.mask & bit)
1498 continue;
1499 /*
1500 * Suppress this argument if its value is zero and
1501 * and we don't have a string associated in an
1502 * strarray for it.
1503 */
1504 if (args[arg.idx] == 0 &&
1505 !(sc->arg_scnprintf &&
1506 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1507 sc->arg_parm[arg.idx]))
1508 continue;
1509
1510 printed += scnprintf(bf + printed, size - printed,
1511 "%s%s: ", printed ? ", " : "", field->name);
1512 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1513 arg.val = args[arg.idx];
1514 if (sc->arg_parm)
1515 arg.parm = sc->arg_parm[arg.idx];
1516 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1517 size - printed, &arg);
1518 } else {
1519 printed += scnprintf(bf + printed, size - printed,
1520 "%ld", args[arg.idx]);
1521 }
1522 }
1523 } else {
1524 int i = 0;
1525
1526 while (i < 6) {
1527 printed += scnprintf(bf + printed, size - printed,
1528 "%sarg%d: %ld",
1529 printed ? ", " : "", i, args[i]);
1530 ++i;
1531 }
1532 }
1533
1534 return printed;
1535}
1536
1537typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1538 struct perf_sample *sample);
1539
1540static struct syscall *trace__syscall_info(struct trace *trace,
1541 struct perf_evsel *evsel, int id)
1542{
1543
1544 if (id < 0) {
1545
1546 /*
1547 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1548 * before that, leaving at a higher verbosity level till that is
1549 * explained. Reproduced with plain ftrace with:
1550 *
1551 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1552 * grep "NR -1 " /t/trace_pipe
1553 *
1554 * After generating some load on the machine.
1555 */
1556 if (verbose > 1) {
1557 static u64 n;
1558 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1559 id, perf_evsel__name(evsel), ++n);
1560 }
1561 return NULL;
1562 }
1563
1564 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1565 trace__read_syscall_info(trace, id))
1566 goto out_cant_read;
1567
1568 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1569 goto out_cant_read;
1570
1571 return &trace->syscalls.table[id];
1572
1573out_cant_read:
1574 if (verbose) {
1575 fprintf(trace->output, "Problems reading syscall %d", id);
1576 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1577 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1578 fputs(" information\n", trace->output);
1579 }
1580 return NULL;
1581}
1582
1583static void thread__update_stats(struct thread_trace *ttrace,
1584 int id, struct perf_sample *sample)
1585{
1586 struct int_node *inode;
1587 struct stats *stats;
1588 u64 duration = 0;
1589
1590 inode = intlist__findnew(ttrace->syscall_stats, id);
1591 if (inode == NULL)
1592 return;
1593
1594 stats = inode->priv;
1595 if (stats == NULL) {
1596 stats = malloc(sizeof(struct stats));
1597 if (stats == NULL)
1598 return;
1599 init_stats(stats);
1600 inode->priv = stats;
1601 }
1602
1603 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1604 duration = sample->time - ttrace->entry_time;
1605
1606 update_stats(stats, duration);
1607}
1608
1609static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1610 struct perf_sample *sample)
1611{
1612 char *msg;
1613 void *args;
1614 size_t printed = 0;
1615 struct thread *thread;
1616 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1617 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1618 struct thread_trace *ttrace;
1619
1620 if (sc == NULL)
1621 return -1;
1622
1623 if (sc->filtered)
1624 return 0;
1625
1626 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1627 ttrace = thread__trace(thread, trace->output);
1628 if (ttrace == NULL)
1629 return -1;
1630
1631 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1632 ttrace = thread->priv;
1633
1634 if (ttrace->entry_str == NULL) {
1635 ttrace->entry_str = malloc(1024);
1636 if (!ttrace->entry_str)
1637 return -1;
1638 }
1639
1640 ttrace->entry_time = sample->time;
1641 msg = ttrace->entry_str;
1642 printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1643
1644 printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1645 args, trace, thread);
1646
1647 if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) {
1648 if (!trace->duration_filter && !trace->summary_only) {
1649 trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1650 fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1651 }
1652 } else
1653 ttrace->entry_pending = true;
1654
1655 return 0;
1656}
1657
1658static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1659 struct perf_sample *sample)
1660{
1661 int ret;
1662 u64 duration = 0;
1663 struct thread *thread;
1664 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1665 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1666 struct thread_trace *ttrace;
1667
1668 if (sc == NULL)
1669 return -1;
1670
1671 if (sc->filtered)
1672 return 0;
1673
1674 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1675 ttrace = thread__trace(thread, trace->output);
1676 if (ttrace == NULL)
1677 return -1;
1678
1679 if (trace->summary)
1680 thread__update_stats(ttrace, id, sample);
1681
1682 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1683
1684 if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1685 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1686 trace->last_vfs_getname = NULL;
1687 ++trace->stats.vfs_getname;
1688 }
1689
1690 ttrace = thread->priv;
1691
1692 ttrace->exit_time = sample->time;
1693
1694 if (ttrace->entry_time) {
1695 duration = sample->time - ttrace->entry_time;
1696 if (trace__filter_duration(trace, duration))
1697 goto out;
1698 } else if (trace->duration_filter)
1699 goto out;
1700
1701 if (trace->summary_only)
1702 goto out;
1703
1704 trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1705
1706 if (ttrace->entry_pending) {
1707 fprintf(trace->output, "%-70s", ttrace->entry_str);
1708 } else {
1709 fprintf(trace->output, " ... [");
1710 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1711 fprintf(trace->output, "]: %s()", sc->name);
1712 }
1713
1714 if (sc->fmt == NULL) {
1715signed_print:
1716 fprintf(trace->output, ") = %d", ret);
1717 } else if (ret < 0 && sc->fmt->errmsg) {
1718 char bf[256];
1719 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1720 *e = audit_errno_to_name(-ret);
1721
1722 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1723 } else if (ret == 0 && sc->fmt->timeout)
1724 fprintf(trace->output, ") = 0 Timeout");
1725 else if (sc->fmt->hexret)
1726 fprintf(trace->output, ") = %#x", ret);
1727 else
1728 goto signed_print;
1729
1730 fputc('\n', trace->output);
1731out:
1732 ttrace->entry_pending = false;
1733
1734 return 0;
1735}
1736
1737static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1738 struct perf_sample *sample)
1739{
1740 trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1741 return 0;
1742}
1743
1744static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1745 struct perf_sample *sample)
1746{
1747 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1748 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1749 struct thread *thread = machine__findnew_thread(trace->host,
1750 sample->pid,
1751 sample->tid);
1752 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1753
1754 if (ttrace == NULL)
1755 goto out_dump;
1756
1757 ttrace->runtime_ms += runtime_ms;
1758 trace->runtime_ms += runtime_ms;
1759 return 0;
1760
1761out_dump:
1762 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1763 evsel->name,
1764 perf_evsel__strval(evsel, sample, "comm"),
1765 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1766 runtime,
1767 perf_evsel__intval(evsel, sample, "vruntime"));
1768 return 0;
1769}
1770
1771static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1772{
1773 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1774 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1775 return false;
1776
1777 if (trace->pid_list || trace->tid_list)
1778 return true;
1779
1780 return false;
1781}
1782
1783static int trace__process_sample(struct perf_tool *tool,
1784 union perf_event *event __maybe_unused,
1785 struct perf_sample *sample,
1786 struct perf_evsel *evsel,
1787 struct machine *machine __maybe_unused)
1788{
1789 struct trace *trace = container_of(tool, struct trace, tool);
1790 int err = 0;
1791
1792 tracepoint_handler handler = evsel->handler;
1793
1794 if (skip_sample(trace, sample))
1795 return 0;
1796
1797 if (!trace->full_time && trace->base_time == 0)
1798 trace->base_time = sample->time;
1799
1800 if (handler) {
1801 ++trace->nr_events;
1802 handler(trace, evsel, sample);
1803 }
1804
1805 return err;
1806}
1807
1808static int parse_target_str(struct trace *trace)
1809{
1810 if (trace->opts.target.pid) {
1811 trace->pid_list = intlist__new(trace->opts.target.pid);
1812 if (trace->pid_list == NULL) {
1813 pr_err("Error parsing process id string\n");
1814 return -EINVAL;
1815 }
1816 }
1817
1818 if (trace->opts.target.tid) {
1819 trace->tid_list = intlist__new(trace->opts.target.tid);
1820 if (trace->tid_list == NULL) {
1821 pr_err("Error parsing thread id string\n");
1822 return -EINVAL;
1823 }
1824 }
1825
1826 return 0;
1827}
1828
1829static int trace__record(int argc, const char **argv)
1830{
1831 unsigned int rec_argc, i, j;
1832 const char **rec_argv;
1833 const char * const record_args[] = {
1834 "record",
1835 "-R",
1836 "-m", "1024",
1837 "-c", "1",
1838 "-e",
1839 };
1840
1841 /* +1 is for the event string below */
1842 rec_argc = ARRAY_SIZE(record_args) + 1 + argc;
1843 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1844
1845 if (rec_argv == NULL)
1846 return -ENOMEM;
1847
1848 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1849 rec_argv[i] = record_args[i];
1850
1851 /* event string may be different for older kernels - e.g., RHEL6 */
1852 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1853 rec_argv[i] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1854 else if (is_valid_tracepoint("syscalls:sys_enter"))
1855 rec_argv[i] = "syscalls:sys_enter,syscalls:sys_exit";
1856 else {
1857 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
1858 return -1;
1859 }
1860 i++;
1861
1862 for (j = 0; j < (unsigned int)argc; j++, i++)
1863 rec_argv[i] = argv[j];
1864
1865 return cmd_record(i, rec_argv, NULL);
1866}
1867
1868static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
1869
1870static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
1871{
1872 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
1873 if (evsel == NULL)
1874 return;
1875
1876 if (perf_evsel__field(evsel, "pathname") == NULL) {
1877 perf_evsel__delete(evsel);
1878 return;
1879 }
1880
1881 evsel->handler = trace__vfs_getname;
1882 perf_evlist__add(evlist, evsel);
1883}
1884
1885static int trace__run(struct trace *trace, int argc, const char **argv)
1886{
1887 struct perf_evlist *evlist = perf_evlist__new();
1888 struct perf_evsel *evsel;
1889 int err = -1, i;
1890 unsigned long before;
1891 const bool forks = argc > 0;
1892
1893 trace->live = true;
1894
1895 if (evlist == NULL) {
1896 fprintf(trace->output, "Not enough memory to run!\n");
1897 goto out;
1898 }
1899
1900 if (perf_evlist__add_syscall_newtp(evlist, trace__sys_enter, trace__sys_exit))
1901 goto out_error_tp;
1902
1903 perf_evlist__add_vfs_getname(evlist);
1904
1905 if (trace->sched &&
1906 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
1907 trace__sched_stat_runtime))
1908 goto out_error_tp;
1909
1910 err = perf_evlist__create_maps(evlist, &trace->opts.target);
1911 if (err < 0) {
1912 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
1913 goto out_delete_evlist;
1914 }
1915
1916 err = trace__symbols_init(trace, evlist);
1917 if (err < 0) {
1918 fprintf(trace->output, "Problems initializing symbol libraries!\n");
1919 goto out_delete_evlist;
1920 }
1921
1922 perf_evlist__config(evlist, &trace->opts);
1923
1924 signal(SIGCHLD, sig_handler);
1925 signal(SIGINT, sig_handler);
1926
1927 if (forks) {
1928 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
1929 argv, false, NULL);
1930 if (err < 0) {
1931 fprintf(trace->output, "Couldn't run the workload!\n");
1932 goto out_delete_evlist;
1933 }
1934 }
1935
1936 err = perf_evlist__open(evlist);
1937 if (err < 0)
1938 goto out_error_open;
1939
1940 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
1941 if (err < 0) {
1942 fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
1943 goto out_delete_evlist;
1944 }
1945
1946 perf_evlist__enable(evlist);
1947
1948 if (forks)
1949 perf_evlist__start_workload(evlist);
1950
1951 trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
1952again:
1953 before = trace->nr_events;
1954
1955 for (i = 0; i < evlist->nr_mmaps; i++) {
1956 union perf_event *event;
1957
1958 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
1959 const u32 type = event->header.type;
1960 tracepoint_handler handler;
1961 struct perf_sample sample;
1962
1963 ++trace->nr_events;
1964
1965 err = perf_evlist__parse_sample(evlist, event, &sample);
1966 if (err) {
1967 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
1968 goto next_event;
1969 }
1970
1971 if (!trace->full_time && trace->base_time == 0)
1972 trace->base_time = sample.time;
1973
1974 if (type != PERF_RECORD_SAMPLE) {
1975 trace__process_event(trace, trace->host, event, &sample);
1976 continue;
1977 }
1978
1979 evsel = perf_evlist__id2evsel(evlist, sample.id);
1980 if (evsel == NULL) {
1981 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
1982 goto next_event;
1983 }
1984
1985 if (sample.raw_data == NULL) {
1986 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
1987 perf_evsel__name(evsel), sample.tid,
1988 sample.cpu, sample.raw_size);
1989 goto next_event;
1990 }
1991
1992 handler = evsel->handler;
1993 handler(trace, evsel, &sample);
1994next_event:
1995 perf_evlist__mmap_consume(evlist, i);
1996
1997 if (interrupted)
1998 goto out_disable;
1999 }
2000 }
2001
2002 if (trace->nr_events == before) {
2003 int timeout = done ? 100 : -1;
2004
2005 if (poll(evlist->pollfd, evlist->nr_fds, timeout) > 0)
2006 goto again;
2007 } else {
2008 goto again;
2009 }
2010
2011out_disable:
2012 perf_evlist__disable(evlist);
2013
2014 if (!err) {
2015 if (trace->summary)
2016 trace__fprintf_thread_summary(trace, trace->output);
2017
2018 if (trace->show_tool_stats) {
2019 fprintf(trace->output, "Stats:\n "
2020 " vfs_getname : %" PRIu64 "\n"
2021 " proc_getname: %" PRIu64 "\n",
2022 trace->stats.vfs_getname,
2023 trace->stats.proc_getname);
2024 }
2025 }
2026
2027out_delete_evlist:
2028 perf_evlist__delete(evlist);
2029out:
2030 trace->live = false;
2031 return err;
2032{
2033 char errbuf[BUFSIZ];
2034
2035out_error_tp:
2036 perf_evlist__strerror_tp(evlist, errno, errbuf, sizeof(errbuf));
2037 goto out_error;
2038
2039out_error_open:
2040 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2041
2042out_error:
2043 fprintf(trace->output, "%s\n", errbuf);
2044 goto out_delete_evlist;
2045}
2046}
2047
2048static int trace__replay(struct trace *trace)
2049{
2050 const struct perf_evsel_str_handler handlers[] = {
2051 { "probe:vfs_getname", trace__vfs_getname, },
2052 };
2053 struct perf_data_file file = {
2054 .path = input_name,
2055 .mode = PERF_DATA_MODE_READ,
2056 };
2057 struct perf_session *session;
2058 struct perf_evsel *evsel;
2059 int err = -1;
2060
2061 trace->tool.sample = trace__process_sample;
2062 trace->tool.mmap = perf_event__process_mmap;
2063 trace->tool.mmap2 = perf_event__process_mmap2;
2064 trace->tool.comm = perf_event__process_comm;
2065 trace->tool.exit = perf_event__process_exit;
2066 trace->tool.fork = perf_event__process_fork;
2067 trace->tool.attr = perf_event__process_attr;
2068 trace->tool.tracing_data = perf_event__process_tracing_data;
2069 trace->tool.build_id = perf_event__process_build_id;
2070
2071 trace->tool.ordered_samples = true;
2072 trace->tool.ordering_requires_timestamps = true;
2073
2074 /* add tid to output */
2075 trace->multiple_threads = true;
2076
2077 if (symbol__init() < 0)
2078 return -1;
2079
2080 session = perf_session__new(&file, false, &trace->tool);
2081 if (session == NULL)
2082 return -ENOMEM;
2083
2084 trace->host = &session->machines.host;
2085
2086 err = perf_session__set_tracepoints_handlers(session, handlers);
2087 if (err)
2088 goto out;
2089
2090 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2091 "raw_syscalls:sys_enter");
2092 /* older kernels have syscalls tp versus raw_syscalls */
2093 if (evsel == NULL)
2094 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2095 "syscalls:sys_enter");
2096 if (evsel == NULL) {
2097 pr_err("Data file does not have raw_syscalls:sys_enter event\n");
2098 goto out;
2099 }
2100
2101 if (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2102 perf_evsel__init_sc_tp_ptr_field(evsel, args)) {
2103 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2104 goto out;
2105 }
2106
2107 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2108 "raw_syscalls:sys_exit");
2109 if (evsel == NULL)
2110 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2111 "syscalls:sys_exit");
2112 if (evsel == NULL) {
2113 pr_err("Data file does not have raw_syscalls:sys_exit event\n");
2114 goto out;
2115 }
2116
2117 if (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2118 perf_evsel__init_sc_tp_uint_field(evsel, ret)) {
2119 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2120 goto out;
2121 }
2122
2123 err = parse_target_str(trace);
2124 if (err != 0)
2125 goto out;
2126
2127 setup_pager();
2128
2129 err = perf_session__process_events(session, &trace->tool);
2130 if (err)
2131 pr_err("Failed to process events, error %d", err);
2132
2133 else if (trace->summary)
2134 trace__fprintf_thread_summary(trace, trace->output);
2135
2136out:
2137 perf_session__delete(session);
2138
2139 return err;
2140}
2141
2142static size_t trace__fprintf_threads_header(FILE *fp)
2143{
2144 size_t printed;
2145
2146 printed = fprintf(fp, "\n Summary of events:\n\n");
2147
2148 return printed;
2149}
2150
2151static size_t thread__dump_stats(struct thread_trace *ttrace,
2152 struct trace *trace, FILE *fp)
2153{
2154 struct stats *stats;
2155 size_t printed = 0;
2156 struct syscall *sc;
2157 struct int_node *inode = intlist__first(ttrace->syscall_stats);
2158
2159 if (inode == NULL)
2160 return 0;
2161
2162 printed += fprintf(fp, "\n");
2163
2164 printed += fprintf(fp, " syscall calls min avg max stddev\n");
2165 printed += fprintf(fp, " (msec) (msec) (msec) (%%)\n");
2166 printed += fprintf(fp, " --------------- -------- --------- --------- --------- ------\n");
2167
2168 /* each int_node is a syscall */
2169 while (inode) {
2170 stats = inode->priv;
2171 if (stats) {
2172 double min = (double)(stats->min) / NSEC_PER_MSEC;
2173 double max = (double)(stats->max) / NSEC_PER_MSEC;
2174 double avg = avg_stats(stats);
2175 double pct;
2176 u64 n = (u64) stats->n;
2177
2178 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2179 avg /= NSEC_PER_MSEC;
2180
2181 sc = &trace->syscalls.table[inode->i];
2182 printed += fprintf(fp, " %-15s", sc->name);
2183 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2184 n, min, avg);
2185 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2186 }
2187
2188 inode = intlist__next(inode);
2189 }
2190
2191 printed += fprintf(fp, "\n\n");
2192
2193 return printed;
2194}
2195
2196/* struct used to pass data to per-thread function */
2197struct summary_data {
2198 FILE *fp;
2199 struct trace *trace;
2200 size_t printed;
2201};
2202
2203static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2204{
2205 struct summary_data *data = priv;
2206 FILE *fp = data->fp;
2207 size_t printed = data->printed;
2208 struct trace *trace = data->trace;
2209 struct thread_trace *ttrace = thread->priv;
2210 double ratio;
2211
2212 if (ttrace == NULL)
2213 return 0;
2214
2215 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2216
2217 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2218 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2219 printed += fprintf(fp, "%.1f%%", ratio);
2220 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2221 printed += thread__dump_stats(ttrace, trace, fp);
2222
2223 data->printed += printed;
2224
2225 return 0;
2226}
2227
2228static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2229{
2230 struct summary_data data = {
2231 .fp = fp,
2232 .trace = trace
2233 };
2234 data.printed = trace__fprintf_threads_header(fp);
2235
2236 machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2237
2238 return data.printed;
2239}
2240
2241static int trace__set_duration(const struct option *opt, const char *str,
2242 int unset __maybe_unused)
2243{
2244 struct trace *trace = opt->value;
2245
2246 trace->duration_filter = atof(str);
2247 return 0;
2248}
2249
2250static int trace__open_output(struct trace *trace, const char *filename)
2251{
2252 struct stat st;
2253
2254 if (!stat(filename, &st) && st.st_size) {
2255 char oldname[PATH_MAX];
2256
2257 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2258 unlink(oldname);
2259 rename(filename, oldname);
2260 }
2261
2262 trace->output = fopen(filename, "w");
2263
2264 return trace->output == NULL ? -errno : 0;
2265}
2266
2267int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2268{
2269 const char * const trace_usage[] = {
2270 "perf trace [<options>] [<command>]",
2271 "perf trace [<options>] -- <command> [<options>]",
2272 "perf trace record [<options>] [<command>]",
2273 "perf trace record [<options>] -- <command> [<options>]",
2274 NULL
2275 };
2276 struct trace trace = {
2277 .audit = {
2278 .machine = audit_detect_machine(),
2279 .open_id = audit_name_to_syscall("open", trace.audit.machine),
2280 },
2281 .syscalls = {
2282 . max = -1,
2283 },
2284 .opts = {
2285 .target = {
2286 .uid = UINT_MAX,
2287 .uses_mmap = true,
2288 },
2289 .user_freq = UINT_MAX,
2290 .user_interval = ULLONG_MAX,
2291 .no_buffering = true,
2292 .mmap_pages = 1024,
2293 },
2294 .output = stdout,
2295 .show_comm = true,
2296 };
2297 const char *output_name = NULL;
2298 const char *ev_qualifier_str = NULL;
2299 const struct option trace_options[] = {
2300 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2301 "show the thread COMM next to its id"),
2302 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2303 OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
2304 "list of events to trace"),
2305 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2306 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2307 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2308 "trace events on existing process id"),
2309 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2310 "trace events on existing thread id"),
2311 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2312 "system-wide collection from all CPUs"),
2313 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2314 "list of cpus to monitor"),
2315 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2316 "child tasks do not inherit counters"),
2317 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2318 "number of mmap data pages",
2319 perf_evlist__parse_mmap_pages),
2320 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2321 "user to profile"),
2322 OPT_CALLBACK(0, "duration", &trace, "float",
2323 "show only events with duration > N.M ms",
2324 trace__set_duration),
2325 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2326 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2327 OPT_BOOLEAN('T', "time", &trace.full_time,
2328 "Show full timestamp, not time relative to first start"),
2329 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2330 "Show only syscall summary with statistics"),
2331 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2332 "Show all syscalls and summary with statistics"),
2333 OPT_END()
2334 };
2335 int err;
2336 char bf[BUFSIZ];
2337
2338 if ((argc > 1) && (strcmp(argv[1], "record") == 0))
2339 return trace__record(argc-2, &argv[2]);
2340
2341 argc = parse_options(argc, argv, trace_options, trace_usage, 0);
2342
2343 /* summary_only implies summary option, but don't overwrite summary if set */
2344 if (trace.summary_only)
2345 trace.summary = trace.summary_only;
2346
2347 if (output_name != NULL) {
2348 err = trace__open_output(&trace, output_name);
2349 if (err < 0) {
2350 perror("failed to create output file");
2351 goto out;
2352 }
2353 }
2354
2355 if (ev_qualifier_str != NULL) {
2356 const char *s = ev_qualifier_str;
2357
2358 trace.not_ev_qualifier = *s == '!';
2359 if (trace.not_ev_qualifier)
2360 ++s;
2361 trace.ev_qualifier = strlist__new(true, s);
2362 if (trace.ev_qualifier == NULL) {
2363 fputs("Not enough memory to parse event qualifier",
2364 trace.output);
2365 err = -ENOMEM;
2366 goto out_close;
2367 }
2368 }
2369
2370 err = target__validate(&trace.opts.target);
2371 if (err) {
2372 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2373 fprintf(trace.output, "%s", bf);
2374 goto out_close;
2375 }
2376
2377 err = target__parse_uid(&trace.opts.target);
2378 if (err) {
2379 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2380 fprintf(trace.output, "%s", bf);
2381 goto out_close;
2382 }
2383
2384 if (!argc && target__none(&trace.opts.target))
2385 trace.opts.target.system_wide = true;
2386
2387 if (input_name)
2388 err = trace__replay(&trace);
2389 else
2390 err = trace__run(&trace, argc, argv);
2391
2392out_close:
2393 if (output_name != NULL)
2394 fclose(trace.output);
2395out:
2396 return err;
2397}