Linux Audio

Check our new training course

Loading...
v3.1
  1/*
  2 * builtin-record.c
  3 *
  4 * Builtin record command: Record the profile of a workload
  5 * (or a CPU, or a PID) into the perf.data output file - for
  6 * later analysis via perf report.
  7 */
  8#define _FILE_OFFSET_BITS 64
  9
 10#include "builtin.h"
 11
 12#include "perf.h"
 13
 14#include "util/build-id.h"
 15#include "util/util.h"
 16#include "util/parse-options.h"
 17#include "util/parse-events.h"
 18
 19#include "util/header.h"
 20#include "util/event.h"
 21#include "util/evlist.h"
 22#include "util/evsel.h"
 23#include "util/debug.h"
 24#include "util/session.h"
 
 25#include "util/symbol.h"
 26#include "util/cpumap.h"
 27#include "util/thread_map.h"
 28
 29#include <unistd.h>
 30#include <sched.h>
 31#include <sys/mman.h>
 32
 33enum write_mode_t {
 34	WRITE_FORCE,
 35	WRITE_APPEND
 36};
 37
 38static u64			user_interval			= ULLONG_MAX;
 39static u64			default_interval		=      0;
 40
 41static unsigned int		page_size;
 42static unsigned int		mmap_pages			= UINT_MAX;
 43static unsigned int		user_freq 			= UINT_MAX;
 44static int			freq				=   1000;
 45static int			output;
 46static int			pipe_output			=      0;
 47static const char		*output_name			= NULL;
 48static bool			group				=  false;
 49static int			realtime_prio			=      0;
 50static bool			nodelay				=  false;
 51static bool			raw_samples			=  false;
 52static bool			sample_id_all_avail		=   true;
 53static bool			system_wide			=  false;
 54static pid_t			target_pid			=     -1;
 55static pid_t			target_tid			=     -1;
 56static pid_t			child_pid			=     -1;
 57static bool			no_inherit			=  false;
 58static enum write_mode_t	write_mode			= WRITE_FORCE;
 59static bool			call_graph			=  false;
 60static bool			inherit_stat			=  false;
 61static bool			no_samples			=  false;
 62static bool			sample_address			=  false;
 63static bool			sample_time			=  false;
 64static bool			no_buildid			=  false;
 65static bool			no_buildid_cache		=  false;
 66static struct perf_evlist	*evsel_list;
 67
 68static long			samples				=      0;
 69static u64			bytes_written			=      0;
 70
 71static int			file_new			=      1;
 72static off_t			post_processing_offset;
 73
 74static struct perf_session	*session;
 75static const char		*cpu_list;
 76
 77static void advance_output(size_t size)
 78{
 79	bytes_written += size;
 80}
 81
 82static void write_output(void *buf, size_t size)
 83{
 84	while (size) {
 85		int ret = write(output, buf, size);
 86
 87		if (ret < 0)
 88			die("failed to write");
 89
 90		size -= ret;
 91		buf += ret;
 92
 93		bytes_written += ret;
 94	}
 95}
 96
 97static int process_synthesized_event(union perf_event *event,
 
 98				     struct perf_sample *sample __used,
 99				     struct perf_session *self __used)
100{
101	write_output(event, event->header.size);
 
102	return 0;
103}
104
105static void mmap_read(struct perf_mmap *md)
 
106{
107	unsigned int head = perf_mmap__read_head(md);
108	unsigned int old = md->prev;
109	unsigned char *data = md->base + page_size;
110	unsigned long size;
111	void *buf;
112
113	if (old == head)
114		return;
115
116	samples++;
117
118	size = head - old;
119
120	if ((old & md->mask) + size != (head & md->mask)) {
121		buf = &data[old & md->mask];
122		size = md->mask + 1 - (old & md->mask);
123		old += size;
124
125		write_output(buf, size);
126	}
127
128	buf = &data[old & md->mask];
129	size = head - old;
130	old += size;
131
132	write_output(buf, size);
133
134	md->prev = old;
135	perf_mmap__write_tail(md, old);
136}
137
138static volatile int done = 0;
139static volatile int signr = -1;
 
140
141static void sig_handler(int sig)
142{
 
 
 
143	done = 1;
144	signr = sig;
145}
146
147static void sig_atexit(void)
148{
149	if (child_pid > 0)
150		kill(child_pid, SIGTERM);
 
 
 
 
 
 
 
 
 
151
152	if (signr == -1 || signr == SIGUSR1)
153		return;
154
155	signal(signr, SIG_DFL);
156	kill(getpid(), signr);
157}
158
159static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
160{
161	struct perf_event_attr *attr = &evsel->attr;
162	int track = !evsel->idx; /* only the first counter needs these */
163
164	attr->disabled		= 1;
165	attr->inherit		= !no_inherit;
166	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
167				  PERF_FORMAT_TOTAL_TIME_RUNNING |
168				  PERF_FORMAT_ID;
169
170	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
171
172	if (evlist->nr_entries > 1)
173		attr->sample_type |= PERF_SAMPLE_ID;
174
175	/*
176	 * We default some events to a 1 default interval. But keep
177	 * it a weak assumption overridable by the user.
178	 */
179	if (!attr->sample_period || (user_freq != UINT_MAX &&
180				     user_interval != ULLONG_MAX)) {
181		if (freq) {
182			attr->sample_type	|= PERF_SAMPLE_PERIOD;
183			attr->freq		= 1;
184			attr->sample_freq	= freq;
185		} else {
186			attr->sample_period = default_interval;
187		}
188	}
189
190	if (no_samples)
191		attr->sample_freq = 0;
192
193	if (inherit_stat)
194		attr->inherit_stat = 1;
195
196	if (sample_address) {
197		attr->sample_type	|= PERF_SAMPLE_ADDR;
198		attr->mmap_data = track;
199	}
200
201	if (call_graph)
202		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
203
204	if (system_wide)
205		attr->sample_type	|= PERF_SAMPLE_CPU;
206
207	if (sample_id_all_avail &&
208	    (sample_time || system_wide || !no_inherit || cpu_list))
209		attr->sample_type	|= PERF_SAMPLE_TIME;
210
211	if (raw_samples) {
212		attr->sample_type	|= PERF_SAMPLE_TIME;
213		attr->sample_type	|= PERF_SAMPLE_RAW;
214		attr->sample_type	|= PERF_SAMPLE_CPU;
215	}
216
217	if (nodelay) {
218		attr->watermark = 0;
219		attr->wakeup_events = 1;
220	}
221
222	attr->mmap		= track;
223	attr->comm		= track;
224
225	if (target_pid == -1 && target_tid == -1 && !system_wide) {
226		attr->disabled = 1;
227		attr->enable_on_exec = 1;
228	}
229}
230
231static bool perf_evlist__equal(struct perf_evlist *evlist,
232			       struct perf_evlist *other)
233{
234	struct perf_evsel *pos, *pair;
235
236	if (evlist->nr_entries != other->nr_entries)
237		return false;
238
239	pair = list_entry(other->entries.next, struct perf_evsel, node);
240
241	list_for_each_entry(pos, &evlist->entries, node) {
242		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
243			return false;
244		pair = list_entry(pair->node.next, struct perf_evsel, node);
245	}
246
247	return true;
248}
249
250static void open_counters(struct perf_evlist *evlist)
251{
252	struct perf_evsel *pos;
 
 
 
253
254	if (evlist->cpus->map[0] < 0)
255		no_inherit = true;
 
256
257	list_for_each_entry(pos, &evlist->entries, node) {
258		struct perf_event_attr *attr = &pos->attr;
 
259		/*
260		 * Check if parse_single_tracepoint_event has already asked for
261		 * PERF_SAMPLE_TIME.
262		 *
263		 * XXX this is kludgy but short term fix for problems introduced by
264		 * eac23d1c that broke 'perf script' by having different sample_types
265		 * when using multiple tracepoint events when we use a perf binary
266		 * that tries to use sample_id_all on an older kernel.
267		 *
268		 * We need to move counter creation to perf_session, support
269		 * different sample_types, etc.
270		 */
271		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
272
273		config_attr(pos, evlist);
 
 
 
 
274retry_sample_id:
275		attr->sample_id_all = sample_id_all_avail ? 1 : 0;
276try_again:
277		if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group) < 0) {
 
278			int err = errno;
279
280			if (err == EPERM || err == EACCES) {
281				ui__warning_paranoid();
282				exit(EXIT_FAILURE);
283			} else if (err ==  ENODEV && cpu_list) {
284				die("No such device - did you specify"
285					" an out-of-range profile CPU?\n");
286			} else if (err == EINVAL && sample_id_all_avail) {
287				/*
288				 * Old kernel, no attr->sample_id_type_all field
289				 */
290				sample_id_all_avail = false;
291				if (!sample_time && !raw_samples && !time_needed)
292					attr->sample_type &= ~PERF_SAMPLE_TIME;
 
 
 
 
 
 
 
293
294				goto retry_sample_id;
 
295			}
296
297			/*
298			 * If it's cycles then fall back to hrtimer
299			 * based cpu-clock-tick sw counter, which
300			 * is always available even if no PMU support:
 
 
 
301			 */
302			if (attr->type == PERF_TYPE_HARDWARE
 
303					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
304
305				if (verbose)
306					ui__warning("The cycles event is not supported, "
307						    "trying to fall back to cpu-clock-ticks\n");
308				attr->type = PERF_TYPE_SOFTWARE;
309				attr->config = PERF_COUNT_SW_CPU_CLOCK;
 
 
 
 
310				goto try_again;
311			}
312
313			if (err == ENOENT) {
314				ui__warning("The %s event is not supported.\n",
315					    event_name(pos));
316				exit(EXIT_FAILURE);
317			}
318
319			printf("\n");
320			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
321			      err, strerror(err));
322
323#if defined(__i386__) || defined(__x86_64__)
324			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
325				die("No hardware sampling interrupt available."
326				    " No APIC? If so then you can boot the kernel"
327				    " with the \"lapic\" boot parameter to"
328				    " force-enable it.\n");
329#endif
330
331			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
332		}
333	}
334
335	if (perf_evlist__set_filters(evlist)) {
336		error("failed to set filter with %d (%s)\n", errno,
337			strerror(errno));
338		exit(-1);
339	}
340
341	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
 
 
 
 
 
 
 
 
 
342		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 
343
344	if (file_new)
345		session->evlist = evlist;
346	else {
347		if (!perf_evlist__equal(session->evlist, evlist)) {
348			fprintf(stderr, "incompatible append\n");
349			exit(-1);
350		}
351 	}
352
353	perf_session__update_sample_type(session);
354}
355
356static int process_buildids(void)
357{
358	u64 size = lseek(output, 0, SEEK_CUR);
359
360	if (size == 0)
361		return 0;
362
363	session->fd = output;
364	return __perf_session__process_events(session, post_processing_offset,
365					      size - post_processing_offset,
366					      size, &build_id__mark_dso_hit_ops);
367}
368
369static void atexit_header(void)
370{
371	if (!pipe_output) {
372		session->header.data_size += bytes_written;
 
 
373
374		if (!no_buildid)
375			process_buildids();
376		perf_session__write_header(session, evsel_list, output, true);
377		perf_session__delete(session);
378		perf_evlist__delete(evsel_list);
 
379		symbol__exit();
380	}
381}
382
383static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
384{
385	int err;
386	struct perf_session *psession = data;
387
388	if (machine__is_host(machine))
389		return;
390
391	/*
392	 *As for guest kernel when processing subcommand record&report,
393	 *we arrange module mmap prior to guest kernel mmap and trigger
394	 *a preload dso because default guest module symbols are loaded
395	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
396	 *method is used to avoid symbol missing when the first addr is
397	 *in module instead of in guest kernel.
398	 */
399	err = perf_event__synthesize_modules(process_synthesized_event,
400					     psession, machine);
401	if (err < 0)
402		pr_err("Couldn't record guest kernel [%d]'s reference"
403		       " relocation symbol.\n", machine->pid);
404
405	/*
406	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
407	 * have no _text sometimes.
408	 */
409	err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
410						 psession, machine, "_text");
411	if (err < 0)
412		err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
413							 psession, machine,
414							 "_stext");
415	if (err < 0)
416		pr_err("Couldn't record guest kernel [%d]'s reference"
417		       " relocation symbol.\n", machine->pid);
418}
419
420static struct perf_event_header finished_round_event = {
421	.size = sizeof(struct perf_event_header),
422	.type = PERF_RECORD_FINISHED_ROUND,
423};
424
425static void mmap_read_all(void)
426{
427	int i;
428
429	for (i = 0; i < evsel_list->nr_mmaps; i++) {
430		if (evsel_list->mmap[i].base)
431			mmap_read(&evsel_list->mmap[i]);
432	}
433
434	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
435		write_output(&finished_round_event, sizeof(finished_round_event));
436}
437
438static int __cmd_record(int argc, const char **argv)
439{
440	struct stat st;
441	int flags;
442	int err;
443	unsigned long waking = 0;
444	int child_ready_pipe[2], go_pipe[2];
445	const bool forks = argc > 0;
446	char buf;
447	struct machine *machine;
 
 
 
 
 
 
 
448
449	page_size = sysconf(_SC_PAGE_SIZE);
450
451	atexit(sig_atexit);
452	signal(SIGCHLD, sig_handler);
453	signal(SIGINT, sig_handler);
454	signal(SIGUSR1, sig_handler);
455
456	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
457		perror("failed to create pipes");
458		exit(-1);
459	}
460
461	if (!output_name) {
462		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
463			pipe_output = 1;
464		else
465			output_name = "perf.data";
466	}
467	if (output_name) {
468		if (!strcmp(output_name, "-"))
469			pipe_output = 1;
470		else if (!stat(output_name, &st) && st.st_size) {
471			if (write_mode == WRITE_FORCE) {
472				char oldname[PATH_MAX];
473				snprintf(oldname, sizeof(oldname), "%s.old",
474					 output_name);
475				unlink(oldname);
476				rename(output_name, oldname);
477			}
478		} else if (write_mode == WRITE_APPEND) {
479			write_mode = WRITE_FORCE;
480		}
481	}
482
483	flags = O_CREAT|O_RDWR;
484	if (write_mode == WRITE_APPEND)
485		file_new = 0;
486	else
487		flags |= O_TRUNC;
488
489	if (pipe_output)
490		output = STDOUT_FILENO;
491	else
492		output = open(output_name, flags, S_IRUSR | S_IWUSR);
493	if (output < 0) {
494		perror("failed to create output file");
495		exit(-1);
496	}
497
 
 
498	session = perf_session__new(output_name, O_WRONLY,
499				    write_mode == WRITE_FORCE, false, NULL);
500	if (session == NULL) {
501		pr_err("Not enough memory for reading perf file header\n");
502		return -1;
503	}
504
505	if (!no_buildid)
506		perf_header__set_feat(&session->header, HEADER_BUILD_ID);
 
 
 
 
 
507
508	if (!file_new) {
 
 
 
 
 
 
509		err = perf_session__read_header(session, output);
510		if (err < 0)
511			goto out_delete_session;
512	}
513
514	if (have_tracepoints(&evsel_list->entries))
515		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
516
517	/* 512 kiB: default amount of unprivileged mlocked memory */
518	if (mmap_pages == UINT_MAX)
519		mmap_pages = (512 * 1024) / page_size;
520
521	if (forks) {
522		child_pid = fork();
523		if (child_pid < 0) {
524			perror("failed to fork");
525			exit(-1);
526		}
527
528		if (!child_pid) {
529			if (pipe_output)
530				dup2(2, 1);
531			close(child_ready_pipe[0]);
532			close(go_pipe[1]);
533			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
534
535			/*
536			 * Do a dummy execvp to get the PLT entry resolved,
537			 * so we avoid the resolver overhead on the real
538			 * execvp call.
539			 */
540			execvp("", (char **)argv);
541
542			/*
543			 * Tell the parent we're ready to go
544			 */
545			close(child_ready_pipe[1]);
546
547			/*
548			 * Wait until the parent tells us to go.
549			 */
550			if (read(go_pipe[0], &buf, 1) == -1)
551				perror("unable to read pipe");
552
553			execvp(argv[0], (char **)argv);
554
555			perror(argv[0]);
556			kill(getppid(), SIGUSR1);
557			exit(-1);
558		}
559
560		if (!system_wide && target_tid == -1 && target_pid == -1)
561			evsel_list->threads->map[0] = child_pid;
562
563		close(child_ready_pipe[1]);
564		close(go_pipe[0]);
565		/*
566		 * wait for child to settle
567		 */
568		if (read(child_ready_pipe[0], &buf, 1) == -1) {
569			perror("unable to read pipe");
570			exit(-1);
571		}
572		close(child_ready_pipe[0]);
573	}
574
575	open_counters(evsel_list);
576
577	/*
578	 * perf_session__delete(session) will be called at atexit_header()
579	 */
580	atexit(atexit_header);
581
582	if (pipe_output) {
583		err = perf_header__write_pipe(output);
584		if (err < 0)
585			return err;
586	} else if (file_new) {
587		err = perf_session__write_header(session, evsel_list,
588						 output, false);
589		if (err < 0)
590			return err;
591	}
592
593	post_processing_offset = lseek(output, 0, SEEK_CUR);
 
 
 
 
 
594
595	if (pipe_output) {
596		err = perf_session__synthesize_attrs(session,
597						     process_synthesized_event);
 
 
 
 
 
 
 
 
598		if (err < 0) {
599			pr_err("Couldn't synthesize attrs.\n");
600			return err;
601		}
602
603		err = perf_event__synthesize_event_types(process_synthesized_event,
604							 session);
605		if (err < 0) {
606			pr_err("Couldn't synthesize event_types.\n");
607			return err;
608		}
609
610		if (have_tracepoints(&evsel_list->entries)) {
611			/*
612			 * FIXME err <= 0 here actually means that
613			 * there were no tracepoints so its not really
614			 * an error, just that we don't need to
615			 * synthesize anything.  We really have to
616			 * return this more properly and also
617			 * propagate errors that now are calling die()
618			 */
619			err = perf_event__synthesize_tracing_data(output, evsel_list,
620								  process_synthesized_event,
621								  session);
622			if (err <= 0) {
623				pr_err("Couldn't record tracing data.\n");
624				return err;
625			}
626			advance_output(err);
627		}
628	}
629
630	machine = perf_session__find_host_machine(session);
631	if (!machine) {
632		pr_err("Couldn't find native kernel information.\n");
633		return -1;
634	}
635
636	err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
637						 session, machine, "_text");
638	if (err < 0)
639		err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
640							 session, machine, "_stext");
641	if (err < 0)
642		pr_err("Couldn't record kernel reference relocation symbol\n"
643		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
644		       "Check /proc/kallsyms permission or run as root.\n");
645
646	err = perf_event__synthesize_modules(process_synthesized_event,
647					     session, machine);
648	if (err < 0)
649		pr_err("Couldn't record kernel module information.\n"
650		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
651		       "Check /proc/modules permission or run as root.\n");
652
653	if (perf_guest)
654		perf_session__process_machines(session,
655					       perf_event__synthesize_guest_os);
656
657	if (!system_wide)
658		perf_event__synthesize_thread_map(evsel_list->threads,
659						  process_synthesized_event,
660						  session);
661	else
662		perf_event__synthesize_threads(process_synthesized_event,
663					       session);
664
665	if (realtime_prio) {
666		struct sched_param param;
667
668		param.sched_priority = realtime_prio;
669		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
670			pr_err("Could not set realtime priority.\n");
671			exit(-1);
672		}
673	}
674
675	perf_evlist__enable(evsel_list);
676
677	/*
678	 * Let the child rip
679	 */
680	if (forks)
681		close(go_pipe[1]);
682
683	for (;;) {
684		int hits = samples;
685
686		mmap_read_all();
687
688		if (hits == samples) {
689			if (done)
690				break;
691			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
692			waking++;
693		}
694
695		if (done)
696			perf_evlist__disable(evsel_list);
697	}
698
699	if (quiet || signr == SIGUSR1)
700		return 0;
701
702	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
703
704	/*
705	 * Approximate RIP event size: 24 bytes.
706	 */
707	fprintf(stderr,
708		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
709		(double)bytes_written / 1024.0 / 1024.0,
710		output_name,
711		bytes_written / 24);
712
713	return 0;
714
715out_delete_session:
716	perf_session__delete(session);
717	return err;
718}
719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720static const char * const record_usage[] = {
721	"perf record [<options>] [<command>]",
722	"perf record [<options>] -- <command> [<options>]",
723	NULL
724};
725
726static bool force, append_file;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
 
 
 
 
 
 
728const struct option record_options[] = {
729	OPT_CALLBACK('e', "event", &evsel_list, "event",
730		     "event selector. use 'perf list' to list available events",
731		     parse_events_option),
732	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
733		     "event filter", parse_filter),
734	OPT_INTEGER('p', "pid", &target_pid,
735		    "record events on existing process id"),
736	OPT_INTEGER('t', "tid", &target_tid,
737		    "record events on existing thread id"),
738	OPT_INTEGER('r', "realtime", &realtime_prio,
739		    "collect data with this RT SCHED_FIFO priority"),
740	OPT_BOOLEAN('D', "no-delay", &nodelay,
741		    "collect data without buffering"),
742	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
743		    "collect raw sample records from all opened counters"),
744	OPT_BOOLEAN('a', "all-cpus", &system_wide,
745			    "system-wide collection from all CPUs"),
746	OPT_BOOLEAN('A', "append", &append_file,
747			    "append to the output file to do incremental profiling"),
748	OPT_STRING('C', "cpu", &cpu_list, "cpu",
749		    "list of cpus to monitor"),
750	OPT_BOOLEAN('f', "force", &force,
751			"overwrite existing data file (deprecated)"),
752	OPT_U64('c', "count", &user_interval, "event period to sample"),
753	OPT_STRING('o', "output", &output_name, "file",
754		    "output file name"),
755	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
756		    "child tasks do not inherit counters"),
757	OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
758	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
759	OPT_BOOLEAN(0, "group", &group,
 
760		    "put the counters into a counter group"),
761	OPT_BOOLEAN('g', "call-graph", &call_graph,
762		    "do call-graph (stack chain/backtrace) recording"),
763	OPT_INCR('v', "verbose", &verbose,
764		    "be more verbose (show counter open errors, etc)"),
765	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
766	OPT_BOOLEAN('s', "stat", &inherit_stat,
767		    "per thread counts"),
768	OPT_BOOLEAN('d', "data", &sample_address,
769		    "Sample addresses"),
770	OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
771	OPT_BOOLEAN('n', "no-samples", &no_samples,
 
772		    "don't sample"),
773	OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
774		    "do not update the buildid cache"),
775	OPT_BOOLEAN('B', "no-buildid", &no_buildid,
776		    "do not collect buildids in perf.data"),
777	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
778		     "monitor event in cgroup name only",
779		     parse_cgroups),
 
 
 
 
 
 
 
 
 
 
780	OPT_END()
781};
782
783int cmd_record(int argc, const char **argv, const char *prefix __used)
784{
785	int err = -ENOMEM;
786	struct perf_evsel *pos;
 
 
 
 
 
787
788	evsel_list = perf_evlist__new(NULL, NULL);
789	if (evsel_list == NULL)
790		return -ENOMEM;
791
 
 
792	argc = parse_options(argc, argv, record_options, record_usage,
793			    PARSE_OPT_STOP_AT_NON_OPTION);
794	if (!argc && target_pid == -1 && target_tid == -1 &&
795		!system_wide && !cpu_list)
796		usage_with_options(record_usage, record_options);
797
798	if (force && append_file) {
799		fprintf(stderr, "Can't overwrite and append at the same time."
800				" You need to choose between -f and -A");
801		usage_with_options(record_usage, record_options);
802	} else if (append_file) {
803		write_mode = WRITE_APPEND;
804	} else {
805		write_mode = WRITE_FORCE;
806	}
807
808	if (nr_cgroups && !system_wide) {
809		fprintf(stderr, "cgroup monitoring only available in"
810			" system-wide mode\n");
811		usage_with_options(record_usage, record_options);
812	}
813
814	symbol__init();
815
816	if (symbol_conf.kptr_restrict)
817		pr_warning(
818"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
819"check /proc/sys/kernel/kptr_restrict.\n\n"
820"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
821"file is not found in the buildid cache or in the vmlinux path.\n\n"
822"Samples in kernel modules won't be resolved at all.\n\n"
823"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
824"even with a suitable vmlinux or kallsyms file.\n\n");
825
826	if (no_buildid_cache || no_buildid)
827		disable_buildid_cache();
828
829	if (evsel_list->nr_entries == 0 &&
830	    perf_evlist__add_default(evsel_list) < 0) {
831		pr_err("Not enough memory for event selector list\n");
832		goto out_symbol_exit;
833	}
834
835	if (target_pid != -1)
836		target_tid = target_pid;
 
 
 
 
 
 
 
837
838	if (perf_evlist__create_maps(evsel_list, target_pid,
839				     target_tid, cpu_list) < 0)
 
 
 
 
 
 
 
840		usage_with_options(record_usage, record_options);
841
842	list_for_each_entry(pos, &evsel_list->entries, node) {
843		if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
844					 evsel_list->threads->nr) < 0)
845			goto out_free_fd;
846		if (perf_header__push_event(pos->attr.config, event_name(pos)))
847			goto out_free_fd;
848	}
849
850	if (perf_evlist__alloc_pollfd(evsel_list) < 0)
851		goto out_free_fd;
852
853	if (user_interval != ULLONG_MAX)
854		default_interval = user_interval;
855	if (user_freq != UINT_MAX)
856		freq = user_freq;
857
858	/*
859	 * User specified count overrides default frequency.
860	 */
861	if (default_interval)
862		freq = 0;
863	else if (freq) {
864		default_interval = freq;
865	} else {
866		fprintf(stderr, "frequency and count are zero, aborting\n");
867		err = -EINVAL;
868		goto out_free_fd;
869	}
870
871	err = __cmd_record(argc, argv);
872out_free_fd:
873	perf_evlist__delete_maps(evsel_list);
874out_symbol_exit:
875	symbol__exit();
876	return err;
877}
v3.5.6
  1/*
  2 * builtin-record.c
  3 *
  4 * Builtin record command: Record the profile of a workload
  5 * (or a CPU, or a PID) into the perf.data output file - for
  6 * later analysis via perf report.
  7 */
  8#define _FILE_OFFSET_BITS 64
  9
 10#include "builtin.h"
 11
 12#include "perf.h"
 13
 14#include "util/build-id.h"
 15#include "util/util.h"
 16#include "util/parse-options.h"
 17#include "util/parse-events.h"
 18
 19#include "util/header.h"
 20#include "util/event.h"
 21#include "util/evlist.h"
 22#include "util/evsel.h"
 23#include "util/debug.h"
 24#include "util/session.h"
 25#include "util/tool.h"
 26#include "util/symbol.h"
 27#include "util/cpumap.h"
 28#include "util/thread_map.h"
 29
 30#include <unistd.h>
 31#include <sched.h>
 32#include <sys/mman.h>
 33
 34enum write_mode_t {
 35	WRITE_FORCE,
 36	WRITE_APPEND
 37};
 38
 39struct perf_record {
 40	struct perf_tool	tool;
 41	struct perf_record_opts	opts;
 42	u64			bytes_written;
 43	const char		*output_name;
 44	struct perf_evlist	*evlist;
 45	struct perf_session	*session;
 46	const char		*progname;
 47	int			output;
 48	unsigned int		page_size;
 49	int			realtime_prio;
 50	enum write_mode_t	write_mode;
 51	bool			no_buildid;
 52	bool			no_buildid_cache;
 53	bool			force;
 54	bool			file_new;
 55	bool			append_file;
 56	long			samples;
 57	off_t			post_processing_offset;
 58};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 59
 60static void advance_output(struct perf_record *rec, size_t size)
 61{
 62	rec->bytes_written += size;
 63}
 64
 65static void write_output(struct perf_record *rec, void *buf, size_t size)
 66{
 67	while (size) {
 68		int ret = write(rec->output, buf, size);
 69
 70		if (ret < 0)
 71			die("failed to write");
 72
 73		size -= ret;
 74		buf += ret;
 75
 76		rec->bytes_written += ret;
 77	}
 78}
 79
 80static int process_synthesized_event(struct perf_tool *tool,
 81				     union perf_event *event,
 82				     struct perf_sample *sample __used,
 83				     struct machine *machine __used)
 84{
 85	struct perf_record *rec = container_of(tool, struct perf_record, tool);
 86	write_output(rec, event, event->header.size);
 87	return 0;
 88}
 89
 90static void perf_record__mmap_read(struct perf_record *rec,
 91				   struct perf_mmap *md)
 92{
 93	unsigned int head = perf_mmap__read_head(md);
 94	unsigned int old = md->prev;
 95	unsigned char *data = md->base + rec->page_size;
 96	unsigned long size;
 97	void *buf;
 98
 99	if (old == head)
100		return;
101
102	rec->samples++;
103
104	size = head - old;
105
106	if ((old & md->mask) + size != (head & md->mask)) {
107		buf = &data[old & md->mask];
108		size = md->mask + 1 - (old & md->mask);
109		old += size;
110
111		write_output(rec, buf, size);
112	}
113
114	buf = &data[old & md->mask];
115	size = head - old;
116	old += size;
117
118	write_output(rec, buf, size);
119
120	md->prev = old;
121	perf_mmap__write_tail(md, old);
122}
123
124static volatile int done = 0;
125static volatile int signr = -1;
126static volatile int child_finished = 0;
127
128static void sig_handler(int sig)
129{
130	if (sig == SIGCHLD)
131		child_finished = 1;
132
133	done = 1;
134	signr = sig;
135}
136
137static void perf_record__sig_exit(int exit_status __used, void *arg)
138{
139	struct perf_record *rec = arg;
140	int status;
141
142	if (rec->evlist->workload.pid > 0) {
143		if (!child_finished)
144			kill(rec->evlist->workload.pid, SIGTERM);
145
146		wait(&status);
147		if (WIFSIGNALED(status))
148			psignal(WTERMSIG(status), rec->progname);
149	}
150
151	if (signr == -1 || signr == SIGUSR1)
152		return;
153
154	signal(signr, SIG_DFL);
155	kill(getpid(), signr);
156}
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158static bool perf_evlist__equal(struct perf_evlist *evlist,
159			       struct perf_evlist *other)
160{
161	struct perf_evsel *pos, *pair;
162
163	if (evlist->nr_entries != other->nr_entries)
164		return false;
165
166	pair = list_entry(other->entries.next, struct perf_evsel, node);
167
168	list_for_each_entry(pos, &evlist->entries, node) {
169		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
170			return false;
171		pair = list_entry(pair->node.next, struct perf_evsel, node);
172	}
173
174	return true;
175}
176
177static void perf_record__open(struct perf_record *rec)
178{
179	struct perf_evsel *pos, *first;
180	struct perf_evlist *evlist = rec->evlist;
181	struct perf_session *session = rec->session;
182	struct perf_record_opts *opts = &rec->opts;
183
184	first = list_entry(evlist->entries.next, struct perf_evsel, node);
185
186	perf_evlist__config_attrs(evlist, opts);
187
188	list_for_each_entry(pos, &evlist->entries, node) {
189		struct perf_event_attr *attr = &pos->attr;
190		struct xyarray *group_fd = NULL;
191		/*
192		 * Check if parse_single_tracepoint_event has already asked for
193		 * PERF_SAMPLE_TIME.
194		 *
195		 * XXX this is kludgy but short term fix for problems introduced by
196		 * eac23d1c that broke 'perf script' by having different sample_types
197		 * when using multiple tracepoint events when we use a perf binary
198		 * that tries to use sample_id_all on an older kernel.
199		 *
200		 * We need to move counter creation to perf_session, support
201		 * different sample_types, etc.
202		 */
203		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
204
205		if (opts->group && pos != first)
206			group_fd = first->fd;
207fallback_missing_features:
208		if (opts->exclude_guest_missing)
209			attr->exclude_guest = attr->exclude_host = 0;
210retry_sample_id:
211		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
212try_again:
213		if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
214				     opts->group, group_fd) < 0) {
215			int err = errno;
216
217			if (err == EPERM || err == EACCES) {
218				ui__error_paranoid();
219				exit(EXIT_FAILURE);
220			} else if (err ==  ENODEV && opts->target.cpu_list) {
221				die("No such device - did you specify"
222					" an out-of-range profile CPU?\n");
223			} else if (err == EINVAL) {
224				if (!opts->exclude_guest_missing &&
225				    (attr->exclude_guest || attr->exclude_host)) {
226					pr_debug("Old kernel, cannot exclude "
227						 "guest or host samples.\n");
228					opts->exclude_guest_missing = true;
229					goto fallback_missing_features;
230				} else if (!opts->sample_id_all_missing) {
231					/*
232					 * Old kernel, no attr->sample_id_type_all field
233					 */
234					opts->sample_id_all_missing = true;
235					if (!opts->sample_time && !opts->raw_samples && !time_needed)
236						attr->sample_type &= ~PERF_SAMPLE_TIME;
237
238					goto retry_sample_id;
239				}
240			}
241
242			/*
243			 * If it's cycles then fall back to hrtimer
244			 * based cpu-clock-tick sw counter, which
245			 * is always available even if no PMU support.
246			 *
247			 * PPC returns ENXIO until 2.6.37 (behavior changed
248			 * with commit b0a873e).
249			 */
250			if ((err == ENOENT || err == ENXIO)
251					&& attr->type == PERF_TYPE_HARDWARE
252					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
253
254				if (verbose)
255					ui__warning("The cycles event is not supported, "
256						    "trying to fall back to cpu-clock-ticks\n");
257				attr->type = PERF_TYPE_SOFTWARE;
258				attr->config = PERF_COUNT_SW_CPU_CLOCK;
259				if (pos->name) {
260					free(pos->name);
261					pos->name = NULL;
262				}
263				goto try_again;
264			}
265
266			if (err == ENOENT) {
267				ui__error("The %s event is not supported.\n",
268					    event_name(pos));
269				exit(EXIT_FAILURE);
270			}
271
272			printf("\n");
273			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
274			      err, strerror(err));
275
276#if defined(__i386__) || defined(__x86_64__)
277			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
278				die("No hardware sampling interrupt available."
279				    " No APIC? If so then you can boot the kernel"
280				    " with the \"lapic\" boot parameter to"
281				    " force-enable it.\n");
282#endif
283
284			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
285		}
286	}
287
288	if (perf_evlist__set_filters(evlist)) {
289		error("failed to set filter with %d (%s)\n", errno,
290			strerror(errno));
291		exit(-1);
292	}
293
294	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
295		if (errno == EPERM)
296			die("Permission error mapping pages.\n"
297			    "Consider increasing "
298			    "/proc/sys/kernel/perf_event_mlock_kb,\n"
299			    "or try again with a smaller value of -m/--mmap_pages.\n"
300			    "(current value: %d)\n", opts->mmap_pages);
301		else if (!is_power_of_2(opts->mmap_pages))
302			die("--mmap_pages/-m value must be a power of two.");
303
304		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
305	}
306
307	if (rec->file_new)
308		session->evlist = evlist;
309	else {
310		if (!perf_evlist__equal(session->evlist, evlist)) {
311			fprintf(stderr, "incompatible append\n");
312			exit(-1);
313		}
314 	}
315
316	perf_session__update_sample_type(session);
317}
318
319static int process_buildids(struct perf_record *rec)
320{
321	u64 size = lseek(rec->output, 0, SEEK_CUR);
322
323	if (size == 0)
324		return 0;
325
326	rec->session->fd = rec->output;
327	return __perf_session__process_events(rec->session, rec->post_processing_offset,
328					      size - rec->post_processing_offset,
329					      size, &build_id__mark_dso_hit_ops);
330}
331
332static void perf_record__exit(int status __used, void *arg)
333{
334	struct perf_record *rec = arg;
335
336	if (!rec->opts.pipe_output) {
337		rec->session->header.data_size += rec->bytes_written;
338
339		if (!rec->no_buildid)
340			process_buildids(rec);
341		perf_session__write_header(rec->session, rec->evlist,
342					   rec->output, true);
343		perf_session__delete(rec->session);
344		perf_evlist__delete(rec->evlist);
345		symbol__exit();
346	}
347}
348
349static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
350{
351	int err;
352	struct perf_tool *tool = data;
353
354	if (machine__is_host(machine))
355		return;
356
357	/*
358	 *As for guest kernel when processing subcommand record&report,
359	 *we arrange module mmap prior to guest kernel mmap and trigger
360	 *a preload dso because default guest module symbols are loaded
361	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
362	 *method is used to avoid symbol missing when the first addr is
363	 *in module instead of in guest kernel.
364	 */
365	err = perf_event__synthesize_modules(tool, process_synthesized_event,
366					     machine);
367	if (err < 0)
368		pr_err("Couldn't record guest kernel [%d]'s reference"
369		       " relocation symbol.\n", machine->pid);
370
371	/*
372	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
373	 * have no _text sometimes.
374	 */
375	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
376						 machine, "_text");
377	if (err < 0)
378		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
379							 machine, "_stext");
 
380	if (err < 0)
381		pr_err("Couldn't record guest kernel [%d]'s reference"
382		       " relocation symbol.\n", machine->pid);
383}
384
385static struct perf_event_header finished_round_event = {
386	.size = sizeof(struct perf_event_header),
387	.type = PERF_RECORD_FINISHED_ROUND,
388};
389
390static void perf_record__mmap_read_all(struct perf_record *rec)
391{
392	int i;
393
394	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
395		if (rec->evlist->mmap[i].base)
396			perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
397	}
398
399	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
400		write_output(rec, &finished_round_event, sizeof(finished_round_event));
401}
402
403static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
404{
405	struct stat st;
406	int flags;
407	int err, output, feat;
408	unsigned long waking = 0;
 
409	const bool forks = argc > 0;
 
410	struct machine *machine;
411	struct perf_tool *tool = &rec->tool;
412	struct perf_record_opts *opts = &rec->opts;
413	struct perf_evlist *evsel_list = rec->evlist;
414	const char *output_name = rec->output_name;
415	struct perf_session *session;
416
417	rec->progname = argv[0];
418
419	rec->page_size = sysconf(_SC_PAGE_SIZE);
420
421	on_exit(perf_record__sig_exit, rec);
422	signal(SIGCHLD, sig_handler);
423	signal(SIGINT, sig_handler);
424	signal(SIGUSR1, sig_handler);
425
 
 
 
 
 
426	if (!output_name) {
427		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
428			opts->pipe_output = true;
429		else
430			rec->output_name = output_name = "perf.data";
431	}
432	if (output_name) {
433		if (!strcmp(output_name, "-"))
434			opts->pipe_output = true;
435		else if (!stat(output_name, &st) && st.st_size) {
436			if (rec->write_mode == WRITE_FORCE) {
437				char oldname[PATH_MAX];
438				snprintf(oldname, sizeof(oldname), "%s.old",
439					 output_name);
440				unlink(oldname);
441				rename(output_name, oldname);
442			}
443		} else if (rec->write_mode == WRITE_APPEND) {
444			rec->write_mode = WRITE_FORCE;
445		}
446	}
447
448	flags = O_CREAT|O_RDWR;
449	if (rec->write_mode == WRITE_APPEND)
450		rec->file_new = 0;
451	else
452		flags |= O_TRUNC;
453
454	if (opts->pipe_output)
455		output = STDOUT_FILENO;
456	else
457		output = open(output_name, flags, S_IRUSR | S_IWUSR);
458	if (output < 0) {
459		perror("failed to create output file");
460		exit(-1);
461	}
462
463	rec->output = output;
464
465	session = perf_session__new(output_name, O_WRONLY,
466				    rec->write_mode == WRITE_FORCE, false, NULL);
467	if (session == NULL) {
468		pr_err("Not enough memory for reading perf file header\n");
469		return -1;
470	}
471
472	rec->session = session;
473
474	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
475		perf_header__set_feat(&session->header, feat);
476
477	if (rec->no_buildid)
478		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
479
480	if (!have_tracepoints(&evsel_list->entries))
481		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
482
483	if (!rec->opts.branch_stack)
484		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
485
486	if (!rec->file_new) {
487		err = perf_session__read_header(session, output);
488		if (err < 0)
489			goto out_delete_session;
490	}
491
 
 
 
 
 
 
 
492	if (forks) {
493		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
494		if (err < 0) {
495			pr_err("Couldn't run the workload!\n");
496			goto out_delete_session;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497		}
 
498	}
499
500	perf_record__open(rec);
501
502	/*
503	 * perf_session__delete(session) will be called at perf_record__exit()
504	 */
505	on_exit(perf_record__exit, rec);
506
507	if (opts->pipe_output) {
508		err = perf_header__write_pipe(output);
509		if (err < 0)
510			return err;
511	} else if (rec->file_new) {
512		err = perf_session__write_header(session, evsel_list,
513						 output, false);
514		if (err < 0)
515			return err;
516	}
517
518	if (!rec->no_buildid
519	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
520		pr_err("Couldn't generate buildids. "
521		       "Use --no-buildid to profile anyway.\n");
522		return -1;
523	}
524
525	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
526
527	machine = perf_session__find_host_machine(session);
528	if (!machine) {
529		pr_err("Couldn't find native kernel information.\n");
530		return -1;
531	}
532
533	if (opts->pipe_output) {
534		err = perf_event__synthesize_attrs(tool, session,
535						   process_synthesized_event);
536		if (err < 0) {
537			pr_err("Couldn't synthesize attrs.\n");
538			return err;
539		}
540
541		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
542							 machine);
543		if (err < 0) {
544			pr_err("Couldn't synthesize event_types.\n");
545			return err;
546		}
547
548		if (have_tracepoints(&evsel_list->entries)) {
549			/*
550			 * FIXME err <= 0 here actually means that
551			 * there were no tracepoints so its not really
552			 * an error, just that we don't need to
553			 * synthesize anything.  We really have to
554			 * return this more properly and also
555			 * propagate errors that now are calling die()
556			 */
557			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
558								  process_synthesized_event);
 
559			if (err <= 0) {
560				pr_err("Couldn't record tracing data.\n");
561				return err;
562			}
563			advance_output(rec, err);
564		}
565	}
566
567	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
568						 machine, "_text");
 
 
 
 
 
 
569	if (err < 0)
570		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
571							 machine, "_stext");
572	if (err < 0)
573		pr_err("Couldn't record kernel reference relocation symbol\n"
574		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
575		       "Check /proc/kallsyms permission or run as root.\n");
576
577	err = perf_event__synthesize_modules(tool, process_synthesized_event,
578					     machine);
579	if (err < 0)
580		pr_err("Couldn't record kernel module information.\n"
581		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
582		       "Check /proc/modules permission or run as root.\n");
583
584	if (perf_guest)
585		perf_session__process_machines(session, tool,
586					       perf_event__synthesize_guest_os);
587
588	if (!opts->target.system_wide)
589		perf_event__synthesize_thread_map(tool, evsel_list->threads,
590						  process_synthesized_event,
591						  machine);
592	else
593		perf_event__synthesize_threads(tool, process_synthesized_event,
594					       machine);
595
596	if (rec->realtime_prio) {
597		struct sched_param param;
598
599		param.sched_priority = rec->realtime_prio;
600		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
601			pr_err("Could not set realtime priority.\n");
602			exit(-1);
603		}
604	}
605
606	perf_evlist__enable(evsel_list);
607
608	/*
609	 * Let the child rip
610	 */
611	if (forks)
612		perf_evlist__start_workload(evsel_list);
613
614	for (;;) {
615		int hits = rec->samples;
616
617		perf_record__mmap_read_all(rec);
618
619		if (hits == rec->samples) {
620			if (done)
621				break;
622			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
623			waking++;
624		}
625
626		if (done)
627			perf_evlist__disable(evsel_list);
628	}
629
630	if (quiet || signr == SIGUSR1)
631		return 0;
632
633	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
634
635	/*
636	 * Approximate RIP event size: 24 bytes.
637	 */
638	fprintf(stderr,
639		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
640		(double)rec->bytes_written / 1024.0 / 1024.0,
641		output_name,
642		rec->bytes_written / 24);
643
644	return 0;
645
646out_delete_session:
647	perf_session__delete(session);
648	return err;
649}
650
651#define BRANCH_OPT(n, m) \
652	{ .name = n, .mode = (m) }
653
654#define BRANCH_END { .name = NULL }
655
656struct branch_mode {
657	const char *name;
658	int mode;
659};
660
661static const struct branch_mode branch_modes[] = {
662	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
663	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
664	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
665	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
666	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
667	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
668	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
669	BRANCH_END
670};
671
672static int
673parse_branch_stack(const struct option *opt, const char *str, int unset)
674{
675#define ONLY_PLM \
676	(PERF_SAMPLE_BRANCH_USER	|\
677	 PERF_SAMPLE_BRANCH_KERNEL	|\
678	 PERF_SAMPLE_BRANCH_HV)
679
680	uint64_t *mode = (uint64_t *)opt->value;
681	const struct branch_mode *br;
682	char *s, *os = NULL, *p;
683	int ret = -1;
684
685	if (unset)
686		return 0;
687
688	/*
689	 * cannot set it twice, -b + --branch-filter for instance
690	 */
691	if (*mode)
692		return -1;
693
694	/* str may be NULL in case no arg is passed to -b */
695	if (str) {
696		/* because str is read-only */
697		s = os = strdup(str);
698		if (!s)
699			return -1;
700
701		for (;;) {
702			p = strchr(s, ',');
703			if (p)
704				*p = '\0';
705
706			for (br = branch_modes; br->name; br++) {
707				if (!strcasecmp(s, br->name))
708					break;
709			}
710			if (!br->name) {
711				ui__warning("unknown branch filter %s,"
712					    " check man page\n", s);
713				goto error;
714			}
715
716			*mode |= br->mode;
717
718			if (!p)
719				break;
720
721			s = p + 1;
722		}
723	}
724	ret = 0;
725
726	/* default to any branch */
727	if ((*mode & ~ONLY_PLM) == 0) {
728		*mode = PERF_SAMPLE_BRANCH_ANY;
729	}
730error:
731	free(os);
732	return ret;
733}
734
735static const char * const record_usage[] = {
736	"perf record [<options>] [<command>]",
737	"perf record [<options>] -- <command> [<options>]",
738	NULL
739};
740
741/*
742 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
743 * because we need to have access to it in perf_record__exit, that is called
744 * after cmd_record() exits, but since record_options need to be accessible to
745 * builtin-script, leave it here.
746 *
747 * At least we don't ouch it in all the other functions here directly.
748 *
749 * Just say no to tons of global variables, sigh.
750 */
751static struct perf_record record = {
752	.opts = {
753		.mmap_pages	     = UINT_MAX,
754		.user_freq	     = UINT_MAX,
755		.user_interval	     = ULLONG_MAX,
756		.freq		     = 4000,
757		.target		     = {
758			.uses_mmap   = true,
759		},
760	},
761	.write_mode = WRITE_FORCE,
762	.file_new   = true,
763};
764
765/*
766 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
767 * with it and switch to use the library functions in perf_evlist that came
768 * from builtin-record.c, i.e. use perf_record_opts,
769 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
770 * using pipes, etc.
771 */
772const struct option record_options[] = {
773	OPT_CALLBACK('e', "event", &record.evlist, "event",
774		     "event selector. use 'perf list' to list available events",
775		     parse_events_option),
776	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
777		     "event filter", parse_filter),
778	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
779		    "record events on existing process id"),
780	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
781		    "record events on existing thread id"),
782	OPT_INTEGER('r', "realtime", &record.realtime_prio,
783		    "collect data with this RT SCHED_FIFO priority"),
784	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
785		    "collect data without buffering"),
786	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
787		    "collect raw sample records from all opened counters"),
788	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
789			    "system-wide collection from all CPUs"),
790	OPT_BOOLEAN('A', "append", &record.append_file,
791			    "append to the output file to do incremental profiling"),
792	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
793		    "list of cpus to monitor"),
794	OPT_BOOLEAN('f', "force", &record.force,
795			"overwrite existing data file (deprecated)"),
796	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
797	OPT_STRING('o', "output", &record.output_name, "file",
798		    "output file name"),
799	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
800		    "child tasks do not inherit counters"),
801	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
802	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
803		     "number of mmap data pages"),
804	OPT_BOOLEAN(0, "group", &record.opts.group,
805		    "put the counters into a counter group"),
806	OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
807		    "do call-graph (stack chain/backtrace) recording"),
808	OPT_INCR('v', "verbose", &verbose,
809		    "be more verbose (show counter open errors, etc)"),
810	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
811	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
812		    "per thread counts"),
813	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
814		    "Sample addresses"),
815	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
816	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
817	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
818		    "don't sample"),
819	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
820		    "do not update the buildid cache"),
821	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
822		    "do not collect buildids in perf.data"),
823	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
824		     "monitor event in cgroup name only",
825		     parse_cgroups),
826	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
827		   "user to profile"),
828
829	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
830		     "branch any", "sample any taken branches",
831		     parse_branch_stack),
832
833	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
834		     "branch filter mask", "branch stack filter modes",
835		     parse_branch_stack),
836	OPT_END()
837};
838
839int cmd_record(int argc, const char **argv, const char *prefix __used)
840{
841	int err = -ENOMEM;
842	struct perf_evsel *pos;
843	struct perf_evlist *evsel_list;
844	struct perf_record *rec = &record;
845	char errbuf[BUFSIZ];
846
847	perf_header__set_cmdline(argc, argv);
848
849	evsel_list = perf_evlist__new(NULL, NULL);
850	if (evsel_list == NULL)
851		return -ENOMEM;
852
853	rec->evlist = evsel_list;
854
855	argc = parse_options(argc, argv, record_options, record_usage,
856			    PARSE_OPT_STOP_AT_NON_OPTION);
857	if (!argc && perf_target__none(&rec->opts.target))
 
858		usage_with_options(record_usage, record_options);
859
860	if (rec->force && rec->append_file) {
861		ui__error("Can't overwrite and append at the same time."
862			  " You need to choose between -f and -A");
863		usage_with_options(record_usage, record_options);
864	} else if (rec->append_file) {
865		rec->write_mode = WRITE_APPEND;
866	} else {
867		rec->write_mode = WRITE_FORCE;
868	}
869
870	if (nr_cgroups && !rec->opts.target.system_wide) {
871		ui__error("cgroup monitoring only available in"
872			  " system-wide mode\n");
873		usage_with_options(record_usage, record_options);
874	}
875
876	symbol__init();
877
878	if (symbol_conf.kptr_restrict)
879		pr_warning(
880"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
881"check /proc/sys/kernel/kptr_restrict.\n\n"
882"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
883"file is not found in the buildid cache or in the vmlinux path.\n\n"
884"Samples in kernel modules won't be resolved at all.\n\n"
885"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
886"even with a suitable vmlinux or kallsyms file.\n\n");
887
888	if (rec->no_buildid_cache || rec->no_buildid)
889		disable_buildid_cache();
890
891	if (evsel_list->nr_entries == 0 &&
892	    perf_evlist__add_default(evsel_list) < 0) {
893		pr_err("Not enough memory for event selector list\n");
894		goto out_symbol_exit;
895	}
896
897	err = perf_target__validate(&rec->opts.target);
898	if (err) {
899		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
900		ui__warning("%s", errbuf);
901	}
902
903	err = perf_target__parse_uid(&rec->opts.target);
904	if (err) {
905		int saved_errno = errno;
906
907		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
908		ui__error("%s", errbuf);
909
910		err = -saved_errno;
911		goto out_free_fd;
912	}
913
914	err = -ENOMEM;
915	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
916		usage_with_options(record_usage, record_options);
917
918	list_for_each_entry(pos, &evsel_list->entries, node) {
 
 
 
919		if (perf_header__push_event(pos->attr.config, event_name(pos)))
920			goto out_free_fd;
921	}
922
923	if (rec->opts.user_interval != ULLONG_MAX)
924		rec->opts.default_interval = rec->opts.user_interval;
925	if (rec->opts.user_freq != UINT_MAX)
926		rec->opts.freq = rec->opts.user_freq;
 
 
 
927
928	/*
929	 * User specified count overrides default frequency.
930	 */
931	if (rec->opts.default_interval)
932		rec->opts.freq = 0;
933	else if (rec->opts.freq) {
934		rec->opts.default_interval = rec->opts.freq;
935	} else {
936		ui__error("frequency and count are zero, aborting\n");
937		err = -EINVAL;
938		goto out_free_fd;
939	}
940
941	err = __cmd_record(&record, argc, argv);
942out_free_fd:
943	perf_evlist__delete_maps(evsel_list);
944out_symbol_exit:
945	symbol__exit();
946	return err;
947}