Linux Audio

Check our new training course

Loading...
v4.10.11
    1/*
    2 * Performance events core code:
    3 *
    4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
    5 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
    6 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
    7 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
    8 *
    9 * For licensing details see kernel-base/COPYING
   10 */
   11
   12#include <linux/fs.h>
   13#include <linux/mm.h>
   14#include <linux/cpu.h>
   15#include <linux/smp.h>
   16#include <linux/idr.h>
   17#include <linux/file.h>
   18#include <linux/poll.h>
   19#include <linux/slab.h>
   20#include <linux/hash.h>
   21#include <linux/tick.h>
   22#include <linux/sysfs.h>
   23#include <linux/dcache.h>
   24#include <linux/percpu.h>
   25#include <linux/ptrace.h>
   26#include <linux/reboot.h>
   27#include <linux/vmstat.h>
   28#include <linux/device.h>
   29#include <linux/export.h>
   30#include <linux/vmalloc.h>
   31#include <linux/hardirq.h>
   32#include <linux/rculist.h>
   33#include <linux/uaccess.h>
   34#include <linux/syscalls.h>
   35#include <linux/anon_inodes.h>
   36#include <linux/kernel_stat.h>
   37#include <linux/cgroup.h>
   38#include <linux/perf_event.h>
   39#include <linux/trace_events.h>
   40#include <linux/hw_breakpoint.h>
   41#include <linux/mm_types.h>
   42#include <linux/module.h>
   43#include <linux/mman.h>
   44#include <linux/compat.h>
   45#include <linux/bpf.h>
   46#include <linux/filter.h>
   47#include <linux/namei.h>
   48#include <linux/parser.h>
   49
   50#include "internal.h"
   51
   52#include <asm/irq_regs.h>
   53
   54typedef int (*remote_function_f)(void *);
   55
   56struct remote_function_call {
   57	struct task_struct	*p;
   58	remote_function_f	func;
   59	void			*info;
   60	int			ret;
   61};
   62
   63static void remote_function(void *data)
   64{
   65	struct remote_function_call *tfc = data;
   66	struct task_struct *p = tfc->p;
   67
   68	if (p) {
   69		/* -EAGAIN */
   70		if (task_cpu(p) != smp_processor_id())
   71			return;
   72
   73		/*
   74		 * Now that we're on right CPU with IRQs disabled, we can test
   75		 * if we hit the right task without races.
   76		 */
   77
   78		tfc->ret = -ESRCH; /* No such (running) process */
   79		if (p != current)
   80			return;
   81	}
   82
   83	tfc->ret = tfc->func(tfc->info);
   84}
   85
   86/**
   87 * task_function_call - call a function on the cpu on which a task runs
   88 * @p:		the task to evaluate
   89 * @func:	the function to be called
   90 * @info:	the function call argument
   91 *
   92 * Calls the function @func when the task is currently running. This might
   93 * be on the current CPU, which just calls the function directly
   94 *
   95 * returns: @func return value, or
   96 *	    -ESRCH  - when the process isn't running
   97 *	    -EAGAIN - when the process moved away
   98 */
   99static int
  100task_function_call(struct task_struct *p, remote_function_f func, void *info)
  101{
  102	struct remote_function_call data = {
  103		.p	= p,
  104		.func	= func,
  105		.info	= info,
  106		.ret	= -EAGAIN,
  107	};
  108	int ret;
  109
  110	do {
  111		ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
  112		if (!ret)
  113			ret = data.ret;
  114	} while (ret == -EAGAIN);
  115
  116	return ret;
  117}
  118
  119/**
  120 * cpu_function_call - call a function on the cpu
  121 * @func:	the function to be called
  122 * @info:	the function call argument
  123 *
  124 * Calls the function @func on the remote cpu.
  125 *
  126 * returns: @func return value or -ENXIO when the cpu is offline
  127 */
  128static int cpu_function_call(int cpu, remote_function_f func, void *info)
  129{
  130	struct remote_function_call data = {
  131		.p	= NULL,
  132		.func	= func,
  133		.info	= info,
  134		.ret	= -ENXIO, /* No such CPU */
  135	};
  136
  137	smp_call_function_single(cpu, remote_function, &data, 1);
  138
  139	return data.ret;
  140}
  141
  142static inline struct perf_cpu_context *
  143__get_cpu_context(struct perf_event_context *ctx)
  144{
  145	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
  146}
  147
  148static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
  149			  struct perf_event_context *ctx)
  150{
  151	raw_spin_lock(&cpuctx->ctx.lock);
  152	if (ctx)
  153		raw_spin_lock(&ctx->lock);
  154}
  155
  156static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
  157			    struct perf_event_context *ctx)
  158{
  159	if (ctx)
  160		raw_spin_unlock(&ctx->lock);
  161	raw_spin_unlock(&cpuctx->ctx.lock);
  162}
  163
  164#define TASK_TOMBSTONE ((void *)-1L)
  165
  166static bool is_kernel_event(struct perf_event *event)
  167{
  168	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
  169}
  170
  171/*
  172 * On task ctx scheduling...
  173 *
  174 * When !ctx->nr_events a task context will not be scheduled. This means
  175 * we can disable the scheduler hooks (for performance) without leaving
  176 * pending task ctx state.
  177 *
  178 * This however results in two special cases:
  179 *
  180 *  - removing the last event from a task ctx; this is relatively straight
  181 *    forward and is done in __perf_remove_from_context.
  182 *
  183 *  - adding the first event to a task ctx; this is tricky because we cannot
  184 *    rely on ctx->is_active and therefore cannot use event_function_call().
  185 *    See perf_install_in_context().
  186 *
  187 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
  188 */
  189
  190typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
  191			struct perf_event_context *, void *);
  192
  193struct event_function_struct {
  194	struct perf_event *event;
  195	event_f func;
  196	void *data;
  197};
  198
  199static int event_function(void *info)
  200{
  201	struct event_function_struct *efs = info;
  202	struct perf_event *event = efs->event;
  203	struct perf_event_context *ctx = event->ctx;
  204	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  205	struct perf_event_context *task_ctx = cpuctx->task_ctx;
  206	int ret = 0;
  207
  208	WARN_ON_ONCE(!irqs_disabled());
  209
  210	perf_ctx_lock(cpuctx, task_ctx);
  211	/*
  212	 * Since we do the IPI call without holding ctx->lock things can have
  213	 * changed, double check we hit the task we set out to hit.
  214	 */
  215	if (ctx->task) {
  216		if (ctx->task != current) {
  217			ret = -ESRCH;
  218			goto unlock;
  219		}
  220
  221		/*
  222		 * We only use event_function_call() on established contexts,
  223		 * and event_function() is only ever called when active (or
  224		 * rather, we'll have bailed in task_function_call() or the
  225		 * above ctx->task != current test), therefore we must have
  226		 * ctx->is_active here.
  227		 */
  228		WARN_ON_ONCE(!ctx->is_active);
  229		/*
  230		 * And since we have ctx->is_active, cpuctx->task_ctx must
  231		 * match.
  232		 */
  233		WARN_ON_ONCE(task_ctx != ctx);
  234	} else {
  235		WARN_ON_ONCE(&cpuctx->ctx != ctx);
  236	}
  237
  238	efs->func(event, cpuctx, ctx, efs->data);
  239unlock:
  240	perf_ctx_unlock(cpuctx, task_ctx);
  241
  242	return ret;
  243}
  244
  245static void event_function_call(struct perf_event *event, event_f func, void *data)
  246{
  247	struct perf_event_context *ctx = event->ctx;
  248	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
  249	struct event_function_struct efs = {
  250		.event = event,
  251		.func = func,
  252		.data = data,
  253	};
  254
  255	if (!event->parent) {
  256		/*
  257		 * If this is a !child event, we must hold ctx::mutex to
  258		 * stabilize the the event->ctx relation. See
  259		 * perf_event_ctx_lock().
  260		 */
  261		lockdep_assert_held(&ctx->mutex);
  262	}
  263
  264	if (!task) {
  265		cpu_function_call(event->cpu, event_function, &efs);
  266		return;
  267	}
  268
  269	if (task == TASK_TOMBSTONE)
  270		return;
  271
  272again:
  273	if (!task_function_call(task, event_function, &efs))
  274		return;
  275
  276	raw_spin_lock_irq(&ctx->lock);
  277	/*
  278	 * Reload the task pointer, it might have been changed by
  279	 * a concurrent perf_event_context_sched_out().
  280	 */
  281	task = ctx->task;
  282	if (task == TASK_TOMBSTONE) {
  283		raw_spin_unlock_irq(&ctx->lock);
  284		return;
  285	}
  286	if (ctx->is_active) {
  287		raw_spin_unlock_irq(&ctx->lock);
  288		goto again;
  289	}
  290	func(event, NULL, ctx, data);
  291	raw_spin_unlock_irq(&ctx->lock);
  292}
  293
  294/*
  295 * Similar to event_function_call() + event_function(), but hard assumes IRQs
  296 * are already disabled and we're on the right CPU.
  297 */
  298static void event_function_local(struct perf_event *event, event_f func, void *data)
  299{
  300	struct perf_event_context *ctx = event->ctx;
  301	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  302	struct task_struct *task = READ_ONCE(ctx->task);
  303	struct perf_event_context *task_ctx = NULL;
  304
  305	WARN_ON_ONCE(!irqs_disabled());
  306
  307	if (task) {
  308		if (task == TASK_TOMBSTONE)
  309			return;
  310
  311		task_ctx = ctx;
  312	}
  313
  314	perf_ctx_lock(cpuctx, task_ctx);
  315
  316	task = ctx->task;
  317	if (task == TASK_TOMBSTONE)
  318		goto unlock;
  319
  320	if (task) {
  321		/*
  322		 * We must be either inactive or active and the right task,
  323		 * otherwise we're screwed, since we cannot IPI to somewhere
  324		 * else.
  325		 */
  326		if (ctx->is_active) {
  327			if (WARN_ON_ONCE(task != current))
  328				goto unlock;
  329
  330			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
  331				goto unlock;
  332		}
  333	} else {
  334		WARN_ON_ONCE(&cpuctx->ctx != ctx);
  335	}
  336
  337	func(event, cpuctx, ctx, data);
  338unlock:
  339	perf_ctx_unlock(cpuctx, task_ctx);
  340}
  341
  342#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
  343		       PERF_FLAG_FD_OUTPUT  |\
  344		       PERF_FLAG_PID_CGROUP |\
  345		       PERF_FLAG_FD_CLOEXEC)
  346
  347/*
  348 * branch priv levels that need permission checks
  349 */
  350#define PERF_SAMPLE_BRANCH_PERM_PLM \
  351	(PERF_SAMPLE_BRANCH_KERNEL |\
  352	 PERF_SAMPLE_BRANCH_HV)
  353
  354enum event_type_t {
  355	EVENT_FLEXIBLE = 0x1,
  356	EVENT_PINNED = 0x2,
  357	EVENT_TIME = 0x4,
  358	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
  359};
  360
  361/*
  362 * perf_sched_events : >0 events exist
  363 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  364 */
  365
  366static void perf_sched_delayed(struct work_struct *work);
  367DEFINE_STATIC_KEY_FALSE(perf_sched_events);
  368static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
  369static DEFINE_MUTEX(perf_sched_mutex);
  370static atomic_t perf_sched_count;
  371
  372static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
  373static DEFINE_PER_CPU(int, perf_sched_cb_usages);
  374static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
  375
  376static atomic_t nr_mmap_events __read_mostly;
  377static atomic_t nr_comm_events __read_mostly;
  378static atomic_t nr_task_events __read_mostly;
  379static atomic_t nr_freq_events __read_mostly;
  380static atomic_t nr_switch_events __read_mostly;
  381
  382static LIST_HEAD(pmus);
  383static DEFINE_MUTEX(pmus_lock);
  384static struct srcu_struct pmus_srcu;
  385
  386/*
  387 * perf event paranoia level:
  388 *  -1 - not paranoid at all
  389 *   0 - disallow raw tracepoint access for unpriv
  390 *   1 - disallow cpu events for unpriv
  391 *   2 - disallow kernel profiling for unpriv
  392 */
  393int sysctl_perf_event_paranoid __read_mostly = 2;
  394
  395/* Minimum for 512 kiB + 1 user control page */
  396int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
  397
  398/*
  399 * max perf event sample rate
  400 */
  401#define DEFAULT_MAX_SAMPLE_RATE		100000
  402#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
  403#define DEFAULT_CPU_TIME_MAX_PERCENT	25
  404
  405int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
  406
  407static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
  408static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
  409
  410static int perf_sample_allowed_ns __read_mostly =
  411	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  412
  413static void update_perf_cpu_limits(void)
  414{
  415	u64 tmp = perf_sample_period_ns;
  416
  417	tmp *= sysctl_perf_cpu_time_max_percent;
  418	tmp = div_u64(tmp, 100);
  419	if (!tmp)
  420		tmp = 1;
  421
  422	WRITE_ONCE(perf_sample_allowed_ns, tmp);
  423}
  424
  425static int perf_rotate_context(struct perf_cpu_context *cpuctx);
  426
  427int perf_proc_update_handler(struct ctl_table *table, int write,
  428		void __user *buffer, size_t *lenp,
  429		loff_t *ppos)
  430{
  431	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  432
  433	if (ret || !write)
  434		return ret;
  435
  436	/*
  437	 * If throttling is disabled don't allow the write:
  438	 */
  439	if (sysctl_perf_cpu_time_max_percent == 100 ||
  440	    sysctl_perf_cpu_time_max_percent == 0)
  441		return -EINVAL;
  442
  443	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
  444	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
  445	update_perf_cpu_limits();
  446
  447	return 0;
  448}
  449
  450int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
  451
  452int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
  453				void __user *buffer, size_t *lenp,
  454				loff_t *ppos)
  455{
  456	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
  457
  458	if (ret || !write)
  459		return ret;
  460
  461	if (sysctl_perf_cpu_time_max_percent == 100 ||
  462	    sysctl_perf_cpu_time_max_percent == 0) {
  463		printk(KERN_WARNING
  464		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
  465		WRITE_ONCE(perf_sample_allowed_ns, 0);
  466	} else {
  467		update_perf_cpu_limits();
  468	}
  469
  470	return 0;
  471}
  472
  473/*
  474 * perf samples are done in some very critical code paths (NMIs).
  475 * If they take too much CPU time, the system can lock up and not
  476 * get any real work done.  This will drop the sample rate when
  477 * we detect that events are taking too long.
  478 */
  479#define NR_ACCUMULATED_SAMPLES 128
  480static DEFINE_PER_CPU(u64, running_sample_length);
  481
  482static u64 __report_avg;
  483static u64 __report_allowed;
  484
  485static void perf_duration_warn(struct irq_work *w)
  486{
  487	printk_ratelimited(KERN_INFO
  488		"perf: interrupt took too long (%lld > %lld), lowering "
  489		"kernel.perf_event_max_sample_rate to %d\n",
  490		__report_avg, __report_allowed,
  491		sysctl_perf_event_sample_rate);
 
 
 
 
 
 
 
  492}
  493
  494static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
  495
  496void perf_sample_event_took(u64 sample_len_ns)
  497{
  498	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
  499	u64 running_len;
  500	u64 avg_len;
  501	u32 max;
  502
  503	if (max_len == 0)
  504		return;
  505
  506	/* Decay the counter by 1 average sample. */
  507	running_len = __this_cpu_read(running_sample_length);
  508	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
  509	running_len += sample_len_ns;
  510	__this_cpu_write(running_sample_length, running_len);
  511
  512	/*
  513	 * Note: this will be biased artifically low until we have
  514	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
  515	 * from having to maintain a count.
  516	 */
  517	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
  518	if (avg_len <= max_len)
  519		return;
  520
  521	__report_avg = avg_len;
  522	__report_allowed = max_len;
  523
  524	/*
  525	 * Compute a throttle threshold 25% below the current duration.
  526	 */
  527	avg_len += avg_len / 4;
  528	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
  529	if (avg_len < max)
  530		max /= (u32)avg_len;
  531	else
  532		max = 1;
  533
  534	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
  535	WRITE_ONCE(max_samples_per_tick, max);
  536
  537	sysctl_perf_event_sample_rate = max * HZ;
 
  538	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
  539
 
 
  540	if (!irq_work_queue(&perf_duration_work)) {
  541		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
  542			     "kernel.perf_event_max_sample_rate to %d\n",
  543			     __report_avg, __report_allowed,
  544			     sysctl_perf_event_sample_rate);
  545	}
  546}
  547
  548static atomic64_t perf_event_id;
  549
  550static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
  551			      enum event_type_t event_type);
  552
  553static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  554			     enum event_type_t event_type,
  555			     struct task_struct *task);
  556
  557static void update_context_time(struct perf_event_context *ctx);
  558static u64 perf_event_time(struct perf_event *event);
  559
  560void __weak perf_event_print_debug(void)	{ }
  561
  562extern __weak const char *perf_pmu_name(void)
  563{
  564	return "pmu";
  565}
  566
  567static inline u64 perf_clock(void)
  568{
  569	return local_clock();
  570}
  571
  572static inline u64 perf_event_clock(struct perf_event *event)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  573{
  574	return event->clock();
 
 
  575}
  576
  577#ifdef CONFIG_CGROUP_PERF
  578
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  579static inline bool
  580perf_cgroup_match(struct perf_event *event)
  581{
  582	struct perf_event_context *ctx = event->ctx;
  583	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  584
  585	/* @event doesn't care about cgroup */
  586	if (!event->cgrp)
  587		return true;
  588
  589	/* wants specific cgroup scope but @cpuctx isn't associated with any */
  590	if (!cpuctx->cgrp)
  591		return false;
  592
  593	/*
  594	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
  595	 * also enabled for all its descendant cgroups.  If @cpuctx's
  596	 * cgroup is a descendant of @event's (the test covers identity
  597	 * case), it's a match.
  598	 */
  599	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
  600				    event->cgrp->css.cgroup);
  601}
  602
  603static inline void perf_detach_cgroup(struct perf_event *event)
  604{
  605	css_put(&event->cgrp->css);
 
 
 
 
 
  606	event->cgrp = NULL;
  607}
  608
  609static inline int is_cgroup_event(struct perf_event *event)
  610{
  611	return event->cgrp != NULL;
  612}
  613
  614static inline u64 perf_cgroup_event_time(struct perf_event *event)
  615{
  616	struct perf_cgroup_info *t;
  617
  618	t = per_cpu_ptr(event->cgrp->info, event->cpu);
  619	return t->time;
  620}
  621
  622static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
  623{
  624	struct perf_cgroup_info *info;
  625	u64 now;
  626
  627	now = perf_clock();
  628
  629	info = this_cpu_ptr(cgrp->info);
  630
  631	info->time += now - info->timestamp;
  632	info->timestamp = now;
  633}
  634
  635static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
  636{
  637	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
  638	if (cgrp_out)
  639		__update_cgrp_time(cgrp_out);
  640}
  641
  642static inline void update_cgrp_time_from_event(struct perf_event *event)
  643{
  644	struct perf_cgroup *cgrp;
  645
  646	/*
  647	 * ensure we access cgroup data only when needed and
  648	 * when we know the cgroup is pinned (css_get)
  649	 */
  650	if (!is_cgroup_event(event))
  651		return;
  652
  653	cgrp = perf_cgroup_from_task(current, event->ctx);
  654	/*
  655	 * Do not update time when cgroup is not active
  656	 */
  657	if (cgrp == event->cgrp)
  658		__update_cgrp_time(event->cgrp);
  659}
  660
  661static inline void
  662perf_cgroup_set_timestamp(struct task_struct *task,
  663			  struct perf_event_context *ctx)
  664{
  665	struct perf_cgroup *cgrp;
  666	struct perf_cgroup_info *info;
  667
  668	/*
  669	 * ctx->lock held by caller
  670	 * ensure we do not access cgroup data
  671	 * unless we have the cgroup pinned (css_get)
  672	 */
  673	if (!task || !ctx->nr_cgroups)
  674		return;
  675
  676	cgrp = perf_cgroup_from_task(task, ctx);
  677	info = this_cpu_ptr(cgrp->info);
  678	info->timestamp = ctx->timestamp;
  679}
  680
  681#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
  682#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
  683
  684/*
  685 * reschedule events based on the cgroup constraint of task.
  686 *
  687 * mode SWOUT : schedule out everything
  688 * mode SWIN : schedule in based on cgroup for next
  689 */
  690static void perf_cgroup_switch(struct task_struct *task, int mode)
  691{
  692	struct perf_cpu_context *cpuctx;
  693	struct pmu *pmu;
  694	unsigned long flags;
  695
  696	/*
  697	 * disable interrupts to avoid geting nr_cgroup
  698	 * changes via __perf_event_disable(). Also
  699	 * avoids preemption.
  700	 */
  701	local_irq_save(flags);
  702
  703	/*
  704	 * we reschedule only in the presence of cgroup
  705	 * constrained events.
  706	 */
 
  707
  708	list_for_each_entry_rcu(pmu, &pmus, entry) {
  709		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
  710		if (cpuctx->unique_pmu != pmu)
  711			continue; /* ensure we process each cpuctx once */
  712
  713		/*
  714		 * perf_cgroup_events says at least one
  715		 * context on this CPU has cgroup events.
  716		 *
  717		 * ctx->nr_cgroups reports the number of cgroup
  718		 * events for a context.
  719		 */
  720		if (cpuctx->ctx.nr_cgroups > 0) {
  721			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
  722			perf_pmu_disable(cpuctx->ctx.pmu);
  723
  724			if (mode & PERF_CGROUP_SWOUT) {
  725				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
  726				/*
  727				 * must not be done before ctxswout due
  728				 * to event_filter_match() in event_sched_out()
  729				 */
  730				cpuctx->cgrp = NULL;
  731			}
  732
  733			if (mode & PERF_CGROUP_SWIN) {
  734				WARN_ON_ONCE(cpuctx->cgrp);
  735				/*
  736				 * set cgrp before ctxsw in to allow
  737				 * event_filter_match() to not have to pass
  738				 * task around
  739				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
  740				 * because cgorup events are only per-cpu
  741				 */
  742				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
  743				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
  744			}
  745			perf_pmu_enable(cpuctx->ctx.pmu);
  746			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
  747		}
  748	}
  749
 
 
  750	local_irq_restore(flags);
  751}
  752
  753static inline void perf_cgroup_sched_out(struct task_struct *task,
  754					 struct task_struct *next)
  755{
  756	struct perf_cgroup *cgrp1;
  757	struct perf_cgroup *cgrp2 = NULL;
  758
  759	rcu_read_lock();
  760	/*
  761	 * we come here when we know perf_cgroup_events > 0
  762	 * we do not need to pass the ctx here because we know
  763	 * we are holding the rcu lock
  764	 */
  765	cgrp1 = perf_cgroup_from_task(task, NULL);
  766	cgrp2 = perf_cgroup_from_task(next, NULL);
 
 
 
 
 
 
  767
  768	/*
  769	 * only schedule out current cgroup events if we know
  770	 * that we are switching to a different cgroup. Otherwise,
  771	 * do no touch the cgroup events.
  772	 */
  773	if (cgrp1 != cgrp2)
  774		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
  775
  776	rcu_read_unlock();
  777}
  778
  779static inline void perf_cgroup_sched_in(struct task_struct *prev,
  780					struct task_struct *task)
  781{
  782	struct perf_cgroup *cgrp1;
  783	struct perf_cgroup *cgrp2 = NULL;
  784
  785	rcu_read_lock();
  786	/*
  787	 * we come here when we know perf_cgroup_events > 0
  788	 * we do not need to pass the ctx here because we know
  789	 * we are holding the rcu lock
  790	 */
  791	cgrp1 = perf_cgroup_from_task(task, NULL);
  792	cgrp2 = perf_cgroup_from_task(prev, NULL);
 
 
  793
  794	/*
  795	 * only need to schedule in cgroup events if we are changing
  796	 * cgroup during ctxsw. Cgroup events were not scheduled
  797	 * out of ctxsw out if that was not the case.
  798	 */
  799	if (cgrp1 != cgrp2)
  800		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
  801
  802	rcu_read_unlock();
  803}
  804
  805static inline int perf_cgroup_connect(int fd, struct perf_event *event,
  806				      struct perf_event_attr *attr,
  807				      struct perf_event *group_leader)
  808{
  809	struct perf_cgroup *cgrp;
  810	struct cgroup_subsys_state *css;
  811	struct fd f = fdget(fd);
  812	int ret = 0;
  813
  814	if (!f.file)
  815		return -EBADF;
  816
  817	css = css_tryget_online_from_dir(f.file->f_path.dentry,
  818					 &perf_event_cgrp_subsys);
  819	if (IS_ERR(css)) {
  820		ret = PTR_ERR(css);
  821		goto out;
  822	}
  823
  824	cgrp = container_of(css, struct perf_cgroup, css);
  825	event->cgrp = cgrp;
  826
  827	/*
  828	 * all events in a group must monitor
  829	 * the same cgroup because a task belongs
  830	 * to only one perf cgroup at a time
  831	 */
  832	if (group_leader && group_leader->cgrp != cgrp) {
  833		perf_detach_cgroup(event);
  834		ret = -EINVAL;
  835	}
  836out:
  837	fdput(f);
  838	return ret;
  839}
  840
  841static inline void
  842perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
  843{
  844	struct perf_cgroup_info *t;
  845	t = per_cpu_ptr(event->cgrp->info, event->cpu);
  846	event->shadow_ctx_time = now - t->timestamp;
  847}
  848
  849static inline void
  850perf_cgroup_defer_enabled(struct perf_event *event)
  851{
  852	/*
  853	 * when the current task's perf cgroup does not match
  854	 * the event's, we need to remember to call the
  855	 * perf_mark_enable() function the first time a task with
  856	 * a matching perf cgroup is scheduled in.
  857	 */
  858	if (is_cgroup_event(event) && !perf_cgroup_match(event))
  859		event->cgrp_defer_enabled = 1;
  860}
  861
  862static inline void
  863perf_cgroup_mark_enabled(struct perf_event *event,
  864			 struct perf_event_context *ctx)
  865{
  866	struct perf_event *sub;
  867	u64 tstamp = perf_event_time(event);
  868
  869	if (!event->cgrp_defer_enabled)
  870		return;
  871
  872	event->cgrp_defer_enabled = 0;
  873
  874	event->tstamp_enabled = tstamp - event->total_time_enabled;
  875	list_for_each_entry(sub, &event->sibling_list, group_entry) {
  876		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
  877			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
  878			sub->cgrp_defer_enabled = 0;
  879		}
  880	}
  881}
  882
  883/*
  884 * Update cpuctx->cgrp so that it is set when first cgroup event is added and
  885 * cleared when last cgroup event is removed.
  886 */
  887static inline void
  888list_update_cgroup_event(struct perf_event *event,
  889			 struct perf_event_context *ctx, bool add)
  890{
  891	struct perf_cpu_context *cpuctx;
  892
  893	if (!is_cgroup_event(event))
  894		return;
  895
  896	if (add && ctx->nr_cgroups++)
  897		return;
  898	else if (!add && --ctx->nr_cgroups)
  899		return;
  900	/*
  901	 * Because cgroup events are always per-cpu events,
  902	 * this will always be called from the right CPU.
  903	 */
  904	cpuctx = __get_cpu_context(ctx);
  905
  906	/*
  907	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
  908	 * ctx->nr_cgroup == 0 .
  909	 */
  910	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
  911		cpuctx->cgrp = event->cgrp;
  912	else if (!add)
  913		cpuctx->cgrp = NULL;
  914}
  915
  916#else /* !CONFIG_CGROUP_PERF */
  917
  918static inline bool
  919perf_cgroup_match(struct perf_event *event)
  920{
  921	return true;
  922}
  923
  924static inline void perf_detach_cgroup(struct perf_event *event)
  925{}
  926
  927static inline int is_cgroup_event(struct perf_event *event)
  928{
  929	return 0;
  930}
  931
  932static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
  933{
  934	return 0;
  935}
  936
  937static inline void update_cgrp_time_from_event(struct perf_event *event)
  938{
  939}
  940
  941static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
  942{
  943}
  944
  945static inline void perf_cgroup_sched_out(struct task_struct *task,
  946					 struct task_struct *next)
  947{
  948}
  949
  950static inline void perf_cgroup_sched_in(struct task_struct *prev,
  951					struct task_struct *task)
  952{
  953}
  954
  955static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
  956				      struct perf_event_attr *attr,
  957				      struct perf_event *group_leader)
  958{
  959	return -EINVAL;
  960}
  961
  962static inline void
  963perf_cgroup_set_timestamp(struct task_struct *task,
  964			  struct perf_event_context *ctx)
  965{
  966}
  967
  968void
  969perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
  970{
  971}
  972
  973static inline void
  974perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
  975{
  976}
  977
  978static inline u64 perf_cgroup_event_time(struct perf_event *event)
  979{
  980	return 0;
  981}
  982
  983static inline void
  984perf_cgroup_defer_enabled(struct perf_event *event)
  985{
  986}
  987
  988static inline void
  989perf_cgroup_mark_enabled(struct perf_event *event,
  990			 struct perf_event_context *ctx)
  991{
  992}
  993
  994static inline void
  995list_update_cgroup_event(struct perf_event *event,
  996			 struct perf_event_context *ctx, bool add)
  997{
  998}
  999
 1000#endif
 1001
 1002/*
 1003 * set default to be dependent on timer tick just
 1004 * like original code
 1005 */
 1006#define PERF_CPU_HRTIMER (1000 / HZ)
 1007/*
 1008 * function must be called with interrupts disbled
 1009 */
 1010static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 1011{
 1012	struct perf_cpu_context *cpuctx;
 
 1013	int rotations = 0;
 1014
 1015	WARN_ON(!irqs_disabled());
 1016
 1017	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 
 1018	rotations = perf_rotate_context(cpuctx);
 1019
 1020	raw_spin_lock(&cpuctx->hrtimer_lock);
 1021	if (rotations)
 
 
 1022		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
 1023	else
 1024		cpuctx->hrtimer_active = 0;
 1025	raw_spin_unlock(&cpuctx->hrtimer_lock);
 1026
 1027	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 1028}
 1029
 1030static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 
 1031{
 1032	struct hrtimer *timer = &cpuctx->hrtimer;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1033	struct pmu *pmu = cpuctx->ctx.pmu;
 1034	u64 interval;
 1035
 1036	/* no multiplexing needed for SW PMU */
 1037	if (pmu->task_ctx_nr == perf_sw_context)
 1038		return;
 1039
 1040	/*
 1041	 * check default is sane, if not set then force to
 1042	 * default interval (1/tick)
 1043	 */
 1044	interval = pmu->hrtimer_interval_ms;
 1045	if (interval < 1)
 1046		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 1047
 1048	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 1049
 1050	raw_spin_lock_init(&cpuctx->hrtimer_lock);
 1051	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 1052	timer->function = perf_mux_hrtimer_handler;
 1053}
 1054
 1055static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 1056{
 1057	struct hrtimer *timer = &cpuctx->hrtimer;
 1058	struct pmu *pmu = cpuctx->ctx.pmu;
 1059	unsigned long flags;
 1060
 1061	/* not for SW PMU */
 1062	if (pmu->task_ctx_nr == perf_sw_context)
 1063		return 0;
 1064
 1065	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
 1066	if (!cpuctx->hrtimer_active) {
 1067		cpuctx->hrtimer_active = 1;
 1068		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
 1069		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 1070	}
 1071	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 1072
 1073	return 0;
 
 
 1074}
 1075
 1076void perf_pmu_disable(struct pmu *pmu)
 1077{
 1078	int *count = this_cpu_ptr(pmu->pmu_disable_count);
 1079	if (!(*count)++)
 1080		pmu->pmu_disable(pmu);
 1081}
 1082
 1083void perf_pmu_enable(struct pmu *pmu)
 1084{
 1085	int *count = this_cpu_ptr(pmu->pmu_disable_count);
 1086	if (!--(*count))
 1087		pmu->pmu_enable(pmu);
 1088}
 1089
 1090static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 1091
 1092/*
 1093 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 1094 * perf_event_task_tick() are fully serialized because they're strictly cpu
 1095 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 1096 * disabled, while perf_event_task_tick is called from IRQ context.
 1097 */
 1098static void perf_event_ctx_activate(struct perf_event_context *ctx)
 1099{
 1100	struct list_head *head = this_cpu_ptr(&active_ctx_list);
 1101
 1102	WARN_ON(!irqs_disabled());
 1103
 1104	WARN_ON(!list_empty(&ctx->active_ctx_list));
 1105
 1106	list_add(&ctx->active_ctx_list, head);
 1107}
 1108
 1109static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
 1110{
 1111	WARN_ON(!irqs_disabled());
 1112
 1113	WARN_ON(list_empty(&ctx->active_ctx_list));
 1114
 1115	list_del_init(&ctx->active_ctx_list);
 1116}
 1117
 1118static void get_ctx(struct perf_event_context *ctx)
 1119{
 1120	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 1121}
 1122
 1123static void free_ctx(struct rcu_head *head)
 1124{
 1125	struct perf_event_context *ctx;
 1126
 1127	ctx = container_of(head, struct perf_event_context, rcu_head);
 1128	kfree(ctx->task_ctx_data);
 1129	kfree(ctx);
 1130}
 1131
 1132static void put_ctx(struct perf_event_context *ctx)
 1133{
 1134	if (atomic_dec_and_test(&ctx->refcount)) {
 1135		if (ctx->parent_ctx)
 1136			put_ctx(ctx->parent_ctx);
 1137		if (ctx->task && ctx->task != TASK_TOMBSTONE)
 1138			put_task_struct(ctx->task);
 1139		call_rcu(&ctx->rcu_head, free_ctx);
 1140	}
 1141}
 1142
 1143/*
 1144 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 1145 * perf_pmu_migrate_context() we need some magic.
 1146 *
 1147 * Those places that change perf_event::ctx will hold both
 1148 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 1149 *
 1150 * Lock ordering is by mutex address. There are two other sites where
 1151 * perf_event_context::mutex nests and those are:
 1152 *
 1153 *  - perf_event_exit_task_context()	[ child , 0 ]
 1154 *      perf_event_exit_event()
 1155 *        put_event()			[ parent, 1 ]
 1156 *
 1157 *  - perf_event_init_context()		[ parent, 0 ]
 1158 *      inherit_task_group()
 1159 *        inherit_group()
 1160 *          inherit_event()
 1161 *            perf_event_alloc()
 1162 *              perf_init_event()
 1163 *                perf_try_init_event()	[ child , 1 ]
 1164 *
 1165 * While it appears there is an obvious deadlock here -- the parent and child
 1166 * nesting levels are inverted between the two. This is in fact safe because
 1167 * life-time rules separate them. That is an exiting task cannot fork, and a
 1168 * spawning task cannot (yet) exit.
 1169 *
 1170 * But remember that that these are parent<->child context relations, and
 1171 * migration does not affect children, therefore these two orderings should not
 1172 * interact.
 1173 *
 1174 * The change in perf_event::ctx does not affect children (as claimed above)
 1175 * because the sys_perf_event_open() case will install a new event and break
 1176 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 1177 * concerned with cpuctx and that doesn't have children.
 1178 *
 1179 * The places that change perf_event::ctx will issue:
 1180 *
 1181 *   perf_remove_from_context();
 1182 *   synchronize_rcu();
 1183 *   perf_install_in_context();
 1184 *
 1185 * to affect the change. The remove_from_context() + synchronize_rcu() should
 1186 * quiesce the event, after which we can install it in the new location. This
 1187 * means that only external vectors (perf_fops, prctl) can perturb the event
 1188 * while in transit. Therefore all such accessors should also acquire
 1189 * perf_event_context::mutex to serialize against this.
 1190 *
 1191 * However; because event->ctx can change while we're waiting to acquire
 1192 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 1193 * function.
 1194 *
 1195 * Lock order:
 1196 *    cred_guard_mutex
 1197 *	task_struct::perf_event_mutex
 1198 *	  perf_event_context::mutex
 1199 *	    perf_event::child_mutex;
 1200 *	      perf_event_context::lock
 1201 *	    perf_event::mmap_mutex
 1202 *	    mmap_sem
 1203 */
 1204static struct perf_event_context *
 1205perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
 1206{
 1207	struct perf_event_context *ctx;
 1208
 1209again:
 1210	rcu_read_lock();
 1211	ctx = ACCESS_ONCE(event->ctx);
 1212	if (!atomic_inc_not_zero(&ctx->refcount)) {
 1213		rcu_read_unlock();
 1214		goto again;
 1215	}
 1216	rcu_read_unlock();
 1217
 1218	mutex_lock_nested(&ctx->mutex, nesting);
 1219	if (event->ctx != ctx) {
 1220		mutex_unlock(&ctx->mutex);
 1221		put_ctx(ctx);
 1222		goto again;
 1223	}
 1224
 1225	return ctx;
 1226}
 1227
 1228static inline struct perf_event_context *
 1229perf_event_ctx_lock(struct perf_event *event)
 1230{
 1231	return perf_event_ctx_lock_nested(event, 0);
 1232}
 1233
 1234static void perf_event_ctx_unlock(struct perf_event *event,
 1235				  struct perf_event_context *ctx)
 1236{
 1237	mutex_unlock(&ctx->mutex);
 1238	put_ctx(ctx);
 1239}
 1240
 1241/*
 1242 * This must be done under the ctx->lock, such as to serialize against
 1243 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 1244 * calling scheduler related locks and ctx->lock nests inside those.
 1245 */
 1246static __must_check struct perf_event_context *
 1247unclone_ctx(struct perf_event_context *ctx)
 1248{
 1249	struct perf_event_context *parent_ctx = ctx->parent_ctx;
 1250
 1251	lockdep_assert_held(&ctx->lock);
 1252
 1253	if (parent_ctx)
 1254		ctx->parent_ctx = NULL;
 
 1255	ctx->generation++;
 1256
 1257	return parent_ctx;
 1258}
 1259
 1260static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 1261{
 1262	/*
 1263	 * only top level events have the pid namespace they were created in
 1264	 */
 1265	if (event->parent)
 1266		event = event->parent;
 1267
 1268	return task_tgid_nr_ns(p, event->ns);
 1269}
 1270
 1271static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 1272{
 1273	/*
 1274	 * only top level events have the pid namespace they were created in
 1275	 */
 1276	if (event->parent)
 1277		event = event->parent;
 1278
 1279	return task_pid_nr_ns(p, event->ns);
 1280}
 1281
 1282/*
 1283 * If we inherit events we want to return the parent event id
 1284 * to userspace.
 1285 */
 1286static u64 primary_event_id(struct perf_event *event)
 1287{
 1288	u64 id = event->id;
 1289
 1290	if (event->parent)
 1291		id = event->parent->id;
 1292
 1293	return id;
 1294}
 1295
 1296/*
 1297 * Get the perf_event_context for a task and lock it.
 1298 *
 1299 * This has to cope with with the fact that until it is locked,
 1300 * the context could get moved to another task.
 1301 */
 1302static struct perf_event_context *
 1303perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 1304{
 1305	struct perf_event_context *ctx;
 1306
 1307retry:
 1308	/*
 1309	 * One of the few rules of preemptible RCU is that one cannot do
 1310	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
 1311	 * part of the read side critical section was irqs-enabled -- see
 1312	 * rcu_read_unlock_special().
 1313	 *
 1314	 * Since ctx->lock nests under rq->lock we must ensure the entire read
 1315	 * side critical section has interrupts disabled.
 1316	 */
 1317	local_irq_save(*flags);
 1318	rcu_read_lock();
 1319	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
 1320	if (ctx) {
 1321		/*
 1322		 * If this context is a clone of another, it might
 1323		 * get swapped for another underneath us by
 1324		 * perf_event_task_sched_out, though the
 1325		 * rcu_read_lock() protects us from any context
 1326		 * getting freed.  Lock the context and check if it
 1327		 * got swapped before we could get the lock, and retry
 1328		 * if so.  If we locked the right context, then it
 1329		 * can't get swapped on us any more.
 1330		 */
 1331		raw_spin_lock(&ctx->lock);
 1332		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
 1333			raw_spin_unlock(&ctx->lock);
 1334			rcu_read_unlock();
 1335			local_irq_restore(*flags);
 1336			goto retry;
 1337		}
 1338
 1339		if (ctx->task == TASK_TOMBSTONE ||
 1340		    !atomic_inc_not_zero(&ctx->refcount)) {
 1341			raw_spin_unlock(&ctx->lock);
 1342			ctx = NULL;
 1343		} else {
 1344			WARN_ON_ONCE(ctx->task != task);
 1345		}
 1346	}
 1347	rcu_read_unlock();
 1348	if (!ctx)
 1349		local_irq_restore(*flags);
 1350	return ctx;
 1351}
 1352
 1353/*
 1354 * Get the context for a task and increment its pin_count so it
 1355 * can't get swapped to another task.  This also increments its
 1356 * reference count so that the context can't get freed.
 1357 */
 1358static struct perf_event_context *
 1359perf_pin_task_context(struct task_struct *task, int ctxn)
 1360{
 1361	struct perf_event_context *ctx;
 1362	unsigned long flags;
 1363
 1364	ctx = perf_lock_task_context(task, ctxn, &flags);
 1365	if (ctx) {
 1366		++ctx->pin_count;
 1367		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 1368	}
 1369	return ctx;
 1370}
 1371
 1372static void perf_unpin_context(struct perf_event_context *ctx)
 1373{
 1374	unsigned long flags;
 1375
 1376	raw_spin_lock_irqsave(&ctx->lock, flags);
 1377	--ctx->pin_count;
 1378	raw_spin_unlock_irqrestore(&ctx->lock, flags);
 1379}
 1380
 1381/*
 1382 * Update the record of the current time in a context.
 1383 */
 1384static void update_context_time(struct perf_event_context *ctx)
 1385{
 1386	u64 now = perf_clock();
 1387
 1388	ctx->time += now - ctx->timestamp;
 1389	ctx->timestamp = now;
 1390}
 1391
 1392static u64 perf_event_time(struct perf_event *event)
 1393{
 1394	struct perf_event_context *ctx = event->ctx;
 1395
 1396	if (is_cgroup_event(event))
 1397		return perf_cgroup_event_time(event);
 1398
 1399	return ctx ? ctx->time : 0;
 1400}
 1401
 1402/*
 1403 * Update the total_time_enabled and total_time_running fields for a event.
 
 1404 */
 1405static void update_event_times(struct perf_event *event)
 1406{
 1407	struct perf_event_context *ctx = event->ctx;
 1408	u64 run_end;
 1409
 1410	lockdep_assert_held(&ctx->lock);
 1411
 1412	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 1413	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 1414		return;
 1415
 1416	/*
 1417	 * in cgroup mode, time_enabled represents
 1418	 * the time the event was enabled AND active
 1419	 * tasks were in the monitored cgroup. This is
 1420	 * independent of the activity of the context as
 1421	 * there may be a mix of cgroup and non-cgroup events.
 1422	 *
 1423	 * That is why we treat cgroup events differently
 1424	 * here.
 1425	 */
 1426	if (is_cgroup_event(event))
 1427		run_end = perf_cgroup_event_time(event);
 1428	else if (ctx->is_active)
 1429		run_end = ctx->time;
 1430	else
 1431		run_end = event->tstamp_stopped;
 1432
 1433	event->total_time_enabled = run_end - event->tstamp_enabled;
 1434
 1435	if (event->state == PERF_EVENT_STATE_INACTIVE)
 1436		run_end = event->tstamp_stopped;
 1437	else
 1438		run_end = perf_event_time(event);
 1439
 1440	event->total_time_running = run_end - event->tstamp_running;
 1441
 1442}
 1443
 1444/*
 1445 * Update total_time_enabled and total_time_running for all events in a group.
 1446 */
 1447static void update_group_times(struct perf_event *leader)
 1448{
 1449	struct perf_event *event;
 1450
 1451	update_event_times(leader);
 1452	list_for_each_entry(event, &leader->sibling_list, group_entry)
 1453		update_event_times(event);
 1454}
 1455
 1456static struct list_head *
 1457ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 1458{
 1459	if (event->attr.pinned)
 1460		return &ctx->pinned_groups;
 1461	else
 1462		return &ctx->flexible_groups;
 1463}
 1464
 1465/*
 1466 * Add a event from the lists for its context.
 1467 * Must be called with ctx->mutex and ctx->lock held.
 1468 */
 1469static void
 1470list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 1471{
 1472	lockdep_assert_held(&ctx->lock);
 1473
 1474	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 1475	event->attach_state |= PERF_ATTACH_CONTEXT;
 1476
 1477	/*
 1478	 * If we're a stand alone event or group leader, we go to the context
 1479	 * list, group events are kept attached to the group so that
 1480	 * perf_group_detach can, at all times, locate all siblings.
 1481	 */
 1482	if (event->group_leader == event) {
 1483		struct list_head *list;
 1484
 1485		event->group_caps = event->event_caps;
 
 1486
 1487		list = ctx_group_list(event, ctx);
 1488		list_add_tail(&event->group_entry, list);
 1489	}
 1490
 1491	list_update_cgroup_event(event, ctx, true);
 
 
 
 
 1492
 1493	list_add_rcu(&event->event_entry, &ctx->event_list);
 
 
 1494	ctx->nr_events++;
 1495	if (event->attr.inherit_stat)
 1496		ctx->nr_stat++;
 1497
 1498	ctx->generation++;
 1499}
 1500
 1501/*
 1502 * Initialize event state based on the perf_event_attr::disabled.
 1503 */
 1504static inline void perf_event__state_init(struct perf_event *event)
 1505{
 1506	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
 1507					      PERF_EVENT_STATE_INACTIVE;
 1508}
 1509
 1510static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
 
 
 
 
 1511{
 1512	int entry = sizeof(u64); /* value */
 1513	int size = 0;
 1514	int nr = 1;
 1515
 1516	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 1517		size += sizeof(u64);
 1518
 1519	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 1520		size += sizeof(u64);
 1521
 1522	if (event->attr.read_format & PERF_FORMAT_ID)
 1523		entry += sizeof(u64);
 1524
 1525	if (event->attr.read_format & PERF_FORMAT_GROUP) {
 1526		nr += nr_siblings;
 1527		size += sizeof(u64);
 1528	}
 1529
 1530	size += entry * nr;
 1531	event->read_size = size;
 1532}
 1533
 1534static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 1535{
 1536	struct perf_sample_data *data;
 
 1537	u16 size = 0;
 1538
 
 
 1539	if (sample_type & PERF_SAMPLE_IP)
 1540		size += sizeof(data->ip);
 1541
 1542	if (sample_type & PERF_SAMPLE_ADDR)
 1543		size += sizeof(data->addr);
 1544
 1545	if (sample_type & PERF_SAMPLE_PERIOD)
 1546		size += sizeof(data->period);
 1547
 1548	if (sample_type & PERF_SAMPLE_WEIGHT)
 1549		size += sizeof(data->weight);
 1550
 1551	if (sample_type & PERF_SAMPLE_READ)
 1552		size += event->read_size;
 1553
 1554	if (sample_type & PERF_SAMPLE_DATA_SRC)
 1555		size += sizeof(data->data_src.val);
 1556
 1557	if (sample_type & PERF_SAMPLE_TRANSACTION)
 1558		size += sizeof(data->txn);
 1559
 1560	event->header_size = size;
 1561}
 1562
 1563/*
 1564 * Called at perf_event creation and when events are attached/detached from a
 1565 * group.
 1566 */
 1567static void perf_event__header_size(struct perf_event *event)
 1568{
 1569	__perf_event_read_size(event,
 1570			       event->group_leader->nr_siblings);
 1571	__perf_event_header_size(event, event->attr.sample_type);
 1572}
 1573
 1574static void perf_event__id_header_size(struct perf_event *event)
 1575{
 1576	struct perf_sample_data *data;
 1577	u64 sample_type = event->attr.sample_type;
 1578	u16 size = 0;
 1579
 1580	if (sample_type & PERF_SAMPLE_TID)
 1581		size += sizeof(data->tid_entry);
 1582
 1583	if (sample_type & PERF_SAMPLE_TIME)
 1584		size += sizeof(data->time);
 1585
 1586	if (sample_type & PERF_SAMPLE_IDENTIFIER)
 1587		size += sizeof(data->id);
 1588
 1589	if (sample_type & PERF_SAMPLE_ID)
 1590		size += sizeof(data->id);
 1591
 1592	if (sample_type & PERF_SAMPLE_STREAM_ID)
 1593		size += sizeof(data->stream_id);
 1594
 1595	if (sample_type & PERF_SAMPLE_CPU)
 1596		size += sizeof(data->cpu_entry);
 1597
 1598	event->id_header_size = size;
 1599}
 1600
 1601static bool perf_event_validate_size(struct perf_event *event)
 1602{
 1603	/*
 1604	 * The values computed here will be over-written when we actually
 1605	 * attach the event.
 1606	 */
 1607	__perf_event_read_size(event, event->group_leader->nr_siblings + 1);
 1608	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
 1609	perf_event__id_header_size(event);
 1610
 1611	/*
 1612	 * Sum the lot; should not exceed the 64k limit we have on records.
 1613	 * Conservative limit to allow for callchains and other variable fields.
 1614	 */
 1615	if (event->read_size + event->header_size +
 1616	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
 1617		return false;
 1618
 1619	return true;
 1620}
 1621
 1622static void perf_group_attach(struct perf_event *event)
 1623{
 1624	struct perf_event *group_leader = event->group_leader, *pos;
 1625
 1626	lockdep_assert_held(&event->ctx->lock);
 1627
 1628	/*
 1629	 * We can have double attach due to group movement in perf_event_open.
 1630	 */
 1631	if (event->attach_state & PERF_ATTACH_GROUP)
 1632		return;
 1633
 1634	event->attach_state |= PERF_ATTACH_GROUP;
 1635
 1636	if (group_leader == event)
 1637		return;
 1638
 1639	WARN_ON_ONCE(group_leader->ctx != event->ctx);
 1640
 1641	group_leader->group_caps &= event->event_caps;
 1642
 1643	list_add_tail(&event->group_entry, &group_leader->sibling_list);
 1644	group_leader->nr_siblings++;
 1645
 1646	perf_event__header_size(group_leader);
 1647
 1648	list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
 1649		perf_event__header_size(pos);
 1650}
 1651
 1652/*
 1653 * Remove a event from the lists for its context.
 1654 * Must be called with ctx->mutex and ctx->lock held.
 1655 */
 1656static void
 1657list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 1658{
 1659	WARN_ON_ONCE(event->ctx != ctx);
 1660	lockdep_assert_held(&ctx->lock);
 1661
 1662	/*
 1663	 * We can have double detach due to exit/hot-unplug + close.
 1664	 */
 1665	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
 1666		return;
 1667
 1668	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 1669
 1670	list_update_cgroup_event(event, ctx, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 1671
 1672	ctx->nr_events--;
 1673	if (event->attr.inherit_stat)
 1674		ctx->nr_stat--;
 1675
 1676	list_del_rcu(&event->event_entry);
 1677
 1678	if (event->group_leader == event)
 1679		list_del_init(&event->group_entry);
 1680
 1681	update_group_times(event);
 1682
 1683	/*
 1684	 * If event was in error state, then keep it
 1685	 * that way, otherwise bogus counts will be
 1686	 * returned on read(). The only way to get out
 1687	 * of error state is by explicit re-enabling
 1688	 * of the event
 1689	 */
 1690	if (event->state > PERF_EVENT_STATE_OFF)
 1691		event->state = PERF_EVENT_STATE_OFF;
 1692
 1693	ctx->generation++;
 1694}
 1695
 1696static void perf_group_detach(struct perf_event *event)
 1697{
 1698	struct perf_event *sibling, *tmp;
 1699	struct list_head *list = NULL;
 1700
 1701	lockdep_assert_held(&event->ctx->lock);
 1702
 1703	/*
 1704	 * We can have double detach due to exit/hot-unplug + close.
 1705	 */
 1706	if (!(event->attach_state & PERF_ATTACH_GROUP))
 1707		return;
 1708
 1709	event->attach_state &= ~PERF_ATTACH_GROUP;
 1710
 1711	/*
 1712	 * If this is a sibling, remove it from its group.
 1713	 */
 1714	if (event->group_leader != event) {
 1715		list_del_init(&event->group_entry);
 1716		event->group_leader->nr_siblings--;
 1717		goto out;
 1718	}
 1719
 1720	if (!list_empty(&event->group_entry))
 1721		list = &event->group_entry;
 1722
 1723	/*
 1724	 * If this was a group event with sibling events then
 1725	 * upgrade the siblings to singleton events by adding them
 1726	 * to whatever list we are on.
 1727	 */
 1728	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 1729		if (list)
 1730			list_move_tail(&sibling->group_entry, list);
 1731		sibling->group_leader = sibling;
 1732
 1733		/* Inherit group flags from the previous leader */
 1734		sibling->group_caps = event->group_caps;
 1735
 1736		WARN_ON_ONCE(sibling->ctx != event->ctx);
 1737	}
 1738
 1739out:
 1740	perf_event__header_size(event->group_leader);
 1741
 1742	list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
 1743		perf_event__header_size(tmp);
 1744}
 1745
 1746static bool is_orphaned_event(struct perf_event *event)
 1747{
 1748	return event->state == PERF_EVENT_STATE_DEAD;
 1749}
 1750
 1751static inline int __pmu_filter_match(struct perf_event *event)
 1752{
 1753	struct pmu *pmu = event->pmu;
 1754	return pmu->filter_match ? pmu->filter_match(event) : 1;
 1755}
 1756
 1757/*
 1758 * Check whether we should attempt to schedule an event group based on
 1759 * PMU-specific filtering. An event group can consist of HW and SW events,
 1760 * potentially with a SW leader, so we must check all the filters, to
 1761 * determine whether a group is schedulable:
 1762 */
 1763static inline int pmu_filter_match(struct perf_event *event)
 1764{
 1765	struct perf_event *child;
 1766
 1767	if (!__pmu_filter_match(event))
 1768		return 0;
 1769
 1770	list_for_each_entry(child, &event->sibling_list, group_entry) {
 1771		if (!__pmu_filter_match(child))
 1772			return 0;
 1773	}
 1774
 1775	return 1;
 1776}
 1777
 1778static inline int
 1779event_filter_match(struct perf_event *event)
 1780{
 1781	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
 1782	       perf_cgroup_match(event) && pmu_filter_match(event);
 1783}
 1784
 1785static void
 1786event_sched_out(struct perf_event *event,
 1787		  struct perf_cpu_context *cpuctx,
 1788		  struct perf_event_context *ctx)
 1789{
 1790	u64 tstamp = perf_event_time(event);
 1791	u64 delta;
 1792
 1793	WARN_ON_ONCE(event->ctx != ctx);
 1794	lockdep_assert_held(&ctx->lock);
 1795
 1796	/*
 1797	 * An event which could not be activated because of
 1798	 * filter mismatch still needs to have its timings
 1799	 * maintained, otherwise bogus information is return
 1800	 * via read() for time_enabled, time_running:
 1801	 */
 1802	if (event->state == PERF_EVENT_STATE_INACTIVE &&
 1803	    !event_filter_match(event)) {
 1804		delta = tstamp - event->tstamp_stopped;
 1805		event->tstamp_running += delta;
 1806		event->tstamp_stopped = tstamp;
 1807	}
 1808
 1809	if (event->state != PERF_EVENT_STATE_ACTIVE)
 1810		return;
 1811
 1812	perf_pmu_disable(event->pmu);
 1813
 1814	event->tstamp_stopped = tstamp;
 1815	event->pmu->del(event, 0);
 1816	event->oncpu = -1;
 1817	event->state = PERF_EVENT_STATE_INACTIVE;
 1818	if (event->pending_disable) {
 1819		event->pending_disable = 0;
 1820		event->state = PERF_EVENT_STATE_OFF;
 1821	}
 
 
 
 1822
 1823	if (!is_software_event(event))
 1824		cpuctx->active_oncpu--;
 1825	if (!--ctx->nr_active)
 1826		perf_event_ctx_deactivate(ctx);
 1827	if (event->attr.freq && event->attr.sample_freq)
 1828		ctx->nr_freq--;
 1829	if (event->attr.exclusive || !cpuctx->active_oncpu)
 1830		cpuctx->exclusive = 0;
 1831
 1832	perf_pmu_enable(event->pmu);
 1833}
 1834
 1835static void
 1836group_sched_out(struct perf_event *group_event,
 1837		struct perf_cpu_context *cpuctx,
 1838		struct perf_event_context *ctx)
 1839{
 1840	struct perf_event *event;
 1841	int state = group_event->state;
 1842
 1843	perf_pmu_disable(ctx->pmu);
 1844
 1845	event_sched_out(group_event, cpuctx, ctx);
 1846
 1847	/*
 1848	 * Schedule out siblings (if any):
 1849	 */
 1850	list_for_each_entry(event, &group_event->sibling_list, group_entry)
 1851		event_sched_out(event, cpuctx, ctx);
 1852
 1853	perf_pmu_enable(ctx->pmu);
 1854
 1855	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
 1856		cpuctx->exclusive = 0;
 1857}
 1858
 1859#define DETACH_GROUP	0x01UL
 
 
 
 1860
 1861/*
 1862 * Cross CPU call to remove a performance event
 1863 *
 1864 * We disable the event on the hardware level first. After that we
 1865 * remove it from the context list.
 1866 */
 1867static void
 1868__perf_remove_from_context(struct perf_event *event,
 1869			   struct perf_cpu_context *cpuctx,
 1870			   struct perf_event_context *ctx,
 1871			   void *info)
 1872{
 1873	unsigned long flags = (unsigned long)info;
 
 
 
 1874
 
 1875	event_sched_out(event, cpuctx, ctx);
 1876	if (flags & DETACH_GROUP)
 1877		perf_group_detach(event);
 1878	list_del_event(event, ctx);
 1879
 1880	if (!ctx->nr_events && ctx->is_active) {
 1881		ctx->is_active = 0;
 1882		if (ctx->task) {
 1883			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 1884			cpuctx->task_ctx = NULL;
 1885		}
 1886	}
 
 
 
 1887}
 1888
 
 1889/*
 1890 * Remove the event from a task's (or a CPU's) list of events.
 1891 *
 
 
 
 1892 * If event->ctx is a cloned context, callers must make sure that
 1893 * every task struct that event->ctx->task could possibly point to
 1894 * remains valid.  This is OK when called from perf_release since
 1895 * that only calls us on the top-level context, which can't be a clone.
 1896 * When called from perf_event_exit_task, it's OK because the
 1897 * context has been detached from its task.
 1898 */
 1899static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
 1900{
 1901	struct perf_event_context *ctx = event->ctx;
 
 
 
 
 
 1902
 1903	lockdep_assert_held(&ctx->mutex);
 1904
 1905	event_function_call(event, __perf_remove_from_context, (void *)flags);
 
 
 
 
 
 
 
 
 
 
 
 1906
 
 1907	/*
 1908	 * The above event_function_call() can NO-OP when it hits
 1909	 * TASK_TOMBSTONE. In that case we must already have been detached
 1910	 * from the context (by perf_event_exit_event()) but the grouping
 1911	 * might still be in-tact.
 1912	 */
 1913	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 1914	if ((flags & DETACH_GROUP) &&
 1915	    (event->attach_state & PERF_ATTACH_GROUP)) {
 1916		/*
 1917		 * Since in that case we cannot possibly be scheduled, simply
 1918		 * detach now.
 1919		 */
 1920		raw_spin_lock_irq(&ctx->lock);
 1921		perf_group_detach(event);
 1922		raw_spin_unlock_irq(&ctx->lock);
 
 1923	}
 
 
 
 
 
 
 
 
 
 1924}
 1925
 1926/*
 1927 * Cross CPU call to disable a performance event
 1928 */
 1929static void __perf_event_disable(struct perf_event *event,
 1930				 struct perf_cpu_context *cpuctx,
 1931				 struct perf_event_context *ctx,
 1932				 void *info)
 1933{
 1934	if (event->state < PERF_EVENT_STATE_INACTIVE)
 1935		return;
 
 1936
 1937	update_context_time(ctx);
 1938	update_cgrp_time_from_event(event);
 1939	update_group_times(event);
 1940	if (event == event->group_leader)
 1941		group_sched_out(event, cpuctx, ctx);
 1942	else
 1943		event_sched_out(event, cpuctx, ctx);
 1944	event->state = PERF_EVENT_STATE_OFF;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1945}
 1946
 1947/*
 1948 * Disable a event.
 1949 *
 1950 * If event->ctx is a cloned context, callers must make sure that
 1951 * every task struct that event->ctx->task could possibly point to
 1952 * remains valid.  This condition is satisifed when called through
 1953 * perf_event_for_each_child or perf_event_for_each because they
 1954 * hold the top-level event's child_mutex, so any descendant that
 1955 * goes to exit will block in perf_event_exit_event().
 1956 *
 1957 * When called from perf_pending_event it's OK because event->ctx
 1958 * is the current context on this CPU and preemption is disabled,
 1959 * hence we can't get into perf_event_task_sched_out for this context.
 1960 */
 1961static void _perf_event_disable(struct perf_event *event)
 1962{
 1963	struct perf_event_context *ctx = event->ctx;
 
 1964
 1965	raw_spin_lock_irq(&ctx->lock);
 1966	if (event->state <= PERF_EVENT_STATE_OFF) {
 1967		raw_spin_unlock_irq(&ctx->lock);
 
 
 1968		return;
 1969	}
 1970	raw_spin_unlock_irq(&ctx->lock);
 1971
 1972	event_function_call(event, __perf_event_disable, NULL);
 1973}
 1974
 1975void perf_event_disable_local(struct perf_event *event)
 1976{
 1977	event_function_local(event, __perf_event_disable, NULL);
 1978}
 1979
 1980/*
 1981 * Strictly speaking kernel users cannot create groups and therefore this
 1982 * interface does not need the perf_event_ctx_lock() magic.
 1983 */
 1984void perf_event_disable(struct perf_event *event)
 1985{
 1986	struct perf_event_context *ctx;
 
 
 
 
 
 
 1987
 1988	ctx = perf_event_ctx_lock(event);
 1989	_perf_event_disable(event);
 1990	perf_event_ctx_unlock(event, ctx);
 
 
 
 
 
 
 1991}
 1992EXPORT_SYMBOL_GPL(perf_event_disable);
 1993
 1994void perf_event_disable_inatomic(struct perf_event *event)
 1995{
 1996	event->pending_disable = 1;
 1997	irq_work_queue(&event->pending);
 1998}
 1999
 2000static void perf_set_shadow_time(struct perf_event *event,
 2001				 struct perf_event_context *ctx,
 2002				 u64 tstamp)
 2003{
 2004	/*
 2005	 * use the correct time source for the time snapshot
 2006	 *
 2007	 * We could get by without this by leveraging the
 2008	 * fact that to get to this function, the caller
 2009	 * has most likely already called update_context_time()
 2010	 * and update_cgrp_time_xx() and thus both timestamp
 2011	 * are identical (or very close). Given that tstamp is,
 2012	 * already adjusted for cgroup, we could say that:
 2013	 *    tstamp - ctx->timestamp
 2014	 * is equivalent to
 2015	 *    tstamp - cgrp->timestamp.
 2016	 *
 2017	 * Then, in perf_output_read(), the calculation would
 2018	 * work with no changes because:
 2019	 * - event is guaranteed scheduled in
 2020	 * - no scheduled out in between
 2021	 * - thus the timestamp would be the same
 2022	 *
 2023	 * But this is a bit hairy.
 2024	 *
 2025	 * So instead, we have an explicit cgroup call to remain
 2026	 * within the time time source all along. We believe it
 2027	 * is cleaner and simpler to understand.
 2028	 */
 2029	if (is_cgroup_event(event))
 2030		perf_cgroup_set_shadow_time(event, tstamp);
 2031	else
 2032		event->shadow_ctx_time = tstamp - ctx->timestamp;
 2033}
 2034
 2035#define MAX_INTERRUPTS (~0ULL)
 2036
 2037static void perf_log_throttle(struct perf_event *event, int enable);
 2038static void perf_log_itrace_start(struct perf_event *event);
 2039
 2040static int
 2041event_sched_in(struct perf_event *event,
 2042		 struct perf_cpu_context *cpuctx,
 2043		 struct perf_event_context *ctx)
 2044{
 2045	u64 tstamp = perf_event_time(event);
 2046	int ret = 0;
 2047
 2048	lockdep_assert_held(&ctx->lock);
 2049
 2050	if (event->state <= PERF_EVENT_STATE_OFF)
 2051		return 0;
 2052
 2053	WRITE_ONCE(event->oncpu, smp_processor_id());
 2054	/*
 2055	 * Order event::oncpu write to happen before the ACTIVE state
 2056	 * is visible.
 2057	 */
 2058	smp_wmb();
 2059	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
 2060
 2061	/*
 2062	 * Unthrottle events, since we scheduled we might have missed several
 2063	 * ticks already, also for a heavily scheduling task there is little
 2064	 * guarantee it'll get a tick in a timely manner.
 2065	 */
 2066	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
 2067		perf_log_throttle(event, 1);
 2068		event->hw.interrupts = 0;
 2069	}
 2070
 2071	/*
 2072	 * The new state must be visible before we turn it on in the hardware:
 2073	 */
 2074	smp_wmb();
 2075
 2076	perf_pmu_disable(event->pmu);
 2077
 2078	perf_set_shadow_time(event, ctx, tstamp);
 2079
 2080	perf_log_itrace_start(event);
 2081
 2082	if (event->pmu->add(event, PERF_EF_START)) {
 2083		event->state = PERF_EVENT_STATE_INACTIVE;
 2084		event->oncpu = -1;
 2085		ret = -EAGAIN;
 2086		goto out;
 2087	}
 2088
 2089	event->tstamp_running += tstamp - event->tstamp_stopped;
 2090
 
 
 2091	if (!is_software_event(event))
 2092		cpuctx->active_oncpu++;
 2093	if (!ctx->nr_active++)
 2094		perf_event_ctx_activate(ctx);
 2095	if (event->attr.freq && event->attr.sample_freq)
 2096		ctx->nr_freq++;
 2097
 2098	if (event->attr.exclusive)
 2099		cpuctx->exclusive = 1;
 2100
 2101out:
 2102	perf_pmu_enable(event->pmu);
 2103
 2104	return ret;
 2105}
 2106
 2107static int
 2108group_sched_in(struct perf_event *group_event,
 2109	       struct perf_cpu_context *cpuctx,
 2110	       struct perf_event_context *ctx)
 2111{
 2112	struct perf_event *event, *partial_group = NULL;
 2113	struct pmu *pmu = ctx->pmu;
 2114	u64 now = ctx->time;
 2115	bool simulate = false;
 2116
 2117	if (group_event->state == PERF_EVENT_STATE_OFF)
 2118		return 0;
 2119
 2120	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
 2121
 2122	if (event_sched_in(group_event, cpuctx, ctx)) {
 2123		pmu->cancel_txn(pmu);
 2124		perf_mux_hrtimer_restart(cpuctx);
 2125		return -EAGAIN;
 2126	}
 2127
 2128	/*
 2129	 * Schedule in siblings as one group (if any):
 2130	 */
 2131	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 2132		if (event_sched_in(event, cpuctx, ctx)) {
 2133			partial_group = event;
 2134			goto group_error;
 2135		}
 2136	}
 2137
 2138	if (!pmu->commit_txn(pmu))
 2139		return 0;
 2140
 2141group_error:
 2142	/*
 2143	 * Groups can be scheduled in as one unit only, so undo any
 2144	 * partial group before returning:
 2145	 * The events up to the failed event are scheduled out normally,
 2146	 * tstamp_stopped will be updated.
 2147	 *
 2148	 * The failed events and the remaining siblings need to have
 2149	 * their timings updated as if they had gone thru event_sched_in()
 2150	 * and event_sched_out(). This is required to get consistent timings
 2151	 * across the group. This also takes care of the case where the group
 2152	 * could never be scheduled by ensuring tstamp_stopped is set to mark
 2153	 * the time the event was actually stopped, such that time delta
 2154	 * calculation in update_event_times() is correct.
 2155	 */
 2156	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 2157		if (event == partial_group)
 2158			simulate = true;
 2159
 2160		if (simulate) {
 2161			event->tstamp_running += now - event->tstamp_stopped;
 2162			event->tstamp_stopped = now;
 2163		} else {
 2164			event_sched_out(event, cpuctx, ctx);
 2165		}
 2166	}
 2167	event_sched_out(group_event, cpuctx, ctx);
 2168
 2169	pmu->cancel_txn(pmu);
 2170
 2171	perf_mux_hrtimer_restart(cpuctx);
 2172
 2173	return -EAGAIN;
 2174}
 2175
 2176/*
 2177 * Work out whether we can put this event group on the CPU now.
 2178 */
 2179static int group_can_go_on(struct perf_event *event,
 2180			   struct perf_cpu_context *cpuctx,
 2181			   int can_add_hw)
 2182{
 2183	/*
 2184	 * Groups consisting entirely of software events can always go on.
 2185	 */
 2186	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
 2187		return 1;
 2188	/*
 2189	 * If an exclusive group is already on, no other hardware
 2190	 * events can go on.
 2191	 */
 2192	if (cpuctx->exclusive)
 2193		return 0;
 2194	/*
 2195	 * If this group is exclusive and there are already
 2196	 * events on the CPU, it can't go on.
 2197	 */
 2198	if (event->attr.exclusive && cpuctx->active_oncpu)
 2199		return 0;
 2200	/*
 2201	 * Otherwise, try to add it if all previous groups were able
 2202	 * to go on.
 2203	 */
 2204	return can_add_hw;
 2205}
 2206
 2207static void add_event_to_ctx(struct perf_event *event,
 2208			       struct perf_event_context *ctx)
 2209{
 2210	u64 tstamp = perf_event_time(event);
 2211
 2212	list_add_event(event, ctx);
 2213	perf_group_attach(event);
 2214	event->tstamp_enabled = tstamp;
 2215	event->tstamp_running = tstamp;
 2216	event->tstamp_stopped = tstamp;
 2217}
 2218
 2219static void ctx_sched_out(struct perf_event_context *ctx,
 2220			  struct perf_cpu_context *cpuctx,
 2221			  enum event_type_t event_type);
 2222static void
 2223ctx_sched_in(struct perf_event_context *ctx,
 2224	     struct perf_cpu_context *cpuctx,
 2225	     enum event_type_t event_type,
 2226	     struct task_struct *task);
 2227
 2228static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 2229			       struct perf_event_context *ctx)
 2230{
 2231	if (!cpuctx->task_ctx)
 2232		return;
 2233
 2234	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 2235		return;
 2236
 2237	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
 2238}
 2239
 2240static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 2241				struct perf_event_context *ctx,
 2242				struct task_struct *task)
 2243{
 2244	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
 2245	if (ctx)
 2246		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
 2247	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
 2248	if (ctx)
 2249		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 2250}
 2251
 2252static void ctx_resched(struct perf_cpu_context *cpuctx,
 2253			struct perf_event_context *task_ctx)
 2254{
 2255	perf_pmu_disable(cpuctx->ctx.pmu);
 2256	if (task_ctx)
 2257		task_ctx_sched_out(cpuctx, task_ctx);
 2258	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 2259	perf_event_sched_in(cpuctx, task_ctx, current);
 2260	perf_pmu_enable(cpuctx->ctx.pmu);
 2261}
 2262
 2263/*
 2264 * Cross CPU call to install and enable a performance event
 2265 *
 2266 * Very similar to remote_function() + event_function() but cannot assume that
 2267 * things like ctx->is_active and cpuctx->task_ctx are set.
 2268 */
 2269static int  __perf_install_in_context(void *info)
 2270{
 2271	struct perf_event *event = info;
 2272	struct perf_event_context *ctx = event->ctx;
 2273	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 2274	struct perf_event_context *task_ctx = cpuctx->task_ctx;
 2275	bool reprogram = true;
 2276	int ret = 0;
 2277
 2278	raw_spin_lock(&cpuctx->ctx.lock);
 2279	if (ctx->task) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2280		raw_spin_lock(&ctx->lock);
 2281		task_ctx = ctx;
 
 2282
 2283		reprogram = (ctx->task == current);
 
 
 
 2284
 2285		/*
 2286		 * If the task is running, it must be running on this CPU,
 2287		 * otherwise we cannot reprogram things.
 2288		 *
 2289		 * If its not running, we don't care, ctx->lock will
 2290		 * serialize against it becoming runnable.
 2291		 */
 2292		if (task_curr(ctx->task) && !reprogram) {
 2293			ret = -ESRCH;
 2294			goto unlock;
 2295		}
 2296
 2297		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
 2298	} else if (task_ctx) {
 2299		raw_spin_lock(&task_ctx->lock);
 2300	}
 
 
 
 2301
 2302	if (reprogram) {
 2303		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 2304		add_event_to_ctx(event, ctx);
 2305		ctx_resched(cpuctx, task_ctx);
 2306	} else {
 2307		add_event_to_ctx(event, ctx);
 2308	}
 2309
 2310unlock:
 
 
 
 
 
 2311	perf_ctx_unlock(cpuctx, task_ctx);
 2312
 2313	return ret;
 2314}
 2315
 2316/*
 2317 * Attach a performance event to a context.
 
 
 
 2318 *
 2319 * Very similar to event_function_call, see comment there.
 
 
 2320 */
 2321static void
 2322perf_install_in_context(struct perf_event_context *ctx,
 2323			struct perf_event *event,
 2324			int cpu)
 2325{
 2326	struct task_struct *task = READ_ONCE(ctx->task);
 2327
 2328	lockdep_assert_held(&ctx->mutex);
 2329
 
 2330	if (event->cpu != -1)
 2331		event->cpu = cpu;
 2332
 2333	/*
 2334	 * Ensures that if we can observe event->ctx, both the event and ctx
 2335	 * will be 'complete'. See perf_iterate_sb_cpu().
 2336	 */
 2337	smp_store_release(&event->ctx, ctx);
 2338
 2339	if (!task) {
 
 
 
 
 2340		cpu_function_call(cpu, __perf_install_in_context, event);
 2341		return;
 2342	}
 2343
 2344	/*
 2345	 * Should not happen, we validate the ctx is still alive before calling.
 2346	 */
 2347	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
 2348		return;
 2349
 2350	/*
 2351	 * Installing events is tricky because we cannot rely on ctx->is_active
 2352	 * to be set in case this is the nr_events 0 -> 1 transition.
 2353	 *
 2354	 * Instead we use task_curr(), which tells us if the task is running.
 2355	 * However, since we use task_curr() outside of rq::lock, we can race
 2356	 * against the actual state. This means the result can be wrong.
 2357	 *
 2358	 * If we get a false positive, we retry, this is harmless.
 2359	 *
 2360	 * If we get a false negative, things are complicated. If we are after
 2361	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
 2362	 * value must be correct. If we're before, it doesn't matter since
 2363	 * perf_event_context_sched_in() will program the counter.
 2364	 *
 2365	 * However, this hinges on the remote context switch having observed
 2366	 * our task->perf_event_ctxp[] store, such that it will in fact take
 2367	 * ctx::lock in perf_event_context_sched_in().
 2368	 *
 2369	 * We do this by task_function_call(), if the IPI fails to hit the task
 2370	 * we know any future context switch of task must see the
 2371	 * perf_event_ctpx[] store.
 2372	 */
 2373
 2374	/*
 2375	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
 2376	 * task_cpu() load, such that if the IPI then does not find the task
 2377	 * running, a future context switch of that task must observe the
 2378	 * store.
 2379	 */
 2380	smp_mb();
 2381again:
 2382	if (!task_function_call(task, __perf_install_in_context, event))
 2383		return;
 2384
 2385	raw_spin_lock_irq(&ctx->lock);
 2386	task = ctx->task;
 2387	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
 2388		/*
 2389		 * Cannot happen because we already checked above (which also
 2390		 * cannot happen), and we hold ctx->mutex, which serializes us
 2391		 * against perf_event_exit_task_context().
 2392		 */
 2393		raw_spin_unlock_irq(&ctx->lock);
 2394		return;
 2395	}
 2396	/*
 2397	 * If the task is not running, ctx->lock will avoid it becoming so,
 2398	 * thus we can safely install the event.
 2399	 */
 2400	if (task_curr(task)) {
 2401		raw_spin_unlock_irq(&ctx->lock);
 2402		goto again;
 2403	}
 
 
 
 
 
 2404	add_event_to_ctx(event, ctx);
 2405	raw_spin_unlock_irq(&ctx->lock);
 2406}
 2407
 2408/*
 2409 * Put a event into inactive state and update time fields.
 2410 * Enabling the leader of a group effectively enables all
 2411 * the group members that aren't explicitly disabled, so we
 2412 * have to update their ->tstamp_enabled also.
 2413 * Note: this works for group members as well as group leaders
 2414 * since the non-leader members' sibling_lists will be empty.
 2415 */
 2416static void __perf_event_mark_enabled(struct perf_event *event)
 2417{
 2418	struct perf_event *sub;
 2419	u64 tstamp = perf_event_time(event);
 2420
 2421	event->state = PERF_EVENT_STATE_INACTIVE;
 2422	event->tstamp_enabled = tstamp - event->total_time_enabled;
 2423	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 2424		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 2425			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 2426	}
 2427}
 2428
 2429/*
 2430 * Cross CPU call to enable a performance event
 2431 */
 2432static void __perf_event_enable(struct perf_event *event,
 2433				struct perf_cpu_context *cpuctx,
 2434				struct perf_event_context *ctx,
 2435				void *info)
 2436{
 
 
 2437	struct perf_event *leader = event->group_leader;
 2438	struct perf_event_context *task_ctx;
 
 2439
 2440	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
 2441	    event->state <= PERF_EVENT_STATE_ERROR)
 2442		return;
 
 
 
 
 
 
 
 
 2443
 2444	if (ctx->is_active)
 2445		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 2446
 2447	__perf_event_mark_enabled(event);
 
 2448
 2449	if (!ctx->is_active)
 2450		return;
 
 
 
 
 2451
 2452	if (!event_filter_match(event)) {
 2453		if (is_cgroup_event(event))
 2454			perf_cgroup_defer_enabled(event);
 2455		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 2456		return;
 2457	}
 2458
 2459	/*
 2460	 * If the event is in a group and isn't the group leader,
 2461	 * then don't put it on unless the group is on.
 2462	 */
 2463	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
 2464		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 2465		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2466	}
 2467
 2468	task_ctx = cpuctx->task_ctx;
 2469	if (ctx->task)
 2470		WARN_ON_ONCE(task_ctx != ctx);
 2471
 2472	ctx_resched(cpuctx, task_ctx);
 2473}
 2474
 2475/*
 2476 * Enable a event.
 2477 *
 2478 * If event->ctx is a cloned context, callers must make sure that
 2479 * every task struct that event->ctx->task could possibly point to
 2480 * remains valid.  This condition is satisfied when called through
 2481 * perf_event_for_each_child or perf_event_for_each as described
 2482 * for perf_event_disable.
 2483 */
 2484static void _perf_event_enable(struct perf_event *event)
 2485{
 2486	struct perf_event_context *ctx = event->ctx;
 
 2487
 2488	raw_spin_lock_irq(&ctx->lock);
 2489	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
 2490	    event->state <  PERF_EVENT_STATE_ERROR) {
 2491		raw_spin_unlock_irq(&ctx->lock);
 
 2492		return;
 2493	}
 2494
 
 
 
 
 2495	/*
 2496	 * If the event is in error state, clear that first.
 2497	 *
 2498	 * That way, if we see the event in error state below, we know that it
 2499	 * has gone back into error state, as distinct from the task having
 2500	 * been scheduled away before the cross-call arrived.
 2501	 */
 2502	if (event->state == PERF_EVENT_STATE_ERROR)
 2503		event->state = PERF_EVENT_STATE_OFF;
 2504	raw_spin_unlock_irq(&ctx->lock);
 2505
 2506	event_function_call(event, __perf_event_enable, NULL);
 2507}
 2508
 2509/*
 2510 * See perf_event_disable();
 2511 */
 2512void perf_event_enable(struct perf_event *event)
 2513{
 2514	struct perf_event_context *ctx;
 2515
 2516	ctx = perf_event_ctx_lock(event);
 2517	_perf_event_enable(event);
 2518	perf_event_ctx_unlock(event, ctx);
 2519}
 2520EXPORT_SYMBOL_GPL(perf_event_enable);
 2521
 2522struct stop_event_data {
 2523	struct perf_event	*event;
 2524	unsigned int		restart;
 2525};
 2526
 2527static int __perf_event_stop(void *info)
 2528{
 2529	struct stop_event_data *sd = info;
 2530	struct perf_event *event = sd->event;
 2531
 2532	/* if it's already INACTIVE, do nothing */
 2533	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
 2534		return 0;
 2535
 2536	/* matches smp_wmb() in event_sched_in() */
 2537	smp_rmb();
 2538
 2539	/*
 2540	 * There is a window with interrupts enabled before we get here,
 2541	 * so we need to check again lest we try to stop another CPU's event.
 2542	 */
 2543	if (READ_ONCE(event->oncpu) != smp_processor_id())
 2544		return -EAGAIN;
 2545
 2546	event->pmu->stop(event, PERF_EF_UPDATE);
 2547
 2548	/*
 2549	 * May race with the actual stop (through perf_pmu_output_stop()),
 2550	 * but it is only used for events with AUX ring buffer, and such
 2551	 * events will refuse to restart because of rb::aux_mmap_count==0,
 2552	 * see comments in perf_aux_output_begin().
 2553	 *
 2554	 * Since this is happening on a event-local CPU, no trace is lost
 2555	 * while restarting.
 2556	 */
 2557	if (sd->restart)
 2558		event->pmu->start(event, 0);
 2559
 2560	return 0;
 2561}
 2562
 2563static int perf_event_stop(struct perf_event *event, int restart)
 2564{
 2565	struct stop_event_data sd = {
 2566		.event		= event,
 2567		.restart	= restart,
 2568	};
 2569	int ret = 0;
 2570
 2571	do {
 2572		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
 2573			return 0;
 2574
 2575		/* matches smp_wmb() in event_sched_in() */
 2576		smp_rmb();
 2577
 2578		/*
 2579		 * We only want to restart ACTIVE events, so if the event goes
 2580		 * inactive here (event->oncpu==-1), there's nothing more to do;
 2581		 * fall through with ret==-ENXIO.
 2582		 */
 2583		ret = cpu_function_call(READ_ONCE(event->oncpu),
 2584					__perf_event_stop, &sd);
 2585	} while (ret == -EAGAIN);
 2586
 2587	return ret;
 2588}
 2589
 2590/*
 2591 * In order to contain the amount of racy and tricky in the address filter
 2592 * configuration management, it is a two part process:
 2593 *
 2594 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 2595 *      we update the addresses of corresponding vmas in
 2596 *	event::addr_filters_offs array and bump the event::addr_filters_gen;
 2597 * (p2) when an event is scheduled in (pmu::add), it calls
 2598 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 2599 *      if the generation has changed since the previous call.
 2600 *
 2601 * If (p1) happens while the event is active, we restart it to force (p2).
 2602 *
 2603 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 2604 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 2605 *     ioctl;
 2606 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 2607 *     registered mapping, called for every new mmap(), with mm::mmap_sem down
 2608 *     for reading;
 2609 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 2610 *     of exec.
 2611 */
 2612void perf_event_addr_filters_sync(struct perf_event *event)
 2613{
 2614	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
 2615
 2616	if (!has_addr_filter(event))
 2617		return;
 2618
 2619	raw_spin_lock(&ifh->lock);
 2620	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
 2621		event->pmu->addr_filters_sync(event);
 2622		event->hw.addr_filters_gen = event->addr_filters_gen;
 2623	}
 2624	raw_spin_unlock(&ifh->lock);
 
 
 2625}
 2626EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
 2627
 2628static int _perf_event_refresh(struct perf_event *event, int refresh)
 2629{
 2630	/*
 2631	 * not supported on inherited events
 2632	 */
 2633	if (event->attr.inherit || !is_sampling_event(event))
 2634		return -EINVAL;
 2635
 2636	atomic_add(refresh, &event->event_limit);
 2637	_perf_event_enable(event);
 2638
 2639	return 0;
 2640}
 2641
 2642/*
 2643 * See perf_event_disable()
 2644 */
 2645int perf_event_refresh(struct perf_event *event, int refresh)
 2646{
 2647	struct perf_event_context *ctx;
 2648	int ret;
 2649
 2650	ctx = perf_event_ctx_lock(event);
 2651	ret = _perf_event_refresh(event, refresh);
 2652	perf_event_ctx_unlock(event, ctx);
 2653
 2654	return ret;
 2655}
 2656EXPORT_SYMBOL_GPL(perf_event_refresh);
 2657
 2658static void ctx_sched_out(struct perf_event_context *ctx,
 2659			  struct perf_cpu_context *cpuctx,
 2660			  enum event_type_t event_type)
 2661{
 2662	int is_active = ctx->is_active;
 2663	struct perf_event *event;
 2664
 2665	lockdep_assert_held(&ctx->lock);
 2666
 2667	if (likely(!ctx->nr_events)) {
 2668		/*
 2669		 * See __perf_remove_from_context().
 2670		 */
 2671		WARN_ON_ONCE(ctx->is_active);
 2672		if (ctx->task)
 2673			WARN_ON_ONCE(cpuctx->task_ctx);
 2674		return;
 2675	}
 2676
 2677	ctx->is_active &= ~event_type;
 2678	if (!(ctx->is_active & EVENT_ALL))
 2679		ctx->is_active = 0;
 2680
 2681	if (ctx->task) {
 2682		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 2683		if (!ctx->is_active)
 2684			cpuctx->task_ctx = NULL;
 2685	}
 2686
 2687	/*
 2688	 * Always update time if it was set; not only when it changes.
 2689	 * Otherwise we can 'forget' to update time for any but the last
 2690	 * context we sched out. For example:
 2691	 *
 2692	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
 2693	 *   ctx_sched_out(.event_type = EVENT_PINNED)
 2694	 *
 2695	 * would only update time for the pinned events.
 2696	 */
 2697	if (is_active & EVENT_TIME) {
 2698		/* update (and stop) ctx time */
 2699		update_context_time(ctx);
 2700		update_cgrp_time_from_cpuctx(cpuctx);
 2701	}
 2702
 2703	is_active ^= ctx->is_active; /* changed bits */
 2704
 2705	if (!ctx->nr_active || !(is_active & EVENT_ALL))
 
 
 2706		return;
 2707
 2708	perf_pmu_disable(ctx->pmu);
 2709	if (is_active & EVENT_PINNED) {
 2710		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 2711			group_sched_out(event, cpuctx, ctx);
 2712	}
 2713
 2714	if (is_active & EVENT_FLEXIBLE) {
 2715		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 2716			group_sched_out(event, cpuctx, ctx);
 2717	}
 2718	perf_pmu_enable(ctx->pmu);
 2719}
 2720
 2721/*
 2722 * Test whether two contexts are equivalent, i.e. whether they have both been
 2723 * cloned from the same version of the same context.
 2724 *
 2725 * Equivalence is measured using a generation number in the context that is
 2726 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 2727 * and list_del_event().
 2728 */
 2729static int context_equiv(struct perf_event_context *ctx1,
 2730			 struct perf_event_context *ctx2)
 2731{
 2732	lockdep_assert_held(&ctx1->lock);
 2733	lockdep_assert_held(&ctx2->lock);
 2734
 2735	/* Pinning disables the swap optimization */
 2736	if (ctx1->pin_count || ctx2->pin_count)
 2737		return 0;
 2738
 2739	/* If ctx1 is the parent of ctx2 */
 2740	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
 2741		return 1;
 2742
 2743	/* If ctx2 is the parent of ctx1 */
 2744	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
 2745		return 1;
 2746
 2747	/*
 2748	 * If ctx1 and ctx2 have the same parent; we flatten the parent
 2749	 * hierarchy, see perf_event_init_context().
 2750	 */
 2751	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
 2752			ctx1->parent_gen == ctx2->parent_gen)
 2753		return 1;
 2754
 2755	/* Unmatched */
 2756	return 0;
 2757}
 2758
 2759static void __perf_event_sync_stat(struct perf_event *event,
 2760				     struct perf_event *next_event)
 2761{
 2762	u64 value;
 2763
 2764	if (!event->attr.inherit_stat)
 2765		return;
 2766
 2767	/*
 2768	 * Update the event value, we cannot use perf_event_read()
 2769	 * because we're in the middle of a context switch and have IRQs
 2770	 * disabled, which upsets smp_call_function_single(), however
 2771	 * we know the event must be on the current CPU, therefore we
 2772	 * don't need to use it.
 2773	 */
 2774	switch (event->state) {
 2775	case PERF_EVENT_STATE_ACTIVE:
 2776		event->pmu->read(event);
 2777		/* fall-through */
 2778
 2779	case PERF_EVENT_STATE_INACTIVE:
 2780		update_event_times(event);
 2781		break;
 2782
 2783	default:
 2784		break;
 2785	}
 2786
 2787	/*
 2788	 * In order to keep per-task stats reliable we need to flip the event
 2789	 * values when we flip the contexts.
 2790	 */
 2791	value = local64_read(&next_event->count);
 2792	value = local64_xchg(&event->count, value);
 2793	local64_set(&next_event->count, value);
 2794
 2795	swap(event->total_time_enabled, next_event->total_time_enabled);
 2796	swap(event->total_time_running, next_event->total_time_running);
 2797
 2798	/*
 2799	 * Since we swizzled the values, update the user visible data too.
 2800	 */
 2801	perf_event_update_userpage(event);
 2802	perf_event_update_userpage(next_event);
 2803}
 2804
 2805static void perf_event_sync_stat(struct perf_event_context *ctx,
 2806				   struct perf_event_context *next_ctx)
 2807{
 2808	struct perf_event *event, *next_event;
 2809
 2810	if (!ctx->nr_stat)
 2811		return;
 2812
 2813	update_context_time(ctx);
 2814
 2815	event = list_first_entry(&ctx->event_list,
 2816				   struct perf_event, event_entry);
 2817
 2818	next_event = list_first_entry(&next_ctx->event_list,
 2819					struct perf_event, event_entry);
 2820
 2821	while (&event->event_entry != &ctx->event_list &&
 2822	       &next_event->event_entry != &next_ctx->event_list) {
 2823
 2824		__perf_event_sync_stat(event, next_event);
 2825
 2826		event = list_next_entry(event, event_entry);
 2827		next_event = list_next_entry(next_event, event_entry);
 2828	}
 2829}
 2830
 2831static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 2832					 struct task_struct *next)
 2833{
 2834	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
 2835	struct perf_event_context *next_ctx;
 2836	struct perf_event_context *parent, *next_parent;
 2837	struct perf_cpu_context *cpuctx;
 2838	int do_switch = 1;
 2839
 2840	if (likely(!ctx))
 2841		return;
 2842
 2843	cpuctx = __get_cpu_context(ctx);
 2844	if (!cpuctx->task_ctx)
 2845		return;
 2846
 2847	rcu_read_lock();
 2848	next_ctx = next->perf_event_ctxp[ctxn];
 2849	if (!next_ctx)
 2850		goto unlock;
 2851
 2852	parent = rcu_dereference(ctx->parent_ctx);
 2853	next_parent = rcu_dereference(next_ctx->parent_ctx);
 2854
 2855	/* If neither context have a parent context; they cannot be clones. */
 2856	if (!parent && !next_parent)
 2857		goto unlock;
 2858
 2859	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
 2860		/*
 2861		 * Looks like the two contexts are clones, so we might be
 2862		 * able to optimize the context switch.  We lock both
 2863		 * contexts and check that they are clones under the
 2864		 * lock (including re-checking that neither has been
 2865		 * uncloned in the meantime).  It doesn't matter which
 2866		 * order we take the locks because no other cpu could
 2867		 * be trying to lock both of these tasks.
 2868		 */
 2869		raw_spin_lock(&ctx->lock);
 2870		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 2871		if (context_equiv(ctx, next_ctx)) {
 2872			WRITE_ONCE(ctx->task, next);
 2873			WRITE_ONCE(next_ctx->task, task);
 2874
 2875			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 2876
 2877			/*
 2878			 * RCU_INIT_POINTER here is safe because we've not
 2879			 * modified the ctx and the above modification of
 2880			 * ctx->task and ctx->task_ctx_data are immaterial
 2881			 * since those values are always verified under
 2882			 * ctx->lock which we're now holding.
 2883			 */
 2884			RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
 2885			RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
 2886
 
 2887			do_switch = 0;
 2888
 2889			perf_event_sync_stat(ctx, next_ctx);
 2890		}
 2891		raw_spin_unlock(&next_ctx->lock);
 2892		raw_spin_unlock(&ctx->lock);
 2893	}
 2894unlock:
 2895	rcu_read_unlock();
 2896
 2897	if (do_switch) {
 2898		raw_spin_lock(&ctx->lock);
 2899		task_ctx_sched_out(cpuctx, ctx);
 
 2900		raw_spin_unlock(&ctx->lock);
 2901	}
 2902}
 2903
 2904static DEFINE_PER_CPU(struct list_head, sched_cb_list);
 2905
 2906void perf_sched_cb_dec(struct pmu *pmu)
 2907{
 2908	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 2909
 2910	this_cpu_dec(perf_sched_cb_usages);
 2911
 2912	if (!--cpuctx->sched_cb_usage)
 2913		list_del(&cpuctx->sched_cb_entry);
 2914}
 2915
 2916
 2917void perf_sched_cb_inc(struct pmu *pmu)
 2918{
 2919	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 2920
 2921	if (!cpuctx->sched_cb_usage++)
 2922		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
 2923
 2924	this_cpu_inc(perf_sched_cb_usages);
 2925}
 2926
 2927/*
 2928 * This function provides the context switch callback to the lower code
 2929 * layer. It is invoked ONLY when the context switch callback is enabled.
 2930 *
 2931 * This callback is relevant even to per-cpu events; for example multi event
 2932 * PEBS requires this to provide PID/TID information. This requires we flush
 2933 * all queued PEBS records before we context switch to a new task.
 2934 */
 2935static void perf_pmu_sched_task(struct task_struct *prev,
 2936				struct task_struct *next,
 2937				bool sched_in)
 2938{
 2939	struct perf_cpu_context *cpuctx;
 2940	struct pmu *pmu;
 2941
 2942	if (prev == next)
 2943		return;
 2944
 2945	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
 2946		pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
 2947
 2948		if (WARN_ON_ONCE(!pmu->sched_task))
 2949			continue;
 2950
 2951		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 2952		perf_pmu_disable(pmu);
 2953
 2954		pmu->sched_task(cpuctx->task_ctx, sched_in);
 2955
 2956		perf_pmu_enable(pmu);
 2957		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 2958	}
 2959}
 2960
 2961static void perf_event_switch(struct task_struct *task,
 2962			      struct task_struct *next_prev, bool sched_in);
 2963
 2964#define for_each_task_context_nr(ctxn)					\
 2965	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 2966
 2967/*
 2968 * Called from scheduler to remove the events of the current task,
 2969 * with interrupts disabled.
 2970 *
 2971 * We stop each event and update the event value in event->count.
 2972 *
 2973 * This does not protect us against NMI, but disable()
 2974 * sets the disabled bit in the control field of event _before_
 2975 * accessing the event control register. If a NMI hits, then it will
 2976 * not restart the event.
 2977 */
 2978void __perf_event_task_sched_out(struct task_struct *task,
 2979				 struct task_struct *next)
 2980{
 2981	int ctxn;
 2982
 2983	if (__this_cpu_read(perf_sched_cb_usages))
 2984		perf_pmu_sched_task(task, next, false);
 2985
 2986	if (atomic_read(&nr_switch_events))
 2987		perf_event_switch(task, next, false);
 2988
 2989	for_each_task_context_nr(ctxn)
 2990		perf_event_context_sched_out(task, ctxn, next);
 2991
 2992	/*
 2993	 * if cgroup events exist on this CPU, then we need
 2994	 * to check if we have to switch out PMU state.
 2995	 * cgroup event are system-wide mode only
 2996	 */
 2997	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 2998		perf_cgroup_sched_out(task, next);
 2999}
 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3001/*
 3002 * Called with IRQs disabled
 3003 */
 3004static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 3005			      enum event_type_t event_type)
 3006{
 3007	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 3008}
 3009
 3010static void
 3011ctx_pinned_sched_in(struct perf_event_context *ctx,
 3012		    struct perf_cpu_context *cpuctx)
 3013{
 3014	struct perf_event *event;
 3015
 3016	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 3017		if (event->state <= PERF_EVENT_STATE_OFF)
 3018			continue;
 3019		if (!event_filter_match(event))
 3020			continue;
 3021
 3022		/* may need to reset tstamp_enabled */
 3023		if (is_cgroup_event(event))
 3024			perf_cgroup_mark_enabled(event, ctx);
 3025
 3026		if (group_can_go_on(event, cpuctx, 1))
 3027			group_sched_in(event, cpuctx, ctx);
 3028
 3029		/*
 3030		 * If this pinned group hasn't been scheduled,
 3031		 * put it in error state.
 3032		 */
 3033		if (event->state == PERF_EVENT_STATE_INACTIVE) {
 3034			update_group_times(event);
 3035			event->state = PERF_EVENT_STATE_ERROR;
 3036		}
 3037	}
 3038}
 3039
 3040static void
 3041ctx_flexible_sched_in(struct perf_event_context *ctx,
 3042		      struct perf_cpu_context *cpuctx)
 3043{
 3044	struct perf_event *event;
 3045	int can_add_hw = 1;
 3046
 3047	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
 3048		/* Ignore events in OFF or ERROR state */
 3049		if (event->state <= PERF_EVENT_STATE_OFF)
 3050			continue;
 3051		/*
 3052		 * Listen to the 'cpu' scheduling filter constraint
 3053		 * of events:
 3054		 */
 3055		if (!event_filter_match(event))
 3056			continue;
 3057
 3058		/* may need to reset tstamp_enabled */
 3059		if (is_cgroup_event(event))
 3060			perf_cgroup_mark_enabled(event, ctx);
 3061
 3062		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 3063			if (group_sched_in(event, cpuctx, ctx))
 3064				can_add_hw = 0;
 3065		}
 3066	}
 3067}
 3068
 3069static void
 3070ctx_sched_in(struct perf_event_context *ctx,
 3071	     struct perf_cpu_context *cpuctx,
 3072	     enum event_type_t event_type,
 3073	     struct task_struct *task)
 3074{
 3075	int is_active = ctx->is_active;
 3076	u64 now;
 
 3077
 3078	lockdep_assert_held(&ctx->lock);
 3079
 3080	if (likely(!ctx->nr_events))
 3081		return;
 3082
 3083	ctx->is_active |= (event_type | EVENT_TIME);
 3084	if (ctx->task) {
 3085		if (!is_active)
 3086			cpuctx->task_ctx = ctx;
 3087		else
 3088			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 3089	}
 3090
 3091	is_active ^= ctx->is_active; /* changed bits */
 3092
 3093	if (is_active & EVENT_TIME) {
 3094		/* start ctx time */
 3095		now = perf_clock();
 3096		ctx->timestamp = now;
 3097		perf_cgroup_set_timestamp(task, ctx);
 3098	}
 3099
 3100	/*
 3101	 * First go through the list and put on any pinned groups
 3102	 * in order to give them the best chance of going on.
 3103	 */
 3104	if (is_active & EVENT_PINNED)
 3105		ctx_pinned_sched_in(ctx, cpuctx);
 3106
 3107	/* Then walk through the lower prio flexible groups */
 3108	if (is_active & EVENT_FLEXIBLE)
 3109		ctx_flexible_sched_in(ctx, cpuctx);
 3110}
 3111
 3112static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 3113			     enum event_type_t event_type,
 3114			     struct task_struct *task)
 3115{
 3116	struct perf_event_context *ctx = &cpuctx->ctx;
 3117
 3118	ctx_sched_in(ctx, cpuctx, event_type, task);
 3119}
 3120
 3121static void perf_event_context_sched_in(struct perf_event_context *ctx,
 3122					struct task_struct *task)
 3123{
 3124	struct perf_cpu_context *cpuctx;
 3125
 3126	cpuctx = __get_cpu_context(ctx);
 3127	if (cpuctx->task_ctx == ctx)
 3128		return;
 3129
 3130	perf_ctx_lock(cpuctx, ctx);
 3131	perf_pmu_disable(ctx->pmu);
 3132	/*
 3133	 * We want to keep the following priority order:
 3134	 * cpu pinned (that don't need to move), task pinned,
 3135	 * cpu flexible, task flexible.
 3136	 */
 3137	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 3138	perf_event_sched_in(cpuctx, ctx, task);
 
 
 
 
 
 3139	perf_pmu_enable(ctx->pmu);
 3140	perf_ctx_unlock(cpuctx, ctx);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3141}
 3142
 3143/*
 3144 * Called from scheduler to add the events of the current task
 3145 * with interrupts disabled.
 3146 *
 3147 * We restore the event value and then enable it.
 3148 *
 3149 * This does not protect us against NMI, but enable()
 3150 * sets the enabled bit in the control field of event _before_
 3151 * accessing the event control register. If a NMI hits, then it will
 3152 * keep the event running.
 3153 */
 3154void __perf_event_task_sched_in(struct task_struct *prev,
 3155				struct task_struct *task)
 3156{
 3157	struct perf_event_context *ctx;
 3158	int ctxn;
 3159
 3160	/*
 3161	 * If cgroup events exist on this CPU, then we need to check if we have
 3162	 * to switch in PMU state; cgroup event are system-wide mode only.
 3163	 *
 3164	 * Since cgroup events are CPU events, we must schedule these in before
 3165	 * we schedule in the task events.
 3166	 */
 3167	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 3168		perf_cgroup_sched_in(prev, task);
 3169
 3170	for_each_task_context_nr(ctxn) {
 3171		ctx = task->perf_event_ctxp[ctxn];
 3172		if (likely(!ctx))
 3173			continue;
 3174
 3175		perf_event_context_sched_in(ctx, task);
 3176	}
 
 
 
 
 
 
 
 3177
 3178	if (atomic_read(&nr_switch_events))
 3179		perf_event_switch(task, prev, true);
 3180
 3181	if (__this_cpu_read(perf_sched_cb_usages))
 3182		perf_pmu_sched_task(prev, task, true);
 3183}
 3184
 3185static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 3186{
 3187	u64 frequency = event->attr.sample_freq;
 3188	u64 sec = NSEC_PER_SEC;
 3189	u64 divisor, dividend;
 3190
 3191	int count_fls, nsec_fls, frequency_fls, sec_fls;
 3192
 3193	count_fls = fls64(count);
 3194	nsec_fls = fls64(nsec);
 3195	frequency_fls = fls64(frequency);
 3196	sec_fls = 30;
 3197
 3198	/*
 3199	 * We got @count in @nsec, with a target of sample_freq HZ
 3200	 * the target period becomes:
 3201	 *
 3202	 *             @count * 10^9
 3203	 * period = -------------------
 3204	 *          @nsec * sample_freq
 3205	 *
 3206	 */
 3207
 3208	/*
 3209	 * Reduce accuracy by one bit such that @a and @b converge
 3210	 * to a similar magnitude.
 3211	 */
 3212#define REDUCE_FLS(a, b)		\
 3213do {					\
 3214	if (a##_fls > b##_fls) {	\
 3215		a >>= 1;		\
 3216		a##_fls--;		\
 3217	} else {			\
 3218		b >>= 1;		\
 3219		b##_fls--;		\
 3220	}				\
 3221} while (0)
 3222
 3223	/*
 3224	 * Reduce accuracy until either term fits in a u64, then proceed with
 3225	 * the other, so that finally we can do a u64/u64 division.
 3226	 */
 3227	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
 3228		REDUCE_FLS(nsec, frequency);
 3229		REDUCE_FLS(sec, count);
 3230	}
 3231
 3232	if (count_fls + sec_fls > 64) {
 3233		divisor = nsec * frequency;
 3234
 3235		while (count_fls + sec_fls > 64) {
 3236			REDUCE_FLS(count, sec);
 3237			divisor >>= 1;
 3238		}
 3239
 3240		dividend = count * sec;
 3241	} else {
 3242		dividend = count * sec;
 3243
 3244		while (nsec_fls + frequency_fls > 64) {
 3245			REDUCE_FLS(nsec, frequency);
 3246			dividend >>= 1;
 3247		}
 3248
 3249		divisor = nsec * frequency;
 3250	}
 3251
 3252	if (!divisor)
 3253		return dividend;
 3254
 3255	return div64_u64(dividend, divisor);
 3256}
 3257
 3258static DEFINE_PER_CPU(int, perf_throttled_count);
 3259static DEFINE_PER_CPU(u64, perf_throttled_seq);
 3260
 3261static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
 3262{
 3263	struct hw_perf_event *hwc = &event->hw;
 3264	s64 period, sample_period;
 3265	s64 delta;
 3266
 3267	period = perf_calculate_period(event, nsec, count);
 3268
 3269	delta = (s64)(period - hwc->sample_period);
 3270	delta = (delta + 7) / 8; /* low pass filter */
 3271
 3272	sample_period = hwc->sample_period + delta;
 3273
 3274	if (!sample_period)
 3275		sample_period = 1;
 3276
 3277	hwc->sample_period = sample_period;
 3278
 3279	if (local64_read(&hwc->period_left) > 8*sample_period) {
 3280		if (disable)
 3281			event->pmu->stop(event, PERF_EF_UPDATE);
 3282
 3283		local64_set(&hwc->period_left, 0);
 3284
 3285		if (disable)
 3286			event->pmu->start(event, PERF_EF_RELOAD);
 3287	}
 3288}
 3289
 3290/*
 3291 * combine freq adjustment with unthrottling to avoid two passes over the
 3292 * events. At the same time, make sure, having freq events does not change
 3293 * the rate of unthrottling as that would introduce bias.
 3294 */
 3295static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 3296					   int needs_unthr)
 3297{
 3298	struct perf_event *event;
 3299	struct hw_perf_event *hwc;
 3300	u64 now, period = TICK_NSEC;
 3301	s64 delta;
 3302
 3303	/*
 3304	 * only need to iterate over all events iff:
 3305	 * - context have events in frequency mode (needs freq adjust)
 3306	 * - there are events to unthrottle on this cpu
 3307	 */
 3308	if (!(ctx->nr_freq || needs_unthr))
 3309		return;
 3310
 3311	raw_spin_lock(&ctx->lock);
 3312	perf_pmu_disable(ctx->pmu);
 3313
 3314	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 3315		if (event->state != PERF_EVENT_STATE_ACTIVE)
 3316			continue;
 3317
 3318		if (!event_filter_match(event))
 3319			continue;
 3320
 3321		perf_pmu_disable(event->pmu);
 3322
 3323		hwc = &event->hw;
 3324
 3325		if (hwc->interrupts == MAX_INTERRUPTS) {
 3326			hwc->interrupts = 0;
 3327			perf_log_throttle(event, 1);
 3328			event->pmu->start(event, 0);
 3329		}
 3330
 3331		if (!event->attr.freq || !event->attr.sample_freq)
 3332			goto next;
 3333
 3334		/*
 3335		 * stop the event and update event->count
 3336		 */
 3337		event->pmu->stop(event, PERF_EF_UPDATE);
 3338
 3339		now = local64_read(&event->count);
 3340		delta = now - hwc->freq_count_stamp;
 3341		hwc->freq_count_stamp = now;
 3342
 3343		/*
 3344		 * restart the event
 3345		 * reload only if value has changed
 3346		 * we have stopped the event so tell that
 3347		 * to perf_adjust_period() to avoid stopping it
 3348		 * twice.
 3349		 */
 3350		if (delta > 0)
 3351			perf_adjust_period(event, period, delta, false);
 3352
 3353		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
 3354	next:
 3355		perf_pmu_enable(event->pmu);
 3356	}
 3357
 3358	perf_pmu_enable(ctx->pmu);
 3359	raw_spin_unlock(&ctx->lock);
 3360}
 3361
 3362/*
 3363 * Round-robin a context's events:
 3364 */
 3365static void rotate_ctx(struct perf_event_context *ctx)
 3366{
 3367	/*
 3368	 * Rotate the first entry last of non-pinned groups. Rotation might be
 3369	 * disabled by the inheritance code.
 3370	 */
 3371	if (!ctx->rotate_disable)
 3372		list_rotate_left(&ctx->flexible_groups);
 3373}
 3374
 
 
 
 
 
 3375static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 3376{
 3377	struct perf_event_context *ctx = NULL;
 3378	int rotate = 0;
 3379
 3380	if (cpuctx->ctx.nr_events) {
 
 3381		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 3382			rotate = 1;
 3383	}
 3384
 3385	ctx = cpuctx->task_ctx;
 3386	if (ctx && ctx->nr_events) {
 
 3387		if (ctx->nr_events != ctx->nr_active)
 3388			rotate = 1;
 3389	}
 3390
 3391	if (!rotate)
 3392		goto done;
 3393
 3394	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 3395	perf_pmu_disable(cpuctx->ctx.pmu);
 3396
 3397	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 3398	if (ctx)
 3399		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 3400
 3401	rotate_ctx(&cpuctx->ctx);
 3402	if (ctx)
 3403		rotate_ctx(ctx);
 3404
 3405	perf_event_sched_in(cpuctx, ctx, current);
 3406
 3407	perf_pmu_enable(cpuctx->ctx.pmu);
 3408	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 3409done:
 
 
 3410
 3411	return rotate;
 3412}
 3413
 
 
 
 
 
 
 
 
 
 
 
 3414void perf_event_task_tick(void)
 3415{
 3416	struct list_head *head = this_cpu_ptr(&active_ctx_list);
 3417	struct perf_event_context *ctx, *tmp;
 
 3418	int throttled;
 3419
 3420	WARN_ON(!irqs_disabled());
 3421
 3422	__this_cpu_inc(perf_throttled_seq);
 3423	throttled = __this_cpu_xchg(perf_throttled_count, 0);
 3424	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
 3425
 3426	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
 
 3427		perf_adjust_freq_unthr_context(ctx, throttled);
 
 
 
 
 
 3428}
 3429
 3430static int event_enable_on_exec(struct perf_event *event,
 3431				struct perf_event_context *ctx)
 3432{
 3433	if (!event->attr.enable_on_exec)
 3434		return 0;
 3435
 3436	event->attr.enable_on_exec = 0;
 3437	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 3438		return 0;
 3439
 3440	__perf_event_mark_enabled(event);
 3441
 3442	return 1;
 3443}
 3444
 3445/*
 3446 * Enable all of a task's events that have been marked enable-on-exec.
 3447 * This expects task == current.
 3448 */
 3449static void perf_event_enable_on_exec(int ctxn)
 3450{
 3451	struct perf_event_context *ctx, *clone_ctx = NULL;
 3452	struct perf_cpu_context *cpuctx;
 3453	struct perf_event *event;
 3454	unsigned long flags;
 3455	int enabled = 0;
 
 3456
 3457	local_irq_save(flags);
 3458	ctx = current->perf_event_ctxp[ctxn];
 3459	if (!ctx || !ctx->nr_events)
 3460		goto out;
 3461
 3462	cpuctx = __get_cpu_context(ctx);
 3463	perf_ctx_lock(cpuctx, ctx);
 3464	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 3465	list_for_each_entry(event, &ctx->event_list, event_entry)
 3466		enabled |= event_enable_on_exec(event, ctx);
 3467
 3468	/*
 3469	 * Unclone and reschedule this context if we enabled any event.
 
 
 
 
 3470	 */
 3471	if (enabled) {
 3472		clone_ctx = unclone_ctx(ctx);
 3473		ctx_resched(cpuctx, ctx);
 3474	}
 3475	perf_ctx_unlock(cpuctx, ctx);
 3476
 3477out:
 3478	local_irq_restore(flags);
 3479
 3480	if (clone_ctx)
 3481		put_ctx(clone_ctx);
 3482}
 3483
 3484struct perf_read_data {
 3485	struct perf_event *event;
 3486	bool group;
 3487	int ret;
 3488};
 3489
 3490static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
 3491{
 3492	u16 local_pkg, event_pkg;
 3493
 3494	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
 3495		int local_cpu = smp_processor_id();
 
 
 
 3496
 3497		event_pkg = topology_physical_package_id(event_cpu);
 3498		local_pkg = topology_physical_package_id(local_cpu);
 
 
 
 3499
 3500		if (event_pkg == local_pkg)
 3501			return local_cpu;
 3502	}
 3503
 3504	return event_cpu;
 
 
 
 
 
 3505}
 3506
 3507/*
 3508 * Cross CPU call to read the hardware event
 3509 */
 3510static void __perf_event_read(void *info)
 3511{
 3512	struct perf_read_data *data = info;
 3513	struct perf_event *sub, *event = data->event;
 3514	struct perf_event_context *ctx = event->ctx;
 3515	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 3516	struct pmu *pmu = event->pmu;
 3517
 3518	/*
 3519	 * If this is a task context, we need to check whether it is
 3520	 * the current task context of this cpu.  If not it has been
 3521	 * scheduled out before the smp call arrived.  In that case
 3522	 * event->count would have been updated to a recent sample
 3523	 * when the event was scheduled out.
 3524	 */
 3525	if (ctx->task && cpuctx->task_ctx != ctx)
 3526		return;
 3527
 3528	raw_spin_lock(&ctx->lock);
 3529	if (ctx->is_active) {
 3530		update_context_time(ctx);
 3531		update_cgrp_time_from_event(event);
 3532	}
 3533
 3534	update_event_times(event);
 3535	if (event->state != PERF_EVENT_STATE_ACTIVE)
 3536		goto unlock;
 3537
 3538	if (!data->group) {
 3539		pmu->read(event);
 3540		data->ret = 0;
 3541		goto unlock;
 3542	}
 3543
 3544	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 3545
 3546	pmu->read(event);
 3547
 3548	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 3549		update_event_times(sub);
 3550		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 3551			/*
 3552			 * Use sibling's PMU rather than @event's since
 3553			 * sibling could be on different (eg: software) PMU.
 3554			 */
 3555			sub->pmu->read(sub);
 3556		}
 3557	}
 3558
 3559	data->ret = pmu->commit_txn(pmu);
 3560
 3561unlock:
 3562	raw_spin_unlock(&ctx->lock);
 3563}
 3564
 3565static inline u64 perf_event_count(struct perf_event *event)
 3566{
 3567	if (event->pmu->count)
 3568		return event->pmu->count(event);
 3569
 3570	return __perf_event_count(event);
 3571}
 3572
 3573/*
 3574 * NMI-safe method to read a local event, that is an event that
 3575 * is:
 3576 *   - either for the current task, or for this CPU
 3577 *   - does not have inherit set, for inherited task events
 3578 *     will not be local and we cannot read them atomically
 3579 *   - must not have a pmu::count method
 3580 */
 3581u64 perf_event_read_local(struct perf_event *event)
 3582{
 3583	unsigned long flags;
 3584	u64 val;
 3585
 3586	/*
 3587	 * Disabling interrupts avoids all counter scheduling (context
 3588	 * switches, timer based rotation and IPIs).
 3589	 */
 3590	local_irq_save(flags);
 3591
 3592	/* If this is a per-task event, it must be for current */
 3593	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
 3594		     event->hw.target != current);
 3595
 3596	/* If this is a per-CPU event, it must be for this CPU */
 3597	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
 3598		     event->cpu != smp_processor_id());
 3599
 3600	/*
 3601	 * It must not be an event with inherit set, we cannot read
 3602	 * all child counters from atomic context.
 3603	 */
 3604	WARN_ON_ONCE(event->attr.inherit);
 3605
 3606	/*
 3607	 * It must not have a pmu::count method, those are not
 3608	 * NMI safe.
 3609	 */
 3610	WARN_ON_ONCE(event->pmu->count);
 3611
 3612	/*
 3613	 * If the event is currently on this CPU, its either a per-task event,
 3614	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 3615	 * oncpu == -1).
 3616	 */
 3617	if (event->oncpu == smp_processor_id())
 3618		event->pmu->read(event);
 3619
 3620	val = local64_read(&event->count);
 3621	local_irq_restore(flags);
 3622
 3623	return val;
 3624}
 3625
 3626static int perf_event_read(struct perf_event *event, bool group)
 3627{
 3628	int event_cpu, ret = 0;
 3629
 3630	/*
 3631	 * If event is enabled and currently active on a CPU, update the
 3632	 * value in the event structure:
 3633	 */
 3634	if (event->state == PERF_EVENT_STATE_ACTIVE) {
 3635		struct perf_read_data data = {
 3636			.event = event,
 3637			.group = group,
 3638			.ret = 0,
 3639		};
 3640
 3641		event_cpu = READ_ONCE(event->oncpu);
 3642		if ((unsigned)event_cpu >= nr_cpu_ids)
 3643			return 0;
 3644
 3645		preempt_disable();
 3646		event_cpu = __perf_event_read_cpu(event, event_cpu);
 3647
 3648		/*
 3649		 * Purposely ignore the smp_call_function_single() return
 3650		 * value.
 3651		 *
 3652		 * If event_cpu isn't a valid CPU it means the event got
 3653		 * scheduled out and that will have updated the event count.
 3654		 *
 3655		 * Therefore, either way, we'll have an up-to-date event count
 3656		 * after this.
 3657		 */
 3658		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
 3659		preempt_enable();
 3660		ret = data.ret;
 3661	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 3662		struct perf_event_context *ctx = event->ctx;
 3663		unsigned long flags;
 3664
 3665		raw_spin_lock_irqsave(&ctx->lock, flags);
 3666		/*
 3667		 * may read while context is not active
 3668		 * (e.g., thread is blocked), in that case
 3669		 * we cannot update context time
 3670		 */
 3671		if (ctx->is_active) {
 3672			update_context_time(ctx);
 3673			update_cgrp_time_from_event(event);
 3674		}
 3675		if (group)
 3676			update_group_times(event);
 3677		else
 3678			update_event_times(event);
 3679		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 3680	}
 3681
 3682	return ret;
 3683}
 3684
 3685/*
 3686 * Initialize the perf_event context in a task_struct:
 3687 */
 3688static void __perf_event_init_context(struct perf_event_context *ctx)
 3689{
 3690	raw_spin_lock_init(&ctx->lock);
 3691	mutex_init(&ctx->mutex);
 3692	INIT_LIST_HEAD(&ctx->active_ctx_list);
 3693	INIT_LIST_HEAD(&ctx->pinned_groups);
 3694	INIT_LIST_HEAD(&ctx->flexible_groups);
 3695	INIT_LIST_HEAD(&ctx->event_list);
 3696	atomic_set(&ctx->refcount, 1);
 3697}
 3698
 3699static struct perf_event_context *
 3700alloc_perf_context(struct pmu *pmu, struct task_struct *task)
 3701{
 3702	struct perf_event_context *ctx;
 3703
 3704	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
 3705	if (!ctx)
 3706		return NULL;
 3707
 3708	__perf_event_init_context(ctx);
 3709	if (task) {
 3710		ctx->task = task;
 3711		get_task_struct(task);
 3712	}
 3713	ctx->pmu = pmu;
 3714
 3715	return ctx;
 3716}
 3717
 3718static struct task_struct *
 3719find_lively_task_by_vpid(pid_t vpid)
 3720{
 3721	struct task_struct *task;
 
 3722
 3723	rcu_read_lock();
 3724	if (!vpid)
 3725		task = current;
 3726	else
 3727		task = find_task_by_vpid(vpid);
 3728	if (task)
 3729		get_task_struct(task);
 3730	rcu_read_unlock();
 3731
 3732	if (!task)
 3733		return ERR_PTR(-ESRCH);
 3734
 
 
 
 
 
 3735	return task;
 
 
 
 
 3736}
 3737
 3738/*
 3739 * Returns a matching context with refcount and pincount.
 3740 */
 3741static struct perf_event_context *
 3742find_get_context(struct pmu *pmu, struct task_struct *task,
 3743		struct perf_event *event)
 3744{
 3745	struct perf_event_context *ctx, *clone_ctx = NULL;
 3746	struct perf_cpu_context *cpuctx;
 3747	void *task_ctx_data = NULL;
 3748	unsigned long flags;
 3749	int ctxn, err;
 3750	int cpu = event->cpu;
 3751
 3752	if (!task) {
 3753		/* Must be root to operate on a CPU event: */
 3754		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 3755			return ERR_PTR(-EACCES);
 3756
 3757		/*
 3758		 * We could be clever and allow to attach a event to an
 3759		 * offline CPU and activate it when the CPU comes up, but
 3760		 * that's for later.
 3761		 */
 3762		if (!cpu_online(cpu))
 3763			return ERR_PTR(-ENODEV);
 3764
 3765		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 3766		ctx = &cpuctx->ctx;
 3767		get_ctx(ctx);
 3768		++ctx->pin_count;
 3769
 3770		return ctx;
 3771	}
 3772
 3773	err = -EINVAL;
 3774	ctxn = pmu->task_ctx_nr;
 3775	if (ctxn < 0)
 3776		goto errout;
 3777
 3778	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
 3779		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
 3780		if (!task_ctx_data) {
 3781			err = -ENOMEM;
 3782			goto errout;
 3783		}
 3784	}
 3785
 3786retry:
 3787	ctx = perf_lock_task_context(task, ctxn, &flags);
 3788	if (ctx) {
 3789		clone_ctx = unclone_ctx(ctx);
 3790		++ctx->pin_count;
 3791
 3792		if (task_ctx_data && !ctx->task_ctx_data) {
 3793			ctx->task_ctx_data = task_ctx_data;
 3794			task_ctx_data = NULL;
 3795		}
 3796		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 3797
 3798		if (clone_ctx)
 3799			put_ctx(clone_ctx);
 3800	} else {
 3801		ctx = alloc_perf_context(pmu, task);
 3802		err = -ENOMEM;
 3803		if (!ctx)
 3804			goto errout;
 3805
 3806		if (task_ctx_data) {
 3807			ctx->task_ctx_data = task_ctx_data;
 3808			task_ctx_data = NULL;
 3809		}
 3810
 3811		err = 0;
 3812		mutex_lock(&task->perf_event_mutex);
 3813		/*
 3814		 * If it has already passed perf_event_exit_task().
 3815		 * we must see PF_EXITING, it takes this mutex too.
 3816		 */
 3817		if (task->flags & PF_EXITING)
 3818			err = -ESRCH;
 3819		else if (task->perf_event_ctxp[ctxn])
 3820			err = -EAGAIN;
 3821		else {
 3822			get_ctx(ctx);
 3823			++ctx->pin_count;
 3824			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
 3825		}
 3826		mutex_unlock(&task->perf_event_mutex);
 3827
 3828		if (unlikely(err)) {
 3829			put_ctx(ctx);
 3830
 3831			if (err == -EAGAIN)
 3832				goto retry;
 3833			goto errout;
 3834		}
 3835	}
 3836
 3837	kfree(task_ctx_data);
 3838	return ctx;
 3839
 3840errout:
 3841	kfree(task_ctx_data);
 3842	return ERR_PTR(err);
 3843}
 3844
 3845static void perf_event_free_filter(struct perf_event *event);
 3846static void perf_event_free_bpf_prog(struct perf_event *event);
 3847
 3848static void free_event_rcu(struct rcu_head *head)
 3849{
 3850	struct perf_event *event;
 3851
 3852	event = container_of(head, struct perf_event, rcu_head);
 3853	if (event->ns)
 3854		put_pid_ns(event->ns);
 3855	perf_event_free_filter(event);
 3856	kfree(event);
 3857}
 3858
 
 3859static void ring_buffer_attach(struct perf_event *event,
 3860			       struct ring_buffer *rb);
 3861
 3862static void detach_sb_event(struct perf_event *event)
 3863{
 3864	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
 3865
 3866	raw_spin_lock(&pel->lock);
 3867	list_del_rcu(&event->sb_list);
 3868	raw_spin_unlock(&pel->lock);
 3869}
 3870
 3871static bool is_sb_event(struct perf_event *event)
 3872{
 3873	struct perf_event_attr *attr = &event->attr;
 3874
 3875	if (event->parent)
 3876		return false;
 3877
 3878	if (event->attach_state & PERF_ATTACH_TASK)
 3879		return false;
 3880
 3881	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 3882	    attr->comm || attr->comm_exec ||
 3883	    attr->task ||
 3884	    attr->context_switch)
 3885		return true;
 3886	return false;
 3887}
 3888
 3889static void unaccount_pmu_sb_event(struct perf_event *event)
 3890{
 3891	if (is_sb_event(event))
 3892		detach_sb_event(event);
 3893}
 3894
 3895static void unaccount_event_cpu(struct perf_event *event, int cpu)
 3896{
 3897	if (event->parent)
 3898		return;
 3899
 
 
 
 
 3900	if (is_cgroup_event(event))
 3901		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 3902}
 3903
 3904#ifdef CONFIG_NO_HZ_FULL
 3905static DEFINE_SPINLOCK(nr_freq_lock);
 3906#endif
 3907
 3908static void unaccount_freq_event_nohz(void)
 3909{
 3910#ifdef CONFIG_NO_HZ_FULL
 3911	spin_lock(&nr_freq_lock);
 3912	if (atomic_dec_and_test(&nr_freq_events))
 3913		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
 3914	spin_unlock(&nr_freq_lock);
 3915#endif
 3916}
 3917
 3918static void unaccount_freq_event(void)
 3919{
 3920	if (tick_nohz_full_enabled())
 3921		unaccount_freq_event_nohz();
 3922	else
 3923		atomic_dec(&nr_freq_events);
 3924}
 3925
 3926static void unaccount_event(struct perf_event *event)
 3927{
 3928	bool dec = false;
 3929
 3930	if (event->parent)
 3931		return;
 3932
 3933	if (event->attach_state & PERF_ATTACH_TASK)
 3934		dec = true;
 3935	if (event->attr.mmap || event->attr.mmap_data)
 3936		atomic_dec(&nr_mmap_events);
 3937	if (event->attr.comm)
 3938		atomic_dec(&nr_comm_events);
 3939	if (event->attr.task)
 3940		atomic_dec(&nr_task_events);
 3941	if (event->attr.freq)
 3942		unaccount_freq_event();
 3943	if (event->attr.context_switch) {
 3944		dec = true;
 3945		atomic_dec(&nr_switch_events);
 3946	}
 3947	if (is_cgroup_event(event))
 3948		dec = true;
 3949	if (has_branch_stack(event))
 3950		dec = true;
 3951
 3952	if (dec) {
 3953		if (!atomic_add_unless(&perf_sched_count, -1, 1))
 3954			schedule_delayed_work(&perf_sched_work, HZ);
 3955	}
 3956
 3957	unaccount_event_cpu(event, event->cpu);
 3958
 3959	unaccount_pmu_sb_event(event);
 3960}
 3961
 3962static void perf_sched_delayed(struct work_struct *work)
 3963{
 3964	mutex_lock(&perf_sched_mutex);
 3965	if (atomic_dec_and_test(&perf_sched_count))
 3966		static_branch_disable(&perf_sched_events);
 3967	mutex_unlock(&perf_sched_mutex);
 3968}
 3969
 3970/*
 3971 * The following implement mutual exclusion of events on "exclusive" pmus
 3972 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 3973 * at a time, so we disallow creating events that might conflict, namely:
 3974 *
 3975 *  1) cpu-wide events in the presence of per-task events,
 3976 *  2) per-task events in the presence of cpu-wide events,
 3977 *  3) two matching events on the same context.
 3978 *
 3979 * The former two cases are handled in the allocation path (perf_event_alloc(),
 3980 * _free_event()), the latter -- before the first perf_install_in_context().
 3981 */
 3982static int exclusive_event_init(struct perf_event *event)
 3983{
 3984	struct pmu *pmu = event->pmu;
 3985
 3986	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
 3987		return 0;
 3988
 3989	/*
 3990	 * Prevent co-existence of per-task and cpu-wide events on the
 3991	 * same exclusive pmu.
 3992	 *
 3993	 * Negative pmu::exclusive_cnt means there are cpu-wide
 3994	 * events on this "exclusive" pmu, positive means there are
 3995	 * per-task events.
 3996	 *
 3997	 * Since this is called in perf_event_alloc() path, event::ctx
 3998	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
 3999	 * to mean "per-task event", because unlike other attach states it
 4000	 * never gets cleared.
 4001	 */
 4002	if (event->attach_state & PERF_ATTACH_TASK) {
 4003		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
 4004			return -EBUSY;
 4005	} else {
 4006		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
 4007			return -EBUSY;
 4008	}
 4009
 4010	return 0;
 4011}
 4012
 4013static void exclusive_event_destroy(struct perf_event *event)
 4014{
 4015	struct pmu *pmu = event->pmu;
 4016
 4017	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
 4018		return;
 4019
 4020	/* see comment in exclusive_event_init() */
 4021	if (event->attach_state & PERF_ATTACH_TASK)
 4022		atomic_dec(&pmu->exclusive_cnt);
 4023	else
 4024		atomic_inc(&pmu->exclusive_cnt);
 4025}
 4026
 4027static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
 4028{
 4029	if ((e1->pmu == e2->pmu) &&
 4030	    (e1->cpu == e2->cpu ||
 4031	     e1->cpu == -1 ||
 4032	     e2->cpu == -1))
 4033		return true;
 4034	return false;
 4035}
 4036
 4037/* Called under the same ctx::mutex as perf_install_in_context() */
 4038static bool exclusive_event_installable(struct perf_event *event,
 4039					struct perf_event_context *ctx)
 4040{
 4041	struct perf_event *iter_event;
 4042	struct pmu *pmu = event->pmu;
 4043
 4044	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
 4045		return true;
 4046
 4047	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
 4048		if (exclusive_event_match(iter_event, event))
 4049			return false;
 4050	}
 4051
 4052	return true;
 4053}
 4054
 4055static void perf_addr_filters_splice(struct perf_event *event,
 4056				       struct list_head *head);
 4057
 4058static void _free_event(struct perf_event *event)
 4059{
 4060	irq_work_sync(&event->pending);
 4061
 4062	unaccount_event(event);
 4063
 4064	if (event->rb) {
 4065		/*
 4066		 * Can happen when we close an event with re-directed output.
 4067		 *
 4068		 * Since we have a 0 refcount, perf_mmap_close() will skip
 4069		 * over us; possibly making our ring_buffer_put() the last.
 4070		 */
 4071		mutex_lock(&event->mmap_mutex);
 4072		ring_buffer_attach(event, NULL);
 4073		mutex_unlock(&event->mmap_mutex);
 4074	}
 4075
 4076	if (is_cgroup_event(event))
 4077		perf_detach_cgroup(event);
 4078
 4079	if (!event->parent) {
 4080		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
 4081			put_callchain_buffers();
 4082	}
 4083
 4084	perf_event_free_bpf_prog(event);
 4085	perf_addr_filters_splice(event, NULL);
 4086	kfree(event->addr_filters_offs);
 4087
 4088	if (event->destroy)
 4089		event->destroy(event);
 4090
 4091	if (event->ctx)
 4092		put_ctx(event->ctx);
 4093
 4094	exclusive_event_destroy(event);
 4095	module_put(event->pmu->module);
 4096
 4097	call_rcu(&event->rcu_head, free_event_rcu);
 4098}
 4099
 4100/*
 4101 * Used to free events which have a known refcount of 1, such as in error paths
 4102 * where the event isn't exposed yet and inherited events.
 4103 */
 4104static void free_event(struct perf_event *event)
 4105{
 4106	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
 4107				"unexpected event refcount: %ld; ptr=%p\n",
 4108				atomic_long_read(&event->refcount), event)) {
 4109		/* leak to avoid use-after-free */
 4110		return;
 4111	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4112
 4113	_free_event(event);
 4114}
 
 4115
 4116/*
 4117 * Remove user event from the owner task.
 4118 */
 4119static void perf_remove_from_owner(struct perf_event *event)
 4120{
 4121	struct task_struct *owner;
 4122
 
 
 
 4123	rcu_read_lock();
 
 4124	/*
 4125	 * Matches the smp_store_release() in perf_event_exit_task(). If we
 4126	 * observe !owner it means the list deletion is complete and we can
 4127	 * indeed free this event, otherwise we need to serialize on
 4128	 * owner->perf_event_mutex.
 4129	 */
 4130	owner = lockless_dereference(event->owner);
 4131	if (owner) {
 4132		/*
 4133		 * Since delayed_put_task_struct() also drops the last
 4134		 * task reference we can safely take a new reference
 4135		 * while holding the rcu_read_lock().
 4136		 */
 4137		get_task_struct(owner);
 4138	}
 4139	rcu_read_unlock();
 4140
 4141	if (owner) {
 4142		/*
 4143		 * If we're here through perf_event_exit_task() we're already
 4144		 * holding ctx->mutex which would be an inversion wrt. the
 4145		 * normal lock order.
 4146		 *
 4147		 * However we can safely take this lock because its the child
 4148		 * ctx->mutex.
 4149		 */
 4150		mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
 4151
 4152		/*
 4153		 * We have to re-check the event->owner field, if it is cleared
 4154		 * we raced with perf_event_exit_task(), acquiring the mutex
 4155		 * ensured they're done, and we can proceed with freeing the
 4156		 * event.
 4157		 */
 4158		if (event->owner) {
 4159			list_del_init(&event->owner_entry);
 4160			smp_store_release(&event->owner, NULL);
 4161		}
 4162		mutex_unlock(&owner->perf_event_mutex);
 4163		put_task_struct(owner);
 4164	}
 4165}
 4166
 4167static void put_event(struct perf_event *event)
 4168{
 4169	if (!atomic_long_dec_and_test(&event->refcount))
 4170		return;
 4171
 4172	_free_event(event);
 4173}
 4174
 4175/*
 4176 * Kill an event dead; while event:refcount will preserve the event
 4177 * object, it will not preserve its functionality. Once the last 'user'
 4178 * gives up the object, we'll destroy the thing.
 4179 */
 4180int perf_event_release_kernel(struct perf_event *event)
 4181{
 4182	struct perf_event_context *ctx = event->ctx;
 4183	struct perf_event *child, *tmp;
 4184
 4185	/*
 4186	 * If we got here through err_file: fput(event_file); we will not have
 4187	 * attached to a context yet.
 4188	 */
 4189	if (!ctx) {
 4190		WARN_ON_ONCE(event->attach_state &
 4191				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
 4192		goto no_ctx;
 4193	}
 4194
 4195	if (!is_kernel_event(event))
 4196		perf_remove_from_owner(event);
 4197
 4198	ctx = perf_event_ctx_lock(event);
 4199	WARN_ON_ONCE(ctx->parent_ctx);
 4200	perf_remove_from_context(event, DETACH_GROUP);
 4201
 4202	raw_spin_lock_irq(&ctx->lock);
 4203	/*
 4204	 * Mark this even as STATE_DEAD, there is no external reference to it
 4205	 * anymore.
 4206	 *
 4207	 * Anybody acquiring event->child_mutex after the below loop _must_
 4208	 * also see this, most importantly inherit_event() which will avoid
 4209	 * placing more children on the list.
 4210	 *
 4211	 * Thus this guarantees that we will in fact observe and kill _ALL_
 4212	 * child events.
 4213	 */
 4214	event->state = PERF_EVENT_STATE_DEAD;
 4215	raw_spin_unlock_irq(&ctx->lock);
 4216
 4217	perf_event_ctx_unlock(event, ctx);
 4218
 4219again:
 4220	mutex_lock(&event->child_mutex);
 4221	list_for_each_entry(child, &event->child_list, child_list) {
 4222
 4223		/*
 4224		 * Cannot change, child events are not migrated, see the
 4225		 * comment with perf_event_ctx_lock_nested().
 4226		 */
 4227		ctx = lockless_dereference(child->ctx);
 4228		/*
 4229		 * Since child_mutex nests inside ctx::mutex, we must jump
 4230		 * through hoops. We start by grabbing a reference on the ctx.
 4231		 *
 4232		 * Since the event cannot get freed while we hold the
 4233		 * child_mutex, the context must also exist and have a !0
 4234		 * reference count.
 4235		 */
 4236		get_ctx(ctx);
 4237
 4238		/*
 4239		 * Now that we have a ctx ref, we can drop child_mutex, and
 4240		 * acquire ctx::mutex without fear of it going away. Then we
 4241		 * can re-acquire child_mutex.
 4242		 */
 4243		mutex_unlock(&event->child_mutex);
 4244		mutex_lock(&ctx->mutex);
 4245		mutex_lock(&event->child_mutex);
 4246
 4247		/*
 4248		 * Now that we hold ctx::mutex and child_mutex, revalidate our
 4249		 * state, if child is still the first entry, it didn't get freed
 4250		 * and we can continue doing so.
 4251		 */
 4252		tmp = list_first_entry_or_null(&event->child_list,
 4253					       struct perf_event, child_list);
 4254		if (tmp == child) {
 4255			perf_remove_from_context(child, DETACH_GROUP);
 4256			list_del(&child->child_list);
 4257			free_event(child);
 4258			/*
 4259			 * This matches the refcount bump in inherit_event();
 4260			 * this can't be the last reference.
 4261			 */
 4262			put_event(event);
 4263		}
 4264
 4265		mutex_unlock(&event->child_mutex);
 4266		mutex_unlock(&ctx->mutex);
 4267		put_ctx(ctx);
 4268		goto again;
 4269	}
 4270	mutex_unlock(&event->child_mutex);
 4271
 4272no_ctx:
 4273	put_event(event); /* Must be the 'last' reference */
 4274	return 0;
 4275}
 4276EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 4277
 4278/*
 4279 * Called when the last reference to the file is gone.
 4280 */
 4281static int perf_release(struct inode *inode, struct file *file)
 4282{
 4283	perf_event_release_kernel(file->private_data);
 4284	return 0;
 4285}
 4286
 4287u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 4288{
 4289	struct perf_event *child;
 4290	u64 total = 0;
 4291
 4292	*enabled = 0;
 4293	*running = 0;
 4294
 4295	mutex_lock(&event->child_mutex);
 4296
 4297	(void)perf_event_read(event, false);
 4298	total += perf_event_count(event);
 4299
 4300	*enabled += event->total_time_enabled +
 4301			atomic64_read(&event->child_total_time_enabled);
 4302	*running += event->total_time_running +
 4303			atomic64_read(&event->child_total_time_running);
 4304
 4305	list_for_each_entry(child, &event->child_list, child_list) {
 4306		(void)perf_event_read(child, false);
 4307		total += perf_event_count(child);
 4308		*enabled += child->total_time_enabled;
 4309		*running += child->total_time_running;
 4310	}
 4311	mutex_unlock(&event->child_mutex);
 4312
 4313	return total;
 4314}
 4315EXPORT_SYMBOL_GPL(perf_event_read_value);
 4316
 4317static int __perf_read_group_add(struct perf_event *leader,
 4318					u64 read_format, u64 *values)
 4319{
 4320	struct perf_event *sub;
 4321	int n = 1; /* skip @nr */
 4322	int ret;
 4323
 4324	ret = perf_event_read(leader, true);
 4325	if (ret)
 4326		return ret;
 4327
 4328	/*
 4329	 * Since we co-schedule groups, {enabled,running} times of siblings
 4330	 * will be identical to those of the leader, so we only publish one
 4331	 * set.
 4332	 */
 4333	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 4334		values[n++] += leader->total_time_enabled +
 4335			atomic64_read(&leader->child_total_time_enabled);
 4336	}
 4337
 4338	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
 4339		values[n++] += leader->total_time_running +
 4340			atomic64_read(&leader->child_total_time_running);
 4341	}
 4342
 4343	/*
 4344	 * Write {count,id} tuples for every sibling.
 4345	 */
 4346	values[n++] += perf_event_count(leader);
 
 
 4347	if (read_format & PERF_FORMAT_ID)
 4348		values[n++] = primary_event_id(leader);
 4349
 4350	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 4351		values[n++] += perf_event_count(sub);
 4352		if (read_format & PERF_FORMAT_ID)
 4353			values[n++] = primary_event_id(sub);
 4354	}
 4355
 4356	return 0;
 4357}
 4358
 4359static int perf_read_group(struct perf_event *event,
 4360				   u64 read_format, char __user *buf)
 4361{
 4362	struct perf_event *leader = event->group_leader, *child;
 4363	struct perf_event_context *ctx = leader->ctx;
 4364	int ret;
 4365	u64 *values;
 4366
 4367	lockdep_assert_held(&ctx->mutex);
 
 4368
 4369	values = kzalloc(event->read_size, GFP_KERNEL);
 4370	if (!values)
 4371		return -ENOMEM;
 4372
 4373	values[0] = 1 + leader->nr_siblings;
 
 4374
 4375	/*
 4376	 * By locking the child_mutex of the leader we effectively
 4377	 * lock the child list of all siblings.. XXX explain how.
 4378	 */
 4379	mutex_lock(&leader->child_mutex);
 4380
 4381	ret = __perf_read_group_add(leader, read_format, values);
 4382	if (ret)
 4383		goto unlock;
 4384
 4385	list_for_each_entry(child, &leader->child_list, child_list) {
 4386		ret = __perf_read_group_add(child, read_format, values);
 4387		if (ret)
 4388			goto unlock;
 4389	}
 4390
 4391	mutex_unlock(&leader->child_mutex);
 4392
 4393	ret = event->read_size;
 4394	if (copy_to_user(buf, values, event->read_size))
 4395		ret = -EFAULT;
 4396	goto out;
 4397
 
 
 4398unlock:
 4399	mutex_unlock(&leader->child_mutex);
 4400out:
 4401	kfree(values);
 4402	return ret;
 4403}
 4404
 4405static int perf_read_one(struct perf_event *event,
 4406				 u64 read_format, char __user *buf)
 4407{
 4408	u64 enabled, running;
 4409	u64 values[4];
 4410	int n = 0;
 4411
 4412	values[n++] = perf_event_read_value(event, &enabled, &running);
 4413	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 4414		values[n++] = enabled;
 4415	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 4416		values[n++] = running;
 4417	if (read_format & PERF_FORMAT_ID)
 4418		values[n++] = primary_event_id(event);
 4419
 4420	if (copy_to_user(buf, values, n * sizeof(u64)))
 4421		return -EFAULT;
 4422
 4423	return n * sizeof(u64);
 4424}
 4425
 4426static bool is_event_hup(struct perf_event *event)
 4427{
 4428	bool no_children;
 4429
 4430	if (event->state > PERF_EVENT_STATE_EXIT)
 4431		return false;
 4432
 4433	mutex_lock(&event->child_mutex);
 4434	no_children = list_empty(&event->child_list);
 4435	mutex_unlock(&event->child_mutex);
 4436	return no_children;
 4437}
 4438
 4439/*
 4440 * Read the performance event - simple non blocking version for now
 4441 */
 4442static ssize_t
 4443__perf_read(struct perf_event *event, char __user *buf, size_t count)
 4444{
 4445	u64 read_format = event->attr.read_format;
 4446	int ret;
 4447
 4448	/*
 4449	 * Return end-of-file for a read on a event that is in
 4450	 * error state (i.e. because it was pinned but it couldn't be
 4451	 * scheduled on to the CPU at some point).
 4452	 */
 4453	if (event->state == PERF_EVENT_STATE_ERROR)
 4454		return 0;
 4455
 4456	if (count < event->read_size)
 4457		return -ENOSPC;
 4458
 4459	WARN_ON_ONCE(event->ctx->parent_ctx);
 4460	if (read_format & PERF_FORMAT_GROUP)
 4461		ret = perf_read_group(event, read_format, buf);
 4462	else
 4463		ret = perf_read_one(event, read_format, buf);
 4464
 4465	return ret;
 4466}
 4467
 4468static ssize_t
 4469perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 4470{
 4471	struct perf_event *event = file->private_data;
 4472	struct perf_event_context *ctx;
 4473	int ret;
 4474
 4475	ctx = perf_event_ctx_lock(event);
 4476	ret = __perf_read(event, buf, count);
 4477	perf_event_ctx_unlock(event, ctx);
 4478
 4479	return ret;
 4480}
 4481
 4482static unsigned int perf_poll(struct file *file, poll_table *wait)
 4483{
 4484	struct perf_event *event = file->private_data;
 4485	struct ring_buffer *rb;
 4486	unsigned int events = POLLHUP;
 4487
 4488	poll_wait(file, &event->waitq, wait);
 4489
 4490	if (is_event_hup(event))
 4491		return events;
 4492
 4493	/*
 4494	 * Pin the event->rb by taking event->mmap_mutex; otherwise
 4495	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
 4496	 */
 4497	mutex_lock(&event->mmap_mutex);
 4498	rb = event->rb;
 4499	if (rb)
 4500		events = atomic_xchg(&rb->poll, 0);
 4501	mutex_unlock(&event->mmap_mutex);
 
 
 
 4502	return events;
 4503}
 4504
 4505static void _perf_event_reset(struct perf_event *event)
 4506{
 4507	(void)perf_event_read(event, false);
 4508	local64_set(&event->count, 0);
 4509	perf_event_update_userpage(event);
 4510}
 4511
 4512/*
 4513 * Holding the top-level event's child_mutex means that any
 4514 * descendant process that has inherited this event will block
 4515 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 4516 * task existence requirements of perf_event_enable/disable.
 4517 */
 4518static void perf_event_for_each_child(struct perf_event *event,
 4519					void (*func)(struct perf_event *))
 4520{
 4521	struct perf_event *child;
 4522
 4523	WARN_ON_ONCE(event->ctx->parent_ctx);
 4524
 4525	mutex_lock(&event->child_mutex);
 4526	func(event);
 4527	list_for_each_entry(child, &event->child_list, child_list)
 4528		func(child);
 4529	mutex_unlock(&event->child_mutex);
 4530}
 4531
 4532static void perf_event_for_each(struct perf_event *event,
 4533				  void (*func)(struct perf_event *))
 4534{
 4535	struct perf_event_context *ctx = event->ctx;
 4536	struct perf_event *sibling;
 4537
 4538	lockdep_assert_held(&ctx->mutex);
 4539
 4540	event = event->group_leader;
 4541
 4542	perf_event_for_each_child(event, func);
 4543	list_for_each_entry(sibling, &event->sibling_list, group_entry)
 4544		perf_event_for_each_child(sibling, func);
 
 4545}
 4546
 4547static void __perf_event_period(struct perf_event *event,
 4548				struct perf_cpu_context *cpuctx,
 4549				struct perf_event_context *ctx,
 4550				void *info)
 4551{
 4552	u64 value = *((u64 *)info);
 4553	bool active;
 
 4554
 
 
 
 
 
 
 
 
 
 
 4555	if (event->attr.freq) {
 
 
 
 
 
 4556		event->attr.sample_freq = value;
 4557	} else {
 4558		event->attr.sample_period = value;
 4559		event->hw.sample_period = value;
 4560	}
 4561
 4562	active = (event->state == PERF_EVENT_STATE_ACTIVE);
 4563	if (active) {
 4564		perf_pmu_disable(ctx->pmu);
 4565		/*
 4566		 * We could be throttled; unthrottle now to avoid the tick
 4567		 * trying to unthrottle while we already re-started the event.
 4568		 */
 4569		if (event->hw.interrupts == MAX_INTERRUPTS) {
 4570			event->hw.interrupts = 0;
 4571			perf_log_throttle(event, 1);
 4572		}
 4573		event->pmu->stop(event, PERF_EF_UPDATE);
 4574	}
 4575
 4576	local64_set(&event->hw.period_left, 0);
 4577
 4578	if (active) {
 4579		event->pmu->start(event, PERF_EF_RELOAD);
 4580		perf_pmu_enable(ctx->pmu);
 4581	}
 4582}
 4583
 4584static int perf_event_period(struct perf_event *event, u64 __user *arg)
 4585{
 4586	u64 value;
 4587
 4588	if (!is_sampling_event(event))
 4589		return -EINVAL;
 4590
 4591	if (copy_from_user(&value, arg, sizeof(value)))
 4592		return -EFAULT;
 4593
 4594	if (!value)
 4595		return -EINVAL;
 4596
 4597	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
 4598		return -EINVAL;
 4599
 4600	event_function_call(event, __perf_event_period, &value);
 
 4601
 4602	return 0;
 4603}
 4604
 4605static const struct file_operations perf_fops;
 4606
 4607static inline int perf_fget_light(int fd, struct fd *p)
 4608{
 4609	struct fd f = fdget(fd);
 4610	if (!f.file)
 4611		return -EBADF;
 4612
 4613	if (f.file->f_op != &perf_fops) {
 4614		fdput(f);
 4615		return -EBADF;
 4616	}
 4617	*p = f;
 4618	return 0;
 4619}
 4620
 4621static int perf_event_set_output(struct perf_event *event,
 4622				 struct perf_event *output_event);
 4623static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 4624static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 4625
 4626static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 4627{
 
 4628	void (*func)(struct perf_event *);
 4629	u32 flags = arg;
 4630
 4631	switch (cmd) {
 4632	case PERF_EVENT_IOC_ENABLE:
 4633		func = _perf_event_enable;
 4634		break;
 4635	case PERF_EVENT_IOC_DISABLE:
 4636		func = _perf_event_disable;
 4637		break;
 4638	case PERF_EVENT_IOC_RESET:
 4639		func = _perf_event_reset;
 4640		break;
 4641
 4642	case PERF_EVENT_IOC_REFRESH:
 4643		return _perf_event_refresh(event, arg);
 4644
 4645	case PERF_EVENT_IOC_PERIOD:
 4646		return perf_event_period(event, (u64 __user *)arg);
 4647
 4648	case PERF_EVENT_IOC_ID:
 4649	{
 4650		u64 id = primary_event_id(event);
 4651
 4652		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
 4653			return -EFAULT;
 4654		return 0;
 4655	}
 4656
 4657	case PERF_EVENT_IOC_SET_OUTPUT:
 4658	{
 4659		int ret;
 4660		if (arg != -1) {
 4661			struct perf_event *output_event;
 4662			struct fd output;
 4663			ret = perf_fget_light(arg, &output);
 4664			if (ret)
 4665				return ret;
 4666			output_event = output.file->private_data;
 4667			ret = perf_event_set_output(event, output_event);
 4668			fdput(output);
 4669		} else {
 4670			ret = perf_event_set_output(event, NULL);
 4671		}
 4672		return ret;
 4673	}
 4674
 4675	case PERF_EVENT_IOC_SET_FILTER:
 4676		return perf_event_set_filter(event, (void __user *)arg);
 4677
 4678	case PERF_EVENT_IOC_SET_BPF:
 4679		return perf_event_set_bpf_prog(event, arg);
 4680
 4681	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
 4682		struct ring_buffer *rb;
 4683
 4684		rcu_read_lock();
 4685		rb = rcu_dereference(event->rb);
 4686		if (!rb || !rb->nr_pages) {
 4687			rcu_read_unlock();
 4688			return -EINVAL;
 4689		}
 4690		rb_toggle_paused(rb, !!arg);
 4691		rcu_read_unlock();
 4692		return 0;
 4693	}
 4694	default:
 4695		return -ENOTTY;
 4696	}
 4697
 4698	if (flags & PERF_IOC_FLAG_GROUP)
 4699		perf_event_for_each(event, func);
 4700	else
 4701		perf_event_for_each_child(event, func);
 4702
 4703	return 0;
 4704}
 4705
 4706static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 4707{
 4708	struct perf_event *event = file->private_data;
 4709	struct perf_event_context *ctx;
 4710	long ret;
 4711
 4712	ctx = perf_event_ctx_lock(event);
 4713	ret = _perf_ioctl(event, cmd, arg);
 4714	perf_event_ctx_unlock(event, ctx);
 4715
 4716	return ret;
 4717}
 4718
 4719#ifdef CONFIG_COMPAT
 4720static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 4721				unsigned long arg)
 4722{
 4723	switch (_IOC_NR(cmd)) {
 4724	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
 4725	case _IOC_NR(PERF_EVENT_IOC_ID):
 4726		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
 4727		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
 4728			cmd &= ~IOCSIZE_MASK;
 4729			cmd |= sizeof(void *) << IOCSIZE_SHIFT;
 4730		}
 4731		break;
 4732	}
 4733	return perf_ioctl(file, cmd, arg);
 4734}
 4735#else
 4736# define perf_compat_ioctl NULL
 4737#endif
 4738
 4739int perf_event_task_enable(void)
 4740{
 4741	struct perf_event_context *ctx;
 4742	struct perf_event *event;
 4743
 4744	mutex_lock(&current->perf_event_mutex);
 4745	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
 4746		ctx = perf_event_ctx_lock(event);
 4747		perf_event_for_each_child(event, _perf_event_enable);
 4748		perf_event_ctx_unlock(event, ctx);
 4749	}
 4750	mutex_unlock(&current->perf_event_mutex);
 4751
 4752	return 0;
 4753}
 4754
 4755int perf_event_task_disable(void)
 4756{
 4757	struct perf_event_context *ctx;
 4758	struct perf_event *event;
 4759
 4760	mutex_lock(&current->perf_event_mutex);
 4761	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
 4762		ctx = perf_event_ctx_lock(event);
 4763		perf_event_for_each_child(event, _perf_event_disable);
 4764		perf_event_ctx_unlock(event, ctx);
 4765	}
 4766	mutex_unlock(&current->perf_event_mutex);
 4767
 4768	return 0;
 4769}
 4770
 4771static int perf_event_index(struct perf_event *event)
 4772{
 4773	if (event->hw.state & PERF_HES_STOPPED)
 4774		return 0;
 4775
 4776	if (event->state != PERF_EVENT_STATE_ACTIVE)
 4777		return 0;
 4778
 4779	return event->pmu->event_idx(event);
 4780}
 4781
 4782static void calc_timer_values(struct perf_event *event,
 4783				u64 *now,
 4784				u64 *enabled,
 4785				u64 *running)
 4786{
 4787	u64 ctx_time;
 4788
 4789	*now = perf_clock();
 4790	ctx_time = event->shadow_ctx_time + *now;
 4791	*enabled = ctx_time - event->tstamp_enabled;
 4792	*running = ctx_time - event->tstamp_running;
 4793}
 4794
 4795static void perf_event_init_userpage(struct perf_event *event)
 4796{
 4797	struct perf_event_mmap_page *userpg;
 4798	struct ring_buffer *rb;
 4799
 4800	rcu_read_lock();
 4801	rb = rcu_dereference(event->rb);
 4802	if (!rb)
 4803		goto unlock;
 4804
 4805	userpg = rb->user_page;
 4806
 4807	/* Allow new userspace to detect that bit 0 is deprecated */
 4808	userpg->cap_bit0_is_deprecated = 1;
 4809	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
 4810	userpg->data_offset = PAGE_SIZE;
 4811	userpg->data_size = perf_data_size(rb);
 4812
 4813unlock:
 4814	rcu_read_unlock();
 4815}
 4816
 4817void __weak arch_perf_update_userpage(
 4818	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
 4819{
 4820}
 4821
 4822/*
 4823 * Callers need to ensure there can be no nesting of this function, otherwise
 4824 * the seqlock logic goes bad. We can not serialize this because the arch
 4825 * code calls this from NMI context.
 4826 */
 4827void perf_event_update_userpage(struct perf_event *event)
 4828{
 4829	struct perf_event_mmap_page *userpg;
 4830	struct ring_buffer *rb;
 4831	u64 enabled, running, now;
 4832
 4833	rcu_read_lock();
 4834	rb = rcu_dereference(event->rb);
 4835	if (!rb)
 4836		goto unlock;
 4837
 4838	/*
 4839	 * compute total_time_enabled, total_time_running
 4840	 * based on snapshot values taken when the event
 4841	 * was last scheduled in.
 4842	 *
 4843	 * we cannot simply called update_context_time()
 4844	 * because of locking issue as we can be called in
 4845	 * NMI context
 4846	 */
 4847	calc_timer_values(event, &now, &enabled, &running);
 4848
 4849	userpg = rb->user_page;
 4850	/*
 4851	 * Disable preemption so as to not let the corresponding user-space
 4852	 * spin too long if we get preempted.
 4853	 */
 4854	preempt_disable();
 4855	++userpg->lock;
 4856	barrier();
 4857	userpg->index = perf_event_index(event);
 4858	userpg->offset = perf_event_count(event);
 4859	if (userpg->index)
 4860		userpg->offset -= local64_read(&event->hw.prev_count);
 4861
 4862	userpg->time_enabled = enabled +
 4863			atomic64_read(&event->child_total_time_enabled);
 4864
 4865	userpg->time_running = running +
 4866			atomic64_read(&event->child_total_time_running);
 4867
 4868	arch_perf_update_userpage(event, userpg, now);
 4869
 4870	barrier();
 4871	++userpg->lock;
 4872	preempt_enable();
 4873unlock:
 4874	rcu_read_unlock();
 4875}
 4876
 4877static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 4878{
 4879	struct perf_event *event = vma->vm_file->private_data;
 4880	struct ring_buffer *rb;
 4881	int ret = VM_FAULT_SIGBUS;
 4882
 4883	if (vmf->flags & FAULT_FLAG_MKWRITE) {
 4884		if (vmf->pgoff == 0)
 4885			ret = 0;
 4886		return ret;
 4887	}
 4888
 4889	rcu_read_lock();
 4890	rb = rcu_dereference(event->rb);
 4891	if (!rb)
 4892		goto unlock;
 4893
 4894	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
 4895		goto unlock;
 4896
 4897	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
 4898	if (!vmf->page)
 4899		goto unlock;
 4900
 4901	get_page(vmf->page);
 4902	vmf->page->mapping = vma->vm_file->f_mapping;
 4903	vmf->page->index   = vmf->pgoff;
 4904
 4905	ret = 0;
 4906unlock:
 4907	rcu_read_unlock();
 4908
 4909	return ret;
 4910}
 4911
 4912static void ring_buffer_attach(struct perf_event *event,
 4913			       struct ring_buffer *rb)
 4914{
 4915	struct ring_buffer *old_rb = NULL;
 4916	unsigned long flags;
 4917
 4918	if (event->rb) {
 4919		/*
 4920		 * Should be impossible, we set this when removing
 4921		 * event->rb_entry and wait/clear when adding event->rb_entry.
 4922		 */
 4923		WARN_ON_ONCE(event->rcu_pending);
 4924
 4925		old_rb = event->rb;
 
 
 
 4926		spin_lock_irqsave(&old_rb->event_lock, flags);
 4927		list_del_rcu(&event->rb_entry);
 4928		spin_unlock_irqrestore(&old_rb->event_lock, flags);
 
 4929
 4930		event->rcu_batches = get_state_synchronize_rcu();
 4931		event->rcu_pending = 1;
 
 4932	}
 4933
 4934	if (rb) {
 4935		if (event->rcu_pending) {
 4936			cond_synchronize_rcu(event->rcu_batches);
 4937			event->rcu_pending = 0;
 4938		}
 4939
 4940		spin_lock_irqsave(&rb->event_lock, flags);
 4941		list_add_rcu(&event->rb_entry, &rb->event_list);
 4942		spin_unlock_irqrestore(&rb->event_lock, flags);
 4943	}
 4944
 4945	/*
 4946	 * Avoid racing with perf_mmap_close(AUX): stop the event
 4947	 * before swizzling the event::rb pointer; if it's getting
 4948	 * unmapped, its aux_mmap_count will be 0 and it won't
 4949	 * restart. See the comment in __perf_pmu_output_stop().
 4950	 *
 4951	 * Data will inevitably be lost when set_output is done in
 4952	 * mid-air, but then again, whoever does it like this is
 4953	 * not in for the data anyway.
 4954	 */
 4955	if (has_aux(event))
 4956		perf_event_stop(event, 0);
 4957
 4958	rcu_assign_pointer(event->rb, rb);
 4959
 4960	if (old_rb) {
 4961		ring_buffer_put(old_rb);
 4962		/*
 4963		 * Since we detached before setting the new rb, so that we
 4964		 * could attach the new rb, we could have missed a wakeup.
 4965		 * Provide it now.
 4966		 */
 4967		wake_up_all(&event->waitq);
 4968	}
 4969}
 4970
 4971static void ring_buffer_wakeup(struct perf_event *event)
 4972{
 4973	struct ring_buffer *rb;
 4974
 4975	rcu_read_lock();
 4976	rb = rcu_dereference(event->rb);
 4977	if (rb) {
 4978		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
 4979			wake_up_all(&event->waitq);
 4980	}
 4981	rcu_read_unlock();
 4982}
 4983
 4984struct ring_buffer *ring_buffer_get(struct perf_event *event)
 
 
 
 
 
 
 
 
 4985{
 4986	struct ring_buffer *rb;
 4987
 4988	rcu_read_lock();
 4989	rb = rcu_dereference(event->rb);
 4990	if (rb) {
 4991		if (!atomic_inc_not_zero(&rb->refcount))
 4992			rb = NULL;
 4993	}
 4994	rcu_read_unlock();
 4995
 4996	return rb;
 4997}
 4998
 4999void ring_buffer_put(struct ring_buffer *rb)
 5000{
 5001	if (!atomic_dec_and_test(&rb->refcount))
 5002		return;
 5003
 5004	WARN_ON_ONCE(!list_empty(&rb->event_list));
 5005
 5006	call_rcu(&rb->rcu_head, rb_free_rcu);
 5007}
 5008
 5009static void perf_mmap_open(struct vm_area_struct *vma)
 5010{
 5011	struct perf_event *event = vma->vm_file->private_data;
 5012
 5013	atomic_inc(&event->mmap_count);
 5014	atomic_inc(&event->rb->mmap_count);
 5015
 5016	if (vma->vm_pgoff)
 5017		atomic_inc(&event->rb->aux_mmap_count);
 5018
 5019	if (event->pmu->event_mapped)
 5020		event->pmu->event_mapped(event);
 5021}
 5022
 5023static void perf_pmu_output_stop(struct perf_event *event);
 5024
 5025/*
 5026 * A buffer can be mmap()ed multiple times; either directly through the same
 5027 * event, or through other events by use of perf_event_set_output().
 5028 *
 5029 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 5030 * the buffer here, where we still have a VM context. This means we need
 5031 * to detach all events redirecting to us.
 5032 */
 5033static void perf_mmap_close(struct vm_area_struct *vma)
 5034{
 5035	struct perf_event *event = vma->vm_file->private_data;
 5036
 5037	struct ring_buffer *rb = ring_buffer_get(event);
 5038	struct user_struct *mmap_user = rb->mmap_user;
 5039	int mmap_locked = rb->mmap_locked;
 5040	unsigned long size = perf_data_size(rb);
 5041
 5042	if (event->pmu->event_unmapped)
 5043		event->pmu->event_unmapped(event);
 5044
 5045	/*
 5046	 * rb->aux_mmap_count will always drop before rb->mmap_count and
 5047	 * event->mmap_count, so it is ok to use event->mmap_mutex to
 5048	 * serialize with perf_mmap here.
 5049	 */
 5050	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
 5051	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
 5052		/*
 5053		 * Stop all AUX events that are writing to this buffer,
 5054		 * so that we can free its AUX pages and corresponding PMU
 5055		 * data. Note that after rb::aux_mmap_count dropped to zero,
 5056		 * they won't start any more (see perf_aux_output_begin()).
 5057		 */
 5058		perf_pmu_output_stop(event);
 5059
 5060		/* now it's safe to free the pages */
 5061		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
 5062		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
 5063
 5064		/* this has to be the last one */
 5065		rb_free_aux(rb);
 5066		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
 5067
 5068		mutex_unlock(&event->mmap_mutex);
 5069	}
 5070
 5071	atomic_dec(&rb->mmap_count);
 5072
 5073	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
 5074		goto out_put;
 5075
 5076	ring_buffer_attach(event, NULL);
 5077	mutex_unlock(&event->mmap_mutex);
 5078
 5079	/* If there's still other mmap()s of this buffer, we're done. */
 5080	if (atomic_read(&rb->mmap_count))
 5081		goto out_put;
 5082
 5083	/*
 5084	 * No other mmap()s, detach from all other events that might redirect
 5085	 * into the now unreachable buffer. Somewhat complicated by the
 5086	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
 5087	 */
 5088again:
 5089	rcu_read_lock();
 5090	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
 5091		if (!atomic_long_inc_not_zero(&event->refcount)) {
 5092			/*
 5093			 * This event is en-route to free_event() which will
 5094			 * detach it and remove it from the list.
 5095			 */
 5096			continue;
 5097		}
 5098		rcu_read_unlock();
 5099
 5100		mutex_lock(&event->mmap_mutex);
 5101		/*
 5102		 * Check we didn't race with perf_event_set_output() which can
 5103		 * swizzle the rb from under us while we were waiting to
 5104		 * acquire mmap_mutex.
 5105		 *
 5106		 * If we find a different rb; ignore this event, a next
 5107		 * iteration will no longer find it on the list. We have to
 5108		 * still restart the iteration to make sure we're not now
 5109		 * iterating the wrong list.
 5110		 */
 5111		if (event->rb == rb)
 5112			ring_buffer_attach(event, NULL);
 5113
 5114		mutex_unlock(&event->mmap_mutex);
 5115		put_event(event);
 5116
 5117		/*
 5118		 * Restart the iteration; either we're on the wrong list or
 5119		 * destroyed its integrity by doing a deletion.
 5120		 */
 5121		goto again;
 5122	}
 5123	rcu_read_unlock();
 5124
 5125	/*
 5126	 * It could be there's still a few 0-ref events on the list; they'll
 5127	 * get cleaned up by free_event() -- they'll also still have their
 5128	 * ref on the rb and will free it whenever they are done with it.
 5129	 *
 5130	 * Aside from that, this buffer is 'fully' detached and unmapped,
 5131	 * undo the VM accounting.
 5132	 */
 5133
 5134	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
 5135	vma->vm_mm->pinned_vm -= mmap_locked;
 5136	free_uid(mmap_user);
 5137
 5138out_put:
 5139	ring_buffer_put(rb); /* could be last */
 5140}
 5141
 5142static const struct vm_operations_struct perf_mmap_vmops = {
 5143	.open		= perf_mmap_open,
 5144	.close		= perf_mmap_close, /* non mergable */
 5145	.fault		= perf_mmap_fault,
 5146	.page_mkwrite	= perf_mmap_fault,
 5147};
 5148
 5149static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 5150{
 5151	struct perf_event *event = file->private_data;
 5152	unsigned long user_locked, user_lock_limit;
 5153	struct user_struct *user = current_user();
 5154	unsigned long locked, lock_limit;
 5155	struct ring_buffer *rb = NULL;
 5156	unsigned long vma_size;
 5157	unsigned long nr_pages;
 5158	long user_extra = 0, extra = 0;
 5159	int ret = 0, flags = 0;
 5160
 5161	/*
 5162	 * Don't allow mmap() of inherited per-task counters. This would
 5163	 * create a performance issue due to all children writing to the
 5164	 * same rb.
 5165	 */
 5166	if (event->cpu == -1 && event->attr.inherit)
 5167		return -EINVAL;
 5168
 5169	if (!(vma->vm_flags & VM_SHARED))
 5170		return -EINVAL;
 5171
 5172	vma_size = vma->vm_end - vma->vm_start;
 5173
 5174	if (vma->vm_pgoff == 0) {
 5175		nr_pages = (vma_size / PAGE_SIZE) - 1;
 5176	} else {
 5177		/*
 5178		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
 5179		 * mapped, all subsequent mappings should have the same size
 5180		 * and offset. Must be above the normal perf buffer.
 5181		 */
 5182		u64 aux_offset, aux_size;
 5183
 5184		if (!event->rb)
 5185			return -EINVAL;
 5186
 5187		nr_pages = vma_size / PAGE_SIZE;
 5188
 5189		mutex_lock(&event->mmap_mutex);
 5190		ret = -EINVAL;
 5191
 5192		rb = event->rb;
 5193		if (!rb)
 5194			goto aux_unlock;
 5195
 5196		aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
 5197		aux_size = ACCESS_ONCE(rb->user_page->aux_size);
 5198
 5199		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
 5200			goto aux_unlock;
 5201
 5202		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
 5203			goto aux_unlock;
 5204
 5205		/* already mapped with a different offset */
 5206		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
 5207			goto aux_unlock;
 5208
 5209		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
 5210			goto aux_unlock;
 5211
 5212		/* already mapped with a different size */
 5213		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
 5214			goto aux_unlock;
 5215
 5216		if (!is_power_of_2(nr_pages))
 5217			goto aux_unlock;
 5218
 5219		if (!atomic_inc_not_zero(&rb->mmap_count))
 5220			goto aux_unlock;
 5221
 5222		if (rb_has_aux(rb)) {
 5223			atomic_inc(&rb->aux_mmap_count);
 5224			ret = 0;
 5225			goto unlock;
 5226		}
 5227
 5228		atomic_set(&rb->aux_mmap_count, 1);
 5229		user_extra = nr_pages;
 5230
 5231		goto accounting;
 5232	}
 5233
 5234	/*
 5235	 * If we have rb pages ensure they're a power-of-two number, so we
 5236	 * can do bitmasks instead of modulo.
 5237	 */
 5238	if (nr_pages != 0 && !is_power_of_2(nr_pages))
 5239		return -EINVAL;
 5240
 5241	if (vma_size != PAGE_SIZE * (1 + nr_pages))
 5242		return -EINVAL;
 5243
 
 
 
 5244	WARN_ON_ONCE(event->ctx->parent_ctx);
 5245again:
 5246	mutex_lock(&event->mmap_mutex);
 5247	if (event->rb) {
 5248		if (event->rb->nr_pages != nr_pages) {
 5249			ret = -EINVAL;
 5250			goto unlock;
 5251		}
 5252
 5253		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
 5254			/*
 5255			 * Raced against perf_mmap_close() through
 5256			 * perf_event_set_output(). Try again, hope for better
 5257			 * luck.
 5258			 */
 5259			mutex_unlock(&event->mmap_mutex);
 5260			goto again;
 5261		}
 5262
 5263		goto unlock;
 5264	}
 5265
 5266	user_extra = nr_pages + 1;
 5267
 5268accounting:
 5269	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
 5270
 5271	/*
 5272	 * Increase the limit linearly with more CPUs:
 5273	 */
 5274	user_lock_limit *= num_online_cpus();
 5275
 5276	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 5277
 
 5278	if (user_locked > user_lock_limit)
 5279		extra = user_locked - user_lock_limit;
 5280
 5281	lock_limit = rlimit(RLIMIT_MEMLOCK);
 5282	lock_limit >>= PAGE_SHIFT;
 5283	locked = vma->vm_mm->pinned_vm + extra;
 5284
 5285	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
 5286		!capable(CAP_IPC_LOCK)) {
 5287		ret = -EPERM;
 5288		goto unlock;
 5289	}
 5290
 5291	WARN_ON(!rb && event->rb);
 5292
 5293	if (vma->vm_flags & VM_WRITE)
 5294		flags |= RING_BUFFER_WRITABLE;
 5295
 
 
 
 
 5296	if (!rb) {
 5297		rb = rb_alloc(nr_pages,
 5298			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
 5299			      event->cpu, flags);
 5300
 5301		if (!rb) {
 5302			ret = -ENOMEM;
 5303			goto unlock;
 5304		}
 5305
 5306		atomic_set(&rb->mmap_count, 1);
 5307		rb->mmap_user = get_current_user();
 5308		rb->mmap_locked = extra;
 5309
 5310		ring_buffer_attach(event, rb);
 5311
 5312		perf_event_init_userpage(event);
 5313		perf_event_update_userpage(event);
 5314	} else {
 5315		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
 5316				   event->attr.aux_watermark, flags);
 5317		if (!ret)
 5318			rb->aux_mmap_locked = extra;
 5319	}
 5320
 5321unlock:
 5322	if (!ret) {
 5323		atomic_long_add(user_extra, &user->locked_vm);
 5324		vma->vm_mm->pinned_vm += extra;
 5325
 5326		atomic_inc(&event->mmap_count);
 5327	} else if (rb) {
 5328		atomic_dec(&rb->mmap_count);
 5329	}
 5330aux_unlock:
 5331	mutex_unlock(&event->mmap_mutex);
 5332
 5333	/*
 5334	 * Since pinned accounting is per vm we cannot allow fork() to copy our
 5335	 * vma.
 5336	 */
 5337	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 5338	vma->vm_ops = &perf_mmap_vmops;
 5339
 5340	if (event->pmu->event_mapped)
 5341		event->pmu->event_mapped(event);
 5342
 5343	return ret;
 5344}
 5345
 5346static int perf_fasync(int fd, struct file *filp, int on)
 5347{
 5348	struct inode *inode = file_inode(filp);
 5349	struct perf_event *event = filp->private_data;
 5350	int retval;
 5351
 5352	inode_lock(inode);
 5353	retval = fasync_helper(fd, filp, on, &event->fasync);
 5354	inode_unlock(inode);
 5355
 5356	if (retval < 0)
 5357		return retval;
 5358
 5359	return 0;
 5360}
 5361
 5362static const struct file_operations perf_fops = {
 5363	.llseek			= no_llseek,
 5364	.release		= perf_release,
 5365	.read			= perf_read,
 5366	.poll			= perf_poll,
 5367	.unlocked_ioctl		= perf_ioctl,
 5368	.compat_ioctl		= perf_compat_ioctl,
 5369	.mmap			= perf_mmap,
 5370	.fasync			= perf_fasync,
 5371};
 5372
 5373/*
 5374 * Perf event wakeup
 5375 *
 5376 * If there's data, ensure we set the poll() state and publish everything
 5377 * to user-space before waking everybody up.
 5378 */
 5379
 5380static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
 5381{
 5382	/* only the parent has fasync state */
 5383	if (event->parent)
 5384		event = event->parent;
 5385	return &event->fasync;
 5386}
 5387
 5388void perf_event_wakeup(struct perf_event *event)
 5389{
 5390	ring_buffer_wakeup(event);
 5391
 5392	if (event->pending_kill) {
 5393		kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
 5394		event->pending_kill = 0;
 5395	}
 5396}
 5397
 5398static void perf_pending_event(struct irq_work *entry)
 5399{
 5400	struct perf_event *event = container_of(entry,
 5401			struct perf_event, pending);
 5402	int rctx;
 5403
 5404	rctx = perf_swevent_get_recursion_context();
 5405	/*
 5406	 * If we 'fail' here, that's OK, it means recursion is already disabled
 5407	 * and we won't recurse 'further'.
 5408	 */
 5409
 5410	if (event->pending_disable) {
 5411		event->pending_disable = 0;
 5412		perf_event_disable_local(event);
 5413	}
 5414
 5415	if (event->pending_wakeup) {
 5416		event->pending_wakeup = 0;
 5417		perf_event_wakeup(event);
 5418	}
 5419
 5420	if (rctx >= 0)
 5421		perf_swevent_put_recursion_context(rctx);
 5422}
 5423
 5424/*
 5425 * We assume there is only KVM supporting the callbacks.
 5426 * Later on, we might change it to a list if there is
 5427 * another virtualization implementation supporting the callbacks.
 5428 */
 5429struct perf_guest_info_callbacks *perf_guest_cbs;
 5430
 5431int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 5432{
 5433	perf_guest_cbs = cbs;
 5434	return 0;
 5435}
 5436EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
 5437
 5438int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 5439{
 5440	perf_guest_cbs = NULL;
 5441	return 0;
 5442}
 5443EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 5444
 5445static void
 5446perf_output_sample_regs(struct perf_output_handle *handle,
 5447			struct pt_regs *regs, u64 mask)
 5448{
 5449	int bit;
 5450	DECLARE_BITMAP(_mask, 64);
 5451
 5452	bitmap_from_u64(_mask, mask);
 5453	for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
 5454		u64 val;
 5455
 5456		val = perf_reg_value(regs, bit);
 5457		perf_output_put(handle, val);
 5458	}
 5459}
 5460
 5461static void perf_sample_regs_user(struct perf_regs *regs_user,
 5462				  struct pt_regs *regs,
 5463				  struct pt_regs *regs_user_copy)
 5464{
 5465	if (user_mode(regs)) {
 5466		regs_user->abi = perf_reg_abi(current);
 5467		regs_user->regs = regs;
 5468	} else if (current->mm) {
 5469		perf_get_regs_user(regs_user, regs, regs_user_copy);
 5470	} else {
 5471		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
 5472		regs_user->regs = NULL;
 5473	}
 5474}
 5475
 5476static void perf_sample_regs_intr(struct perf_regs *regs_intr,
 5477				  struct pt_regs *regs)
 5478{
 5479	regs_intr->regs = regs;
 5480	regs_intr->abi  = perf_reg_abi(current);
 5481}
 5482
 5483
 5484/*
 5485 * Get remaining task size from user stack pointer.
 5486 *
 5487 * It'd be better to take stack vma map and limit this more
 5488 * precisly, but there's no way to get it safely under interrupt,
 5489 * so using TASK_SIZE as limit.
 5490 */
 5491static u64 perf_ustack_task_size(struct pt_regs *regs)
 5492{
 5493	unsigned long addr = perf_user_stack_pointer(regs);
 5494
 5495	if (!addr || addr >= TASK_SIZE)
 5496		return 0;
 5497
 5498	return TASK_SIZE - addr;
 5499}
 5500
 5501static u16
 5502perf_sample_ustack_size(u16 stack_size, u16 header_size,
 5503			struct pt_regs *regs)
 5504{
 5505	u64 task_size;
 5506
 5507	/* No regs, no stack pointer, no dump. */
 5508	if (!regs)
 5509		return 0;
 5510
 5511	/*
 5512	 * Check if we fit in with the requested stack size into the:
 5513	 * - TASK_SIZE
 5514	 *   If we don't, we limit the size to the TASK_SIZE.
 5515	 *
 5516	 * - remaining sample size
 5517	 *   If we don't, we customize the stack size to
 5518	 *   fit in to the remaining sample size.
 5519	 */
 5520
 5521	task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
 5522	stack_size = min(stack_size, (u16) task_size);
 5523
 5524	/* Current header size plus static size and dynamic size. */
 5525	header_size += 2 * sizeof(u64);
 5526
 5527	/* Do we fit in with the current stack dump size? */
 5528	if ((u16) (header_size + stack_size) < header_size) {
 5529		/*
 5530		 * If we overflow the maximum size for the sample,
 5531		 * we customize the stack dump size to fit in.
 5532		 */
 5533		stack_size = USHRT_MAX - header_size - sizeof(u64);
 5534		stack_size = round_up(stack_size, sizeof(u64));
 5535	}
 5536
 5537	return stack_size;
 5538}
 5539
 5540static void
 5541perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 5542			  struct pt_regs *regs)
 5543{
 5544	/* Case of a kernel thread, nothing to dump */
 5545	if (!regs) {
 5546		u64 size = 0;
 5547		perf_output_put(handle, size);
 5548	} else {
 5549		unsigned long sp;
 5550		unsigned int rem;
 5551		u64 dyn_size;
 5552
 5553		/*
 5554		 * We dump:
 5555		 * static size
 5556		 *   - the size requested by user or the best one we can fit
 5557		 *     in to the sample max size
 5558		 * data
 5559		 *   - user stack dump data
 5560		 * dynamic size
 5561		 *   - the actual dumped size
 5562		 */
 5563
 5564		/* Static size. */
 5565		perf_output_put(handle, dump_size);
 5566
 5567		/* Data. */
 5568		sp = perf_user_stack_pointer(regs);
 5569		rem = __output_copy_user(handle, (void *) sp, dump_size);
 5570		dyn_size = dump_size - rem;
 5571
 5572		perf_output_skip(handle, rem);
 5573
 5574		/* Dynamic size. */
 5575		perf_output_put(handle, dyn_size);
 5576	}
 5577}
 5578
 5579static void __perf_event_header__init_id(struct perf_event_header *header,
 5580					 struct perf_sample_data *data,
 5581					 struct perf_event *event)
 5582{
 5583	u64 sample_type = event->attr.sample_type;
 5584
 5585	data->type = sample_type;
 5586	header->size += event->id_header_size;
 5587
 5588	if (sample_type & PERF_SAMPLE_TID) {
 5589		/* namespace issues */
 5590		data->tid_entry.pid = perf_event_pid(event, current);
 5591		data->tid_entry.tid = perf_event_tid(event, current);
 5592	}
 5593
 5594	if (sample_type & PERF_SAMPLE_TIME)
 5595		data->time = perf_event_clock(event);
 5596
 5597	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 5598		data->id = primary_event_id(event);
 5599
 5600	if (sample_type & PERF_SAMPLE_STREAM_ID)
 5601		data->stream_id = event->id;
 5602
 5603	if (sample_type & PERF_SAMPLE_CPU) {
 5604		data->cpu_entry.cpu	 = raw_smp_processor_id();
 5605		data->cpu_entry.reserved = 0;
 5606	}
 5607}
 5608
 5609void perf_event_header__init_id(struct perf_event_header *header,
 5610				struct perf_sample_data *data,
 5611				struct perf_event *event)
 5612{
 5613	if (event->attr.sample_id_all)
 5614		__perf_event_header__init_id(header, data, event);
 5615}
 5616
 5617static void __perf_event__output_id_sample(struct perf_output_handle *handle,
 5618					   struct perf_sample_data *data)
 5619{
 5620	u64 sample_type = data->type;
 5621
 5622	if (sample_type & PERF_SAMPLE_TID)
 5623		perf_output_put(handle, data->tid_entry);
 5624
 5625	if (sample_type & PERF_SAMPLE_TIME)
 5626		perf_output_put(handle, data->time);
 5627
 5628	if (sample_type & PERF_SAMPLE_ID)
 5629		perf_output_put(handle, data->id);
 5630
 5631	if (sample_type & PERF_SAMPLE_STREAM_ID)
 5632		perf_output_put(handle, data->stream_id);
 5633
 5634	if (sample_type & PERF_SAMPLE_CPU)
 5635		perf_output_put(handle, data->cpu_entry);
 5636
 5637	if (sample_type & PERF_SAMPLE_IDENTIFIER)
 5638		perf_output_put(handle, data->id);
 5639}
 5640
 5641void perf_event__output_id_sample(struct perf_event *event,
 5642				  struct perf_output_handle *handle,
 5643				  struct perf_sample_data *sample)
 5644{
 5645	if (event->attr.sample_id_all)
 5646		__perf_event__output_id_sample(handle, sample);
 5647}
 5648
 5649static void perf_output_read_one(struct perf_output_handle *handle,
 5650				 struct perf_event *event,
 5651				 u64 enabled, u64 running)
 5652{
 5653	u64 read_format = event->attr.read_format;
 5654	u64 values[4];
 5655	int n = 0;
 5656
 5657	values[n++] = perf_event_count(event);
 5658	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 5659		values[n++] = enabled +
 5660			atomic64_read(&event->child_total_time_enabled);
 5661	}
 5662	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
 5663		values[n++] = running +
 5664			atomic64_read(&event->child_total_time_running);
 5665	}
 5666	if (read_format & PERF_FORMAT_ID)
 5667		values[n++] = primary_event_id(event);
 5668
 5669	__output_copy(handle, values, n * sizeof(u64));
 5670}
 5671
 5672/*
 5673 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
 5674 */
 5675static void perf_output_read_group(struct perf_output_handle *handle,
 5676			    struct perf_event *event,
 5677			    u64 enabled, u64 running)
 5678{
 5679	struct perf_event *leader = event->group_leader, *sub;
 5680	u64 read_format = event->attr.read_format;
 5681	u64 values[5];
 5682	int n = 0;
 5683
 5684	values[n++] = 1 + leader->nr_siblings;
 5685
 5686	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 5687		values[n++] = enabled;
 5688
 5689	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 5690		values[n++] = running;
 5691
 5692	if (leader != event)
 5693		leader->pmu->read(leader);
 5694
 5695	values[n++] = perf_event_count(leader);
 5696	if (read_format & PERF_FORMAT_ID)
 5697		values[n++] = primary_event_id(leader);
 5698
 5699	__output_copy(handle, values, n * sizeof(u64));
 5700
 5701	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 5702		n = 0;
 5703
 5704		if ((sub != event) &&
 5705		    (sub->state == PERF_EVENT_STATE_ACTIVE))
 5706			sub->pmu->read(sub);
 5707
 5708		values[n++] = perf_event_count(sub);
 5709		if (read_format & PERF_FORMAT_ID)
 5710			values[n++] = primary_event_id(sub);
 5711
 5712		__output_copy(handle, values, n * sizeof(u64));
 5713	}
 5714}
 5715
 5716#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
 5717				 PERF_FORMAT_TOTAL_TIME_RUNNING)
 5718
 5719static void perf_output_read(struct perf_output_handle *handle,
 5720			     struct perf_event *event)
 5721{
 5722	u64 enabled = 0, running = 0, now;
 5723	u64 read_format = event->attr.read_format;
 5724
 5725	/*
 5726	 * compute total_time_enabled, total_time_running
 5727	 * based on snapshot values taken when the event
 5728	 * was last scheduled in.
 5729	 *
 5730	 * we cannot simply called update_context_time()
 5731	 * because of locking issue as we are called in
 5732	 * NMI context
 5733	 */
 5734	if (read_format & PERF_FORMAT_TOTAL_TIMES)
 5735		calc_timer_values(event, &now, &enabled, &running);
 5736
 5737	if (event->attr.read_format & PERF_FORMAT_GROUP)
 5738		perf_output_read_group(handle, event, enabled, running);
 5739	else
 5740		perf_output_read_one(handle, event, enabled, running);
 5741}
 5742
 5743void perf_output_sample(struct perf_output_handle *handle,
 5744			struct perf_event_header *header,
 5745			struct perf_sample_data *data,
 5746			struct perf_event *event)
 5747{
 5748	u64 sample_type = data->type;
 5749
 5750	perf_output_put(handle, *header);
 5751
 5752	if (sample_type & PERF_SAMPLE_IDENTIFIER)
 5753		perf_output_put(handle, data->id);
 5754
 5755	if (sample_type & PERF_SAMPLE_IP)
 5756		perf_output_put(handle, data->ip);
 5757
 5758	if (sample_type & PERF_SAMPLE_TID)
 5759		perf_output_put(handle, data->tid_entry);
 5760
 5761	if (sample_type & PERF_SAMPLE_TIME)
 5762		perf_output_put(handle, data->time);
 5763
 5764	if (sample_type & PERF_SAMPLE_ADDR)
 5765		perf_output_put(handle, data->addr);
 5766
 5767	if (sample_type & PERF_SAMPLE_ID)
 5768		perf_output_put(handle, data->id);
 5769
 5770	if (sample_type & PERF_SAMPLE_STREAM_ID)
 5771		perf_output_put(handle, data->stream_id);
 5772
 5773	if (sample_type & PERF_SAMPLE_CPU)
 5774		perf_output_put(handle, data->cpu_entry);
 5775
 5776	if (sample_type & PERF_SAMPLE_PERIOD)
 5777		perf_output_put(handle, data->period);
 5778
 5779	if (sample_type & PERF_SAMPLE_READ)
 5780		perf_output_read(handle, event);
 5781
 5782	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 5783		if (data->callchain) {
 5784			int size = 1;
 5785
 5786			if (data->callchain)
 5787				size += data->callchain->nr;
 5788
 5789			size *= sizeof(u64);
 5790
 5791			__output_copy(handle, data->callchain, size);
 5792		} else {
 5793			u64 nr = 0;
 5794			perf_output_put(handle, nr);
 5795		}
 5796	}
 5797
 5798	if (sample_type & PERF_SAMPLE_RAW) {
 5799		struct perf_raw_record *raw = data->raw;
 5800
 5801		if (raw) {
 5802			struct perf_raw_frag *frag = &raw->frag;
 5803
 5804			perf_output_put(handle, raw->size);
 5805			do {
 5806				if (frag->copy) {
 5807					__output_custom(handle, frag->copy,
 5808							frag->data, frag->size);
 5809				} else {
 5810					__output_copy(handle, frag->data,
 5811						      frag->size);
 5812				}
 5813				if (perf_raw_frag_last(frag))
 5814					break;
 5815				frag = frag->next;
 5816			} while (1);
 5817			if (frag->pad)
 5818				__output_skip(handle, NULL, frag->pad);
 5819		} else {
 5820			struct {
 5821				u32	size;
 5822				u32	data;
 5823			} raw = {
 5824				.size = sizeof(u32),
 5825				.data = 0,
 5826			};
 5827			perf_output_put(handle, raw);
 5828		}
 5829	}
 5830
 5831	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
 5832		if (data->br_stack) {
 5833			size_t size;
 5834
 5835			size = data->br_stack->nr
 5836			     * sizeof(struct perf_branch_entry);
 5837
 5838			perf_output_put(handle, data->br_stack->nr);
 5839			perf_output_copy(handle, data->br_stack->entries, size);
 5840		} else {
 5841			/*
 5842			 * we always store at least the value of nr
 5843			 */
 5844			u64 nr = 0;
 5845			perf_output_put(handle, nr);
 5846		}
 5847	}
 5848
 5849	if (sample_type & PERF_SAMPLE_REGS_USER) {
 5850		u64 abi = data->regs_user.abi;
 5851
 5852		/*
 5853		 * If there are no regs to dump, notice it through
 5854		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
 5855		 */
 5856		perf_output_put(handle, abi);
 5857
 5858		if (abi) {
 5859			u64 mask = event->attr.sample_regs_user;
 5860			perf_output_sample_regs(handle,
 5861						data->regs_user.regs,
 5862						mask);
 5863		}
 5864	}
 5865
 5866	if (sample_type & PERF_SAMPLE_STACK_USER) {
 5867		perf_output_sample_ustack(handle,
 5868					  data->stack_user_size,
 5869					  data->regs_user.regs);
 5870	}
 5871
 5872	if (sample_type & PERF_SAMPLE_WEIGHT)
 5873		perf_output_put(handle, data->weight);
 5874
 5875	if (sample_type & PERF_SAMPLE_DATA_SRC)
 5876		perf_output_put(handle, data->data_src.val);
 5877
 5878	if (sample_type & PERF_SAMPLE_TRANSACTION)
 5879		perf_output_put(handle, data->txn);
 5880
 5881	if (sample_type & PERF_SAMPLE_REGS_INTR) {
 5882		u64 abi = data->regs_intr.abi;
 5883		/*
 5884		 * If there are no regs to dump, notice it through
 5885		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
 5886		 */
 5887		perf_output_put(handle, abi);
 5888
 5889		if (abi) {
 5890			u64 mask = event->attr.sample_regs_intr;
 5891
 5892			perf_output_sample_regs(handle,
 5893						data->regs_intr.regs,
 5894						mask);
 5895		}
 5896	}
 5897
 5898	if (!event->attr.watermark) {
 5899		int wakeup_events = event->attr.wakeup_events;
 5900
 5901		if (wakeup_events) {
 5902			struct ring_buffer *rb = handle->rb;
 5903			int events = local_inc_return(&rb->events);
 5904
 5905			if (events >= wakeup_events) {
 5906				local_sub(wakeup_events, &rb->events);
 5907				local_inc(&rb->wakeup);
 5908			}
 5909		}
 5910	}
 5911}
 5912
 5913void perf_prepare_sample(struct perf_event_header *header,
 5914			 struct perf_sample_data *data,
 5915			 struct perf_event *event,
 5916			 struct pt_regs *regs)
 5917{
 5918	u64 sample_type = event->attr.sample_type;
 5919
 5920	header->type = PERF_RECORD_SAMPLE;
 5921	header->size = sizeof(*header) + event->header_size;
 5922
 5923	header->misc = 0;
 5924	header->misc |= perf_misc_flags(regs);
 5925
 5926	__perf_event_header__init_id(header, data, event);
 5927
 5928	if (sample_type & PERF_SAMPLE_IP)
 5929		data->ip = perf_instruction_pointer(regs);
 5930
 5931	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 5932		int size = 1;
 5933
 5934		data->callchain = perf_callchain(event, regs);
 5935
 5936		if (data->callchain)
 5937			size += data->callchain->nr;
 5938
 5939		header->size += size * sizeof(u64);
 5940	}
 5941
 5942	if (sample_type & PERF_SAMPLE_RAW) {
 5943		struct perf_raw_record *raw = data->raw;
 5944		int size;
 5945
 5946		if (raw) {
 5947			struct perf_raw_frag *frag = &raw->frag;
 5948			u32 sum = 0;
 5949
 5950			do {
 5951				sum += frag->size;
 5952				if (perf_raw_frag_last(frag))
 5953					break;
 5954				frag = frag->next;
 5955			} while (1);
 5956
 5957			size = round_up(sum + sizeof(u32), sizeof(u64));
 5958			raw->size = size - sizeof(u32);
 5959			frag->pad = raw->size - sum;
 5960		} else {
 5961			size = sizeof(u64);
 5962		}
 5963
 
 5964		header->size += size;
 5965	}
 5966
 5967	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
 5968		int size = sizeof(u64); /* nr */
 5969		if (data->br_stack) {
 5970			size += data->br_stack->nr
 5971			      * sizeof(struct perf_branch_entry);
 5972		}
 5973		header->size += size;
 5974	}
 5975
 5976	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
 5977		perf_sample_regs_user(&data->regs_user, regs,
 5978				      &data->regs_user_copy);
 5979
 5980	if (sample_type & PERF_SAMPLE_REGS_USER) {
 5981		/* regs dump ABI info */
 5982		int size = sizeof(u64);
 5983
 
 
 5984		if (data->regs_user.regs) {
 5985			u64 mask = event->attr.sample_regs_user;
 5986			size += hweight64(mask) * sizeof(u64);
 5987		}
 5988
 5989		header->size += size;
 5990	}
 5991
 5992	if (sample_type & PERF_SAMPLE_STACK_USER) {
 5993		/*
 5994		 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
 5995		 * processed as the last one or have additional check added
 5996		 * in case new sample type is added, because we could eat
 5997		 * up the rest of the sample size.
 5998		 */
 
 5999		u16 stack_size = event->attr.sample_stack_user;
 6000		u16 size = sizeof(u64);
 6001
 
 
 
 6002		stack_size = perf_sample_ustack_size(stack_size, header->size,
 6003						     data->regs_user.regs);
 6004
 6005		/*
 6006		 * If there is something to dump, add space for the dump
 6007		 * itself and for the field that tells the dynamic size,
 6008		 * which is how many have been actually dumped.
 6009		 */
 6010		if (stack_size)
 6011			size += sizeof(u64) + stack_size;
 6012
 6013		data->stack_user_size = stack_size;
 6014		header->size += size;
 6015	}
 6016
 6017	if (sample_type & PERF_SAMPLE_REGS_INTR) {
 6018		/* regs dump ABI info */
 6019		int size = sizeof(u64);
 6020
 6021		perf_sample_regs_intr(&data->regs_intr, regs);
 6022
 6023		if (data->regs_intr.regs) {
 6024			u64 mask = event->attr.sample_regs_intr;
 6025
 6026			size += hweight64(mask) * sizeof(u64);
 6027		}
 6028
 6029		header->size += size;
 6030	}
 6031}
 6032
 6033static void __always_inline
 6034__perf_event_output(struct perf_event *event,
 6035		    struct perf_sample_data *data,
 6036		    struct pt_regs *regs,
 6037		    int (*output_begin)(struct perf_output_handle *,
 6038					struct perf_event *,
 6039					unsigned int))
 6040{
 6041	struct perf_output_handle handle;
 6042	struct perf_event_header header;
 6043
 6044	/* protect the callchain buffers */
 6045	rcu_read_lock();
 6046
 6047	perf_prepare_sample(&header, data, event, regs);
 6048
 6049	if (output_begin(&handle, event, header.size))
 6050		goto exit;
 6051
 6052	perf_output_sample(&handle, &header, data, event);
 6053
 6054	perf_output_end(&handle);
 6055
 6056exit:
 6057	rcu_read_unlock();
 6058}
 6059
 6060void
 6061perf_event_output_forward(struct perf_event *event,
 6062			 struct perf_sample_data *data,
 6063			 struct pt_regs *regs)
 6064{
 6065	__perf_event_output(event, data, regs, perf_output_begin_forward);
 6066}
 6067
 6068void
 6069perf_event_output_backward(struct perf_event *event,
 6070			   struct perf_sample_data *data,
 6071			   struct pt_regs *regs)
 6072{
 6073	__perf_event_output(event, data, regs, perf_output_begin_backward);
 6074}
 6075
 6076void
 6077perf_event_output(struct perf_event *event,
 6078		  struct perf_sample_data *data,
 6079		  struct pt_regs *regs)
 6080{
 6081	__perf_event_output(event, data, regs, perf_output_begin);
 6082}
 6083
 6084/*
 6085 * read event_id
 6086 */
 6087
 6088struct perf_read_event {
 6089	struct perf_event_header	header;
 6090
 6091	u32				pid;
 6092	u32				tid;
 6093};
 6094
 6095static void
 6096perf_event_read_event(struct perf_event *event,
 6097			struct task_struct *task)
 6098{
 6099	struct perf_output_handle handle;
 6100	struct perf_sample_data sample;
 6101	struct perf_read_event read_event = {
 6102		.header = {
 6103			.type = PERF_RECORD_READ,
 6104			.misc = 0,
 6105			.size = sizeof(read_event) + event->read_size,
 6106		},
 6107		.pid = perf_event_pid(event, task),
 6108		.tid = perf_event_tid(event, task),
 6109	};
 6110	int ret;
 6111
 6112	perf_event_header__init_id(&read_event.header, &sample, event);
 6113	ret = perf_output_begin(&handle, event, read_event.header.size);
 6114	if (ret)
 6115		return;
 6116
 6117	perf_output_put(&handle, read_event);
 6118	perf_output_read(&handle, event);
 6119	perf_event__output_id_sample(event, &handle, &sample);
 6120
 6121	perf_output_end(&handle);
 6122}
 6123
 6124typedef void (perf_iterate_f)(struct perf_event *event, void *data);
 6125
 6126static void
 6127perf_iterate_ctx(struct perf_event_context *ctx,
 6128		   perf_iterate_f output,
 6129		   void *data, bool all)
 6130{
 6131	struct perf_event *event;
 6132
 6133	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 6134		if (!all) {
 6135			if (event->state < PERF_EVENT_STATE_INACTIVE)
 6136				continue;
 6137			if (!event_filter_match(event))
 6138				continue;
 6139		}
 6140
 6141		output(event, data);
 6142	}
 6143}
 6144
 6145static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 6146{
 6147	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
 6148	struct perf_event *event;
 6149
 6150	list_for_each_entry_rcu(event, &pel->list, sb_list) {
 6151		/*
 6152		 * Skip events that are not fully formed yet; ensure that
 6153		 * if we observe event->ctx, both event and ctx will be
 6154		 * complete enough. See perf_install_in_context().
 6155		 */
 6156		if (!smp_load_acquire(&event->ctx))
 6157			continue;
 6158
 6159		if (event->state < PERF_EVENT_STATE_INACTIVE)
 6160			continue;
 6161		if (!event_filter_match(event))
 6162			continue;
 6163		output(event, data);
 6164	}
 6165}
 6166
 6167/*
 6168 * Iterate all events that need to receive side-band events.
 6169 *
 6170 * For new callers; ensure that account_pmu_sb_event() includes
 6171 * your event, otherwise it might not get delivered.
 6172 */
 6173static void
 6174perf_iterate_sb(perf_iterate_f output, void *data,
 6175	       struct perf_event_context *task_ctx)
 6176{
 
 6177	struct perf_event_context *ctx;
 
 6178	int ctxn;
 6179
 6180	rcu_read_lock();
 6181	preempt_disable();
 6182
 6183	/*
 6184	 * If we have task_ctx != NULL we only notify the task context itself.
 6185	 * The task_ctx is set only for EXIT events before releasing task
 6186	 * context.
 6187	 */
 6188	if (task_ctx) {
 6189		perf_iterate_ctx(task_ctx, output, data, false);
 6190		goto done;
 6191	}
 6192
 6193	perf_iterate_sb_cpu(output, data);
 6194
 6195	for_each_task_context_nr(ctxn) {
 6196		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 6197		if (ctx)
 6198			perf_iterate_ctx(ctx, output, data, false);
 
 
 6199	}
 6200done:
 6201	preempt_enable();
 6202	rcu_read_unlock();
 6203}
 6204
 6205/*
 6206 * Clear all file-based filters at exec, they'll have to be
 6207 * re-instated when/if these objects are mmapped again.
 6208 */
 6209static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 6210{
 6211	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
 6212	struct perf_addr_filter *filter;
 6213	unsigned int restart = 0, count = 0;
 6214	unsigned long flags;
 6215
 6216	if (!has_addr_filter(event))
 6217		return;
 6218
 6219	raw_spin_lock_irqsave(&ifh->lock, flags);
 6220	list_for_each_entry(filter, &ifh->list, entry) {
 6221		if (filter->inode) {
 6222			event->addr_filters_offs[count] = 0;
 6223			restart++;
 6224		}
 6225
 6226		count++;
 6227	}
 6228
 6229	if (restart)
 6230		event->addr_filters_gen++;
 6231	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 6232
 6233	if (restart)
 6234		perf_event_stop(event, 1);
 6235}
 6236
 6237void perf_event_exec(void)
 6238{
 6239	struct perf_event_context *ctx;
 6240	int ctxn;
 6241
 6242	rcu_read_lock();
 6243	for_each_task_context_nr(ctxn) {
 6244		ctx = current->perf_event_ctxp[ctxn];
 6245		if (!ctx)
 6246			continue;
 6247
 6248		perf_event_enable_on_exec(ctxn);
 6249
 6250		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
 6251				   true);
 6252	}
 6253	rcu_read_unlock();
 6254}
 6255
 6256struct remote_output {
 6257	struct ring_buffer	*rb;
 6258	int			err;
 6259};
 6260
 6261static void __perf_event_output_stop(struct perf_event *event, void *data)
 6262{
 6263	struct perf_event *parent = event->parent;
 6264	struct remote_output *ro = data;
 6265	struct ring_buffer *rb = ro->rb;
 6266	struct stop_event_data sd = {
 6267		.event	= event,
 6268	};
 6269
 6270	if (!has_aux(event))
 6271		return;
 6272
 6273	if (!parent)
 6274		parent = event;
 6275
 6276	/*
 6277	 * In case of inheritance, it will be the parent that links to the
 6278	 * ring-buffer, but it will be the child that's actually using it.
 6279	 *
 6280	 * We are using event::rb to determine if the event should be stopped,
 6281	 * however this may race with ring_buffer_attach() (through set_output),
 6282	 * which will make us skip the event that actually needs to be stopped.
 6283	 * So ring_buffer_attach() has to stop an aux event before re-assigning
 6284	 * its rb pointer.
 6285	 */
 6286	if (rcu_dereference(parent->rb) == rb)
 6287		ro->err = __perf_event_stop(&sd);
 6288}
 6289
 6290static int __perf_pmu_output_stop(void *info)
 6291{
 6292	struct perf_event *event = info;
 6293	struct pmu *pmu = event->pmu;
 6294	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 6295	struct remote_output ro = {
 6296		.rb	= event->rb,
 6297	};
 6298
 6299	rcu_read_lock();
 6300	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
 6301	if (cpuctx->task_ctx)
 6302		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
 6303				   &ro, false);
 6304	rcu_read_unlock();
 6305
 6306	return ro.err;
 6307}
 6308
 6309static void perf_pmu_output_stop(struct perf_event *event)
 6310{
 6311	struct perf_event *iter;
 6312	int err, cpu;
 6313
 6314restart:
 6315	rcu_read_lock();
 6316	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
 6317		/*
 6318		 * For per-CPU events, we need to make sure that neither they
 6319		 * nor their children are running; for cpu==-1 events it's
 6320		 * sufficient to stop the event itself if it's active, since
 6321		 * it can't have children.
 6322		 */
 6323		cpu = iter->cpu;
 6324		if (cpu == -1)
 6325			cpu = READ_ONCE(iter->oncpu);
 6326
 6327		if (cpu == -1)
 6328			continue;
 6329
 6330		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
 6331		if (err == -EAGAIN) {
 6332			rcu_read_unlock();
 6333			goto restart;
 6334		}
 6335	}
 6336	rcu_read_unlock();
 6337}
 6338
 6339/*
 6340 * task tracking -- fork/exit
 6341 *
 6342 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 6343 */
 6344
 6345struct perf_task_event {
 6346	struct task_struct		*task;
 6347	struct perf_event_context	*task_ctx;
 6348
 6349	struct {
 6350		struct perf_event_header	header;
 6351
 6352		u32				pid;
 6353		u32				ppid;
 6354		u32				tid;
 6355		u32				ptid;
 6356		u64				time;
 6357	} event_id;
 6358};
 6359
 6360static int perf_event_task_match(struct perf_event *event)
 6361{
 6362	return event->attr.comm  || event->attr.mmap ||
 6363	       event->attr.mmap2 || event->attr.mmap_data ||
 6364	       event->attr.task;
 6365}
 6366
 6367static void perf_event_task_output(struct perf_event *event,
 6368				   void *data)
 6369{
 6370	struct perf_task_event *task_event = data;
 6371	struct perf_output_handle handle;
 6372	struct perf_sample_data	sample;
 6373	struct task_struct *task = task_event->task;
 6374	int ret, size = task_event->event_id.header.size;
 6375
 6376	if (!perf_event_task_match(event))
 6377		return;
 6378
 6379	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
 6380
 6381	ret = perf_output_begin(&handle, event,
 6382				task_event->event_id.header.size);
 6383	if (ret)
 6384		goto out;
 6385
 6386	task_event->event_id.pid = perf_event_pid(event, task);
 6387	task_event->event_id.ppid = perf_event_pid(event, current);
 6388
 6389	task_event->event_id.tid = perf_event_tid(event, task);
 6390	task_event->event_id.ptid = perf_event_tid(event, current);
 6391
 6392	task_event->event_id.time = perf_event_clock(event);
 6393
 6394	perf_output_put(&handle, task_event->event_id);
 6395
 6396	perf_event__output_id_sample(event, &handle, &sample);
 6397
 6398	perf_output_end(&handle);
 6399out:
 6400	task_event->event_id.header.size = size;
 6401}
 6402
 6403static void perf_event_task(struct task_struct *task,
 6404			      struct perf_event_context *task_ctx,
 6405			      int new)
 6406{
 6407	struct perf_task_event task_event;
 6408
 6409	if (!atomic_read(&nr_comm_events) &&
 6410	    !atomic_read(&nr_mmap_events) &&
 6411	    !atomic_read(&nr_task_events))
 6412		return;
 6413
 6414	task_event = (struct perf_task_event){
 6415		.task	  = task,
 6416		.task_ctx = task_ctx,
 6417		.event_id    = {
 6418			.header = {
 6419				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
 6420				.misc = 0,
 6421				.size = sizeof(task_event.event_id),
 6422			},
 6423			/* .pid  */
 6424			/* .ppid */
 6425			/* .tid  */
 6426			/* .ptid */
 6427			/* .time */
 6428		},
 6429	};
 6430
 6431	perf_iterate_sb(perf_event_task_output,
 6432		       &task_event,
 6433		       task_ctx);
 6434}
 6435
 6436void perf_event_fork(struct task_struct *task)
 6437{
 6438	perf_event_task(task, NULL, 1);
 6439}
 6440
 6441/*
 6442 * comm tracking
 6443 */
 6444
 6445struct perf_comm_event {
 6446	struct task_struct	*task;
 6447	char			*comm;
 6448	int			comm_size;
 6449
 6450	struct {
 6451		struct perf_event_header	header;
 6452
 6453		u32				pid;
 6454		u32				tid;
 6455	} event_id;
 6456};
 6457
 6458static int perf_event_comm_match(struct perf_event *event)
 6459{
 6460	return event->attr.comm;
 6461}
 6462
 6463static void perf_event_comm_output(struct perf_event *event,
 6464				   void *data)
 6465{
 6466	struct perf_comm_event *comm_event = data;
 6467	struct perf_output_handle handle;
 6468	struct perf_sample_data sample;
 6469	int size = comm_event->event_id.header.size;
 6470	int ret;
 6471
 6472	if (!perf_event_comm_match(event))
 6473		return;
 6474
 6475	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
 6476	ret = perf_output_begin(&handle, event,
 6477				comm_event->event_id.header.size);
 6478
 6479	if (ret)
 6480		goto out;
 6481
 6482	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
 6483	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
 6484
 6485	perf_output_put(&handle, comm_event->event_id);
 6486	__output_copy(&handle, comm_event->comm,
 6487				   comm_event->comm_size);
 6488
 6489	perf_event__output_id_sample(event, &handle, &sample);
 6490
 6491	perf_output_end(&handle);
 6492out:
 6493	comm_event->event_id.header.size = size;
 6494}
 6495
 6496static void perf_event_comm_event(struct perf_comm_event *comm_event)
 6497{
 6498	char comm[TASK_COMM_LEN];
 6499	unsigned int size;
 6500
 6501	memset(comm, 0, sizeof(comm));
 6502	strlcpy(comm, comm_event->task->comm, sizeof(comm));
 6503	size = ALIGN(strlen(comm)+1, sizeof(u64));
 6504
 6505	comm_event->comm = comm;
 6506	comm_event->comm_size = size;
 6507
 6508	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 6509
 6510	perf_iterate_sb(perf_event_comm_output,
 6511		       comm_event,
 6512		       NULL);
 6513}
 6514
 6515void perf_event_comm(struct task_struct *task, bool exec)
 6516{
 6517	struct perf_comm_event comm_event;
 
 
 
 
 
 
 
 
 
 
 
 
 6518
 6519	if (!atomic_read(&nr_comm_events))
 6520		return;
 6521
 6522	comm_event = (struct perf_comm_event){
 6523		.task	= task,
 6524		/* .comm      */
 6525		/* .comm_size */
 6526		.event_id  = {
 6527			.header = {
 6528				.type = PERF_RECORD_COMM,
 6529				.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
 6530				/* .size */
 6531			},
 6532			/* .pid */
 6533			/* .tid */
 6534		},
 6535	};
 6536
 6537	perf_event_comm_event(&comm_event);
 6538}
 6539
 6540/*
 6541 * mmap tracking
 6542 */
 6543
 6544struct perf_mmap_event {
 6545	struct vm_area_struct	*vma;
 6546
 6547	const char		*file_name;
 6548	int			file_size;
 6549	int			maj, min;
 6550	u64			ino;
 6551	u64			ino_generation;
 6552	u32			prot, flags;
 6553
 6554	struct {
 6555		struct perf_event_header	header;
 6556
 6557		u32				pid;
 6558		u32				tid;
 6559		u64				start;
 6560		u64				len;
 6561		u64				pgoff;
 6562	} event_id;
 6563};
 6564
 6565static int perf_event_mmap_match(struct perf_event *event,
 6566				 void *data)
 6567{
 6568	struct perf_mmap_event *mmap_event = data;
 6569	struct vm_area_struct *vma = mmap_event->vma;
 6570	int executable = vma->vm_flags & VM_EXEC;
 6571
 6572	return (!executable && event->attr.mmap_data) ||
 6573	       (executable && (event->attr.mmap || event->attr.mmap2));
 6574}
 6575
 6576static void perf_event_mmap_output(struct perf_event *event,
 6577				   void *data)
 6578{
 6579	struct perf_mmap_event *mmap_event = data;
 6580	struct perf_output_handle handle;
 6581	struct perf_sample_data sample;
 6582	int size = mmap_event->event_id.header.size;
 6583	int ret;
 6584
 6585	if (!perf_event_mmap_match(event, data))
 6586		return;
 6587
 6588	if (event->attr.mmap2) {
 6589		mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
 6590		mmap_event->event_id.header.size += sizeof(mmap_event->maj);
 6591		mmap_event->event_id.header.size += sizeof(mmap_event->min);
 6592		mmap_event->event_id.header.size += sizeof(mmap_event->ino);
 6593		mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
 6594		mmap_event->event_id.header.size += sizeof(mmap_event->prot);
 6595		mmap_event->event_id.header.size += sizeof(mmap_event->flags);
 6596	}
 6597
 6598	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
 6599	ret = perf_output_begin(&handle, event,
 6600				mmap_event->event_id.header.size);
 6601	if (ret)
 6602		goto out;
 6603
 6604	mmap_event->event_id.pid = perf_event_pid(event, current);
 6605	mmap_event->event_id.tid = perf_event_tid(event, current);
 6606
 6607	perf_output_put(&handle, mmap_event->event_id);
 6608
 6609	if (event->attr.mmap2) {
 6610		perf_output_put(&handle, mmap_event->maj);
 6611		perf_output_put(&handle, mmap_event->min);
 6612		perf_output_put(&handle, mmap_event->ino);
 6613		perf_output_put(&handle, mmap_event->ino_generation);
 6614		perf_output_put(&handle, mmap_event->prot);
 6615		perf_output_put(&handle, mmap_event->flags);
 6616	}
 6617
 6618	__output_copy(&handle, mmap_event->file_name,
 6619				   mmap_event->file_size);
 6620
 6621	perf_event__output_id_sample(event, &handle, &sample);
 6622
 6623	perf_output_end(&handle);
 6624out:
 6625	mmap_event->event_id.header.size = size;
 6626}
 6627
 6628static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 6629{
 6630	struct vm_area_struct *vma = mmap_event->vma;
 6631	struct file *file = vma->vm_file;
 6632	int maj = 0, min = 0;
 6633	u64 ino = 0, gen = 0;
 6634	u32 prot = 0, flags = 0;
 6635	unsigned int size;
 6636	char tmp[16];
 6637	char *buf = NULL;
 6638	char *name;
 6639
 6640	if (vma->vm_flags & VM_READ)
 6641		prot |= PROT_READ;
 6642	if (vma->vm_flags & VM_WRITE)
 6643		prot |= PROT_WRITE;
 6644	if (vma->vm_flags & VM_EXEC)
 6645		prot |= PROT_EXEC;
 6646
 6647	if (vma->vm_flags & VM_MAYSHARE)
 6648		flags = MAP_SHARED;
 6649	else
 6650		flags = MAP_PRIVATE;
 6651
 6652	if (vma->vm_flags & VM_DENYWRITE)
 6653		flags |= MAP_DENYWRITE;
 6654	if (vma->vm_flags & VM_MAYEXEC)
 6655		flags |= MAP_EXECUTABLE;
 6656	if (vma->vm_flags & VM_LOCKED)
 6657		flags |= MAP_LOCKED;
 6658	if (vma->vm_flags & VM_HUGETLB)
 6659		flags |= MAP_HUGETLB;
 6660
 6661	if (file) {
 6662		struct inode *inode;
 6663		dev_t dev;
 6664
 6665		buf = kmalloc(PATH_MAX, GFP_KERNEL);
 6666		if (!buf) {
 6667			name = "//enomem";
 6668			goto cpy_name;
 6669		}
 6670		/*
 6671		 * d_path() works from the end of the rb backwards, so we
 6672		 * need to add enough zero bytes after the string to handle
 6673		 * the 64bit alignment we do later.
 6674		 */
 6675		name = file_path(file, buf, PATH_MAX - sizeof(u64));
 6676		if (IS_ERR(name)) {
 6677			name = "//toolong";
 6678			goto cpy_name;
 6679		}
 6680		inode = file_inode(vma->vm_file);
 6681		dev = inode->i_sb->s_dev;
 6682		ino = inode->i_ino;
 6683		gen = inode->i_generation;
 6684		maj = MAJOR(dev);
 6685		min = MINOR(dev);
 6686
 6687		goto got_name;
 6688	} else {
 6689		if (vma->vm_ops && vma->vm_ops->name) {
 6690			name = (char *) vma->vm_ops->name(vma);
 6691			if (name)
 6692				goto cpy_name;
 6693		}
 6694
 6695		name = (char *)arch_vma_name(vma);
 6696		if (name)
 6697			goto cpy_name;
 6698
 6699		if (vma->vm_start <= vma->vm_mm->start_brk &&
 6700				vma->vm_end >= vma->vm_mm->brk) {
 6701			name = "[heap]";
 6702			goto cpy_name;
 6703		}
 6704		if (vma->vm_start <= vma->vm_mm->start_stack &&
 6705				vma->vm_end >= vma->vm_mm->start_stack) {
 6706			name = "[stack]";
 6707			goto cpy_name;
 6708		}
 6709
 6710		name = "//anon";
 6711		goto cpy_name;
 6712	}
 6713
 6714cpy_name:
 6715	strlcpy(tmp, name, sizeof(tmp));
 6716	name = tmp;
 6717got_name:
 6718	/*
 6719	 * Since our buffer works in 8 byte units we need to align our string
 6720	 * size to a multiple of 8. However, we must guarantee the tail end is
 6721	 * zero'd out to avoid leaking random bits to userspace.
 6722	 */
 6723	size = strlen(name)+1;
 6724	while (!IS_ALIGNED(size, sizeof(u64)))
 6725		name[size++] = '\0';
 6726
 6727	mmap_event->file_name = name;
 6728	mmap_event->file_size = size;
 6729	mmap_event->maj = maj;
 6730	mmap_event->min = min;
 6731	mmap_event->ino = ino;
 6732	mmap_event->ino_generation = gen;
 6733	mmap_event->prot = prot;
 6734	mmap_event->flags = flags;
 6735
 6736	if (!(vma->vm_flags & VM_EXEC))
 6737		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
 6738
 6739	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 6740
 6741	perf_iterate_sb(perf_event_mmap_output,
 6742		       mmap_event,
 6743		       NULL);
 6744
 6745	kfree(buf);
 6746}
 6747
 6748/*
 6749 * Check whether inode and address range match filter criteria.
 6750 */
 6751static bool perf_addr_filter_match(struct perf_addr_filter *filter,
 6752				     struct file *file, unsigned long offset,
 6753				     unsigned long size)
 6754{
 6755	if (filter->inode != file_inode(file))
 6756		return false;
 6757
 6758	if (filter->offset > offset + size)
 6759		return false;
 6760
 6761	if (filter->offset + filter->size < offset)
 6762		return false;
 6763
 6764	return true;
 6765}
 6766
 6767static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 6768{
 6769	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
 6770	struct vm_area_struct *vma = data;
 6771	unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
 6772	struct file *file = vma->vm_file;
 6773	struct perf_addr_filter *filter;
 6774	unsigned int restart = 0, count = 0;
 6775
 6776	if (!has_addr_filter(event))
 6777		return;
 6778
 6779	if (!file)
 6780		return;
 6781
 6782	raw_spin_lock_irqsave(&ifh->lock, flags);
 6783	list_for_each_entry(filter, &ifh->list, entry) {
 6784		if (perf_addr_filter_match(filter, file, off,
 6785					     vma->vm_end - vma->vm_start)) {
 6786			event->addr_filters_offs[count] = vma->vm_start;
 6787			restart++;
 6788		}
 6789
 6790		count++;
 6791	}
 6792
 6793	if (restart)
 6794		event->addr_filters_gen++;
 6795	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 6796
 6797	if (restart)
 6798		perf_event_stop(event, 1);
 6799}
 6800
 6801/*
 6802 * Adjust all task's events' filters to the new vma
 6803 */
 6804static void perf_addr_filters_adjust(struct vm_area_struct *vma)
 6805{
 6806	struct perf_event_context *ctx;
 6807	int ctxn;
 6808
 6809	/*
 6810	 * Data tracing isn't supported yet and as such there is no need
 6811	 * to keep track of anything that isn't related to executable code:
 6812	 */
 6813	if (!(vma->vm_flags & VM_EXEC))
 6814		return;
 6815
 6816	rcu_read_lock();
 6817	for_each_task_context_nr(ctxn) {
 6818		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 6819		if (!ctx)
 6820			continue;
 6821
 6822		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
 6823	}
 6824	rcu_read_unlock();
 6825}
 6826
 6827void perf_event_mmap(struct vm_area_struct *vma)
 6828{
 6829	struct perf_mmap_event mmap_event;
 6830
 6831	if (!atomic_read(&nr_mmap_events))
 6832		return;
 6833
 6834	mmap_event = (struct perf_mmap_event){
 6835		.vma	= vma,
 6836		/* .file_name */
 6837		/* .file_size */
 6838		.event_id  = {
 6839			.header = {
 6840				.type = PERF_RECORD_MMAP,
 6841				.misc = PERF_RECORD_MISC_USER,
 6842				/* .size */
 6843			},
 6844			/* .pid */
 6845			/* .tid */
 6846			.start  = vma->vm_start,
 6847			.len    = vma->vm_end - vma->vm_start,
 6848			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
 6849		},
 6850		/* .maj (attr_mmap2 only) */
 6851		/* .min (attr_mmap2 only) */
 6852		/* .ino (attr_mmap2 only) */
 6853		/* .ino_generation (attr_mmap2 only) */
 6854		/* .prot (attr_mmap2 only) */
 6855		/* .flags (attr_mmap2 only) */
 6856	};
 6857
 6858	perf_addr_filters_adjust(vma);
 6859	perf_event_mmap_event(&mmap_event);
 6860}
 6861
 6862void perf_event_aux_event(struct perf_event *event, unsigned long head,
 6863			  unsigned long size, u64 flags)
 6864{
 6865	struct perf_output_handle handle;
 6866	struct perf_sample_data sample;
 6867	struct perf_aux_event {
 6868		struct perf_event_header	header;
 6869		u64				offset;
 6870		u64				size;
 6871		u64				flags;
 6872	} rec = {
 6873		.header = {
 6874			.type = PERF_RECORD_AUX,
 6875			.misc = 0,
 6876			.size = sizeof(rec),
 6877		},
 6878		.offset		= head,
 6879		.size		= size,
 6880		.flags		= flags,
 6881	};
 6882	int ret;
 6883
 6884	perf_event_header__init_id(&rec.header, &sample, event);
 6885	ret = perf_output_begin(&handle, event, rec.header.size);
 6886
 6887	if (ret)
 6888		return;
 6889
 6890	perf_output_put(&handle, rec);
 6891	perf_event__output_id_sample(event, &handle, &sample);
 6892
 6893	perf_output_end(&handle);
 6894}
 6895
 6896/*
 6897 * Lost/dropped samples logging
 6898 */
 6899void perf_log_lost_samples(struct perf_event *event, u64 lost)
 6900{
 6901	struct perf_output_handle handle;
 6902	struct perf_sample_data sample;
 6903	int ret;
 6904
 6905	struct {
 6906		struct perf_event_header	header;
 6907		u64				lost;
 6908	} lost_samples_event = {
 6909		.header = {
 6910			.type = PERF_RECORD_LOST_SAMPLES,
 6911			.misc = 0,
 6912			.size = sizeof(lost_samples_event),
 6913		},
 6914		.lost		= lost,
 6915	};
 6916
 6917	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
 6918
 6919	ret = perf_output_begin(&handle, event,
 6920				lost_samples_event.header.size);
 6921	if (ret)
 6922		return;
 6923
 6924	perf_output_put(&handle, lost_samples_event);
 6925	perf_event__output_id_sample(event, &handle, &sample);
 6926	perf_output_end(&handle);
 6927}
 6928
 6929/*
 6930 * context_switch tracking
 6931 */
 6932
 6933struct perf_switch_event {
 6934	struct task_struct	*task;
 6935	struct task_struct	*next_prev;
 6936
 6937	struct {
 6938		struct perf_event_header	header;
 6939		u32				next_prev_pid;
 6940		u32				next_prev_tid;
 6941	} event_id;
 6942};
 6943
 6944static int perf_event_switch_match(struct perf_event *event)
 6945{
 6946	return event->attr.context_switch;
 6947}
 6948
 6949static void perf_event_switch_output(struct perf_event *event, void *data)
 6950{
 6951	struct perf_switch_event *se = data;
 6952	struct perf_output_handle handle;
 6953	struct perf_sample_data sample;
 6954	int ret;
 6955
 6956	if (!perf_event_switch_match(event))
 6957		return;
 6958
 6959	/* Only CPU-wide events are allowed to see next/prev pid/tid */
 6960	if (event->ctx->task) {
 6961		se->event_id.header.type = PERF_RECORD_SWITCH;
 6962		se->event_id.header.size = sizeof(se->event_id.header);
 6963	} else {
 6964		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
 6965		se->event_id.header.size = sizeof(se->event_id);
 6966		se->event_id.next_prev_pid =
 6967					perf_event_pid(event, se->next_prev);
 6968		se->event_id.next_prev_tid =
 6969					perf_event_tid(event, se->next_prev);
 6970	}
 6971
 6972	perf_event_header__init_id(&se->event_id.header, &sample, event);
 6973
 6974	ret = perf_output_begin(&handle, event, se->event_id.header.size);
 6975	if (ret)
 6976		return;
 6977
 6978	if (event->ctx->task)
 6979		perf_output_put(&handle, se->event_id.header);
 6980	else
 6981		perf_output_put(&handle, se->event_id);
 6982
 6983	perf_event__output_id_sample(event, &handle, &sample);
 6984
 6985	perf_output_end(&handle);
 6986}
 6987
 6988static void perf_event_switch(struct task_struct *task,
 6989			      struct task_struct *next_prev, bool sched_in)
 6990{
 6991	struct perf_switch_event switch_event;
 6992
 6993	/* N.B. caller checks nr_switch_events != 0 */
 6994
 6995	switch_event = (struct perf_switch_event){
 6996		.task		= task,
 6997		.next_prev	= next_prev,
 6998		.event_id	= {
 6999			.header = {
 7000				/* .type */
 7001				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
 7002				/* .size */
 7003			},
 7004			/* .next_prev_pid */
 7005			/* .next_prev_tid */
 7006		},
 7007	};
 7008
 7009	perf_iterate_sb(perf_event_switch_output,
 7010		       &switch_event,
 7011		       NULL);
 7012}
 7013
 7014/*
 7015 * IRQ throttle logging
 7016 */
 7017
 7018static void perf_log_throttle(struct perf_event *event, int enable)
 7019{
 7020	struct perf_output_handle handle;
 7021	struct perf_sample_data sample;
 7022	int ret;
 7023
 7024	struct {
 7025		struct perf_event_header	header;
 7026		u64				time;
 7027		u64				id;
 7028		u64				stream_id;
 7029	} throttle_event = {
 7030		.header = {
 7031			.type = PERF_RECORD_THROTTLE,
 7032			.misc = 0,
 7033			.size = sizeof(throttle_event),
 7034		},
 7035		.time		= perf_event_clock(event),
 7036		.id		= primary_event_id(event),
 7037		.stream_id	= event->id,
 7038	};
 7039
 7040	if (enable)
 7041		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
 7042
 7043	perf_event_header__init_id(&throttle_event.header, &sample, event);
 7044
 7045	ret = perf_output_begin(&handle, event,
 7046				throttle_event.header.size);
 7047	if (ret)
 7048		return;
 7049
 7050	perf_output_put(&handle, throttle_event);
 7051	perf_event__output_id_sample(event, &handle, &sample);
 7052	perf_output_end(&handle);
 7053}
 7054
 7055static void perf_log_itrace_start(struct perf_event *event)
 7056{
 7057	struct perf_output_handle handle;
 7058	struct perf_sample_data sample;
 7059	struct perf_aux_event {
 7060		struct perf_event_header        header;
 7061		u32				pid;
 7062		u32				tid;
 7063	} rec;
 7064	int ret;
 7065
 7066	if (event->parent)
 7067		event = event->parent;
 7068
 7069	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
 7070	    event->hw.itrace_started)
 7071		return;
 7072
 7073	rec.header.type	= PERF_RECORD_ITRACE_START;
 7074	rec.header.misc	= 0;
 7075	rec.header.size	= sizeof(rec);
 7076	rec.pid	= perf_event_pid(event, current);
 7077	rec.tid	= perf_event_tid(event, current);
 7078
 7079	perf_event_header__init_id(&rec.header, &sample, event);
 7080	ret = perf_output_begin(&handle, event, rec.header.size);
 7081
 7082	if (ret)
 7083		return;
 7084
 7085	perf_output_put(&handle, rec);
 7086	perf_event__output_id_sample(event, &handle, &sample);
 7087
 7088	perf_output_end(&handle);
 7089}
 7090
 7091static int
 7092__perf_event_account_interrupt(struct perf_event *event, int throttle)
 
 7093{
 
 7094	struct hw_perf_event *hwc = &event->hw;
 7095	int ret = 0;
 7096	u64 seq;
 
 
 
 
 
 
 
 
 7097
 7098	seq = __this_cpu_read(perf_throttled_seq);
 7099	if (seq != hwc->interrupts_seq) {
 7100		hwc->interrupts_seq = seq;
 7101		hwc->interrupts = 1;
 7102	} else {
 7103		hwc->interrupts++;
 7104		if (unlikely(throttle
 7105			     && hwc->interrupts >= max_samples_per_tick)) {
 7106			__this_cpu_inc(perf_throttled_count);
 7107			tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
 7108			hwc->interrupts = MAX_INTERRUPTS;
 7109			perf_log_throttle(event, 0);
 
 7110			ret = 1;
 7111		}
 7112	}
 7113
 7114	if (event->attr.freq) {
 7115		u64 now = perf_clock();
 7116		s64 delta = now - hwc->freq_time_stamp;
 7117
 7118		hwc->freq_time_stamp = now;
 7119
 7120		if (delta > 0 && delta < 2*TICK_NSEC)
 7121			perf_adjust_period(event, delta, hwc->last_period, true);
 7122	}
 7123
 7124	return ret;
 7125}
 7126
 7127int perf_event_account_interrupt(struct perf_event *event)
 7128{
 7129	return __perf_event_account_interrupt(event, 1);
 7130}
 7131
 7132/*
 7133 * Generic event overflow handling, sampling.
 7134 */
 7135
 7136static int __perf_event_overflow(struct perf_event *event,
 7137				   int throttle, struct perf_sample_data *data,
 7138				   struct pt_regs *regs)
 7139{
 7140	int events = atomic_read(&event->event_limit);
 7141	int ret = 0;
 7142
 7143	/*
 7144	 * Non-sampling counters might still use the PMI to fold short
 7145	 * hardware counters, ignore those.
 7146	 */
 7147	if (unlikely(!is_sampling_event(event)))
 7148		return 0;
 7149
 7150	ret = __perf_event_account_interrupt(event, throttle);
 7151
 7152	/*
 7153	 * XXX event_limit might not quite work as expected on inherited
 7154	 * events
 7155	 */
 7156
 7157	event->pending_kill = POLL_IN;
 7158	if (events && atomic_dec_and_test(&event->event_limit)) {
 7159		ret = 1;
 7160		event->pending_kill = POLL_HUP;
 7161
 7162		perf_event_disable_inatomic(event);
 7163	}
 7164
 7165	READ_ONCE(event->overflow_handler)(event, data, regs);
 
 
 
 7166
 7167	if (*perf_event_fasync(event) && event->pending_kill) {
 7168		event->pending_wakeup = 1;
 7169		irq_work_queue(&event->pending);
 7170	}
 7171
 7172	return ret;
 7173}
 7174
 7175int perf_event_overflow(struct perf_event *event,
 7176			  struct perf_sample_data *data,
 7177			  struct pt_regs *regs)
 7178{
 7179	return __perf_event_overflow(event, 1, data, regs);
 7180}
 7181
 7182/*
 7183 * Generic software event infrastructure
 7184 */
 7185
 7186struct swevent_htable {
 7187	struct swevent_hlist		*swevent_hlist;
 7188	struct mutex			hlist_mutex;
 7189	int				hlist_refcount;
 7190
 7191	/* Recursion avoidance in each contexts */
 7192	int				recursion[PERF_NR_CONTEXTS];
 
 
 
 7193};
 7194
 7195static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 7196
 7197/*
 7198 * We directly increment event->count and keep a second value in
 7199 * event->hw.period_left to count intervals. This period event
 7200 * is kept in the range [-sample_period, 0] so that we can use the
 7201 * sign as trigger.
 7202 */
 7203
 7204u64 perf_swevent_set_period(struct perf_event *event)
 7205{
 7206	struct hw_perf_event *hwc = &event->hw;
 7207	u64 period = hwc->last_period;
 7208	u64 nr, offset;
 7209	s64 old, val;
 7210
 7211	hwc->last_period = hwc->sample_period;
 7212
 7213again:
 7214	old = val = local64_read(&hwc->period_left);
 7215	if (val < 0)
 7216		return 0;
 7217
 7218	nr = div64_u64(period + val, period);
 7219	offset = nr * period;
 7220	val -= offset;
 7221	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
 7222		goto again;
 7223
 7224	return nr;
 7225}
 7226
 7227static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 7228				    struct perf_sample_data *data,
 7229				    struct pt_regs *regs)
 7230{
 7231	struct hw_perf_event *hwc = &event->hw;
 7232	int throttle = 0;
 7233
 7234	if (!overflow)
 7235		overflow = perf_swevent_set_period(event);
 7236
 7237	if (hwc->interrupts == MAX_INTERRUPTS)
 7238		return;
 7239
 7240	for (; overflow; overflow--) {
 7241		if (__perf_event_overflow(event, throttle,
 7242					    data, regs)) {
 7243			/*
 7244			 * We inhibit the overflow from happening when
 7245			 * hwc->interrupts == MAX_INTERRUPTS.
 7246			 */
 7247			break;
 7248		}
 7249		throttle = 1;
 7250	}
 7251}
 7252
 7253static void perf_swevent_event(struct perf_event *event, u64 nr,
 7254			       struct perf_sample_data *data,
 7255			       struct pt_regs *regs)
 7256{
 7257	struct hw_perf_event *hwc = &event->hw;
 7258
 7259	local64_add(nr, &event->count);
 7260
 7261	if (!regs)
 7262		return;
 7263
 7264	if (!is_sampling_event(event))
 7265		return;
 7266
 7267	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
 7268		data->period = nr;
 7269		return perf_swevent_overflow(event, 1, data, regs);
 7270	} else
 7271		data->period = event->hw.last_period;
 7272
 7273	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 7274		return perf_swevent_overflow(event, 1, data, regs);
 7275
 7276	if (local64_add_negative(nr, &hwc->period_left))
 7277		return;
 7278
 7279	perf_swevent_overflow(event, 0, data, regs);
 7280}
 7281
 7282static int perf_exclude_event(struct perf_event *event,
 7283			      struct pt_regs *regs)
 7284{
 7285	if (event->hw.state & PERF_HES_STOPPED)
 7286		return 1;
 7287
 7288	if (regs) {
 7289		if (event->attr.exclude_user && user_mode(regs))
 7290			return 1;
 7291
 7292		if (event->attr.exclude_kernel && !user_mode(regs))
 7293			return 1;
 7294	}
 7295
 7296	return 0;
 7297}
 7298
 7299static int perf_swevent_match(struct perf_event *event,
 7300				enum perf_type_id type,
 7301				u32 event_id,
 7302				struct perf_sample_data *data,
 7303				struct pt_regs *regs)
 7304{
 7305	if (event->attr.type != type)
 7306		return 0;
 7307
 7308	if (event->attr.config != event_id)
 7309		return 0;
 7310
 7311	if (perf_exclude_event(event, regs))
 7312		return 0;
 7313
 7314	return 1;
 7315}
 7316
 7317static inline u64 swevent_hash(u64 type, u32 event_id)
 7318{
 7319	u64 val = event_id | (type << 32);
 7320
 7321	return hash_64(val, SWEVENT_HLIST_BITS);
 7322}
 7323
 7324static inline struct hlist_head *
 7325__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 7326{
 7327	u64 hash = swevent_hash(type, event_id);
 7328
 7329	return &hlist->heads[hash];
 7330}
 7331
 7332/* For the read side: events when they trigger */
 7333static inline struct hlist_head *
 7334find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
 7335{
 7336	struct swevent_hlist *hlist;
 7337
 7338	hlist = rcu_dereference(swhash->swevent_hlist);
 7339	if (!hlist)
 7340		return NULL;
 7341
 7342	return __find_swevent_head(hlist, type, event_id);
 7343}
 7344
 7345/* For the event head insertion and removal in the hlist */
 7346static inline struct hlist_head *
 7347find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 7348{
 7349	struct swevent_hlist *hlist;
 7350	u32 event_id = event->attr.config;
 7351	u64 type = event->attr.type;
 7352
 7353	/*
 7354	 * Event scheduling is always serialized against hlist allocation
 7355	 * and release. Which makes the protected version suitable here.
 7356	 * The context lock guarantees that.
 7357	 */
 7358	hlist = rcu_dereference_protected(swhash->swevent_hlist,
 7359					  lockdep_is_held(&event->ctx->lock));
 7360	if (!hlist)
 7361		return NULL;
 7362
 7363	return __find_swevent_head(hlist, type, event_id);
 7364}
 7365
 7366static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 7367				    u64 nr,
 7368				    struct perf_sample_data *data,
 7369				    struct pt_regs *regs)
 7370{
 7371	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 7372	struct perf_event *event;
 7373	struct hlist_head *head;
 7374
 7375	rcu_read_lock();
 7376	head = find_swevent_head_rcu(swhash, type, event_id);
 7377	if (!head)
 7378		goto end;
 7379
 7380	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 7381		if (perf_swevent_match(event, type, event_id, data, regs))
 7382			perf_swevent_event(event, nr, data, regs);
 7383	}
 7384end:
 7385	rcu_read_unlock();
 7386}
 7387
 7388DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
 7389
 7390int perf_swevent_get_recursion_context(void)
 7391{
 7392	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 7393
 7394	return get_recursion_context(swhash->recursion);
 7395}
 7396EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 7397
 7398void perf_swevent_put_recursion_context(int rctx)
 7399{
 7400	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 7401
 7402	put_recursion_context(swhash->recursion, rctx);
 7403}
 7404
 7405void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 7406{
 7407	struct perf_sample_data data;
 7408
 7409	if (WARN_ON_ONCE(!regs))
 7410		return;
 7411
 7412	perf_sample_data_init(&data, addr, 0);
 7413	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
 7414}
 7415
 7416void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 7417{
 
 7418	int rctx;
 7419
 7420	preempt_disable_notrace();
 7421	rctx = perf_swevent_get_recursion_context();
 7422	if (unlikely(rctx < 0))
 7423		goto fail;
 
 
 7424
 7425	___perf_sw_event(event_id, nr, regs, addr);
 7426
 7427	perf_swevent_put_recursion_context(rctx);
 7428fail:
 7429	preempt_enable_notrace();
 7430}
 7431
 7432static void perf_swevent_read(struct perf_event *event)
 7433{
 7434}
 7435
 7436static int perf_swevent_add(struct perf_event *event, int flags)
 7437{
 7438	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 7439	struct hw_perf_event *hwc = &event->hw;
 7440	struct hlist_head *head;
 7441
 7442	if (is_sampling_event(event)) {
 7443		hwc->last_period = hwc->sample_period;
 7444		perf_swevent_set_period(event);
 7445	}
 7446
 7447	hwc->state = !(flags & PERF_EF_START);
 7448
 7449	head = find_swevent_head(swhash, event);
 7450	if (WARN_ON_ONCE(!head))
 
 
 
 
 
 7451		return -EINVAL;
 
 7452
 7453	hlist_add_head_rcu(&event->hlist_entry, head);
 7454	perf_event_update_userpage(event);
 7455
 7456	return 0;
 7457}
 7458
 7459static void perf_swevent_del(struct perf_event *event, int flags)
 7460{
 7461	hlist_del_rcu(&event->hlist_entry);
 7462}
 7463
 7464static void perf_swevent_start(struct perf_event *event, int flags)
 7465{
 7466	event->hw.state = 0;
 7467}
 7468
 7469static void perf_swevent_stop(struct perf_event *event, int flags)
 7470{
 7471	event->hw.state = PERF_HES_STOPPED;
 7472}
 7473
 7474/* Deref the hlist from the update side */
 7475static inline struct swevent_hlist *
 7476swevent_hlist_deref(struct swevent_htable *swhash)
 7477{
 7478	return rcu_dereference_protected(swhash->swevent_hlist,
 7479					 lockdep_is_held(&swhash->hlist_mutex));
 7480}
 7481
 7482static void swevent_hlist_release(struct swevent_htable *swhash)
 7483{
 7484	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
 7485
 7486	if (!hlist)
 7487		return;
 7488
 7489	RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
 7490	kfree_rcu(hlist, rcu_head);
 7491}
 7492
 7493static void swevent_hlist_put_cpu(int cpu)
 7494{
 7495	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 7496
 7497	mutex_lock(&swhash->hlist_mutex);
 7498
 7499	if (!--swhash->hlist_refcount)
 7500		swevent_hlist_release(swhash);
 7501
 7502	mutex_unlock(&swhash->hlist_mutex);
 7503}
 7504
 7505static void swevent_hlist_put(void)
 7506{
 7507	int cpu;
 7508
 7509	for_each_possible_cpu(cpu)
 7510		swevent_hlist_put_cpu(cpu);
 7511}
 7512
 7513static int swevent_hlist_get_cpu(int cpu)
 7514{
 7515	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 7516	int err = 0;
 7517
 7518	mutex_lock(&swhash->hlist_mutex);
 
 7519	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
 7520		struct swevent_hlist *hlist;
 7521
 7522		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
 7523		if (!hlist) {
 7524			err = -ENOMEM;
 7525			goto exit;
 7526		}
 7527		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 7528	}
 7529	swhash->hlist_refcount++;
 7530exit:
 7531	mutex_unlock(&swhash->hlist_mutex);
 7532
 7533	return err;
 7534}
 7535
 7536static int swevent_hlist_get(void)
 7537{
 7538	int err, cpu, failed_cpu;
 
 7539
 7540	get_online_cpus();
 7541	for_each_possible_cpu(cpu) {
 7542		err = swevent_hlist_get_cpu(cpu);
 7543		if (err) {
 7544			failed_cpu = cpu;
 7545			goto fail;
 7546		}
 7547	}
 7548	put_online_cpus();
 7549
 7550	return 0;
 7551fail:
 7552	for_each_possible_cpu(cpu) {
 7553		if (cpu == failed_cpu)
 7554			break;
 7555		swevent_hlist_put_cpu(cpu);
 7556	}
 7557
 7558	put_online_cpus();
 7559	return err;
 7560}
 7561
 7562struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 7563
 7564static void sw_perf_event_destroy(struct perf_event *event)
 7565{
 7566	u64 event_id = event->attr.config;
 7567
 7568	WARN_ON(event->parent);
 7569
 7570	static_key_slow_dec(&perf_swevent_enabled[event_id]);
 7571	swevent_hlist_put();
 7572}
 7573
 7574static int perf_swevent_init(struct perf_event *event)
 7575{
 7576	u64 event_id = event->attr.config;
 7577
 7578	if (event->attr.type != PERF_TYPE_SOFTWARE)
 7579		return -ENOENT;
 7580
 7581	/*
 7582	 * no branch sampling for software events
 7583	 */
 7584	if (has_branch_stack(event))
 7585		return -EOPNOTSUPP;
 7586
 7587	switch (event_id) {
 7588	case PERF_COUNT_SW_CPU_CLOCK:
 7589	case PERF_COUNT_SW_TASK_CLOCK:
 7590		return -ENOENT;
 7591
 7592	default:
 7593		break;
 7594	}
 7595
 7596	if (event_id >= PERF_COUNT_SW_MAX)
 7597		return -ENOENT;
 7598
 7599	if (!event->parent) {
 7600		int err;
 7601
 7602		err = swevent_hlist_get();
 7603		if (err)
 7604			return err;
 7605
 7606		static_key_slow_inc(&perf_swevent_enabled[event_id]);
 7607		event->destroy = sw_perf_event_destroy;
 7608	}
 7609
 7610	return 0;
 7611}
 7612
 
 
 
 
 
 7613static struct pmu perf_swevent = {
 7614	.task_ctx_nr	= perf_sw_context,
 7615
 7616	.capabilities	= PERF_PMU_CAP_NO_NMI,
 7617
 7618	.event_init	= perf_swevent_init,
 7619	.add		= perf_swevent_add,
 7620	.del		= perf_swevent_del,
 7621	.start		= perf_swevent_start,
 7622	.stop		= perf_swevent_stop,
 7623	.read		= perf_swevent_read,
 
 
 7624};
 7625
 7626#ifdef CONFIG_EVENT_TRACING
 7627
 7628static int perf_tp_filter_match(struct perf_event *event,
 7629				struct perf_sample_data *data)
 7630{
 7631	void *record = data->raw->frag.data;
 7632
 7633	/* only top level events have filters set */
 7634	if (event->parent)
 7635		event = event->parent;
 7636
 7637	if (likely(!event->filter) || filter_match_preds(event->filter, record))
 7638		return 1;
 7639	return 0;
 7640}
 7641
 7642static int perf_tp_event_match(struct perf_event *event,
 7643				struct perf_sample_data *data,
 7644				struct pt_regs *regs)
 7645{
 7646	if (event->hw.state & PERF_HES_STOPPED)
 7647		return 0;
 7648	/*
 7649	 * All tracepoints are from kernel-space.
 7650	 */
 7651	if (event->attr.exclude_kernel)
 7652		return 0;
 7653
 7654	if (!perf_tp_filter_match(event, data))
 7655		return 0;
 7656
 7657	return 1;
 7658}
 7659
 7660void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 7661			       struct trace_event_call *call, u64 count,
 7662			       struct pt_regs *regs, struct hlist_head *head,
 7663			       struct task_struct *task)
 7664{
 7665	struct bpf_prog *prog = call->prog;
 7666
 7667	if (prog) {
 7668		*(struct pt_regs **)raw_data = regs;
 7669		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
 7670			perf_swevent_put_recursion_context(rctx);
 7671			return;
 7672		}
 7673	}
 7674	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
 7675		      rctx, task);
 7676}
 7677EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 7678
 7679void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 7680		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 7681		   struct task_struct *task)
 7682{
 7683	struct perf_sample_data data;
 7684	struct perf_event *event;
 7685
 7686	struct perf_raw_record raw = {
 7687		.frag = {
 7688			.size = entry_size,
 7689			.data = record,
 7690		},
 7691	};
 7692
 7693	perf_sample_data_init(&data, 0, 0);
 7694	data.raw = &raw;
 7695
 7696	perf_trace_buf_update(record, event_type);
 7697
 7698	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 7699		if (perf_tp_event_match(event, &data, regs))
 7700			perf_swevent_event(event, count, &data, regs);
 7701	}
 7702
 7703	/*
 7704	 * If we got specified a target task, also iterate its context and
 7705	 * deliver this event there too.
 7706	 */
 7707	if (task && task != current) {
 7708		struct perf_event_context *ctx;
 7709		struct trace_entry *entry = record;
 7710
 7711		rcu_read_lock();
 7712		ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
 7713		if (!ctx)
 7714			goto unlock;
 7715
 7716		list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 7717			if (event->attr.type != PERF_TYPE_TRACEPOINT)
 7718				continue;
 7719			if (event->attr.config != entry->type)
 7720				continue;
 7721			if (perf_tp_event_match(event, &data, regs))
 7722				perf_swevent_event(event, count, &data, regs);
 7723		}
 7724unlock:
 7725		rcu_read_unlock();
 7726	}
 7727
 7728	perf_swevent_put_recursion_context(rctx);
 7729}
 7730EXPORT_SYMBOL_GPL(perf_tp_event);
 7731
 7732static void tp_perf_event_destroy(struct perf_event *event)
 7733{
 7734	perf_trace_destroy(event);
 7735}
 7736
 7737static int perf_tp_event_init(struct perf_event *event)
 7738{
 7739	int err;
 7740
 7741	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 7742		return -ENOENT;
 7743
 7744	/*
 7745	 * no branch sampling for tracepoint events
 7746	 */
 7747	if (has_branch_stack(event))
 7748		return -EOPNOTSUPP;
 7749
 7750	err = perf_trace_init(event);
 7751	if (err)
 7752		return err;
 7753
 7754	event->destroy = tp_perf_event_destroy;
 7755
 7756	return 0;
 7757}
 7758
 7759static struct pmu perf_tracepoint = {
 7760	.task_ctx_nr	= perf_sw_context,
 7761
 7762	.event_init	= perf_tp_event_init,
 7763	.add		= perf_trace_add,
 7764	.del		= perf_trace_del,
 7765	.start		= perf_swevent_start,
 7766	.stop		= perf_swevent_stop,
 7767	.read		= perf_swevent_read,
 
 
 7768};
 7769
 7770static inline void perf_tp_register(void)
 7771{
 7772	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 7773}
 7774
 7775static void perf_event_free_filter(struct perf_event *event)
 7776{
 7777	ftrace_profile_free_filter(event);
 7778}
 7779
 7780#ifdef CONFIG_BPF_SYSCALL
 7781static void bpf_overflow_handler(struct perf_event *event,
 7782				 struct perf_sample_data *data,
 7783				 struct pt_regs *regs)
 7784{
 7785	struct bpf_perf_event_data_kern ctx = {
 7786		.data = data,
 7787		.regs = regs,
 7788	};
 7789	int ret = 0;
 7790
 7791	preempt_disable();
 7792	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 7793		goto out;
 7794	rcu_read_lock();
 7795	ret = BPF_PROG_RUN(event->prog, &ctx);
 7796	rcu_read_unlock();
 7797out:
 7798	__this_cpu_dec(bpf_prog_active);
 7799	preempt_enable();
 7800	if (!ret)
 7801		return;
 7802
 7803	event->orig_overflow_handler(event, data, regs);
 7804}
 7805
 7806static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
 7807{
 7808	struct bpf_prog *prog;
 7809
 7810	if (event->overflow_handler_context)
 7811		/* hw breakpoint or kernel counter */
 7812		return -EINVAL;
 7813
 7814	if (event->prog)
 7815		return -EEXIST;
 7816
 7817	prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
 7818	if (IS_ERR(prog))
 7819		return PTR_ERR(prog);
 7820
 7821	event->prog = prog;
 7822	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
 7823	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
 7824	return 0;
 7825}
 7826
 7827static void perf_event_free_bpf_handler(struct perf_event *event)
 7828{
 7829	struct bpf_prog *prog = event->prog;
 7830
 7831	if (!prog)
 7832		return;
 7833
 7834	WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
 7835	event->prog = NULL;
 7836	bpf_prog_put(prog);
 7837}
 7838#else
 7839static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
 7840{
 7841	return -EOPNOTSUPP;
 7842}
 7843static void perf_event_free_bpf_handler(struct perf_event *event)
 7844{
 7845}
 7846#endif
 7847
 7848static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 7849{
 7850	bool is_kprobe, is_tracepoint;
 7851	struct bpf_prog *prog;
 7852
 7853	if (event->attr.type == PERF_TYPE_HARDWARE ||
 7854	    event->attr.type == PERF_TYPE_SOFTWARE)
 7855		return perf_event_set_bpf_handler(event, prog_fd);
 7856
 7857	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 7858		return -EINVAL;
 7859
 7860	if (event->tp_event->prog)
 7861		return -EEXIST;
 7862
 7863	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 7864	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 7865	if (!is_kprobe && !is_tracepoint)
 7866		/* bpf programs can only be attached to u/kprobe or tracepoint */
 7867		return -EINVAL;
 7868
 7869	prog = bpf_prog_get(prog_fd);
 7870	if (IS_ERR(prog))
 7871		return PTR_ERR(prog);
 7872
 7873	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
 7874	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 7875		/* valid fd, but invalid bpf program type */
 7876		bpf_prog_put(prog);
 7877		return -EINVAL;
 7878	}
 7879
 7880	if (is_tracepoint) {
 7881		int off = trace_event_get_offsets(event->tp_event);
 7882
 7883		if (prog->aux->max_ctx_offset > off) {
 7884			bpf_prog_put(prog);
 7885			return -EACCES;
 7886		}
 7887	}
 7888	event->tp_event->prog = prog;
 7889
 7890	return 0;
 
 7891}
 7892
 7893static void perf_event_free_bpf_prog(struct perf_event *event)
 7894{
 7895	struct bpf_prog *prog;
 7896
 7897	perf_event_free_bpf_handler(event);
 7898
 7899	if (!event->tp_event)
 7900		return;
 7901
 7902	prog = event->tp_event->prog;
 7903	if (prog) {
 7904		event->tp_event->prog = NULL;
 7905		bpf_prog_put(prog);
 7906	}
 7907}
 7908
 7909#else
 7910
 7911static inline void perf_tp_register(void)
 7912{
 7913}
 7914
 7915static void perf_event_free_filter(struct perf_event *event)
 7916{
 7917}
 7918
 7919static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 7920{
 7921	return -ENOENT;
 7922}
 7923
 7924static void perf_event_free_bpf_prog(struct perf_event *event)
 7925{
 7926}
 
 7927#endif /* CONFIG_EVENT_TRACING */
 7928
 7929#ifdef CONFIG_HAVE_HW_BREAKPOINT
 7930void perf_bp_event(struct perf_event *bp, void *data)
 7931{
 7932	struct perf_sample_data sample;
 7933	struct pt_regs *regs = data;
 7934
 7935	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
 7936
 7937	if (!bp->hw.state && !perf_exclude_event(bp, regs))
 7938		perf_swevent_event(bp, 1, &sample, regs);
 7939}
 7940#endif
 7941
 7942/*
 7943 * Allocate a new address filter
 7944 */
 7945static struct perf_addr_filter *
 7946perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
 7947{
 7948	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
 7949	struct perf_addr_filter *filter;
 7950
 7951	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
 7952	if (!filter)
 7953		return NULL;
 7954
 7955	INIT_LIST_HEAD(&filter->entry);
 7956	list_add_tail(&filter->entry, filters);
 7957
 7958	return filter;
 7959}
 7960
 7961static void free_filters_list(struct list_head *filters)
 7962{
 7963	struct perf_addr_filter *filter, *iter;
 7964
 7965	list_for_each_entry_safe(filter, iter, filters, entry) {
 7966		if (filter->inode)
 7967			iput(filter->inode);
 7968		list_del(&filter->entry);
 7969		kfree(filter);
 7970	}
 7971}
 7972
 7973/*
 7974 * Free existing address filters and optionally install new ones
 7975 */
 7976static void perf_addr_filters_splice(struct perf_event *event,
 7977				     struct list_head *head)
 7978{
 7979	unsigned long flags;
 7980	LIST_HEAD(list);
 7981
 7982	if (!has_addr_filter(event))
 7983		return;
 7984
 7985	/* don't bother with children, they don't have their own filters */
 7986	if (event->parent)
 7987		return;
 7988
 7989	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
 7990
 7991	list_splice_init(&event->addr_filters.list, &list);
 7992	if (head)
 7993		list_splice(head, &event->addr_filters.list);
 7994
 7995	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
 7996
 7997	free_filters_list(&list);
 7998}
 7999
 8000/*
 8001 * Scan through mm's vmas and see if one of them matches the
 8002 * @filter; if so, adjust filter's address range.
 8003 * Called with mm::mmap_sem down for reading.
 8004 */
 8005static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
 8006					    struct mm_struct *mm)
 8007{
 8008	struct vm_area_struct *vma;
 8009
 8010	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 8011		struct file *file = vma->vm_file;
 8012		unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
 8013		unsigned long vma_size = vma->vm_end - vma->vm_start;
 8014
 8015		if (!file)
 8016			continue;
 8017
 8018		if (!perf_addr_filter_match(filter, file, off, vma_size))
 8019			continue;
 8020
 8021		return vma->vm_start;
 8022	}
 8023
 8024	return 0;
 8025}
 8026
 8027/*
 8028 * Update event's address range filters based on the
 8029 * task's existing mappings, if any.
 8030 */
 8031static void perf_event_addr_filters_apply(struct perf_event *event)
 8032{
 8033	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
 8034	struct task_struct *task = READ_ONCE(event->ctx->task);
 8035	struct perf_addr_filter *filter;
 8036	struct mm_struct *mm = NULL;
 8037	unsigned int count = 0;
 8038	unsigned long flags;
 8039
 8040	/*
 8041	 * We may observe TASK_TOMBSTONE, which means that the event tear-down
 8042	 * will stop on the parent's child_mutex that our caller is also holding
 8043	 */
 8044	if (task == TASK_TOMBSTONE)
 8045		return;
 8046
 8047	mm = get_task_mm(event->ctx->task);
 8048	if (!mm)
 8049		goto restart;
 8050
 8051	down_read(&mm->mmap_sem);
 8052
 8053	raw_spin_lock_irqsave(&ifh->lock, flags);
 8054	list_for_each_entry(filter, &ifh->list, entry) {
 8055		event->addr_filters_offs[count] = 0;
 8056
 8057		/*
 8058		 * Adjust base offset if the filter is associated to a binary
 8059		 * that needs to be mapped:
 8060		 */
 8061		if (filter->inode)
 8062			event->addr_filters_offs[count] =
 8063				perf_addr_filter_apply(filter, mm);
 8064
 8065		count++;
 8066	}
 8067
 8068	event->addr_filters_gen++;
 8069	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 8070
 8071	up_read(&mm->mmap_sem);
 8072
 8073	mmput(mm);
 8074
 8075restart:
 8076	perf_event_stop(event, 1);
 8077}
 8078
 8079/*
 8080 * Address range filtering: limiting the data to certain
 8081 * instruction address ranges. Filters are ioctl()ed to us from
 8082 * userspace as ascii strings.
 8083 *
 8084 * Filter string format:
 8085 *
 8086 * ACTION RANGE_SPEC
 8087 * where ACTION is one of the
 8088 *  * "filter": limit the trace to this region
 8089 *  * "start": start tracing from this address
 8090 *  * "stop": stop tracing at this address/region;
 8091 * RANGE_SPEC is
 8092 *  * for kernel addresses: <start address>[/<size>]
 8093 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 8094 *
 8095 * if <size> is not specified, the range is treated as a single address.
 8096 */
 8097enum {
 8098	IF_ACT_NONE = -1,
 8099	IF_ACT_FILTER,
 8100	IF_ACT_START,
 8101	IF_ACT_STOP,
 8102	IF_SRC_FILE,
 8103	IF_SRC_KERNEL,
 8104	IF_SRC_FILEADDR,
 8105	IF_SRC_KERNELADDR,
 8106};
 8107
 8108enum {
 8109	IF_STATE_ACTION = 0,
 8110	IF_STATE_SOURCE,
 8111	IF_STATE_END,
 8112};
 8113
 8114static const match_table_t if_tokens = {
 8115	{ IF_ACT_FILTER,	"filter" },
 8116	{ IF_ACT_START,		"start" },
 8117	{ IF_ACT_STOP,		"stop" },
 8118	{ IF_SRC_FILE,		"%u/%u@%s" },
 8119	{ IF_SRC_KERNEL,	"%u/%u" },
 8120	{ IF_SRC_FILEADDR,	"%u@%s" },
 8121	{ IF_SRC_KERNELADDR,	"%u" },
 8122	{ IF_ACT_NONE,		NULL },
 8123};
 8124
 8125/*
 8126 * Address filter string parser
 8127 */
 8128static int
 8129perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 8130			     struct list_head *filters)
 8131{
 8132	struct perf_addr_filter *filter = NULL;
 8133	char *start, *orig, *filename = NULL;
 8134	struct path path;
 8135	substring_t args[MAX_OPT_ARGS];
 8136	int state = IF_STATE_ACTION, token;
 8137	unsigned int kernel = 0;
 8138	int ret = -EINVAL;
 8139
 8140	orig = fstr = kstrdup(fstr, GFP_KERNEL);
 8141	if (!fstr)
 8142		return -ENOMEM;
 8143
 8144	while ((start = strsep(&fstr, " ,\n")) != NULL) {
 8145		ret = -EINVAL;
 8146
 8147		if (!*start)
 8148			continue;
 8149
 8150		/* filter definition begins */
 8151		if (state == IF_STATE_ACTION) {
 8152			filter = perf_addr_filter_new(event, filters);
 8153			if (!filter)
 8154				goto fail;
 8155		}
 8156
 8157		token = match_token(start, if_tokens, args);
 8158		switch (token) {
 8159		case IF_ACT_FILTER:
 8160		case IF_ACT_START:
 8161			filter->filter = 1;
 8162
 8163		case IF_ACT_STOP:
 8164			if (state != IF_STATE_ACTION)
 8165				goto fail;
 8166
 8167			state = IF_STATE_SOURCE;
 8168			break;
 8169
 8170		case IF_SRC_KERNELADDR:
 8171		case IF_SRC_KERNEL:
 8172			kernel = 1;
 8173
 8174		case IF_SRC_FILEADDR:
 8175		case IF_SRC_FILE:
 8176			if (state != IF_STATE_SOURCE)
 8177				goto fail;
 8178
 8179			if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
 8180				filter->range = 1;
 8181
 8182			*args[0].to = 0;
 8183			ret = kstrtoul(args[0].from, 0, &filter->offset);
 8184			if (ret)
 8185				goto fail;
 8186
 8187			if (filter->range) {
 8188				*args[1].to = 0;
 8189				ret = kstrtoul(args[1].from, 0, &filter->size);
 8190				if (ret)
 8191					goto fail;
 8192			}
 8193
 8194			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
 8195				int fpos = filter->range ? 2 : 1;
 8196
 8197				filename = match_strdup(&args[fpos]);
 8198				if (!filename) {
 8199					ret = -ENOMEM;
 8200					goto fail;
 8201				}
 8202			}
 8203
 8204			state = IF_STATE_END;
 8205			break;
 8206
 8207		default:
 8208			goto fail;
 8209		}
 8210
 8211		/*
 8212		 * Filter definition is fully parsed, validate and install it.
 8213		 * Make sure that it doesn't contradict itself or the event's
 8214		 * attribute.
 8215		 */
 8216		if (state == IF_STATE_END) {
 8217			if (kernel && event->attr.exclude_kernel)
 8218				goto fail;
 8219
 8220			if (!kernel) {
 8221				if (!filename)
 8222					goto fail;
 8223
 8224				/* look up the path and grab its inode */
 8225				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 8226				if (ret)
 8227					goto fail_free_name;
 8228
 8229				filter->inode = igrab(d_inode(path.dentry));
 8230				path_put(&path);
 8231				kfree(filename);
 8232				filename = NULL;
 8233
 8234				ret = -EINVAL;
 8235				if (!filter->inode ||
 8236				    !S_ISREG(filter->inode->i_mode))
 8237					/* free_filters_list() will iput() */
 8238					goto fail;
 8239			}
 8240
 8241			/* ready to consume more filters */
 8242			state = IF_STATE_ACTION;
 8243			filter = NULL;
 8244		}
 8245	}
 8246
 8247	if (state != IF_STATE_ACTION)
 8248		goto fail;
 8249
 8250	kfree(orig);
 8251
 8252	return 0;
 8253
 8254fail_free_name:
 8255	kfree(filename);
 8256fail:
 8257	free_filters_list(filters);
 8258	kfree(orig);
 8259
 8260	return ret;
 8261}
 8262
 8263static int
 8264perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
 8265{
 8266	LIST_HEAD(filters);
 8267	int ret;
 8268
 8269	/*
 8270	 * Since this is called in perf_ioctl() path, we're already holding
 8271	 * ctx::mutex.
 8272	 */
 8273	lockdep_assert_held(&event->ctx->mutex);
 8274
 8275	if (WARN_ON_ONCE(event->parent))
 8276		return -EINVAL;
 8277
 8278	/*
 8279	 * For now, we only support filtering in per-task events; doing so
 8280	 * for CPU-wide events requires additional context switching trickery,
 8281	 * since same object code will be mapped at different virtual
 8282	 * addresses in different processes.
 8283	 */
 8284	if (!event->ctx->task)
 8285		return -EOPNOTSUPP;
 8286
 8287	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
 8288	if (ret)
 8289		return ret;
 8290
 8291	ret = event->pmu->addr_filters_validate(&filters);
 8292	if (ret) {
 8293		free_filters_list(&filters);
 8294		return ret;
 8295	}
 8296
 8297	/* remove existing filters, if any */
 8298	perf_addr_filters_splice(event, &filters);
 8299
 8300	/* install new filters */
 8301	perf_event_for_each_child(event, perf_event_addr_filters_apply);
 8302
 8303	return ret;
 8304}
 8305
 8306static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 8307{
 8308	char *filter_str;
 8309	int ret = -EINVAL;
 8310
 8311	if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
 8312	    !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
 8313	    !has_addr_filter(event))
 8314		return -EINVAL;
 8315
 8316	filter_str = strndup_user(arg, PAGE_SIZE);
 8317	if (IS_ERR(filter_str))
 8318		return PTR_ERR(filter_str);
 8319
 8320	if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
 8321	    event->attr.type == PERF_TYPE_TRACEPOINT)
 8322		ret = ftrace_profile_set_filter(event, event->attr.config,
 8323						filter_str);
 8324	else if (has_addr_filter(event))
 8325		ret = perf_event_set_addr_filter(event, filter_str);
 8326
 8327	kfree(filter_str);
 8328	return ret;
 8329}
 8330
 8331/*
 8332 * hrtimer based swevent callback
 8333 */
 8334
 8335static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 8336{
 8337	enum hrtimer_restart ret = HRTIMER_RESTART;
 8338	struct perf_sample_data data;
 8339	struct pt_regs *regs;
 8340	struct perf_event *event;
 8341	u64 period;
 8342
 8343	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
 8344
 8345	if (event->state != PERF_EVENT_STATE_ACTIVE)
 8346		return HRTIMER_NORESTART;
 8347
 8348	event->pmu->read(event);
 8349
 8350	perf_sample_data_init(&data, 0, event->hw.last_period);
 8351	regs = get_irq_regs();
 8352
 8353	if (regs && !perf_exclude_event(event, regs)) {
 8354		if (!(event->attr.exclude_idle && is_idle_task(current)))
 8355			if (__perf_event_overflow(event, 1, &data, regs))
 8356				ret = HRTIMER_NORESTART;
 8357	}
 8358
 8359	period = max_t(u64, 10000, event->hw.sample_period);
 8360	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 8361
 8362	return ret;
 8363}
 8364
 8365static void perf_swevent_start_hrtimer(struct perf_event *event)
 8366{
 8367	struct hw_perf_event *hwc = &event->hw;
 8368	s64 period;
 8369
 8370	if (!is_sampling_event(event))
 8371		return;
 8372
 8373	period = local64_read(&hwc->period_left);
 8374	if (period) {
 8375		if (period < 0)
 8376			period = 10000;
 8377
 8378		local64_set(&hwc->period_left, 0);
 8379	} else {
 8380		period = max_t(u64, 10000, hwc->sample_period);
 8381	}
 8382	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
 8383		      HRTIMER_MODE_REL_PINNED);
 
 8384}
 8385
 8386static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 8387{
 8388	struct hw_perf_event *hwc = &event->hw;
 8389
 8390	if (is_sampling_event(event)) {
 8391		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
 8392		local64_set(&hwc->period_left, ktime_to_ns(remaining));
 8393
 8394		hrtimer_cancel(&hwc->hrtimer);
 8395	}
 8396}
 8397
 8398static void perf_swevent_init_hrtimer(struct perf_event *event)
 8399{
 8400	struct hw_perf_event *hwc = &event->hw;
 8401
 8402	if (!is_sampling_event(event))
 8403		return;
 8404
 8405	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 8406	hwc->hrtimer.function = perf_swevent_hrtimer;
 8407
 8408	/*
 8409	 * Since hrtimers have a fixed rate, we can do a static freq->period
 8410	 * mapping and avoid the whole period adjust feedback stuff.
 8411	 */
 8412	if (event->attr.freq) {
 8413		long freq = event->attr.sample_freq;
 8414
 8415		event->attr.sample_period = NSEC_PER_SEC / freq;
 8416		hwc->sample_period = event->attr.sample_period;
 8417		local64_set(&hwc->period_left, hwc->sample_period);
 8418		hwc->last_period = hwc->sample_period;
 8419		event->attr.freq = 0;
 8420	}
 8421}
 8422
 8423/*
 8424 * Software event: cpu wall time clock
 8425 */
 8426
 8427static void cpu_clock_event_update(struct perf_event *event)
 8428{
 8429	s64 prev;
 8430	u64 now;
 8431
 8432	now = local_clock();
 8433	prev = local64_xchg(&event->hw.prev_count, now);
 8434	local64_add(now - prev, &event->count);
 8435}
 8436
 8437static void cpu_clock_event_start(struct perf_event *event, int flags)
 8438{
 8439	local64_set(&event->hw.prev_count, local_clock());
 8440	perf_swevent_start_hrtimer(event);
 8441}
 8442
 8443static void cpu_clock_event_stop(struct perf_event *event, int flags)
 8444{
 8445	perf_swevent_cancel_hrtimer(event);
 8446	cpu_clock_event_update(event);
 8447}
 8448
 8449static int cpu_clock_event_add(struct perf_event *event, int flags)
 8450{
 8451	if (flags & PERF_EF_START)
 8452		cpu_clock_event_start(event, flags);
 8453	perf_event_update_userpage(event);
 8454
 8455	return 0;
 8456}
 8457
 8458static void cpu_clock_event_del(struct perf_event *event, int flags)
 8459{
 8460	cpu_clock_event_stop(event, flags);
 8461}
 8462
 8463static void cpu_clock_event_read(struct perf_event *event)
 8464{
 8465	cpu_clock_event_update(event);
 8466}
 8467
 8468static int cpu_clock_event_init(struct perf_event *event)
 8469{
 8470	if (event->attr.type != PERF_TYPE_SOFTWARE)
 8471		return -ENOENT;
 8472
 8473	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
 8474		return -ENOENT;
 8475
 8476	/*
 8477	 * no branch sampling for software events
 8478	 */
 8479	if (has_branch_stack(event))
 8480		return -EOPNOTSUPP;
 8481
 8482	perf_swevent_init_hrtimer(event);
 8483
 8484	return 0;
 8485}
 8486
 8487static struct pmu perf_cpu_clock = {
 8488	.task_ctx_nr	= perf_sw_context,
 8489
 8490	.capabilities	= PERF_PMU_CAP_NO_NMI,
 8491
 8492	.event_init	= cpu_clock_event_init,
 8493	.add		= cpu_clock_event_add,
 8494	.del		= cpu_clock_event_del,
 8495	.start		= cpu_clock_event_start,
 8496	.stop		= cpu_clock_event_stop,
 8497	.read		= cpu_clock_event_read,
 
 
 8498};
 8499
 8500/*
 8501 * Software event: task time clock
 8502 */
 8503
 8504static void task_clock_event_update(struct perf_event *event, u64 now)
 8505{
 8506	u64 prev;
 8507	s64 delta;
 8508
 8509	prev = local64_xchg(&event->hw.prev_count, now);
 8510	delta = now - prev;
 8511	local64_add(delta, &event->count);
 8512}
 8513
 8514static void task_clock_event_start(struct perf_event *event, int flags)
 8515{
 8516	local64_set(&event->hw.prev_count, event->ctx->time);
 8517	perf_swevent_start_hrtimer(event);
 8518}
 8519
 8520static void task_clock_event_stop(struct perf_event *event, int flags)
 8521{
 8522	perf_swevent_cancel_hrtimer(event);
 8523	task_clock_event_update(event, event->ctx->time);
 8524}
 8525
 8526static int task_clock_event_add(struct perf_event *event, int flags)
 8527{
 8528	if (flags & PERF_EF_START)
 8529		task_clock_event_start(event, flags);
 8530	perf_event_update_userpage(event);
 8531
 8532	return 0;
 8533}
 8534
 8535static void task_clock_event_del(struct perf_event *event, int flags)
 8536{
 8537	task_clock_event_stop(event, PERF_EF_UPDATE);
 8538}
 8539
 8540static void task_clock_event_read(struct perf_event *event)
 8541{
 8542	u64 now = perf_clock();
 8543	u64 delta = now - event->ctx->timestamp;
 8544	u64 time = event->ctx->time + delta;
 8545
 8546	task_clock_event_update(event, time);
 8547}
 8548
 8549static int task_clock_event_init(struct perf_event *event)
 8550{
 8551	if (event->attr.type != PERF_TYPE_SOFTWARE)
 8552		return -ENOENT;
 8553
 8554	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
 8555		return -ENOENT;
 8556
 8557	/*
 8558	 * no branch sampling for software events
 8559	 */
 8560	if (has_branch_stack(event))
 8561		return -EOPNOTSUPP;
 8562
 8563	perf_swevent_init_hrtimer(event);
 8564
 8565	return 0;
 8566}
 8567
 8568static struct pmu perf_task_clock = {
 8569	.task_ctx_nr	= perf_sw_context,
 8570
 8571	.capabilities	= PERF_PMU_CAP_NO_NMI,
 8572
 8573	.event_init	= task_clock_event_init,
 8574	.add		= task_clock_event_add,
 8575	.del		= task_clock_event_del,
 8576	.start		= task_clock_event_start,
 8577	.stop		= task_clock_event_stop,
 8578	.read		= task_clock_event_read,
 
 
 8579};
 8580
 8581static void perf_pmu_nop_void(struct pmu *pmu)
 8582{
 8583}
 8584
 8585static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
 8586{
 8587}
 8588
 8589static int perf_pmu_nop_int(struct pmu *pmu)
 8590{
 8591	return 0;
 8592}
 8593
 8594static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
 8595
 8596static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
 8597{
 8598	__this_cpu_write(nop_txn_flags, flags);
 8599
 8600	if (flags & ~PERF_PMU_TXN_ADD)
 8601		return;
 8602
 8603	perf_pmu_disable(pmu);
 8604}
 8605
 8606static int perf_pmu_commit_txn(struct pmu *pmu)
 8607{
 8608	unsigned int flags = __this_cpu_read(nop_txn_flags);
 8609
 8610	__this_cpu_write(nop_txn_flags, 0);
 8611
 8612	if (flags & ~PERF_PMU_TXN_ADD)
 8613		return 0;
 8614
 8615	perf_pmu_enable(pmu);
 8616	return 0;
 8617}
 8618
 8619static void perf_pmu_cancel_txn(struct pmu *pmu)
 8620{
 8621	unsigned int flags =  __this_cpu_read(nop_txn_flags);
 8622
 8623	__this_cpu_write(nop_txn_flags, 0);
 8624
 8625	if (flags & ~PERF_PMU_TXN_ADD)
 8626		return;
 8627
 8628	perf_pmu_enable(pmu);
 8629}
 8630
 8631static int perf_event_idx_default(struct perf_event *event)
 8632{
 8633	return 0;
 8634}
 8635
 8636/*
 8637 * Ensures all contexts with the same task_ctx_nr have the same
 8638 * pmu_cpu_context too.
 8639 */
 8640static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 8641{
 8642	struct pmu *pmu;
 8643
 8644	if (ctxn < 0)
 8645		return NULL;
 8646
 8647	list_for_each_entry(pmu, &pmus, entry) {
 8648		if (pmu->task_ctx_nr == ctxn)
 8649			return pmu->pmu_cpu_context;
 8650	}
 8651
 8652	return NULL;
 8653}
 8654
 8655static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 8656{
 8657	int cpu;
 8658
 8659	for_each_possible_cpu(cpu) {
 8660		struct perf_cpu_context *cpuctx;
 8661
 8662		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 8663
 8664		if (cpuctx->unique_pmu == old_pmu)
 8665			cpuctx->unique_pmu = pmu;
 8666	}
 8667}
 8668
 8669static void free_pmu_context(struct pmu *pmu)
 8670{
 8671	struct pmu *i;
 8672
 8673	mutex_lock(&pmus_lock);
 8674	/*
 8675	 * Like a real lame refcount.
 8676	 */
 8677	list_for_each_entry(i, &pmus, entry) {
 8678		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
 8679			update_pmu_context(i, pmu);
 8680			goto out;
 8681		}
 8682	}
 8683
 8684	free_percpu(pmu->pmu_cpu_context);
 8685out:
 8686	mutex_unlock(&pmus_lock);
 8687}
 8688
 8689/*
 8690 * Let userspace know that this PMU supports address range filtering:
 8691 */
 8692static ssize_t nr_addr_filters_show(struct device *dev,
 8693				    struct device_attribute *attr,
 8694				    char *page)
 8695{
 8696	struct pmu *pmu = dev_get_drvdata(dev);
 8697
 8698	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
 8699}
 8700DEVICE_ATTR_RO(nr_addr_filters);
 8701
 8702static struct idr pmu_idr;
 8703
 8704static ssize_t
 8705type_show(struct device *dev, struct device_attribute *attr, char *page)
 8706{
 8707	struct pmu *pmu = dev_get_drvdata(dev);
 8708
 8709	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 8710}
 8711static DEVICE_ATTR_RO(type);
 8712
 8713static ssize_t
 8714perf_event_mux_interval_ms_show(struct device *dev,
 8715				struct device_attribute *attr,
 8716				char *page)
 8717{
 8718	struct pmu *pmu = dev_get_drvdata(dev);
 8719
 8720	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
 8721}
 8722
 8723static DEFINE_MUTEX(mux_interval_mutex);
 8724
 8725static ssize_t
 8726perf_event_mux_interval_ms_store(struct device *dev,
 8727				 struct device_attribute *attr,
 8728				 const char *buf, size_t count)
 8729{
 8730	struct pmu *pmu = dev_get_drvdata(dev);
 8731	int timer, cpu, ret;
 8732
 8733	ret = kstrtoint(buf, 0, &timer);
 8734	if (ret)
 8735		return ret;
 8736
 8737	if (timer < 1)
 8738		return -EINVAL;
 8739
 8740	/* same value, noting to do */
 8741	if (timer == pmu->hrtimer_interval_ms)
 8742		return count;
 8743
 8744	mutex_lock(&mux_interval_mutex);
 8745	pmu->hrtimer_interval_ms = timer;
 8746
 8747	/* update all cpuctx for this PMU */
 8748	get_online_cpus();
 8749	for_each_online_cpu(cpu) {
 8750		struct perf_cpu_context *cpuctx;
 8751		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 8752		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 8753
 8754		cpu_function_call(cpu,
 8755			(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
 8756	}
 8757	put_online_cpus();
 8758	mutex_unlock(&mux_interval_mutex);
 8759
 8760	return count;
 8761}
 8762static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
 8763
 8764static struct attribute *pmu_dev_attrs[] = {
 8765	&dev_attr_type.attr,
 8766	&dev_attr_perf_event_mux_interval_ms.attr,
 8767	NULL,
 8768};
 8769ATTRIBUTE_GROUPS(pmu_dev);
 8770
 8771static int pmu_bus_running;
 8772static struct bus_type pmu_bus = {
 8773	.name		= "event_source",
 8774	.dev_groups	= pmu_dev_groups,
 8775};
 8776
 8777static void pmu_dev_release(struct device *dev)
 8778{
 8779	kfree(dev);
 8780}
 8781
 8782static int pmu_dev_alloc(struct pmu *pmu)
 8783{
 8784	int ret = -ENOMEM;
 8785
 8786	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
 8787	if (!pmu->dev)
 8788		goto out;
 8789
 8790	pmu->dev->groups = pmu->attr_groups;
 8791	device_initialize(pmu->dev);
 8792	ret = dev_set_name(pmu->dev, "%s", pmu->name);
 8793	if (ret)
 8794		goto free_dev;
 8795
 8796	dev_set_drvdata(pmu->dev, pmu);
 8797	pmu->dev->bus = &pmu_bus;
 8798	pmu->dev->release = pmu_dev_release;
 8799	ret = device_add(pmu->dev);
 8800	if (ret)
 8801		goto free_dev;
 8802
 8803	/* For PMUs with address filters, throw in an extra attribute: */
 8804	if (pmu->nr_addr_filters)
 8805		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
 8806
 8807	if (ret)
 8808		goto del_dev;
 8809
 8810out:
 8811	return ret;
 8812
 8813del_dev:
 8814	device_del(pmu->dev);
 8815
 8816free_dev:
 8817	put_device(pmu->dev);
 8818	goto out;
 8819}
 8820
 8821static struct lock_class_key cpuctx_mutex;
 8822static struct lock_class_key cpuctx_lock;
 8823
 8824int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 8825{
 8826	int cpu, ret;
 8827
 8828	mutex_lock(&pmus_lock);
 8829	ret = -ENOMEM;
 8830	pmu->pmu_disable_count = alloc_percpu(int);
 8831	if (!pmu->pmu_disable_count)
 8832		goto unlock;
 8833
 8834	pmu->type = -1;
 8835	if (!name)
 8836		goto skip_type;
 8837	pmu->name = name;
 8838
 8839	if (type < 0) {
 8840		type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
 8841		if (type < 0) {
 8842			ret = type;
 8843			goto free_pdc;
 8844		}
 8845	}
 8846	pmu->type = type;
 8847
 8848	if (pmu_bus_running) {
 8849		ret = pmu_dev_alloc(pmu);
 8850		if (ret)
 8851			goto free_idr;
 8852	}
 8853
 8854skip_type:
 8855	if (pmu->task_ctx_nr == perf_hw_context) {
 8856		static int hw_context_taken = 0;
 8857
 8858		/*
 8859		 * Other than systems with heterogeneous CPUs, it never makes
 8860		 * sense for two PMUs to share perf_hw_context. PMUs which are
 8861		 * uncore must use perf_invalid_context.
 8862		 */
 8863		if (WARN_ON_ONCE(hw_context_taken &&
 8864		    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
 8865			pmu->task_ctx_nr = perf_invalid_context;
 8866
 8867		hw_context_taken = 1;
 8868	}
 8869
 8870	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
 8871	if (pmu->pmu_cpu_context)
 8872		goto got_cpu_context;
 8873
 8874	ret = -ENOMEM;
 8875	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
 8876	if (!pmu->pmu_cpu_context)
 8877		goto free_dev;
 8878
 8879	for_each_possible_cpu(cpu) {
 8880		struct perf_cpu_context *cpuctx;
 8881
 8882		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 8883		__perf_event_init_context(&cpuctx->ctx);
 8884		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
 8885		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 
 8886		cpuctx->ctx.pmu = pmu;
 8887
 8888		__perf_mux_hrtimer_init(cpuctx, cpu);
 8889
 
 8890		cpuctx->unique_pmu = pmu;
 8891	}
 8892
 8893got_cpu_context:
 8894	if (!pmu->start_txn) {
 8895		if (pmu->pmu_enable) {
 8896			/*
 8897			 * If we have pmu_enable/pmu_disable calls, install
 8898			 * transaction stubs that use that to try and batch
 8899			 * hardware accesses.
 8900			 */
 8901			pmu->start_txn  = perf_pmu_start_txn;
 8902			pmu->commit_txn = perf_pmu_commit_txn;
 8903			pmu->cancel_txn = perf_pmu_cancel_txn;
 8904		} else {
 8905			pmu->start_txn  = perf_pmu_nop_txn;
 8906			pmu->commit_txn = perf_pmu_nop_int;
 8907			pmu->cancel_txn = perf_pmu_nop_void;
 8908		}
 8909	}
 8910
 8911	if (!pmu->pmu_enable) {
 8912		pmu->pmu_enable  = perf_pmu_nop_void;
 8913		pmu->pmu_disable = perf_pmu_nop_void;
 8914	}
 8915
 8916	if (!pmu->event_idx)
 8917		pmu->event_idx = perf_event_idx_default;
 8918
 8919	list_add_rcu(&pmu->entry, &pmus);
 8920	atomic_set(&pmu->exclusive_cnt, 0);
 8921	ret = 0;
 8922unlock:
 8923	mutex_unlock(&pmus_lock);
 8924
 8925	return ret;
 8926
 8927free_dev:
 8928	device_del(pmu->dev);
 8929	put_device(pmu->dev);
 8930
 8931free_idr:
 8932	if (pmu->type >= PERF_TYPE_MAX)
 8933		idr_remove(&pmu_idr, pmu->type);
 8934
 8935free_pdc:
 8936	free_percpu(pmu->pmu_disable_count);
 8937	goto unlock;
 8938}
 8939EXPORT_SYMBOL_GPL(perf_pmu_register);
 8940
 8941void perf_pmu_unregister(struct pmu *pmu)
 8942{
 8943	int remove_device;
 8944
 8945	mutex_lock(&pmus_lock);
 8946	remove_device = pmu_bus_running;
 8947	list_del_rcu(&pmu->entry);
 8948	mutex_unlock(&pmus_lock);
 8949
 8950	/*
 8951	 * We dereference the pmu list under both SRCU and regular RCU, so
 8952	 * synchronize against both of those.
 8953	 */
 8954	synchronize_srcu(&pmus_srcu);
 8955	synchronize_rcu();
 8956
 8957	free_percpu(pmu->pmu_disable_count);
 8958	if (pmu->type >= PERF_TYPE_MAX)
 8959		idr_remove(&pmu_idr, pmu->type);
 8960	if (remove_device) {
 8961		if (pmu->nr_addr_filters)
 8962			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
 8963		device_del(pmu->dev);
 8964		put_device(pmu->dev);
 8965	}
 8966	free_pmu_context(pmu);
 8967}
 8968EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 8969
 8970static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 8971{
 8972	struct perf_event_context *ctx = NULL;
 8973	int ret;
 8974
 8975	if (!try_module_get(pmu->module))
 8976		return -ENODEV;
 8977
 8978	if (event->group_leader != event) {
 8979		/*
 8980		 * This ctx->mutex can nest when we're called through
 8981		 * inheritance. See the perf_event_ctx_lock_nested() comment.
 8982		 */
 8983		ctx = perf_event_ctx_lock_nested(event->group_leader,
 8984						 SINGLE_DEPTH_NESTING);
 8985		BUG_ON(!ctx);
 8986	}
 8987
 8988	event->pmu = pmu;
 8989	ret = pmu->event_init(event);
 8990
 8991	if (ctx)
 8992		perf_event_ctx_unlock(event->group_leader, ctx);
 8993
 8994	if (ret)
 8995		module_put(pmu->module);
 8996
 8997	return ret;
 8998}
 8999
 9000static struct pmu *perf_init_event(struct perf_event *event)
 9001{
 9002	struct pmu *pmu = NULL;
 9003	int idx;
 9004	int ret;
 9005
 9006	idx = srcu_read_lock(&pmus_srcu);
 9007
 9008	rcu_read_lock();
 9009	pmu = idr_find(&pmu_idr, event->attr.type);
 9010	rcu_read_unlock();
 9011	if (pmu) {
 9012		ret = perf_try_init_event(pmu, event);
 
 9013		if (ret)
 9014			pmu = ERR_PTR(ret);
 9015		goto unlock;
 9016	}
 9017
 9018	list_for_each_entry_rcu(pmu, &pmus, entry) {
 9019		ret = perf_try_init_event(pmu, event);
 
 9020		if (!ret)
 9021			goto unlock;
 9022
 9023		if (ret != -ENOENT) {
 9024			pmu = ERR_PTR(ret);
 9025			goto unlock;
 9026		}
 9027	}
 9028	pmu = ERR_PTR(-ENOENT);
 9029unlock:
 9030	srcu_read_unlock(&pmus_srcu, idx);
 9031
 9032	return pmu;
 9033}
 9034
 9035static void attach_sb_event(struct perf_event *event)
 9036{
 9037	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
 9038
 9039	raw_spin_lock(&pel->lock);
 9040	list_add_rcu(&event->sb_list, &pel->list);
 9041	raw_spin_unlock(&pel->lock);
 9042}
 9043
 9044/*
 9045 * We keep a list of all !task (and therefore per-cpu) events
 9046 * that need to receive side-band records.
 9047 *
 9048 * This avoids having to scan all the various PMU per-cpu contexts
 9049 * looking for them.
 9050 */
 9051static void account_pmu_sb_event(struct perf_event *event)
 9052{
 9053	if (is_sb_event(event))
 9054		attach_sb_event(event);
 9055}
 9056
 9057static void account_event_cpu(struct perf_event *event, int cpu)
 9058{
 9059	if (event->parent)
 9060		return;
 9061
 
 
 
 
 9062	if (is_cgroup_event(event))
 9063		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 9064}
 9065
 9066/* Freq events need the tick to stay alive (see perf_event_task_tick). */
 9067static void account_freq_event_nohz(void)
 9068{
 9069#ifdef CONFIG_NO_HZ_FULL
 9070	/* Lock so we don't race with concurrent unaccount */
 9071	spin_lock(&nr_freq_lock);
 9072	if (atomic_inc_return(&nr_freq_events) == 1)
 9073		tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
 9074	spin_unlock(&nr_freq_lock);
 9075#endif
 9076}
 9077
 9078static void account_freq_event(void)
 9079{
 9080	if (tick_nohz_full_enabled())
 9081		account_freq_event_nohz();
 9082	else
 9083		atomic_inc(&nr_freq_events);
 9084}
 9085
 9086
 9087static void account_event(struct perf_event *event)
 9088{
 9089	bool inc = false;
 9090
 9091	if (event->parent)
 9092		return;
 9093
 9094	if (event->attach_state & PERF_ATTACH_TASK)
 9095		inc = true;
 9096	if (event->attr.mmap || event->attr.mmap_data)
 9097		atomic_inc(&nr_mmap_events);
 9098	if (event->attr.comm)
 9099		atomic_inc(&nr_comm_events);
 9100	if (event->attr.task)
 9101		atomic_inc(&nr_task_events);
 9102	if (event->attr.freq)
 9103		account_freq_event();
 9104	if (event->attr.context_switch) {
 9105		atomic_inc(&nr_switch_events);
 9106		inc = true;
 9107	}
 9108	if (has_branch_stack(event))
 9109		inc = true;
 9110	if (is_cgroup_event(event))
 9111		inc = true;
 9112
 9113	if (inc) {
 9114		if (atomic_inc_not_zero(&perf_sched_count))
 9115			goto enabled;
 9116
 9117		mutex_lock(&perf_sched_mutex);
 9118		if (!atomic_read(&perf_sched_count)) {
 9119			static_branch_enable(&perf_sched_events);
 9120			/*
 9121			 * Guarantee that all CPUs observe they key change and
 9122			 * call the perf scheduling hooks before proceeding to
 9123			 * install events that need them.
 9124			 */
 9125			synchronize_sched();
 9126		}
 9127		/*
 9128		 * Now that we have waited for the sync_sched(), allow further
 9129		 * increments to by-pass the mutex.
 9130		 */
 9131		atomic_inc(&perf_sched_count);
 9132		mutex_unlock(&perf_sched_mutex);
 9133	}
 9134enabled:
 9135
 9136	account_event_cpu(event, event->cpu);
 9137
 9138	account_pmu_sb_event(event);
 9139}
 9140
 9141/*
 9142 * Allocate and initialize a event structure
 9143 */
 9144static struct perf_event *
 9145perf_event_alloc(struct perf_event_attr *attr, int cpu,
 9146		 struct task_struct *task,
 9147		 struct perf_event *group_leader,
 9148		 struct perf_event *parent_event,
 9149		 perf_overflow_handler_t overflow_handler,
 9150		 void *context, int cgroup_fd)
 9151{
 9152	struct pmu *pmu;
 9153	struct perf_event *event;
 9154	struct hw_perf_event *hwc;
 9155	long err = -EINVAL;
 9156
 9157	if ((unsigned)cpu >= nr_cpu_ids) {
 9158		if (!task || cpu != -1)
 9159			return ERR_PTR(-EINVAL);
 9160	}
 9161
 9162	event = kzalloc(sizeof(*event), GFP_KERNEL);
 9163	if (!event)
 9164		return ERR_PTR(-ENOMEM);
 9165
 9166	/*
 9167	 * Single events are their own group leaders, with an
 9168	 * empty sibling list:
 9169	 */
 9170	if (!group_leader)
 9171		group_leader = event;
 9172
 9173	mutex_init(&event->child_mutex);
 9174	INIT_LIST_HEAD(&event->child_list);
 9175
 9176	INIT_LIST_HEAD(&event->group_entry);
 9177	INIT_LIST_HEAD(&event->event_entry);
 9178	INIT_LIST_HEAD(&event->sibling_list);
 9179	INIT_LIST_HEAD(&event->rb_entry);
 9180	INIT_LIST_HEAD(&event->active_entry);
 9181	INIT_LIST_HEAD(&event->addr_filters.list);
 9182	INIT_HLIST_NODE(&event->hlist_entry);
 9183
 9184
 9185	init_waitqueue_head(&event->waitq);
 9186	init_irq_work(&event->pending, perf_pending_event);
 9187
 9188	mutex_init(&event->mmap_mutex);
 9189	raw_spin_lock_init(&event->addr_filters.lock);
 9190
 9191	atomic_long_set(&event->refcount, 1);
 9192	event->cpu		= cpu;
 9193	event->attr		= *attr;
 9194	event->group_leader	= group_leader;
 9195	event->pmu		= NULL;
 9196	event->oncpu		= -1;
 9197
 9198	event->parent		= parent_event;
 9199
 9200	event->ns		= get_pid_ns(task_active_pid_ns(current));
 9201	event->id		= atomic64_inc_return(&perf_event_id);
 9202
 9203	event->state		= PERF_EVENT_STATE_INACTIVE;
 9204
 9205	if (task) {
 9206		event->attach_state = PERF_ATTACH_TASK;
 
 
 
 
 9207		/*
 9208		 * XXX pmu::event_init needs to know what task to account to
 9209		 * and we cannot use the ctx information because we need the
 9210		 * pmu before we get a ctx.
 9211		 */
 9212		event->hw.target = task;
 
 
 9213	}
 9214
 9215	event->clock = &local_clock;
 9216	if (parent_event)
 9217		event->clock = parent_event->clock;
 9218
 9219	if (!overflow_handler && parent_event) {
 9220		overflow_handler = parent_event->overflow_handler;
 9221		context = parent_event->overflow_handler_context;
 9222#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
 9223		if (overflow_handler == bpf_overflow_handler) {
 9224			struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
 9225
 9226			if (IS_ERR(prog)) {
 9227				err = PTR_ERR(prog);
 9228				goto err_ns;
 9229			}
 9230			event->prog = prog;
 9231			event->orig_overflow_handler =
 9232				parent_event->orig_overflow_handler;
 9233		}
 9234#endif
 9235	}
 9236
 9237	if (overflow_handler) {
 9238		event->overflow_handler	= overflow_handler;
 9239		event->overflow_handler_context = context;
 9240	} else if (is_write_backward(event)){
 9241		event->overflow_handler = perf_event_output_backward;
 9242		event->overflow_handler_context = NULL;
 9243	} else {
 9244		event->overflow_handler = perf_event_output_forward;
 9245		event->overflow_handler_context = NULL;
 9246	}
 9247
 9248	perf_event__state_init(event);
 9249
 9250	pmu = NULL;
 9251
 9252	hwc = &event->hw;
 9253	hwc->sample_period = attr->sample_period;
 9254	if (attr->freq && attr->sample_freq)
 9255		hwc->sample_period = 1;
 9256	hwc->last_period = hwc->sample_period;
 9257
 9258	local64_set(&hwc->period_left, hwc->sample_period);
 9259
 9260	/*
 9261	 * we currently do not support PERF_FORMAT_GROUP on inherited events
 9262	 */
 9263	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 9264		goto err_ns;
 9265
 9266	if (!has_branch_stack(event))
 9267		event->attr.branch_sample_type = 0;
 9268
 9269	if (cgroup_fd != -1) {
 9270		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
 9271		if (err)
 9272			goto err_ns;
 9273	}
 9274
 9275	pmu = perf_init_event(event);
 9276	if (!pmu)
 9277		goto err_ns;
 9278	else if (IS_ERR(pmu)) {
 9279		err = PTR_ERR(pmu);
 9280		goto err_ns;
 9281	}
 9282
 9283	err = exclusive_event_init(event);
 9284	if (err)
 9285		goto err_pmu;
 9286
 9287	if (has_addr_filter(event)) {
 9288		event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
 9289						   sizeof(unsigned long),
 9290						   GFP_KERNEL);
 9291		if (!event->addr_filters_offs)
 9292			goto err_per_task;
 9293
 9294		/* force hw sync on the address filters */
 9295		event->addr_filters_gen = 1;
 9296	}
 9297
 9298	if (!event->parent) {
 9299		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 9300			err = get_callchain_buffers(attr->sample_max_stack);
 9301			if (err)
 9302				goto err_addr_filters;
 9303		}
 9304	}
 9305
 9306	/* symmetric to unaccount_event() in _free_event() */
 9307	account_event(event);
 9308
 9309	return event;
 9310
 9311err_addr_filters:
 9312	kfree(event->addr_filters_offs);
 9313
 9314err_per_task:
 9315	exclusive_event_destroy(event);
 9316
 9317err_pmu:
 9318	if (event->destroy)
 9319		event->destroy(event);
 9320	module_put(pmu->module);
 9321err_ns:
 9322	if (is_cgroup_event(event))
 9323		perf_detach_cgroup(event);
 9324	if (event->ns)
 9325		put_pid_ns(event->ns);
 9326	kfree(event);
 9327
 9328	return ERR_PTR(err);
 9329}
 9330
 9331static int perf_copy_attr(struct perf_event_attr __user *uattr,
 9332			  struct perf_event_attr *attr)
 9333{
 9334	u32 size;
 9335	int ret;
 9336
 9337	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
 9338		return -EFAULT;
 9339
 9340	/*
 9341	 * zero the full structure, so that a short copy will be nice.
 9342	 */
 9343	memset(attr, 0, sizeof(*attr));
 9344
 9345	ret = get_user(size, &uattr->size);
 9346	if (ret)
 9347		return ret;
 9348
 9349	if (size > PAGE_SIZE)	/* silly large */
 9350		goto err_size;
 9351
 9352	if (!size)		/* abi compat */
 9353		size = PERF_ATTR_SIZE_VER0;
 9354
 9355	if (size < PERF_ATTR_SIZE_VER0)
 9356		goto err_size;
 9357
 9358	/*
 9359	 * If we're handed a bigger struct than we know of,
 9360	 * ensure all the unknown bits are 0 - i.e. new
 9361	 * user-space does not rely on any kernel feature
 9362	 * extensions we dont know about yet.
 9363	 */
 9364	if (size > sizeof(*attr)) {
 9365		unsigned char __user *addr;
 9366		unsigned char __user *end;
 9367		unsigned char val;
 9368
 9369		addr = (void __user *)uattr + sizeof(*attr);
 9370		end  = (void __user *)uattr + size;
 9371
 9372		for (; addr < end; addr++) {
 9373			ret = get_user(val, addr);
 9374			if (ret)
 9375				return ret;
 9376			if (val)
 9377				goto err_size;
 9378		}
 9379		size = sizeof(*attr);
 9380	}
 9381
 9382	ret = copy_from_user(attr, uattr, size);
 9383	if (ret)
 9384		return -EFAULT;
 9385
 
 
 
 
 9386	if (attr->__reserved_1)
 9387		return -EINVAL;
 9388
 9389	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
 9390		return -EINVAL;
 9391
 9392	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
 9393		return -EINVAL;
 9394
 9395	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
 9396		u64 mask = attr->branch_sample_type;
 9397
 9398		/* only using defined bits */
 9399		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
 9400			return -EINVAL;
 9401
 9402		/* at least one branch bit must be set */
 9403		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
 9404			return -EINVAL;
 9405
 9406		/* propagate priv level, when not set for branch */
 9407		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
 9408
 9409			/* exclude_kernel checked on syscall entry */
 9410			if (!attr->exclude_kernel)
 9411				mask |= PERF_SAMPLE_BRANCH_KERNEL;
 9412
 9413			if (!attr->exclude_user)
 9414				mask |= PERF_SAMPLE_BRANCH_USER;
 9415
 9416			if (!attr->exclude_hv)
 9417				mask |= PERF_SAMPLE_BRANCH_HV;
 9418			/*
 9419			 * adjust user setting (for HW filter setup)
 9420			 */
 9421			attr->branch_sample_type = mask;
 9422		}
 9423		/* privileged levels capture (kernel, hv): check permissions */
 9424		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
 9425		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 9426			return -EACCES;
 9427	}
 9428
 9429	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
 9430		ret = perf_reg_validate(attr->sample_regs_user);
 9431		if (ret)
 9432			return ret;
 9433	}
 9434
 9435	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
 9436		if (!arch_perf_have_user_stack_dump())
 9437			return -ENOSYS;
 9438
 9439		/*
 9440		 * We have __u32 type for the size, but so far
 9441		 * we can only use __u16 as maximum due to the
 9442		 * __u16 sample size limit.
 9443		 */
 9444		if (attr->sample_stack_user >= USHRT_MAX)
 9445			ret = -EINVAL;
 9446		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
 9447			ret = -EINVAL;
 9448	}
 9449
 9450	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
 9451		ret = perf_reg_validate(attr->sample_regs_intr);
 9452out:
 9453	return ret;
 9454
 9455err_size:
 9456	put_user(sizeof(*attr), &uattr->size);
 9457	ret = -E2BIG;
 9458	goto out;
 9459}
 9460
 9461static int
 9462perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 9463{
 9464	struct ring_buffer *rb = NULL;
 9465	int ret = -EINVAL;
 9466
 9467	if (!output_event)
 9468		goto set;
 9469
 9470	/* don't allow circular references */
 9471	if (event == output_event)
 9472		goto out;
 9473
 9474	/*
 9475	 * Don't allow cross-cpu buffers
 9476	 */
 9477	if (output_event->cpu != event->cpu)
 9478		goto out;
 9479
 9480	/*
 9481	 * If its not a per-cpu rb, it must be the same task.
 9482	 */
 9483	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 9484		goto out;
 9485
 9486	/*
 9487	 * Mixing clocks in the same buffer is trouble you don't need.
 9488	 */
 9489	if (output_event->clock != event->clock)
 9490		goto out;
 9491
 9492	/*
 9493	 * Either writing ring buffer from beginning or from end.
 9494	 * Mixing is not allowed.
 9495	 */
 9496	if (is_write_backward(output_event) != is_write_backward(event))
 9497		goto out;
 9498
 9499	/*
 9500	 * If both events generate aux data, they must be on the same PMU
 9501	 */
 9502	if (has_aux(event) && has_aux(output_event) &&
 9503	    event->pmu != output_event->pmu)
 9504		goto out;
 9505
 9506set:
 9507	mutex_lock(&event->mmap_mutex);
 9508	/* Can't redirect output if we've got an active mmap() */
 9509	if (atomic_read(&event->mmap_count))
 9510		goto unlock;
 9511
 9512	if (output_event) {
 9513		/* get the rb we want to redirect to */
 9514		rb = ring_buffer_get(output_event);
 9515		if (!rb)
 9516			goto unlock;
 9517	}
 9518
 9519	ring_buffer_attach(event, rb);
 9520
 9521	ret = 0;
 9522unlock:
 9523	mutex_unlock(&event->mmap_mutex);
 9524
 9525out:
 9526	return ret;
 9527}
 9528
 9529static void mutex_lock_double(struct mutex *a, struct mutex *b)
 9530{
 9531	if (b < a)
 9532		swap(a, b);
 9533
 9534	mutex_lock(a);
 9535	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 9536}
 9537
 9538static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 9539{
 9540	bool nmi_safe = false;
 9541
 9542	switch (clk_id) {
 9543	case CLOCK_MONOTONIC:
 9544		event->clock = &ktime_get_mono_fast_ns;
 9545		nmi_safe = true;
 9546		break;
 9547
 9548	case CLOCK_MONOTONIC_RAW:
 9549		event->clock = &ktime_get_raw_fast_ns;
 9550		nmi_safe = true;
 9551		break;
 9552
 9553	case CLOCK_REALTIME:
 9554		event->clock = &ktime_get_real_ns;
 9555		break;
 9556
 9557	case CLOCK_BOOTTIME:
 9558		event->clock = &ktime_get_boot_ns;
 9559		break;
 9560
 9561	case CLOCK_TAI:
 9562		event->clock = &ktime_get_tai_ns;
 9563		break;
 9564
 9565	default:
 9566		return -EINVAL;
 9567	}
 9568
 9569	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
 9570		return -EINVAL;
 9571
 9572	return 0;
 9573}
 9574
 9575/*
 9576 * Variation on perf_event_ctx_lock_nested(), except we take two context
 9577 * mutexes.
 9578 */
 9579static struct perf_event_context *
 9580__perf_event_ctx_lock_double(struct perf_event *group_leader,
 9581			     struct perf_event_context *ctx)
 9582{
 9583	struct perf_event_context *gctx;
 9584
 9585again:
 9586	rcu_read_lock();
 9587	gctx = READ_ONCE(group_leader->ctx);
 9588	if (!atomic_inc_not_zero(&gctx->refcount)) {
 9589		rcu_read_unlock();
 9590		goto again;
 9591	}
 9592	rcu_read_unlock();
 9593
 9594	mutex_lock_double(&gctx->mutex, &ctx->mutex);
 9595
 9596	if (group_leader->ctx != gctx) {
 9597		mutex_unlock(&ctx->mutex);
 9598		mutex_unlock(&gctx->mutex);
 9599		put_ctx(gctx);
 9600		goto again;
 9601	}
 9602
 9603	return gctx;
 9604}
 9605
 9606/**
 9607 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 9608 *
 9609 * @attr_uptr:	event_id type attributes for monitoring/sampling
 9610 * @pid:		target pid
 9611 * @cpu:		target cpu
 9612 * @group_fd:		group leader event fd
 9613 */
 9614SYSCALL_DEFINE5(perf_event_open,
 9615		struct perf_event_attr __user *, attr_uptr,
 9616		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 9617{
 9618	struct perf_event *group_leader = NULL, *output_event = NULL;
 9619	struct perf_event *event, *sibling;
 9620	struct perf_event_attr attr;
 9621	struct perf_event_context *ctx, *uninitialized_var(gctx);
 9622	struct file *event_file = NULL;
 9623	struct fd group = {NULL, 0};
 9624	struct task_struct *task = NULL;
 9625	struct pmu *pmu;
 9626	int event_fd;
 9627	int move_group = 0;
 9628	int err;
 9629	int f_flags = O_RDWR;
 9630	int cgroup_fd = -1;
 9631
 9632	/* for future expandability... */
 9633	if (flags & ~PERF_FLAG_ALL)
 9634		return -EINVAL;
 9635
 9636	err = perf_copy_attr(attr_uptr, &attr);
 9637	if (err)
 9638		return err;
 9639
 9640	if (!attr.exclude_kernel) {
 9641		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 9642			return -EACCES;
 9643	}
 9644
 9645	if (attr.freq) {
 9646		if (attr.sample_freq > sysctl_perf_event_sample_rate)
 9647			return -EINVAL;
 9648	} else {
 9649		if (attr.sample_period & (1ULL << 63))
 9650			return -EINVAL;
 9651	}
 9652
 9653	if (!attr.sample_max_stack)
 9654		attr.sample_max_stack = sysctl_perf_event_max_stack;
 9655
 9656	/*
 9657	 * In cgroup mode, the pid argument is used to pass the fd
 9658	 * opened to the cgroup directory in cgroupfs. The cpu argument
 9659	 * designates the cpu on which to monitor threads from that
 9660	 * cgroup.
 9661	 */
 9662	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
 9663		return -EINVAL;
 9664
 9665	if (flags & PERF_FLAG_FD_CLOEXEC)
 9666		f_flags |= O_CLOEXEC;
 9667
 9668	event_fd = get_unused_fd_flags(f_flags);
 9669	if (event_fd < 0)
 9670		return event_fd;
 9671
 9672	if (group_fd != -1) {
 9673		err = perf_fget_light(group_fd, &group);
 9674		if (err)
 9675			goto err_fd;
 9676		group_leader = group.file->private_data;
 9677		if (flags & PERF_FLAG_FD_OUTPUT)
 9678			output_event = group_leader;
 9679		if (flags & PERF_FLAG_FD_NO_GROUP)
 9680			group_leader = NULL;
 9681	}
 9682
 9683	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 9684		task = find_lively_task_by_vpid(pid);
 9685		if (IS_ERR(task)) {
 9686			err = PTR_ERR(task);
 9687			goto err_group_fd;
 9688		}
 9689	}
 9690
 9691	if (task && group_leader &&
 9692	    group_leader->attr.inherit != attr.inherit) {
 9693		err = -EINVAL;
 9694		goto err_task;
 9695	}
 9696
 9697	get_online_cpus();
 9698
 9699	if (task) {
 9700		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 9701		if (err)
 9702			goto err_cpus;
 9703
 9704		/*
 9705		 * Reuse ptrace permission checks for now.
 9706		 *
 9707		 * We must hold cred_guard_mutex across this and any potential
 9708		 * perf_install_in_context() call for this new event to
 9709		 * serialize against exec() altering our credentials (and the
 9710		 * perf_event_exit_task() that could imply).
 9711		 */
 9712		err = -EACCES;
 9713		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
 9714			goto err_cred;
 9715	}
 9716
 9717	if (flags & PERF_FLAG_PID_CGROUP)
 9718		cgroup_fd = pid;
 9719
 9720	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
 9721				 NULL, NULL, cgroup_fd);
 9722	if (IS_ERR(event)) {
 9723		err = PTR_ERR(event);
 9724		goto err_cred;
 9725	}
 9726
 9727	if (is_sampling_event(event)) {
 9728		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 9729			err = -EOPNOTSUPP;
 9730			goto err_alloc;
 
 9731		}
 9732	}
 9733
 
 
 9734	/*
 9735	 * Special case software events and allow them to be part of
 9736	 * any hardware group.
 9737	 */
 9738	pmu = event->pmu;
 9739
 9740	if (attr.use_clockid) {
 9741		err = perf_event_set_clock(event, attr.clockid);
 9742		if (err)
 9743			goto err_alloc;
 9744	}
 9745
 9746	if (pmu->task_ctx_nr == perf_sw_context)
 9747		event->event_caps |= PERF_EV_CAP_SOFTWARE;
 9748
 9749	if (group_leader &&
 9750	    (is_software_event(event) != is_software_event(group_leader))) {
 9751		if (is_software_event(event)) {
 9752			/*
 9753			 * If event and group_leader are not both a software
 9754			 * event, and event is, then group leader is not.
 9755			 *
 9756			 * Allow the addition of software events to !software
 9757			 * groups, this is safe because software events never
 9758			 * fail to schedule.
 9759			 */
 9760			pmu = group_leader->pmu;
 9761		} else if (is_software_event(group_leader) &&
 9762			   (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
 9763			/*
 9764			 * In case the group is a pure software group, and we
 9765			 * try to add a hardware event, move the whole group to
 9766			 * the hardware context.
 9767			 */
 9768			move_group = 1;
 9769		}
 9770	}
 9771
 9772	/*
 9773	 * Get the target context (task or percpu):
 9774	 */
 9775	ctx = find_get_context(pmu, task, event);
 9776	if (IS_ERR(ctx)) {
 9777		err = PTR_ERR(ctx);
 9778		goto err_alloc;
 9779	}
 9780
 9781	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
 9782		err = -EBUSY;
 9783		goto err_context;
 9784	}
 9785
 9786	/*
 9787	 * Look up the group leader (we will attach this event to it):
 9788	 */
 9789	if (group_leader) {
 9790		err = -EINVAL;
 9791
 9792		/*
 9793		 * Do not allow a recursive hierarchy (this new sibling
 9794		 * becoming part of another group-sibling):
 9795		 */
 9796		if (group_leader->group_leader != group_leader)
 9797			goto err_context;
 9798
 9799		/* All events in a group should have the same clock */
 9800		if (group_leader->clock != event->clock)
 9801			goto err_context;
 9802
 9803		/*
 9804		 * Do not allow to attach to a group in a different
 9805		 * task or CPU context:
 9806		 */
 9807		if (move_group) {
 9808			/*
 9809			 * Make sure we're both on the same task, or both
 9810			 * per-cpu events.
 9811			 */
 9812			if (group_leader->ctx->task != ctx->task)
 9813				goto err_context;
 9814
 9815			/*
 9816			 * Make sure we're both events for the same CPU;
 9817			 * grouping events for different CPUs is broken; since
 9818			 * you can never concurrently schedule them anyhow.
 9819			 */
 9820			if (group_leader->cpu != event->cpu)
 9821				goto err_context;
 9822		} else {
 9823			if (group_leader->ctx != ctx)
 9824				goto err_context;
 9825		}
 9826
 9827		/*
 9828		 * Only a group leader can be exclusive or pinned
 9829		 */
 9830		if (attr.exclusive || attr.pinned)
 9831			goto err_context;
 9832	}
 9833
 9834	if (output_event) {
 9835		err = perf_event_set_output(event, output_event);
 9836		if (err)
 9837			goto err_context;
 9838	}
 9839
 9840	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
 9841					f_flags);
 9842	if (IS_ERR(event_file)) {
 9843		err = PTR_ERR(event_file);
 9844		event_file = NULL;
 9845		goto err_context;
 9846	}
 9847
 9848	if (move_group) {
 9849		gctx = __perf_event_ctx_lock_double(group_leader, ctx);
 9850
 9851		if (gctx->task == TASK_TOMBSTONE) {
 9852			err = -ESRCH;
 9853			goto err_locked;
 9854		}
 9855
 9856		/*
 9857		 * Check if we raced against another sys_perf_event_open() call
 9858		 * moving the software group underneath us.
 
 9859		 */
 9860		if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
 9861			/*
 9862			 * If someone moved the group out from under us, check
 9863			 * if this new event wound up on the same ctx, if so
 9864			 * its the regular !move_group case, otherwise fail.
 9865			 */
 9866			if (gctx != ctx) {
 9867				err = -EINVAL;
 9868				goto err_locked;
 9869			} else {
 9870				perf_event_ctx_unlock(group_leader, gctx);
 9871				move_group = 0;
 9872			}
 9873		}
 9874	} else {
 9875		mutex_lock(&ctx->mutex);
 9876	}
 9877
 9878	if (ctx->task == TASK_TOMBSTONE) {
 9879		err = -ESRCH;
 9880		goto err_locked;
 9881	}
 9882
 9883	if (!perf_event_validate_size(event)) {
 9884		err = -E2BIG;
 9885		goto err_locked;
 9886	}
 9887
 9888	/*
 9889	 * Must be under the same ctx::mutex as perf_install_in_context(),
 9890	 * because we need to serialize with concurrent event creation.
 9891	 */
 9892	if (!exclusive_event_installable(event, ctx)) {
 9893		/* exclusive and group stuff are assumed mutually exclusive */
 9894		WARN_ON_ONCE(move_group);
 9895
 9896		err = -EBUSY;
 9897		goto err_locked;
 9898	}
 9899
 9900	WARN_ON_ONCE(ctx->parent_ctx);
 9901
 9902	/*
 9903	 * This is the point on no return; we cannot fail hereafter. This is
 9904	 * where we start modifying current state.
 9905	 */
 9906
 9907	if (move_group) {
 9908		/*
 9909		 * See perf_event_ctx_lock() for comments on the details
 9910		 * of swizzling perf_event::ctx.
 9911		 */
 9912		perf_remove_from_context(group_leader, 0);
 9913
 9914		list_for_each_entry(sibling, &group_leader->sibling_list,
 9915				    group_entry) {
 9916			perf_remove_from_context(sibling, 0);
 9917			put_ctx(gctx);
 9918		}
 9919
 9920		/*
 9921		 * Wait for everybody to stop referencing the events through
 9922		 * the old lists, before installing it on new lists.
 9923		 */
 9924		synchronize_rcu();
 9925
 9926		/*
 9927		 * Install the group siblings before the group leader.
 9928		 *
 9929		 * Because a group leader will try and install the entire group
 9930		 * (through the sibling list, which is still in-tact), we can
 9931		 * end up with siblings installed in the wrong context.
 9932		 *
 9933		 * By installing siblings first we NO-OP because they're not
 9934		 * reachable through the group lists.
 9935		 */
 9936		list_for_each_entry(sibling, &group_leader->sibling_list,
 9937				    group_entry) {
 9938			perf_event__state_init(sibling);
 9939			perf_install_in_context(ctx, sibling, sibling->cpu);
 9940			get_ctx(ctx);
 9941		}
 9942
 9943		/*
 9944		 * Removing from the context ends up with disabled
 9945		 * event. What we want here is event in the initial
 9946		 * startup state, ready to be add into new context.
 9947		 */
 9948		perf_event__state_init(group_leader);
 9949		perf_install_in_context(ctx, group_leader, group_leader->cpu);
 9950		get_ctx(ctx);
 9951
 9952		/*
 9953		 * Now that all events are installed in @ctx, nothing
 9954		 * references @gctx anymore, so drop the last reference we have
 9955		 * on it.
 9956		 */
 9957		put_ctx(gctx);
 9958	}
 9959
 9960	/*
 9961	 * Precalculate sample_data sizes; do while holding ctx::mutex such
 9962	 * that we're serialized against further additions and before
 9963	 * perf_install_in_context() which is the point the event is active and
 9964	 * can use these values.
 9965	 */
 9966	perf_event__header_size(event);
 9967	perf_event__id_header_size(event);
 9968
 9969	event->owner = current;
 9970
 9971	perf_install_in_context(ctx, event, event->cpu);
 9972	perf_unpin_context(ctx);
 9973
 9974	if (move_group)
 9975		perf_event_ctx_unlock(group_leader, gctx);
 9976	mutex_unlock(&ctx->mutex);
 9977
 9978	if (task) {
 9979		mutex_unlock(&task->signal->cred_guard_mutex);
 9980		put_task_struct(task);
 9981	}
 9982
 9983	put_online_cpus();
 9984
 
 
 9985	mutex_lock(&current->perf_event_mutex);
 9986	list_add_tail(&event->owner_entry, &current->perf_event_list);
 9987	mutex_unlock(&current->perf_event_mutex);
 9988
 9989	/*
 
 
 
 
 
 
 9990	 * Drop the reference on the group_event after placing the
 9991	 * new event on the sibling_list. This ensures destruction
 9992	 * of the group leader will find the pointer to itself in
 9993	 * perf_group_detach().
 9994	 */
 9995	fdput(group);
 9996	fd_install(event_fd, event_file);
 9997	return event_fd;
 9998
 9999err_locked:
10000	if (move_group)
10001		perf_event_ctx_unlock(group_leader, gctx);
10002	mutex_unlock(&ctx->mutex);
10003/* err_file: */
10004	fput(event_file);
10005err_context:
10006	perf_unpin_context(ctx);
10007	put_ctx(ctx);
10008err_alloc:
10009	/*
10010	 * If event_file is set, the fput() above will have called ->release()
10011	 * and that will take care of freeing the event.
10012	 */
10013	if (!event_file)
10014		free_event(event);
10015err_cred:
10016	if (task)
10017		mutex_unlock(&task->signal->cred_guard_mutex);
10018err_cpus:
10019	put_online_cpus();
10020err_task:
 
10021	if (task)
10022		put_task_struct(task);
10023err_group_fd:
10024	fdput(group);
10025err_fd:
10026	put_unused_fd(event_fd);
10027	return err;
10028}
10029
10030/**
10031 * perf_event_create_kernel_counter
10032 *
10033 * @attr: attributes of the counter to create
10034 * @cpu: cpu in which the counter is bound
10035 * @task: task to profile (NULL for percpu)
10036 */
10037struct perf_event *
10038perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10039				 struct task_struct *task,
10040				 perf_overflow_handler_t overflow_handler,
10041				 void *context)
10042{
10043	struct perf_event_context *ctx;
10044	struct perf_event *event;
10045	int err;
10046
10047	/*
10048	 * Get the target context (task or percpu):
10049	 */
10050
10051	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10052				 overflow_handler, context, -1);
10053	if (IS_ERR(event)) {
10054		err = PTR_ERR(event);
10055		goto err;
10056	}
10057
10058	/* Mark owner so we could distinguish it from user events. */
10059	event->owner = TASK_TOMBSTONE;
10060
10061	ctx = find_get_context(event->pmu, task, event);
10062	if (IS_ERR(ctx)) {
10063		err = PTR_ERR(ctx);
10064		goto err_free;
10065	}
10066
10067	WARN_ON_ONCE(ctx->parent_ctx);
10068	mutex_lock(&ctx->mutex);
10069	if (ctx->task == TASK_TOMBSTONE) {
10070		err = -ESRCH;
10071		goto err_unlock;
10072	}
10073
10074	if (!exclusive_event_installable(event, ctx)) {
10075		err = -EBUSY;
10076		goto err_unlock;
10077	}
10078
10079	perf_install_in_context(ctx, event, cpu);
10080	perf_unpin_context(ctx);
10081	mutex_unlock(&ctx->mutex);
10082
10083	return event;
10084
10085err_unlock:
10086	mutex_unlock(&ctx->mutex);
10087	perf_unpin_context(ctx);
10088	put_ctx(ctx);
10089err_free:
10090	free_event(event);
10091err:
10092	return ERR_PTR(err);
10093}
10094EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10095
10096void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10097{
10098	struct perf_event_context *src_ctx;
10099	struct perf_event_context *dst_ctx;
10100	struct perf_event *event, *tmp;
10101	LIST_HEAD(events);
10102
10103	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10104	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10105
10106	/*
10107	 * See perf_event_ctx_lock() for comments on the details
10108	 * of swizzling perf_event::ctx.
10109	 */
10110	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10111	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10112				 event_entry) {
10113		perf_remove_from_context(event, 0);
10114		unaccount_event_cpu(event, src_cpu);
10115		put_ctx(src_ctx);
10116		list_add(&event->migrate_entry, &events);
10117	}
 
10118
10119	/*
10120	 * Wait for the events to quiesce before re-instating them.
10121	 */
10122	synchronize_rcu();
10123
10124	/*
10125	 * Re-instate events in 2 passes.
10126	 *
10127	 * Skip over group leaders and only install siblings on this first
10128	 * pass, siblings will not get enabled without a leader, however a
10129	 * leader will enable its siblings, even if those are still on the old
10130	 * context.
10131	 */
10132	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10133		if (event->group_leader == event)
10134			continue;
10135
10136		list_del(&event->migrate_entry);
10137		if (event->state >= PERF_EVENT_STATE_OFF)
10138			event->state = PERF_EVENT_STATE_INACTIVE;
10139		account_event_cpu(event, dst_cpu);
10140		perf_install_in_context(dst_ctx, event, dst_cpu);
10141		get_ctx(dst_ctx);
10142	}
10143
10144	/*
10145	 * Once all the siblings are setup properly, install the group leaders
10146	 * to make it go.
10147	 */
10148	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10149		list_del(&event->migrate_entry);
10150		if (event->state >= PERF_EVENT_STATE_OFF)
10151			event->state = PERF_EVENT_STATE_INACTIVE;
10152		account_event_cpu(event, dst_cpu);
10153		perf_install_in_context(dst_ctx, event, dst_cpu);
10154		get_ctx(dst_ctx);
10155	}
10156	mutex_unlock(&dst_ctx->mutex);
10157	mutex_unlock(&src_ctx->mutex);
10158}
10159EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10160
10161static void sync_child_event(struct perf_event *child_event,
10162			       struct task_struct *child)
10163{
10164	struct perf_event *parent_event = child_event->parent;
10165	u64 child_val;
10166
10167	if (child_event->attr.inherit_stat)
10168		perf_event_read_event(child_event, child);
10169
10170	child_val = perf_event_count(child_event);
10171
10172	/*
10173	 * Add back the child's count to the parent's count:
10174	 */
10175	atomic64_add(child_val, &parent_event->child_count);
10176	atomic64_add(child_event->total_time_enabled,
10177		     &parent_event->child_total_time_enabled);
10178	atomic64_add(child_event->total_time_running,
10179		     &parent_event->child_total_time_running);
10180}
10181
10182static void
10183perf_event_exit_event(struct perf_event *child_event,
10184		      struct perf_event_context *child_ctx,
10185		      struct task_struct *child)
10186{
10187	struct perf_event *parent_event = child_event->parent;
10188
10189	/*
10190	 * Do not destroy the 'original' grouping; because of the context
10191	 * switch optimization the original events could've ended up in a
10192	 * random child task.
10193	 *
10194	 * If we were to destroy the original group, all group related
10195	 * operations would cease to function properly after this random
10196	 * child dies.
10197	 *
10198	 * Do destroy all inherited groups, we don't care about those
10199	 * and being thorough is better.
10200	 */
10201	raw_spin_lock_irq(&child_ctx->lock);
10202	WARN_ON_ONCE(child_ctx->is_active);
10203
10204	if (parent_event)
10205		perf_group_detach(child_event);
10206	list_del_event(child_event, child_ctx);
10207	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
10208	raw_spin_unlock_irq(&child_ctx->lock);
10209
10210	/*
10211	 * Parent events are governed by their filedesc, retain them.
10212	 */
10213	if (!parent_event) {
10214		perf_event_wakeup(child_event);
10215		return;
10216	}
10217	/*
10218	 * Child events can be cleaned up.
10219	 */
10220
10221	sync_child_event(child_event, child);
10222
10223	/*
10224	 * Remove this event from the parent's list
10225	 */
10226	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10227	mutex_lock(&parent_event->child_mutex);
10228	list_del_init(&child_event->child_list);
10229	mutex_unlock(&parent_event->child_mutex);
10230
10231	/*
10232	 * Kick perf_poll() for is_event_hup().
 
10233	 */
10234	perf_event_wakeup(parent_event);
10235	free_event(child_event);
10236	put_event(parent_event);
10237}
10238
10239static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 
 
 
10240{
10241	struct perf_event_context *child_ctx, *clone_ctx = NULL;
10242	struct perf_event *child_event, *next;
10243
10244	WARN_ON_ONCE(child != current);
 
 
 
 
 
 
 
 
 
10245
10246	child_ctx = perf_pin_task_context(child, ctxn);
10247	if (!child_ctx)
 
 
 
 
 
 
10248		return;
 
10249
 
10250	/*
10251	 * In order to reduce the amount of tricky in ctx tear-down, we hold
10252	 * ctx::mutex over the entire thing. This serializes against almost
10253	 * everything that wants to access the ctx.
10254	 *
10255	 * The exception is sys_perf_event_open() /
10256	 * perf_event_create_kernel_count() which does find_get_context()
10257	 * without ctx::mutex (it cannot because of the move_group double mutex
10258	 * lock thing). See the comments in perf_install_in_context().
10259	 */
10260	mutex_lock(&child_ctx->mutex);
10261
10262	/*
10263	 * In a single ctx::lock section, de-schedule the events and detach the
10264	 * context from the task such that we cannot ever get it scheduled back
10265	 * in.
10266	 */
10267	raw_spin_lock_irq(&child_ctx->lock);
10268	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
10269
10270	/*
10271	 * Now that the context is inactive, destroy the task <-> ctx relation
10272	 * and mark the context dead.
 
10273	 */
10274	RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10275	put_ctx(child_ctx); /* cannot be last */
10276	WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10277	put_task_struct(current); /* cannot be last */
10278
10279	clone_ctx = unclone_ctx(child_ctx);
10280	raw_spin_unlock_irq(&child_ctx->lock);
10281
10282	if (clone_ctx)
10283		put_ctx(clone_ctx);
10284
10285	/*
10286	 * Report the task dead after unscheduling the events so that we
10287	 * won't get any samples after PERF_RECORD_EXIT. We can however still
10288	 * get a few PERF_RECORD_READ events.
10289	 */
10290	perf_event_task(child, child_ctx, 0);
10291
10292	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10293		perf_event_exit_event(child_event, child_ctx, child);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10294
10295	mutex_unlock(&child_ctx->mutex);
10296
10297	put_ctx(child_ctx);
10298}
10299
10300/*
10301 * When a child task exits, feed back event values to parent events.
10302 *
10303 * Can be called with cred_guard_mutex held when called from
10304 * install_exec_creds().
10305 */
10306void perf_event_exit_task(struct task_struct *child)
10307{
10308	struct perf_event *event, *tmp;
10309	int ctxn;
10310
10311	mutex_lock(&child->perf_event_mutex);
10312	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10313				 owner_entry) {
10314		list_del_init(&event->owner_entry);
10315
10316		/*
10317		 * Ensure the list deletion is visible before we clear
10318		 * the owner, closes a race against perf_release() where
10319		 * we need to serialize on the owner->perf_event_mutex.
10320		 */
10321		smp_store_release(&event->owner, NULL);
 
10322	}
10323	mutex_unlock(&child->perf_event_mutex);
10324
10325	for_each_task_context_nr(ctxn)
10326		perf_event_exit_task_context(child, ctxn);
10327
10328	/*
10329	 * The perf_event_exit_task_context calls perf_event_task
10330	 * with child's task_ctx, which generates EXIT events for
10331	 * child contexts and sets child->perf_event_ctxp[] to NULL.
10332	 * At this point we need to send EXIT events to cpu contexts.
10333	 */
10334	perf_event_task(child, NULL, 0);
10335}
10336
10337static void perf_free_event(struct perf_event *event,
10338			    struct perf_event_context *ctx)
10339{
10340	struct perf_event *parent = event->parent;
10341
10342	if (WARN_ON_ONCE(!parent))
10343		return;
10344
10345	mutex_lock(&parent->child_mutex);
10346	list_del_init(&event->child_list);
10347	mutex_unlock(&parent->child_mutex);
10348
10349	put_event(parent);
10350
10351	raw_spin_lock_irq(&ctx->lock);
10352	perf_group_detach(event);
10353	list_del_event(event, ctx);
10354	raw_spin_unlock_irq(&ctx->lock);
10355	free_event(event);
10356}
10357
10358/*
10359 * Free an unexposed, unused context as created by inheritance by
10360 * perf_event_init_task below, used by fork() in case of fail.
10361 *
10362 * Not all locks are strictly required, but take them anyway to be nice and
10363 * help out with the lockdep assertions.
10364 */
10365void perf_event_free_task(struct task_struct *task)
10366{
10367	struct perf_event_context *ctx;
10368	struct perf_event *event, *tmp;
10369	int ctxn;
10370
10371	for_each_task_context_nr(ctxn) {
10372		ctx = task->perf_event_ctxp[ctxn];
10373		if (!ctx)
10374			continue;
10375
10376		mutex_lock(&ctx->mutex);
10377		raw_spin_lock_irq(&ctx->lock);
10378		/*
10379		 * Destroy the task <-> ctx relation and mark the context dead.
10380		 *
10381		 * This is important because even though the task hasn't been
10382		 * exposed yet the context has been (through child_list).
10383		 */
10384		RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
10385		WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
10386		put_task_struct(task); /* cannot be last */
10387		raw_spin_unlock_irq(&ctx->lock);
10388again:
10389		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
10390				group_entry)
10391			perf_free_event(event, ctx);
10392
10393		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
10394				group_entry)
10395			perf_free_event(event, ctx);
10396
10397		if (!list_empty(&ctx->pinned_groups) ||
10398				!list_empty(&ctx->flexible_groups))
10399			goto again;
10400
10401		mutex_unlock(&ctx->mutex);
10402
10403		put_ctx(ctx);
10404	}
10405}
10406
10407void perf_event_delayed_put(struct task_struct *task)
10408{
10409	int ctxn;
10410
10411	for_each_task_context_nr(ctxn)
10412		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10413}
10414
10415struct file *perf_event_get(unsigned int fd)
10416{
10417	struct file *file;
10418
10419	file = fget_raw(fd);
10420	if (!file)
10421		return ERR_PTR(-EBADF);
10422
10423	if (file->f_op != &perf_fops) {
10424		fput(file);
10425		return ERR_PTR(-EBADF);
10426	}
10427
10428	return file;
10429}
10430
10431const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10432{
10433	if (!event)
10434		return ERR_PTR(-EINVAL);
10435
10436	return &event->attr;
10437}
10438
10439/*
10440 * inherit a event from parent task to child task:
10441 */
10442static struct perf_event *
10443inherit_event(struct perf_event *parent_event,
10444	      struct task_struct *parent,
10445	      struct perf_event_context *parent_ctx,
10446	      struct task_struct *child,
10447	      struct perf_event *group_leader,
10448	      struct perf_event_context *child_ctx)
10449{
10450	enum perf_event_active_state parent_state = parent_event->state;
10451	struct perf_event *child_event;
10452	unsigned long flags;
10453
10454	/*
10455	 * Instead of creating recursive hierarchies of events,
10456	 * we link inherited events back to the original parent,
10457	 * which has a filp for sure, which we use as the reference
10458	 * count:
10459	 */
10460	if (parent_event->parent)
10461		parent_event = parent_event->parent;
10462
10463	child_event = perf_event_alloc(&parent_event->attr,
10464					   parent_event->cpu,
10465					   child,
10466					   group_leader, parent_event,
10467					   NULL, NULL, -1);
10468	if (IS_ERR(child_event))
10469		return child_event;
10470
10471	/*
10472	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
10473	 * must be under the same lock in order to serialize against
10474	 * perf_event_release_kernel(), such that either we must observe
10475	 * is_orphaned_event() or they will observe us on the child_list.
10476	 */
10477	mutex_lock(&parent_event->child_mutex);
10478	if (is_orphaned_event(parent_event) ||
10479	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
10480		mutex_unlock(&parent_event->child_mutex);
10481		free_event(child_event);
10482		return NULL;
10483	}
10484
10485	get_ctx(child_ctx);
10486
10487	/*
10488	 * Make the child state follow the state of the parent event,
10489	 * not its attr.disabled bit.  We hold the parent's mutex,
10490	 * so we won't race with perf_event_{en, dis}able_family.
10491	 */
10492	if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10493		child_event->state = PERF_EVENT_STATE_INACTIVE;
10494	else
10495		child_event->state = PERF_EVENT_STATE_OFF;
10496
10497	if (parent_event->attr.freq) {
10498		u64 sample_period = parent_event->hw.sample_period;
10499		struct hw_perf_event *hwc = &child_event->hw;
10500
10501		hwc->sample_period = sample_period;
10502		hwc->last_period   = sample_period;
10503
10504		local64_set(&hwc->period_left, sample_period);
10505	}
10506
10507	child_event->ctx = child_ctx;
10508	child_event->overflow_handler = parent_event->overflow_handler;
10509	child_event->overflow_handler_context
10510		= parent_event->overflow_handler_context;
10511
10512	/*
10513	 * Precalculate sample_data sizes
10514	 */
10515	perf_event__header_size(child_event);
10516	perf_event__id_header_size(child_event);
10517
10518	/*
10519	 * Link it up in the child's context:
10520	 */
10521	raw_spin_lock_irqsave(&child_ctx->lock, flags);
10522	add_event_to_ctx(child_event, child_ctx);
10523	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10524
10525	/*
10526	 * Link this into the parent event's child list
10527	 */
 
 
10528	list_add_tail(&child_event->child_list, &parent_event->child_list);
10529	mutex_unlock(&parent_event->child_mutex);
10530
10531	return child_event;
10532}
10533
10534static int inherit_group(struct perf_event *parent_event,
10535	      struct task_struct *parent,
10536	      struct perf_event_context *parent_ctx,
10537	      struct task_struct *child,
10538	      struct perf_event_context *child_ctx)
10539{
10540	struct perf_event *leader;
10541	struct perf_event *sub;
10542	struct perf_event *child_ctr;
10543
10544	leader = inherit_event(parent_event, parent, parent_ctx,
10545				 child, NULL, child_ctx);
10546	if (IS_ERR(leader))
10547		return PTR_ERR(leader);
10548	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10549		child_ctr = inherit_event(sub, parent, parent_ctx,
10550					    child, leader, child_ctx);
10551		if (IS_ERR(child_ctr))
10552			return PTR_ERR(child_ctr);
10553	}
10554	return 0;
10555}
10556
10557static int
10558inherit_task_group(struct perf_event *event, struct task_struct *parent,
10559		   struct perf_event_context *parent_ctx,
10560		   struct task_struct *child, int ctxn,
10561		   int *inherited_all)
10562{
10563	int ret;
10564	struct perf_event_context *child_ctx;
10565
10566	if (!event->attr.inherit) {
10567		*inherited_all = 0;
10568		return 0;
10569	}
10570
10571	child_ctx = child->perf_event_ctxp[ctxn];
10572	if (!child_ctx) {
10573		/*
10574		 * This is executed from the parent task context, so
10575		 * inherit events that have been marked for cloning.
10576		 * First allocate and initialize a context for the
10577		 * child.
10578		 */
10579
10580		child_ctx = alloc_perf_context(parent_ctx->pmu, child);
10581		if (!child_ctx)
10582			return -ENOMEM;
10583
10584		child->perf_event_ctxp[ctxn] = child_ctx;
10585	}
10586
10587	ret = inherit_group(event, parent, parent_ctx,
10588			    child, child_ctx);
10589
10590	if (ret)
10591		*inherited_all = 0;
10592
10593	return ret;
10594}
10595
10596/*
10597 * Initialize the perf_event context in task_struct
10598 */
10599static int perf_event_init_context(struct task_struct *child, int ctxn)
10600{
10601	struct perf_event_context *child_ctx, *parent_ctx;
10602	struct perf_event_context *cloned_ctx;
10603	struct perf_event *event;
10604	struct task_struct *parent = current;
10605	int inherited_all = 1;
10606	unsigned long flags;
10607	int ret = 0;
10608
10609	if (likely(!parent->perf_event_ctxp[ctxn]))
10610		return 0;
10611
10612	/*
10613	 * If the parent's context is a clone, pin it so it won't get
10614	 * swapped under us.
10615	 */
10616	parent_ctx = perf_pin_task_context(parent, ctxn);
10617	if (!parent_ctx)
10618		return 0;
10619
10620	/*
10621	 * No need to check if parent_ctx != NULL here; since we saw
10622	 * it non-NULL earlier, the only reason for it to become NULL
10623	 * is if we exit, and since we're currently in the middle of
10624	 * a fork we can't be exiting at the same time.
10625	 */
10626
10627	/*
10628	 * Lock the parent list. No need to lock the child - not PID
10629	 * hashed yet and not running, so nobody can access it.
10630	 */
10631	mutex_lock(&parent_ctx->mutex);
10632
10633	/*
10634	 * We dont have to disable NMIs - we are only looking at
10635	 * the list, not manipulating it:
10636	 */
10637	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
10638		ret = inherit_task_group(event, parent, parent_ctx,
10639					 child, ctxn, &inherited_all);
10640		if (ret)
10641			goto out_unlock;
10642	}
10643
10644	/*
10645	 * We can't hold ctx->lock when iterating the ->flexible_group list due
10646	 * to allocations, but we need to prevent rotation because
10647	 * rotate_ctx() will change the list from interrupt context.
10648	 */
10649	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10650	parent_ctx->rotate_disable = 1;
10651	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10652
10653	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
10654		ret = inherit_task_group(event, parent, parent_ctx,
10655					 child, ctxn, &inherited_all);
10656		if (ret)
10657			goto out_unlock;
10658	}
10659
10660	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10661	parent_ctx->rotate_disable = 0;
10662
10663	child_ctx = child->perf_event_ctxp[ctxn];
10664
10665	if (child_ctx && inherited_all) {
10666		/*
10667		 * Mark the child context as a clone of the parent
10668		 * context, or of whatever the parent is a clone of.
10669		 *
10670		 * Note that if the parent is a clone, the holding of
10671		 * parent_ctx->lock avoids it from being uncloned.
10672		 */
10673		cloned_ctx = parent_ctx->parent_ctx;
10674		if (cloned_ctx) {
10675			child_ctx->parent_ctx = cloned_ctx;
10676			child_ctx->parent_gen = parent_ctx->parent_gen;
10677		} else {
10678			child_ctx->parent_ctx = parent_ctx;
10679			child_ctx->parent_gen = parent_ctx->generation;
10680		}
10681		get_ctx(child_ctx->parent_ctx);
10682	}
10683
10684	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10685out_unlock:
10686	mutex_unlock(&parent_ctx->mutex);
10687
10688	perf_unpin_context(parent_ctx);
10689	put_ctx(parent_ctx);
10690
10691	return ret;
10692}
10693
10694/*
10695 * Initialize the perf_event context in task_struct
10696 */
10697int perf_event_init_task(struct task_struct *child)
10698{
10699	int ctxn, ret;
10700
10701	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
10702	mutex_init(&child->perf_event_mutex);
10703	INIT_LIST_HEAD(&child->perf_event_list);
10704
10705	for_each_task_context_nr(ctxn) {
10706		ret = perf_event_init_context(child, ctxn);
10707		if (ret) {
10708			perf_event_free_task(child);
10709			return ret;
10710		}
10711	}
10712
10713	return 0;
10714}
10715
10716static void __init perf_event_init_all_cpus(void)
10717{
10718	struct swevent_htable *swhash;
10719	int cpu;
10720
10721	for_each_possible_cpu(cpu) {
10722		swhash = &per_cpu(swevent_htable, cpu);
10723		mutex_init(&swhash->hlist_mutex);
10724		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
10725
10726		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
10727		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
10728
10729		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
10730	}
10731}
10732
10733int perf_event_init_cpu(unsigned int cpu)
10734{
10735	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10736
10737	mutex_lock(&swhash->hlist_mutex);
10738	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
 
10739		struct swevent_hlist *hlist;
10740
10741		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
10742		WARN_ON(!hlist);
10743		rcu_assign_pointer(swhash->swevent_hlist, hlist);
10744	}
10745	mutex_unlock(&swhash->hlist_mutex);
10746	return 0;
10747}
10748
10749#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 
 
 
 
 
 
 
 
 
10750static void __perf_event_exit_context(void *__info)
10751{
 
10752	struct perf_event_context *ctx = __info;
10753	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
10754	struct perf_event *event;
10755
10756	raw_spin_lock(&ctx->lock);
10757	list_for_each_entry(event, &ctx->event_list, event_entry)
10758		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
10759	raw_spin_unlock(&ctx->lock);
 
 
10760}
10761
10762static void perf_event_exit_cpu_context(int cpu)
10763{
10764	struct perf_event_context *ctx;
10765	struct pmu *pmu;
10766	int idx;
10767
10768	idx = srcu_read_lock(&pmus_srcu);
10769	list_for_each_entry_rcu(pmu, &pmus, entry) {
10770		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
10771
10772		mutex_lock(&ctx->mutex);
10773		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
10774		mutex_unlock(&ctx->mutex);
10775	}
10776	srcu_read_unlock(&pmus_srcu, idx);
10777}
10778#else
10779
10780static void perf_event_exit_cpu_context(int cpu) { }
10781
10782#endif
10783
10784int perf_event_exit_cpu(unsigned int cpu)
10785{
 
 
10786	perf_event_exit_cpu_context(cpu);
10787	return 0;
 
 
 
 
10788}
 
 
 
10789
10790static int
10791perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
10792{
10793	int cpu;
10794
10795	for_each_online_cpu(cpu)
10796		perf_event_exit_cpu(cpu);
10797
10798	return NOTIFY_OK;
10799}
10800
10801/*
10802 * Run the perf reboot notifier at the very last possible moment so that
10803 * the generic watchdog code runs as long as possible.
10804 */
10805static struct notifier_block perf_reboot_notifier = {
10806	.notifier_call = perf_reboot,
10807	.priority = INT_MIN,
10808};
10809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10810void __init perf_event_init(void)
10811{
10812	int ret;
10813
10814	idr_init(&pmu_idr);
10815
10816	perf_event_init_all_cpus();
10817	init_srcu_struct(&pmus_srcu);
10818	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
10819	perf_pmu_register(&perf_cpu_clock, NULL, -1);
10820	perf_pmu_register(&perf_task_clock, NULL, -1);
10821	perf_tp_register();
10822	perf_event_init_cpu(smp_processor_id());
10823	register_reboot_notifier(&perf_reboot_notifier);
10824
10825	ret = init_hw_breakpoint();
10826	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
10827
 
 
 
10828	/*
10829	 * Build time assertion that we keep the data_head at the intended
10830	 * location.  IOW, validation we got the __reserved[] size right.
10831	 */
10832	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
10833		     != 1024);
10834}
10835
10836ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
10837			      char *page)
10838{
10839	struct perf_pmu_events_attr *pmu_attr =
10840		container_of(attr, struct perf_pmu_events_attr, attr);
10841
10842	if (pmu_attr->event_str)
10843		return sprintf(page, "%s\n", pmu_attr->event_str);
10844
10845	return 0;
10846}
10847EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
10848
10849static int __init perf_event_sysfs_init(void)
10850{
10851	struct pmu *pmu;
10852	int ret;
10853
10854	mutex_lock(&pmus_lock);
10855
10856	ret = bus_register(&pmu_bus);
10857	if (ret)
10858		goto unlock;
10859
10860	list_for_each_entry(pmu, &pmus, entry) {
10861		if (!pmu->name || pmu->type < 0)
10862			continue;
10863
10864		ret = pmu_dev_alloc(pmu);
10865		WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
10866	}
10867	pmu_bus_running = 1;
10868	ret = 0;
10869
10870unlock:
10871	mutex_unlock(&pmus_lock);
10872
10873	return ret;
10874}
10875device_initcall(perf_event_sysfs_init);
10876
10877#ifdef CONFIG_CGROUP_PERF
10878static struct cgroup_subsys_state *
10879perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
10880{
10881	struct perf_cgroup *jc;
10882
10883	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
10884	if (!jc)
10885		return ERR_PTR(-ENOMEM);
10886
10887	jc->info = alloc_percpu(struct perf_cgroup_info);
10888	if (!jc->info) {
10889		kfree(jc);
10890		return ERR_PTR(-ENOMEM);
10891	}
10892
10893	return &jc->css;
10894}
10895
10896static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
10897{
10898	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
10899
10900	free_percpu(jc->info);
10901	kfree(jc);
10902}
10903
10904static int __perf_cgroup_move(void *info)
10905{
10906	struct task_struct *task = info;
10907	rcu_read_lock();
10908	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
10909	rcu_read_unlock();
10910	return 0;
10911}
10912
10913static void perf_cgroup_attach(struct cgroup_taskset *tset)
 
10914{
10915	struct task_struct *task;
10916	struct cgroup_subsys_state *css;
10917
10918	cgroup_taskset_for_each(task, css, tset)
10919		task_function_call(task, __perf_cgroup_move, task);
10920}
10921
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10922struct cgroup_subsys perf_event_cgrp_subsys = {
10923	.css_alloc	= perf_cgroup_css_alloc,
10924	.css_free	= perf_cgroup_css_free,
 
10925	.attach		= perf_cgroup_attach,
10926};
10927#endif /* CONFIG_CGROUP_PERF */
v3.15
   1/*
   2 * Performance events core code:
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8 *
   9 * For licensing details see kernel-base/COPYING
  10 */
  11
  12#include <linux/fs.h>
  13#include <linux/mm.h>
  14#include <linux/cpu.h>
  15#include <linux/smp.h>
  16#include <linux/idr.h>
  17#include <linux/file.h>
  18#include <linux/poll.h>
  19#include <linux/slab.h>
  20#include <linux/hash.h>
  21#include <linux/tick.h>
  22#include <linux/sysfs.h>
  23#include <linux/dcache.h>
  24#include <linux/percpu.h>
  25#include <linux/ptrace.h>
  26#include <linux/reboot.h>
  27#include <linux/vmstat.h>
  28#include <linux/device.h>
  29#include <linux/export.h>
  30#include <linux/vmalloc.h>
  31#include <linux/hardirq.h>
  32#include <linux/rculist.h>
  33#include <linux/uaccess.h>
  34#include <linux/syscalls.h>
  35#include <linux/anon_inodes.h>
  36#include <linux/kernel_stat.h>
 
  37#include <linux/perf_event.h>
  38#include <linux/ftrace_event.h>
  39#include <linux/hw_breakpoint.h>
  40#include <linux/mm_types.h>
  41#include <linux/cgroup.h>
 
 
 
 
 
 
  42
  43#include "internal.h"
  44
  45#include <asm/irq_regs.h>
  46
 
 
  47struct remote_function_call {
  48	struct task_struct	*p;
  49	int			(*func)(void *info);
  50	void			*info;
  51	int			ret;
  52};
  53
  54static void remote_function(void *data)
  55{
  56	struct remote_function_call *tfc = data;
  57	struct task_struct *p = tfc->p;
  58
  59	if (p) {
  60		tfc->ret = -EAGAIN;
  61		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
 
 
 
 
 
 
 
 
 
  62			return;
  63	}
  64
  65	tfc->ret = tfc->func(tfc->info);
  66}
  67
  68/**
  69 * task_function_call - call a function on the cpu on which a task runs
  70 * @p:		the task to evaluate
  71 * @func:	the function to be called
  72 * @info:	the function call argument
  73 *
  74 * Calls the function @func when the task is currently running. This might
  75 * be on the current CPU, which just calls the function directly
  76 *
  77 * returns: @func return value, or
  78 *	    -ESRCH  - when the process isn't running
  79 *	    -EAGAIN - when the process moved away
  80 */
  81static int
  82task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  83{
  84	struct remote_function_call data = {
  85		.p	= p,
  86		.func	= func,
  87		.info	= info,
  88		.ret	= -ESRCH, /* No such (running) process */
  89	};
 
  90
  91	if (task_curr(p))
  92		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 
 
 
  93
  94	return data.ret;
  95}
  96
  97/**
  98 * cpu_function_call - call a function on the cpu
  99 * @func:	the function to be called
 100 * @info:	the function call argument
 101 *
 102 * Calls the function @func on the remote cpu.
 103 *
 104 * returns: @func return value or -ENXIO when the cpu is offline
 105 */
 106static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 107{
 108	struct remote_function_call data = {
 109		.p	= NULL,
 110		.func	= func,
 111		.info	= info,
 112		.ret	= -ENXIO, /* No such CPU */
 113	};
 114
 115	smp_call_function_single(cpu, remote_function, &data, 1);
 116
 117	return data.ret;
 118}
 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 120#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 121		       PERF_FLAG_FD_OUTPUT  |\
 122		       PERF_FLAG_PID_CGROUP |\
 123		       PERF_FLAG_FD_CLOEXEC)
 124
 125/*
 126 * branch priv levels that need permission checks
 127 */
 128#define PERF_SAMPLE_BRANCH_PERM_PLM \
 129	(PERF_SAMPLE_BRANCH_KERNEL |\
 130	 PERF_SAMPLE_BRANCH_HV)
 131
 132enum event_type_t {
 133	EVENT_FLEXIBLE = 0x1,
 134	EVENT_PINNED = 0x2,
 
 135	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 136};
 137
 138/*
 139 * perf_sched_events : >0 events exist
 140 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 141 */
 142struct static_key_deferred perf_sched_events __read_mostly;
 
 
 
 
 
 
 143static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 144static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 
 145
 146static atomic_t nr_mmap_events __read_mostly;
 147static atomic_t nr_comm_events __read_mostly;
 148static atomic_t nr_task_events __read_mostly;
 149static atomic_t nr_freq_events __read_mostly;
 
 150
 151static LIST_HEAD(pmus);
 152static DEFINE_MUTEX(pmus_lock);
 153static struct srcu_struct pmus_srcu;
 154
 155/*
 156 * perf event paranoia level:
 157 *  -1 - not paranoid at all
 158 *   0 - disallow raw tracepoint access for unpriv
 159 *   1 - disallow cpu events for unpriv
 160 *   2 - disallow kernel profiling for unpriv
 161 */
 162int sysctl_perf_event_paranoid __read_mostly = 1;
 163
 164/* Minimum for 512 kiB + 1 user control page */
 165int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 166
 167/*
 168 * max perf event sample rate
 169 */
 170#define DEFAULT_MAX_SAMPLE_RATE		100000
 171#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 172#define DEFAULT_CPU_TIME_MAX_PERCENT	25
 173
 174int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
 175
 176static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 177static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
 178
 179static int perf_sample_allowed_ns __read_mostly =
 180	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 181
 182void update_perf_cpu_limits(void)
 183{
 184	u64 tmp = perf_sample_period_ns;
 185
 186	tmp *= sysctl_perf_cpu_time_max_percent;
 187	do_div(tmp, 100);
 188	ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 
 
 
 189}
 190
 191static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 192
 193int perf_proc_update_handler(struct ctl_table *table, int write,
 194		void __user *buffer, size_t *lenp,
 195		loff_t *ppos)
 196{
 197	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 198
 199	if (ret || !write)
 200		return ret;
 201
 
 
 
 
 
 
 
 202	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 203	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 204	update_perf_cpu_limits();
 205
 206	return 0;
 207}
 208
 209int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 210
 211int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 212				void __user *buffer, size_t *lenp,
 213				loff_t *ppos)
 214{
 215	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
 216
 217	if (ret || !write)
 218		return ret;
 219
 220	update_perf_cpu_limits();
 
 
 
 
 
 
 
 221
 222	return 0;
 223}
 224
 225/*
 226 * perf samples are done in some very critical code paths (NMIs).
 227 * If they take too much CPU time, the system can lock up and not
 228 * get any real work done.  This will drop the sample rate when
 229 * we detect that events are taking too long.
 230 */
 231#define NR_ACCUMULATED_SAMPLES 128
 232static DEFINE_PER_CPU(u64, running_sample_length);
 233
 
 
 
 234static void perf_duration_warn(struct irq_work *w)
 235{
 236	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 237	u64 avg_local_sample_len;
 238	u64 local_samples_len;
 239
 240	local_samples_len = __get_cpu_var(running_sample_length);
 241	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 242
 243	printk_ratelimited(KERN_WARNING
 244			"perf interrupt took too long (%lld > %lld), lowering "
 245			"kernel.perf_event_max_sample_rate to %d\n",
 246			avg_local_sample_len, allowed_ns >> 1,
 247			sysctl_perf_event_sample_rate);
 248}
 249
 250static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 251
 252void perf_sample_event_took(u64 sample_len_ns)
 253{
 254	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 255	u64 avg_local_sample_len;
 256	u64 local_samples_len;
 
 257
 258	if (allowed_ns == 0)
 259		return;
 260
 261	/* decay the counter by 1 average sample */
 262	local_samples_len = __get_cpu_var(running_sample_length);
 263	local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
 264	local_samples_len += sample_len_ns;
 265	__get_cpu_var(running_sample_length) = local_samples_len;
 266
 267	/*
 268	 * note: this will be biased artifically low until we have
 269	 * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
 270	 * from having to maintain a count.
 271	 */
 272	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 
 
 273
 274	if (avg_local_sample_len <= allowed_ns)
 275		return;
 
 
 
 
 
 
 
 
 
 
 276
 277	if (max_samples_per_tick <= 1)
 278		return;
 279
 280	max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
 281	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 282	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 283
 284	update_perf_cpu_limits();
 285
 286	if (!irq_work_queue(&perf_duration_work)) {
 287		early_printk("perf interrupt took too long (%lld > %lld), lowering "
 288			     "kernel.perf_event_max_sample_rate to %d\n",
 289			     avg_local_sample_len, allowed_ns >> 1,
 290			     sysctl_perf_event_sample_rate);
 291	}
 292}
 293
 294static atomic64_t perf_event_id;
 295
 296static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 297			      enum event_type_t event_type);
 298
 299static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 300			     enum event_type_t event_type,
 301			     struct task_struct *task);
 302
 303static void update_context_time(struct perf_event_context *ctx);
 304static u64 perf_event_time(struct perf_event *event);
 305
 306void __weak perf_event_print_debug(void)	{ }
 307
 308extern __weak const char *perf_pmu_name(void)
 309{
 310	return "pmu";
 311}
 312
 313static inline u64 perf_clock(void)
 314{
 315	return local_clock();
 316}
 317
 318static inline struct perf_cpu_context *
 319__get_cpu_context(struct perf_event_context *ctx)
 320{
 321	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 322}
 323
 324static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 325			  struct perf_event_context *ctx)
 326{
 327	raw_spin_lock(&cpuctx->ctx.lock);
 328	if (ctx)
 329		raw_spin_lock(&ctx->lock);
 330}
 331
 332static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 333			    struct perf_event_context *ctx)
 334{
 335	if (ctx)
 336		raw_spin_unlock(&ctx->lock);
 337	raw_spin_unlock(&cpuctx->ctx.lock);
 338}
 339
 340#ifdef CONFIG_CGROUP_PERF
 341
 342/*
 343 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 344 * This is a per-cpu dynamically allocated data structure.
 345 */
 346struct perf_cgroup_info {
 347	u64				time;
 348	u64				timestamp;
 349};
 350
 351struct perf_cgroup {
 352	struct cgroup_subsys_state	css;
 353	struct perf_cgroup_info	__percpu *info;
 354};
 355
 356/*
 357 * Must ensure cgroup is pinned (css_get) before calling
 358 * this function. In other words, we cannot call this function
 359 * if there is no cgroup event for the current CPU context.
 360 */
 361static inline struct perf_cgroup *
 362perf_cgroup_from_task(struct task_struct *task)
 363{
 364	return container_of(task_css(task, perf_event_cgrp_id),
 365			    struct perf_cgroup, css);
 366}
 367
 368static inline bool
 369perf_cgroup_match(struct perf_event *event)
 370{
 371	struct perf_event_context *ctx = event->ctx;
 372	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 373
 374	/* @event doesn't care about cgroup */
 375	if (!event->cgrp)
 376		return true;
 377
 378	/* wants specific cgroup scope but @cpuctx isn't associated with any */
 379	if (!cpuctx->cgrp)
 380		return false;
 381
 382	/*
 383	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
 384	 * also enabled for all its descendant cgroups.  If @cpuctx's
 385	 * cgroup is a descendant of @event's (the test covers identity
 386	 * case), it's a match.
 387	 */
 388	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 389				    event->cgrp->css.cgroup);
 390}
 391
 392static inline void perf_put_cgroup(struct perf_event *event)
 393{
 394	css_put(&event->cgrp->css);
 395}
 396
 397static inline void perf_detach_cgroup(struct perf_event *event)
 398{
 399	perf_put_cgroup(event);
 400	event->cgrp = NULL;
 401}
 402
 403static inline int is_cgroup_event(struct perf_event *event)
 404{
 405	return event->cgrp != NULL;
 406}
 407
 408static inline u64 perf_cgroup_event_time(struct perf_event *event)
 409{
 410	struct perf_cgroup_info *t;
 411
 412	t = per_cpu_ptr(event->cgrp->info, event->cpu);
 413	return t->time;
 414}
 415
 416static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 417{
 418	struct perf_cgroup_info *info;
 419	u64 now;
 420
 421	now = perf_clock();
 422
 423	info = this_cpu_ptr(cgrp->info);
 424
 425	info->time += now - info->timestamp;
 426	info->timestamp = now;
 427}
 428
 429static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 430{
 431	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
 432	if (cgrp_out)
 433		__update_cgrp_time(cgrp_out);
 434}
 435
 436static inline void update_cgrp_time_from_event(struct perf_event *event)
 437{
 438	struct perf_cgroup *cgrp;
 439
 440	/*
 441	 * ensure we access cgroup data only when needed and
 442	 * when we know the cgroup is pinned (css_get)
 443	 */
 444	if (!is_cgroup_event(event))
 445		return;
 446
 447	cgrp = perf_cgroup_from_task(current);
 448	/*
 449	 * Do not update time when cgroup is not active
 450	 */
 451	if (cgrp == event->cgrp)
 452		__update_cgrp_time(event->cgrp);
 453}
 454
 455static inline void
 456perf_cgroup_set_timestamp(struct task_struct *task,
 457			  struct perf_event_context *ctx)
 458{
 459	struct perf_cgroup *cgrp;
 460	struct perf_cgroup_info *info;
 461
 462	/*
 463	 * ctx->lock held by caller
 464	 * ensure we do not access cgroup data
 465	 * unless we have the cgroup pinned (css_get)
 466	 */
 467	if (!task || !ctx->nr_cgroups)
 468		return;
 469
 470	cgrp = perf_cgroup_from_task(task);
 471	info = this_cpu_ptr(cgrp->info);
 472	info->timestamp = ctx->timestamp;
 473}
 474
 475#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 476#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 477
 478/*
 479 * reschedule events based on the cgroup constraint of task.
 480 *
 481 * mode SWOUT : schedule out everything
 482 * mode SWIN : schedule in based on cgroup for next
 483 */
 484void perf_cgroup_switch(struct task_struct *task, int mode)
 485{
 486	struct perf_cpu_context *cpuctx;
 487	struct pmu *pmu;
 488	unsigned long flags;
 489
 490	/*
 491	 * disable interrupts to avoid geting nr_cgroup
 492	 * changes via __perf_event_disable(). Also
 493	 * avoids preemption.
 494	 */
 495	local_irq_save(flags);
 496
 497	/*
 498	 * we reschedule only in the presence of cgroup
 499	 * constrained events.
 500	 */
 501	rcu_read_lock();
 502
 503	list_for_each_entry_rcu(pmu, &pmus, entry) {
 504		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 505		if (cpuctx->unique_pmu != pmu)
 506			continue; /* ensure we process each cpuctx once */
 507
 508		/*
 509		 * perf_cgroup_events says at least one
 510		 * context on this CPU has cgroup events.
 511		 *
 512		 * ctx->nr_cgroups reports the number of cgroup
 513		 * events for a context.
 514		 */
 515		if (cpuctx->ctx.nr_cgroups > 0) {
 516			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 517			perf_pmu_disable(cpuctx->ctx.pmu);
 518
 519			if (mode & PERF_CGROUP_SWOUT) {
 520				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 521				/*
 522				 * must not be done before ctxswout due
 523				 * to event_filter_match() in event_sched_out()
 524				 */
 525				cpuctx->cgrp = NULL;
 526			}
 527
 528			if (mode & PERF_CGROUP_SWIN) {
 529				WARN_ON_ONCE(cpuctx->cgrp);
 530				/*
 531				 * set cgrp before ctxsw in to allow
 532				 * event_filter_match() to not have to pass
 533				 * task around
 
 
 534				 */
 535				cpuctx->cgrp = perf_cgroup_from_task(task);
 536				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 537			}
 538			perf_pmu_enable(cpuctx->ctx.pmu);
 539			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 540		}
 541	}
 542
 543	rcu_read_unlock();
 544
 545	local_irq_restore(flags);
 546}
 547
 548static inline void perf_cgroup_sched_out(struct task_struct *task,
 549					 struct task_struct *next)
 550{
 551	struct perf_cgroup *cgrp1;
 552	struct perf_cgroup *cgrp2 = NULL;
 553
 
 554	/*
 555	 * we come here when we know perf_cgroup_events > 0
 
 
 556	 */
 557	cgrp1 = perf_cgroup_from_task(task);
 558
 559	/*
 560	 * next is NULL when called from perf_event_enable_on_exec()
 561	 * that will systematically cause a cgroup_switch()
 562	 */
 563	if (next)
 564		cgrp2 = perf_cgroup_from_task(next);
 565
 566	/*
 567	 * only schedule out current cgroup events if we know
 568	 * that we are switching to a different cgroup. Otherwise,
 569	 * do no touch the cgroup events.
 570	 */
 571	if (cgrp1 != cgrp2)
 572		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 
 
 573}
 574
 575static inline void perf_cgroup_sched_in(struct task_struct *prev,
 576					struct task_struct *task)
 577{
 578	struct perf_cgroup *cgrp1;
 579	struct perf_cgroup *cgrp2 = NULL;
 580
 
 581	/*
 582	 * we come here when we know perf_cgroup_events > 0
 
 
 583	 */
 584	cgrp1 = perf_cgroup_from_task(task);
 585
 586	/* prev can never be NULL */
 587	cgrp2 = perf_cgroup_from_task(prev);
 588
 589	/*
 590	 * only need to schedule in cgroup events if we are changing
 591	 * cgroup during ctxsw. Cgroup events were not scheduled
 592	 * out of ctxsw out if that was not the case.
 593	 */
 594	if (cgrp1 != cgrp2)
 595		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 
 
 596}
 597
 598static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 599				      struct perf_event_attr *attr,
 600				      struct perf_event *group_leader)
 601{
 602	struct perf_cgroup *cgrp;
 603	struct cgroup_subsys_state *css;
 604	struct fd f = fdget(fd);
 605	int ret = 0;
 606
 607	if (!f.file)
 608		return -EBADF;
 609
 610	css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
 
 611	if (IS_ERR(css)) {
 612		ret = PTR_ERR(css);
 613		goto out;
 614	}
 615
 616	cgrp = container_of(css, struct perf_cgroup, css);
 617	event->cgrp = cgrp;
 618
 619	/*
 620	 * all events in a group must monitor
 621	 * the same cgroup because a task belongs
 622	 * to only one perf cgroup at a time
 623	 */
 624	if (group_leader && group_leader->cgrp != cgrp) {
 625		perf_detach_cgroup(event);
 626		ret = -EINVAL;
 627	}
 628out:
 629	fdput(f);
 630	return ret;
 631}
 632
 633static inline void
 634perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 635{
 636	struct perf_cgroup_info *t;
 637	t = per_cpu_ptr(event->cgrp->info, event->cpu);
 638	event->shadow_ctx_time = now - t->timestamp;
 639}
 640
 641static inline void
 642perf_cgroup_defer_enabled(struct perf_event *event)
 643{
 644	/*
 645	 * when the current task's perf cgroup does not match
 646	 * the event's, we need to remember to call the
 647	 * perf_mark_enable() function the first time a task with
 648	 * a matching perf cgroup is scheduled in.
 649	 */
 650	if (is_cgroup_event(event) && !perf_cgroup_match(event))
 651		event->cgrp_defer_enabled = 1;
 652}
 653
 654static inline void
 655perf_cgroup_mark_enabled(struct perf_event *event,
 656			 struct perf_event_context *ctx)
 657{
 658	struct perf_event *sub;
 659	u64 tstamp = perf_event_time(event);
 660
 661	if (!event->cgrp_defer_enabled)
 662		return;
 663
 664	event->cgrp_defer_enabled = 0;
 665
 666	event->tstamp_enabled = tstamp - event->total_time_enabled;
 667	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 668		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 669			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 670			sub->cgrp_defer_enabled = 0;
 671		}
 672	}
 673}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 674#else /* !CONFIG_CGROUP_PERF */
 675
 676static inline bool
 677perf_cgroup_match(struct perf_event *event)
 678{
 679	return true;
 680}
 681
 682static inline void perf_detach_cgroup(struct perf_event *event)
 683{}
 684
 685static inline int is_cgroup_event(struct perf_event *event)
 686{
 687	return 0;
 688}
 689
 690static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
 691{
 692	return 0;
 693}
 694
 695static inline void update_cgrp_time_from_event(struct perf_event *event)
 696{
 697}
 698
 699static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 700{
 701}
 702
 703static inline void perf_cgroup_sched_out(struct task_struct *task,
 704					 struct task_struct *next)
 705{
 706}
 707
 708static inline void perf_cgroup_sched_in(struct task_struct *prev,
 709					struct task_struct *task)
 710{
 711}
 712
 713static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 714				      struct perf_event_attr *attr,
 715				      struct perf_event *group_leader)
 716{
 717	return -EINVAL;
 718}
 719
 720static inline void
 721perf_cgroup_set_timestamp(struct task_struct *task,
 722			  struct perf_event_context *ctx)
 723{
 724}
 725
 726void
 727perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
 728{
 729}
 730
 731static inline void
 732perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 733{
 734}
 735
 736static inline u64 perf_cgroup_event_time(struct perf_event *event)
 737{
 738	return 0;
 739}
 740
 741static inline void
 742perf_cgroup_defer_enabled(struct perf_event *event)
 743{
 744}
 745
 746static inline void
 747perf_cgroup_mark_enabled(struct perf_event *event,
 748			 struct perf_event_context *ctx)
 749{
 750}
 
 
 
 
 
 
 
 751#endif
 752
 753/*
 754 * set default to be dependent on timer tick just
 755 * like original code
 756 */
 757#define PERF_CPU_HRTIMER (1000 / HZ)
 758/*
 759 * function must be called with interrupts disbled
 760 */
 761static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
 762{
 763	struct perf_cpu_context *cpuctx;
 764	enum hrtimer_restart ret = HRTIMER_NORESTART;
 765	int rotations = 0;
 766
 767	WARN_ON(!irqs_disabled());
 768
 769	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 770
 771	rotations = perf_rotate_context(cpuctx);
 772
 773	/*
 774	 * arm timer if needed
 775	 */
 776	if (rotations) {
 777		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
 778		ret = HRTIMER_RESTART;
 779	}
 
 780
 781	return ret;
 782}
 783
 784/* CPU is going down */
 785void perf_cpu_hrtimer_cancel(int cpu)
 786{
 787	struct perf_cpu_context *cpuctx;
 788	struct pmu *pmu;
 789	unsigned long flags;
 790
 791	if (WARN_ON(cpu != smp_processor_id()))
 792		return;
 793
 794	local_irq_save(flags);
 795
 796	rcu_read_lock();
 797
 798	list_for_each_entry_rcu(pmu, &pmus, entry) {
 799		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 800
 801		if (pmu->task_ctx_nr == perf_sw_context)
 802			continue;
 803
 804		hrtimer_cancel(&cpuctx->hrtimer);
 805	}
 806
 807	rcu_read_unlock();
 808
 809	local_irq_restore(flags);
 810}
 811
 812static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 813{
 814	struct hrtimer *hr = &cpuctx->hrtimer;
 815	struct pmu *pmu = cpuctx->ctx.pmu;
 816	int timer;
 817
 818	/* no multiplexing needed for SW PMU */
 819	if (pmu->task_ctx_nr == perf_sw_context)
 820		return;
 821
 822	/*
 823	 * check default is sane, if not set then force to
 824	 * default interval (1/tick)
 825	 */
 826	timer = pmu->hrtimer_interval_ms;
 827	if (timer < 1)
 828		timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 829
 830	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 831
 832	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 833	hr->function = perf_cpu_hrtimer_handler;
 
 834}
 835
 836static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
 837{
 838	struct hrtimer *hr = &cpuctx->hrtimer;
 839	struct pmu *pmu = cpuctx->ctx.pmu;
 
 840
 841	/* not for SW PMU */
 842	if (pmu->task_ctx_nr == perf_sw_context)
 843		return;
 844
 845	if (hrtimer_active(hr))
 846		return;
 
 
 
 
 
 847
 848	if (!hrtimer_callback_running(hr))
 849		__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
 850					 0, HRTIMER_MODE_REL_PINNED, 0);
 851}
 852
 853void perf_pmu_disable(struct pmu *pmu)
 854{
 855	int *count = this_cpu_ptr(pmu->pmu_disable_count);
 856	if (!(*count)++)
 857		pmu->pmu_disable(pmu);
 858}
 859
 860void perf_pmu_enable(struct pmu *pmu)
 861{
 862	int *count = this_cpu_ptr(pmu->pmu_disable_count);
 863	if (!--(*count))
 864		pmu->pmu_enable(pmu);
 865}
 866
 867static DEFINE_PER_CPU(struct list_head, rotation_list);
 868
 869/*
 870 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
 871 * because they're strictly cpu affine and rotate_start is called with IRQs
 872 * disabled, while rotate_context is called from IRQ context.
 
 873 */
 874static void perf_pmu_rotate_start(struct pmu *pmu)
 875{
 876	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 877	struct list_head *head = &__get_cpu_var(rotation_list);
 
 
 
 
 
 
 878
 
 
 879	WARN_ON(!irqs_disabled());
 880
 881	if (list_empty(&cpuctx->rotation_list))
 882		list_add(&cpuctx->rotation_list, head);
 
 883}
 884
 885static void get_ctx(struct perf_event_context *ctx)
 886{
 887	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 888}
 889
 
 
 
 
 
 
 
 
 
 890static void put_ctx(struct perf_event_context *ctx)
 891{
 892	if (atomic_dec_and_test(&ctx->refcount)) {
 893		if (ctx->parent_ctx)
 894			put_ctx(ctx->parent_ctx);
 895		if (ctx->task)
 896			put_task_struct(ctx->task);
 897		kfree_rcu(ctx, rcu_head);
 898	}
 899}
 900
 901static void unclone_ctx(struct perf_event_context *ctx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 902{
 903	if (ctx->parent_ctx) {
 904		put_ctx(ctx->parent_ctx);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 905		ctx->parent_ctx = NULL;
 906	}
 907	ctx->generation++;
 
 
 908}
 909
 910static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 911{
 912	/*
 913	 * only top level events have the pid namespace they were created in
 914	 */
 915	if (event->parent)
 916		event = event->parent;
 917
 918	return task_tgid_nr_ns(p, event->ns);
 919}
 920
 921static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 922{
 923	/*
 924	 * only top level events have the pid namespace they were created in
 925	 */
 926	if (event->parent)
 927		event = event->parent;
 928
 929	return task_pid_nr_ns(p, event->ns);
 930}
 931
 932/*
 933 * If we inherit events we want to return the parent event id
 934 * to userspace.
 935 */
 936static u64 primary_event_id(struct perf_event *event)
 937{
 938	u64 id = event->id;
 939
 940	if (event->parent)
 941		id = event->parent->id;
 942
 943	return id;
 944}
 945
 946/*
 947 * Get the perf_event_context for a task and lock it.
 
 948 * This has to cope with with the fact that until it is locked,
 949 * the context could get moved to another task.
 950 */
 951static struct perf_event_context *
 952perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 953{
 954	struct perf_event_context *ctx;
 955
 956retry:
 957	/*
 958	 * One of the few rules of preemptible RCU is that one cannot do
 959	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
 960	 * part of the read side critical section was preemptible -- see
 961	 * rcu_read_unlock_special().
 962	 *
 963	 * Since ctx->lock nests under rq->lock we must ensure the entire read
 964	 * side critical section is non-preemptible.
 965	 */
 966	preempt_disable();
 967	rcu_read_lock();
 968	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
 969	if (ctx) {
 970		/*
 971		 * If this context is a clone of another, it might
 972		 * get swapped for another underneath us by
 973		 * perf_event_task_sched_out, though the
 974		 * rcu_read_lock() protects us from any context
 975		 * getting freed.  Lock the context and check if it
 976		 * got swapped before we could get the lock, and retry
 977		 * if so.  If we locked the right context, then it
 978		 * can't get swapped on us any more.
 979		 */
 980		raw_spin_lock_irqsave(&ctx->lock, *flags);
 981		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
 982			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 983			rcu_read_unlock();
 984			preempt_enable();
 985			goto retry;
 986		}
 987
 988		if (!atomic_inc_not_zero(&ctx->refcount)) {
 989			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 
 990			ctx = NULL;
 
 
 991		}
 992	}
 993	rcu_read_unlock();
 994	preempt_enable();
 
 995	return ctx;
 996}
 997
 998/*
 999 * Get the context for a task and increment its pin_count so it
1000 * can't get swapped to another task.  This also increments its
1001 * reference count so that the context can't get freed.
1002 */
1003static struct perf_event_context *
1004perf_pin_task_context(struct task_struct *task, int ctxn)
1005{
1006	struct perf_event_context *ctx;
1007	unsigned long flags;
1008
1009	ctx = perf_lock_task_context(task, ctxn, &flags);
1010	if (ctx) {
1011		++ctx->pin_count;
1012		raw_spin_unlock_irqrestore(&ctx->lock, flags);
1013	}
1014	return ctx;
1015}
1016
1017static void perf_unpin_context(struct perf_event_context *ctx)
1018{
1019	unsigned long flags;
1020
1021	raw_spin_lock_irqsave(&ctx->lock, flags);
1022	--ctx->pin_count;
1023	raw_spin_unlock_irqrestore(&ctx->lock, flags);
1024}
1025
1026/*
1027 * Update the record of the current time in a context.
1028 */
1029static void update_context_time(struct perf_event_context *ctx)
1030{
1031	u64 now = perf_clock();
1032
1033	ctx->time += now - ctx->timestamp;
1034	ctx->timestamp = now;
1035}
1036
1037static u64 perf_event_time(struct perf_event *event)
1038{
1039	struct perf_event_context *ctx = event->ctx;
1040
1041	if (is_cgroup_event(event))
1042		return perf_cgroup_event_time(event);
1043
1044	return ctx ? ctx->time : 0;
1045}
1046
1047/*
1048 * Update the total_time_enabled and total_time_running fields for a event.
1049 * The caller of this function needs to hold the ctx->lock.
1050 */
1051static void update_event_times(struct perf_event *event)
1052{
1053	struct perf_event_context *ctx = event->ctx;
1054	u64 run_end;
1055
 
 
1056	if (event->state < PERF_EVENT_STATE_INACTIVE ||
1057	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1058		return;
 
1059	/*
1060	 * in cgroup mode, time_enabled represents
1061	 * the time the event was enabled AND active
1062	 * tasks were in the monitored cgroup. This is
1063	 * independent of the activity of the context as
1064	 * there may be a mix of cgroup and non-cgroup events.
1065	 *
1066	 * That is why we treat cgroup events differently
1067	 * here.
1068	 */
1069	if (is_cgroup_event(event))
1070		run_end = perf_cgroup_event_time(event);
1071	else if (ctx->is_active)
1072		run_end = ctx->time;
1073	else
1074		run_end = event->tstamp_stopped;
1075
1076	event->total_time_enabled = run_end - event->tstamp_enabled;
1077
1078	if (event->state == PERF_EVENT_STATE_INACTIVE)
1079		run_end = event->tstamp_stopped;
1080	else
1081		run_end = perf_event_time(event);
1082
1083	event->total_time_running = run_end - event->tstamp_running;
1084
1085}
1086
1087/*
1088 * Update total_time_enabled and total_time_running for all events in a group.
1089 */
1090static void update_group_times(struct perf_event *leader)
1091{
1092	struct perf_event *event;
1093
1094	update_event_times(leader);
1095	list_for_each_entry(event, &leader->sibling_list, group_entry)
1096		update_event_times(event);
1097}
1098
1099static struct list_head *
1100ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1101{
1102	if (event->attr.pinned)
1103		return &ctx->pinned_groups;
1104	else
1105		return &ctx->flexible_groups;
1106}
1107
1108/*
1109 * Add a event from the lists for its context.
1110 * Must be called with ctx->mutex and ctx->lock held.
1111 */
1112static void
1113list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1114{
 
 
1115	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1116	event->attach_state |= PERF_ATTACH_CONTEXT;
1117
1118	/*
1119	 * If we're a stand alone event or group leader, we go to the context
1120	 * list, group events are kept attached to the group so that
1121	 * perf_group_detach can, at all times, locate all siblings.
1122	 */
1123	if (event->group_leader == event) {
1124		struct list_head *list;
1125
1126		if (is_software_event(event))
1127			event->group_flags |= PERF_GROUP_SOFTWARE;
1128
1129		list = ctx_group_list(event, ctx);
1130		list_add_tail(&event->group_entry, list);
1131	}
1132
1133	if (is_cgroup_event(event))
1134		ctx->nr_cgroups++;
1135
1136	if (has_branch_stack(event))
1137		ctx->nr_branch_stack++;
1138
1139	list_add_rcu(&event->event_entry, &ctx->event_list);
1140	if (!ctx->nr_events)
1141		perf_pmu_rotate_start(ctx->pmu);
1142	ctx->nr_events++;
1143	if (event->attr.inherit_stat)
1144		ctx->nr_stat++;
1145
1146	ctx->generation++;
1147}
1148
1149/*
1150 * Initialize event state based on the perf_event_attr::disabled.
1151 */
1152static inline void perf_event__state_init(struct perf_event *event)
1153{
1154	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1155					      PERF_EVENT_STATE_INACTIVE;
1156}
1157
1158/*
1159 * Called at perf_event creation and when events are attached/detached from a
1160 * group.
1161 */
1162static void perf_event__read_size(struct perf_event *event)
1163{
1164	int entry = sizeof(u64); /* value */
1165	int size = 0;
1166	int nr = 1;
1167
1168	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1169		size += sizeof(u64);
1170
1171	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1172		size += sizeof(u64);
1173
1174	if (event->attr.read_format & PERF_FORMAT_ID)
1175		entry += sizeof(u64);
1176
1177	if (event->attr.read_format & PERF_FORMAT_GROUP) {
1178		nr += event->group_leader->nr_siblings;
1179		size += sizeof(u64);
1180	}
1181
1182	size += entry * nr;
1183	event->read_size = size;
1184}
1185
1186static void perf_event__header_size(struct perf_event *event)
1187{
1188	struct perf_sample_data *data;
1189	u64 sample_type = event->attr.sample_type;
1190	u16 size = 0;
1191
1192	perf_event__read_size(event);
1193
1194	if (sample_type & PERF_SAMPLE_IP)
1195		size += sizeof(data->ip);
1196
1197	if (sample_type & PERF_SAMPLE_ADDR)
1198		size += sizeof(data->addr);
1199
1200	if (sample_type & PERF_SAMPLE_PERIOD)
1201		size += sizeof(data->period);
1202
1203	if (sample_type & PERF_SAMPLE_WEIGHT)
1204		size += sizeof(data->weight);
1205
1206	if (sample_type & PERF_SAMPLE_READ)
1207		size += event->read_size;
1208
1209	if (sample_type & PERF_SAMPLE_DATA_SRC)
1210		size += sizeof(data->data_src.val);
1211
1212	if (sample_type & PERF_SAMPLE_TRANSACTION)
1213		size += sizeof(data->txn);
1214
1215	event->header_size = size;
1216}
1217
 
 
 
 
 
 
 
 
 
 
 
1218static void perf_event__id_header_size(struct perf_event *event)
1219{
1220	struct perf_sample_data *data;
1221	u64 sample_type = event->attr.sample_type;
1222	u16 size = 0;
1223
1224	if (sample_type & PERF_SAMPLE_TID)
1225		size += sizeof(data->tid_entry);
1226
1227	if (sample_type & PERF_SAMPLE_TIME)
1228		size += sizeof(data->time);
1229
1230	if (sample_type & PERF_SAMPLE_IDENTIFIER)
1231		size += sizeof(data->id);
1232
1233	if (sample_type & PERF_SAMPLE_ID)
1234		size += sizeof(data->id);
1235
1236	if (sample_type & PERF_SAMPLE_STREAM_ID)
1237		size += sizeof(data->stream_id);
1238
1239	if (sample_type & PERF_SAMPLE_CPU)
1240		size += sizeof(data->cpu_entry);
1241
1242	event->id_header_size = size;
1243}
1244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1245static void perf_group_attach(struct perf_event *event)
1246{
1247	struct perf_event *group_leader = event->group_leader, *pos;
1248
 
 
1249	/*
1250	 * We can have double attach due to group movement in perf_event_open.
1251	 */
1252	if (event->attach_state & PERF_ATTACH_GROUP)
1253		return;
1254
1255	event->attach_state |= PERF_ATTACH_GROUP;
1256
1257	if (group_leader == event)
1258		return;
1259
1260	if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1261			!is_software_event(event))
1262		group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1263
1264	list_add_tail(&event->group_entry, &group_leader->sibling_list);
1265	group_leader->nr_siblings++;
1266
1267	perf_event__header_size(group_leader);
1268
1269	list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1270		perf_event__header_size(pos);
1271}
1272
1273/*
1274 * Remove a event from the lists for its context.
1275 * Must be called with ctx->mutex and ctx->lock held.
1276 */
1277static void
1278list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1279{
1280	struct perf_cpu_context *cpuctx;
 
 
1281	/*
1282	 * We can have double detach due to exit/hot-unplug + close.
1283	 */
1284	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1285		return;
1286
1287	event->attach_state &= ~PERF_ATTACH_CONTEXT;
1288
1289	if (is_cgroup_event(event)) {
1290		ctx->nr_cgroups--;
1291		cpuctx = __get_cpu_context(ctx);
1292		/*
1293		 * if there are no more cgroup events
1294		 * then cler cgrp to avoid stale pointer
1295		 * in update_cgrp_time_from_cpuctx()
1296		 */
1297		if (!ctx->nr_cgroups)
1298			cpuctx->cgrp = NULL;
1299	}
1300
1301	if (has_branch_stack(event))
1302		ctx->nr_branch_stack--;
1303
1304	ctx->nr_events--;
1305	if (event->attr.inherit_stat)
1306		ctx->nr_stat--;
1307
1308	list_del_rcu(&event->event_entry);
1309
1310	if (event->group_leader == event)
1311		list_del_init(&event->group_entry);
1312
1313	update_group_times(event);
1314
1315	/*
1316	 * If event was in error state, then keep it
1317	 * that way, otherwise bogus counts will be
1318	 * returned on read(). The only way to get out
1319	 * of error state is by explicit re-enabling
1320	 * of the event
1321	 */
1322	if (event->state > PERF_EVENT_STATE_OFF)
1323		event->state = PERF_EVENT_STATE_OFF;
1324
1325	ctx->generation++;
1326}
1327
1328static void perf_group_detach(struct perf_event *event)
1329{
1330	struct perf_event *sibling, *tmp;
1331	struct list_head *list = NULL;
1332
 
 
1333	/*
1334	 * We can have double detach due to exit/hot-unplug + close.
1335	 */
1336	if (!(event->attach_state & PERF_ATTACH_GROUP))
1337		return;
1338
1339	event->attach_state &= ~PERF_ATTACH_GROUP;
1340
1341	/*
1342	 * If this is a sibling, remove it from its group.
1343	 */
1344	if (event->group_leader != event) {
1345		list_del_init(&event->group_entry);
1346		event->group_leader->nr_siblings--;
1347		goto out;
1348	}
1349
1350	if (!list_empty(&event->group_entry))
1351		list = &event->group_entry;
1352
1353	/*
1354	 * If this was a group event with sibling events then
1355	 * upgrade the siblings to singleton events by adding them
1356	 * to whatever list we are on.
1357	 */
1358	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1359		if (list)
1360			list_move_tail(&sibling->group_entry, list);
1361		sibling->group_leader = sibling;
1362
1363		/* Inherit group flags from the previous leader */
1364		sibling->group_flags = event->group_flags;
 
 
1365	}
1366
1367out:
1368	perf_event__header_size(event->group_leader);
1369
1370	list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1371		perf_event__header_size(tmp);
1372}
1373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1374static inline int
1375event_filter_match(struct perf_event *event)
1376{
1377	return (event->cpu == -1 || event->cpu == smp_processor_id())
1378	    && perf_cgroup_match(event);
1379}
1380
1381static void
1382event_sched_out(struct perf_event *event,
1383		  struct perf_cpu_context *cpuctx,
1384		  struct perf_event_context *ctx)
1385{
1386	u64 tstamp = perf_event_time(event);
1387	u64 delta;
 
 
 
 
1388	/*
1389	 * An event which could not be activated because of
1390	 * filter mismatch still needs to have its timings
1391	 * maintained, otherwise bogus information is return
1392	 * via read() for time_enabled, time_running:
1393	 */
1394	if (event->state == PERF_EVENT_STATE_INACTIVE
1395	    && !event_filter_match(event)) {
1396		delta = tstamp - event->tstamp_stopped;
1397		event->tstamp_running += delta;
1398		event->tstamp_stopped = tstamp;
1399	}
1400
1401	if (event->state != PERF_EVENT_STATE_ACTIVE)
1402		return;
1403
1404	perf_pmu_disable(event->pmu);
1405
 
 
 
1406	event->state = PERF_EVENT_STATE_INACTIVE;
1407	if (event->pending_disable) {
1408		event->pending_disable = 0;
1409		event->state = PERF_EVENT_STATE_OFF;
1410	}
1411	event->tstamp_stopped = tstamp;
1412	event->pmu->del(event, 0);
1413	event->oncpu = -1;
1414
1415	if (!is_software_event(event))
1416		cpuctx->active_oncpu--;
1417	ctx->nr_active--;
 
1418	if (event->attr.freq && event->attr.sample_freq)
1419		ctx->nr_freq--;
1420	if (event->attr.exclusive || !cpuctx->active_oncpu)
1421		cpuctx->exclusive = 0;
1422
1423	perf_pmu_enable(event->pmu);
1424}
1425
1426static void
1427group_sched_out(struct perf_event *group_event,
1428		struct perf_cpu_context *cpuctx,
1429		struct perf_event_context *ctx)
1430{
1431	struct perf_event *event;
1432	int state = group_event->state;
1433
 
 
1434	event_sched_out(group_event, cpuctx, ctx);
1435
1436	/*
1437	 * Schedule out siblings (if any):
1438	 */
1439	list_for_each_entry(event, &group_event->sibling_list, group_entry)
1440		event_sched_out(event, cpuctx, ctx);
1441
 
 
1442	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1443		cpuctx->exclusive = 0;
1444}
1445
1446struct remove_event {
1447	struct perf_event *event;
1448	bool detach_group;
1449};
1450
1451/*
1452 * Cross CPU call to remove a performance event
1453 *
1454 * We disable the event on the hardware level first. After that we
1455 * remove it from the context list.
1456 */
1457static int __perf_remove_from_context(void *info)
 
 
 
 
1458{
1459	struct remove_event *re = info;
1460	struct perf_event *event = re->event;
1461	struct perf_event_context *ctx = event->ctx;
1462	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1463
1464	raw_spin_lock(&ctx->lock);
1465	event_sched_out(event, cpuctx, ctx);
1466	if (re->detach_group)
1467		perf_group_detach(event);
1468	list_del_event(event, ctx);
1469	if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
 
1470		ctx->is_active = 0;
1471		cpuctx->task_ctx = NULL;
 
 
 
1472	}
1473	raw_spin_unlock(&ctx->lock);
1474
1475	return 0;
1476}
1477
1478
1479/*
1480 * Remove the event from a task's (or a CPU's) list of events.
1481 *
1482 * CPU events are removed with a smp call. For task events we only
1483 * call when the task is on a CPU.
1484 *
1485 * If event->ctx is a cloned context, callers must make sure that
1486 * every task struct that event->ctx->task could possibly point to
1487 * remains valid.  This is OK when called from perf_release since
1488 * that only calls us on the top-level context, which can't be a clone.
1489 * When called from perf_event_exit_task, it's OK because the
1490 * context has been detached from its task.
1491 */
1492static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1493{
1494	struct perf_event_context *ctx = event->ctx;
1495	struct task_struct *task = ctx->task;
1496	struct remove_event re = {
1497		.event = event,
1498		.detach_group = detach_group,
1499	};
1500
1501	lockdep_assert_held(&ctx->mutex);
1502
1503	if (!task) {
1504		/*
1505		 * Per cpu events are removed via an smp call and
1506		 * the removal is always successful.
1507		 */
1508		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1509		return;
1510	}
1511
1512retry:
1513	if (!task_function_call(task, __perf_remove_from_context, &re))
1514		return;
1515
1516	raw_spin_lock_irq(&ctx->lock);
1517	/*
1518	 * If we failed to find a running task, but find the context active now
1519	 * that we've acquired the ctx->lock, retry.
 
 
1520	 */
1521	if (ctx->is_active) {
 
 
 
 
 
 
 
 
1522		raw_spin_unlock_irq(&ctx->lock);
1523		goto retry;
1524	}
1525
1526	/*
1527	 * Since the task isn't running, its safe to remove the event, us
1528	 * holding the ctx->lock ensures the task won't get scheduled in.
1529	 */
1530	if (detach_group)
1531		perf_group_detach(event);
1532	list_del_event(event, ctx);
1533	raw_spin_unlock_irq(&ctx->lock);
1534}
1535
1536/*
1537 * Cross CPU call to disable a performance event
1538 */
1539int __perf_event_disable(void *info)
 
 
 
1540{
1541	struct perf_event *event = info;
1542	struct perf_event_context *ctx = event->ctx;
1543	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1544
1545	/*
1546	 * If this is a per-task event, need to check whether this
1547	 * event's task is the current task on this cpu.
1548	 *
1549	 * Can trigger due to concurrent perf_event_context_sched_out()
1550	 * flipping contexts around.
1551	 */
1552	if (ctx->task && cpuctx->task_ctx != ctx)
1553		return -EINVAL;
1554
1555	raw_spin_lock(&ctx->lock);
1556
1557	/*
1558	 * If the event is on, turn it off.
1559	 * If it is in error state, leave it in error state.
1560	 */
1561	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1562		update_context_time(ctx);
1563		update_cgrp_time_from_event(event);
1564		update_group_times(event);
1565		if (event == event->group_leader)
1566			group_sched_out(event, cpuctx, ctx);
1567		else
1568			event_sched_out(event, cpuctx, ctx);
1569		event->state = PERF_EVENT_STATE_OFF;
1570	}
1571
1572	raw_spin_unlock(&ctx->lock);
1573
1574	return 0;
1575}
1576
1577/*
1578 * Disable a event.
1579 *
1580 * If event->ctx is a cloned context, callers must make sure that
1581 * every task struct that event->ctx->task could possibly point to
1582 * remains valid.  This condition is satisifed when called through
1583 * perf_event_for_each_child or perf_event_for_each because they
1584 * hold the top-level event's child_mutex, so any descendant that
1585 * goes to exit will block in sync_child_event.
 
1586 * When called from perf_pending_event it's OK because event->ctx
1587 * is the current context on this CPU and preemption is disabled,
1588 * hence we can't get into perf_event_task_sched_out for this context.
1589 */
1590void perf_event_disable(struct perf_event *event)
1591{
1592	struct perf_event_context *ctx = event->ctx;
1593	struct task_struct *task = ctx->task;
1594
1595	if (!task) {
1596		/*
1597		 * Disable the event on the cpu that it's on
1598		 */
1599		cpu_function_call(event->cpu, __perf_event_disable, event);
1600		return;
1601	}
 
1602
1603retry:
1604	if (!task_function_call(task, __perf_event_disable, event))
1605		return;
 
 
 
 
1606
1607	raw_spin_lock_irq(&ctx->lock);
1608	/*
1609	 * If the event is still active, we need to retry the cross-call.
1610	 */
1611	if (event->state == PERF_EVENT_STATE_ACTIVE) {
1612		raw_spin_unlock_irq(&ctx->lock);
1613		/*
1614		 * Reload the task pointer, it might have been changed by
1615		 * a concurrent perf_event_context_sched_out().
1616		 */
1617		task = ctx->task;
1618		goto retry;
1619	}
1620
1621	/*
1622	 * Since we have the lock this context can't be scheduled
1623	 * in, so we can change the state safely.
1624	 */
1625	if (event->state == PERF_EVENT_STATE_INACTIVE) {
1626		update_group_times(event);
1627		event->state = PERF_EVENT_STATE_OFF;
1628	}
1629	raw_spin_unlock_irq(&ctx->lock);
1630}
1631EXPORT_SYMBOL_GPL(perf_event_disable);
1632
 
 
 
 
 
 
1633static void perf_set_shadow_time(struct perf_event *event,
1634				 struct perf_event_context *ctx,
1635				 u64 tstamp)
1636{
1637	/*
1638	 * use the correct time source for the time snapshot
1639	 *
1640	 * We could get by without this by leveraging the
1641	 * fact that to get to this function, the caller
1642	 * has most likely already called update_context_time()
1643	 * and update_cgrp_time_xx() and thus both timestamp
1644	 * are identical (or very close). Given that tstamp is,
1645	 * already adjusted for cgroup, we could say that:
1646	 *    tstamp - ctx->timestamp
1647	 * is equivalent to
1648	 *    tstamp - cgrp->timestamp.
1649	 *
1650	 * Then, in perf_output_read(), the calculation would
1651	 * work with no changes because:
1652	 * - event is guaranteed scheduled in
1653	 * - no scheduled out in between
1654	 * - thus the timestamp would be the same
1655	 *
1656	 * But this is a bit hairy.
1657	 *
1658	 * So instead, we have an explicit cgroup call to remain
1659	 * within the time time source all along. We believe it
1660	 * is cleaner and simpler to understand.
1661	 */
1662	if (is_cgroup_event(event))
1663		perf_cgroup_set_shadow_time(event, tstamp);
1664	else
1665		event->shadow_ctx_time = tstamp - ctx->timestamp;
1666}
1667
1668#define MAX_INTERRUPTS (~0ULL)
1669
1670static void perf_log_throttle(struct perf_event *event, int enable);
 
1671
1672static int
1673event_sched_in(struct perf_event *event,
1674		 struct perf_cpu_context *cpuctx,
1675		 struct perf_event_context *ctx)
1676{
1677	u64 tstamp = perf_event_time(event);
1678	int ret = 0;
1679
 
 
1680	if (event->state <= PERF_EVENT_STATE_OFF)
1681		return 0;
1682
1683	event->state = PERF_EVENT_STATE_ACTIVE;
1684	event->oncpu = smp_processor_id();
 
 
 
 
 
1685
1686	/*
1687	 * Unthrottle events, since we scheduled we might have missed several
1688	 * ticks already, also for a heavily scheduling task there is little
1689	 * guarantee it'll get a tick in a timely manner.
1690	 */
1691	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1692		perf_log_throttle(event, 1);
1693		event->hw.interrupts = 0;
1694	}
1695
1696	/*
1697	 * The new state must be visible before we turn it on in the hardware:
1698	 */
1699	smp_wmb();
1700
1701	perf_pmu_disable(event->pmu);
1702
 
 
 
 
1703	if (event->pmu->add(event, PERF_EF_START)) {
1704		event->state = PERF_EVENT_STATE_INACTIVE;
1705		event->oncpu = -1;
1706		ret = -EAGAIN;
1707		goto out;
1708	}
1709
1710	event->tstamp_running += tstamp - event->tstamp_stopped;
1711
1712	perf_set_shadow_time(event, ctx, tstamp);
1713
1714	if (!is_software_event(event))
1715		cpuctx->active_oncpu++;
1716	ctx->nr_active++;
 
1717	if (event->attr.freq && event->attr.sample_freq)
1718		ctx->nr_freq++;
1719
1720	if (event->attr.exclusive)
1721		cpuctx->exclusive = 1;
1722
1723out:
1724	perf_pmu_enable(event->pmu);
1725
1726	return ret;
1727}
1728
1729static int
1730group_sched_in(struct perf_event *group_event,
1731	       struct perf_cpu_context *cpuctx,
1732	       struct perf_event_context *ctx)
1733{
1734	struct perf_event *event, *partial_group = NULL;
1735	struct pmu *pmu = ctx->pmu;
1736	u64 now = ctx->time;
1737	bool simulate = false;
1738
1739	if (group_event->state == PERF_EVENT_STATE_OFF)
1740		return 0;
1741
1742	pmu->start_txn(pmu);
1743
1744	if (event_sched_in(group_event, cpuctx, ctx)) {
1745		pmu->cancel_txn(pmu);
1746		perf_cpu_hrtimer_restart(cpuctx);
1747		return -EAGAIN;
1748	}
1749
1750	/*
1751	 * Schedule in siblings as one group (if any):
1752	 */
1753	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1754		if (event_sched_in(event, cpuctx, ctx)) {
1755			partial_group = event;
1756			goto group_error;
1757		}
1758	}
1759
1760	if (!pmu->commit_txn(pmu))
1761		return 0;
1762
1763group_error:
1764	/*
1765	 * Groups can be scheduled in as one unit only, so undo any
1766	 * partial group before returning:
1767	 * The events up to the failed event are scheduled out normally,
1768	 * tstamp_stopped will be updated.
1769	 *
1770	 * The failed events and the remaining siblings need to have
1771	 * their timings updated as if they had gone thru event_sched_in()
1772	 * and event_sched_out(). This is required to get consistent timings
1773	 * across the group. This also takes care of the case where the group
1774	 * could never be scheduled by ensuring tstamp_stopped is set to mark
1775	 * the time the event was actually stopped, such that time delta
1776	 * calculation in update_event_times() is correct.
1777	 */
1778	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1779		if (event == partial_group)
1780			simulate = true;
1781
1782		if (simulate) {
1783			event->tstamp_running += now - event->tstamp_stopped;
1784			event->tstamp_stopped = now;
1785		} else {
1786			event_sched_out(event, cpuctx, ctx);
1787		}
1788	}
1789	event_sched_out(group_event, cpuctx, ctx);
1790
1791	pmu->cancel_txn(pmu);
1792
1793	perf_cpu_hrtimer_restart(cpuctx);
1794
1795	return -EAGAIN;
1796}
1797
1798/*
1799 * Work out whether we can put this event group on the CPU now.
1800 */
1801static int group_can_go_on(struct perf_event *event,
1802			   struct perf_cpu_context *cpuctx,
1803			   int can_add_hw)
1804{
1805	/*
1806	 * Groups consisting entirely of software events can always go on.
1807	 */
1808	if (event->group_flags & PERF_GROUP_SOFTWARE)
1809		return 1;
1810	/*
1811	 * If an exclusive group is already on, no other hardware
1812	 * events can go on.
1813	 */
1814	if (cpuctx->exclusive)
1815		return 0;
1816	/*
1817	 * If this group is exclusive and there are already
1818	 * events on the CPU, it can't go on.
1819	 */
1820	if (event->attr.exclusive && cpuctx->active_oncpu)
1821		return 0;
1822	/*
1823	 * Otherwise, try to add it if all previous groups were able
1824	 * to go on.
1825	 */
1826	return can_add_hw;
1827}
1828
1829static void add_event_to_ctx(struct perf_event *event,
1830			       struct perf_event_context *ctx)
1831{
1832	u64 tstamp = perf_event_time(event);
1833
1834	list_add_event(event, ctx);
1835	perf_group_attach(event);
1836	event->tstamp_enabled = tstamp;
1837	event->tstamp_running = tstamp;
1838	event->tstamp_stopped = tstamp;
1839}
1840
1841static void task_ctx_sched_out(struct perf_event_context *ctx);
 
 
1842static void
1843ctx_sched_in(struct perf_event_context *ctx,
1844	     struct perf_cpu_context *cpuctx,
1845	     enum event_type_t event_type,
1846	     struct task_struct *task);
1847
 
 
 
 
 
 
 
 
 
 
 
 
1848static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1849				struct perf_event_context *ctx,
1850				struct task_struct *task)
1851{
1852	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1853	if (ctx)
1854		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1855	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1856	if (ctx)
1857		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1858}
1859
 
 
 
 
 
 
 
 
 
 
 
1860/*
1861 * Cross CPU call to install and enable a performance event
1862 *
1863 * Must be called with ctx->mutex held
 
1864 */
1865static int  __perf_install_in_context(void *info)
1866{
1867	struct perf_event *event = info;
1868	struct perf_event_context *ctx = event->ctx;
1869	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1870	struct perf_event_context *task_ctx = cpuctx->task_ctx;
1871	struct task_struct *task = current;
 
1872
1873	perf_ctx_lock(cpuctx, task_ctx);
1874	perf_pmu_disable(cpuctx->ctx.pmu);
1875
1876	/*
1877	 * If there was an active task_ctx schedule it out.
1878	 */
1879	if (task_ctx)
1880		task_ctx_sched_out(task_ctx);
1881
1882	/*
1883	 * If the context we're installing events in is not the
1884	 * active task_ctx, flip them.
1885	 */
1886	if (ctx->task && task_ctx != ctx) {
1887		if (task_ctx)
1888			raw_spin_unlock(&task_ctx->lock);
1889		raw_spin_lock(&ctx->lock);
1890		task_ctx = ctx;
1891	}
1892
1893	if (task_ctx) {
1894		cpuctx->task_ctx = task_ctx;
1895		task = task_ctx->task;
1896	}
1897
1898	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 
 
 
 
 
 
 
 
 
 
1899
1900	update_context_time(ctx);
1901	/*
1902	 * update cgrp time only if current cgrp
1903	 * matches event->cgrp. Must be done before
1904	 * calling add_event_to_ctx()
1905	 */
1906	update_cgrp_time_from_event(event);
1907
1908	add_event_to_ctx(event, ctx);
 
 
 
 
 
 
1909
1910	/*
1911	 * Schedule everything back in
1912	 */
1913	perf_event_sched_in(cpuctx, task_ctx, task);
1914
1915	perf_pmu_enable(cpuctx->ctx.pmu);
1916	perf_ctx_unlock(cpuctx, task_ctx);
1917
1918	return 0;
1919}
1920
1921/*
1922 * Attach a performance event to a context
1923 *
1924 * First we add the event to the list with the hardware enable bit
1925 * in event->hw_config cleared.
1926 *
1927 * If the event is attached to a task which is on a CPU we use a smp
1928 * call to enable it in the task context. The task might have been
1929 * scheduled away, but we check this in the smp call again.
1930 */
1931static void
1932perf_install_in_context(struct perf_event_context *ctx,
1933			struct perf_event *event,
1934			int cpu)
1935{
1936	struct task_struct *task = ctx->task;
1937
1938	lockdep_assert_held(&ctx->mutex);
1939
1940	event->ctx = ctx;
1941	if (event->cpu != -1)
1942		event->cpu = cpu;
1943
 
 
 
 
 
 
1944	if (!task) {
1945		/*
1946		 * Per cpu events are installed via an smp call and
1947		 * the install is always successful.
1948		 */
1949		cpu_function_call(cpu, __perf_install_in_context, event);
1950		return;
1951	}
1952
1953retry:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1954	if (!task_function_call(task, __perf_install_in_context, event))
1955		return;
1956
1957	raw_spin_lock_irq(&ctx->lock);
 
 
 
 
 
 
 
 
 
 
1958	/*
1959	 * If we failed to find a running task, but find the context active now
1960	 * that we've acquired the ctx->lock, retry.
1961	 */
1962	if (ctx->is_active) {
1963		raw_spin_unlock_irq(&ctx->lock);
1964		goto retry;
1965	}
1966
1967	/*
1968	 * Since the task isn't running, its safe to add the event, us holding
1969	 * the ctx->lock ensures the task won't get scheduled in.
1970	 */
1971	add_event_to_ctx(event, ctx);
1972	raw_spin_unlock_irq(&ctx->lock);
1973}
1974
1975/*
1976 * Put a event into inactive state and update time fields.
1977 * Enabling the leader of a group effectively enables all
1978 * the group members that aren't explicitly disabled, so we
1979 * have to update their ->tstamp_enabled also.
1980 * Note: this works for group members as well as group leaders
1981 * since the non-leader members' sibling_lists will be empty.
1982 */
1983static void __perf_event_mark_enabled(struct perf_event *event)
1984{
1985	struct perf_event *sub;
1986	u64 tstamp = perf_event_time(event);
1987
1988	event->state = PERF_EVENT_STATE_INACTIVE;
1989	event->tstamp_enabled = tstamp - event->total_time_enabled;
1990	list_for_each_entry(sub, &event->sibling_list, group_entry) {
1991		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1992			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1993	}
1994}
1995
1996/*
1997 * Cross CPU call to enable a performance event
1998 */
1999static int __perf_event_enable(void *info)
 
 
 
2000{
2001	struct perf_event *event = info;
2002	struct perf_event_context *ctx = event->ctx;
2003	struct perf_event *leader = event->group_leader;
2004	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2005	int err;
2006
2007	/*
2008	 * There's a time window between 'ctx->is_active' check
2009	 * in perf_event_enable function and this place having:
2010	 *   - IRQs on
2011	 *   - ctx->lock unlocked
2012	 *
2013	 * where the task could be killed and 'ctx' deactivated
2014	 * by perf_event_exit_task.
2015	 */
2016	if (!ctx->is_active)
2017		return -EINVAL;
2018
2019	raw_spin_lock(&ctx->lock);
2020	update_context_time(ctx);
2021
2022	if (event->state >= PERF_EVENT_STATE_INACTIVE)
2023		goto unlock;
2024
2025	/*
2026	 * set current task's cgroup time reference point
2027	 */
2028	perf_cgroup_set_timestamp(current, ctx);
2029
2030	__perf_event_mark_enabled(event);
2031
2032	if (!event_filter_match(event)) {
2033		if (is_cgroup_event(event))
2034			perf_cgroup_defer_enabled(event);
2035		goto unlock;
 
2036	}
2037
2038	/*
2039	 * If the event is in a group and isn't the group leader,
2040	 * then don't put it on unless the group is on.
2041	 */
2042	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2043		goto unlock;
2044
2045	if (!group_can_go_on(event, cpuctx, 1)) {
2046		err = -EEXIST;
2047	} else {
2048		if (event == leader)
2049			err = group_sched_in(event, cpuctx, ctx);
2050		else
2051			err = event_sched_in(event, cpuctx, ctx);
2052	}
2053
2054	if (err) {
2055		/*
2056		 * If this event can't go on and it's part of a
2057		 * group, then the whole group has to come off.
2058		 */
2059		if (leader != event) {
2060			group_sched_out(leader, cpuctx, ctx);
2061			perf_cpu_hrtimer_restart(cpuctx);
2062		}
2063		if (leader->attr.pinned) {
2064			update_group_times(leader);
2065			leader->state = PERF_EVENT_STATE_ERROR;
2066		}
2067	}
2068
2069unlock:
2070	raw_spin_unlock(&ctx->lock);
 
2071
2072	return 0;
2073}
2074
2075/*
2076 * Enable a event.
2077 *
2078 * If event->ctx is a cloned context, callers must make sure that
2079 * every task struct that event->ctx->task could possibly point to
2080 * remains valid.  This condition is satisfied when called through
2081 * perf_event_for_each_child or perf_event_for_each as described
2082 * for perf_event_disable.
2083 */
2084void perf_event_enable(struct perf_event *event)
2085{
2086	struct perf_event_context *ctx = event->ctx;
2087	struct task_struct *task = ctx->task;
2088
2089	if (!task) {
2090		/*
2091		 * Enable the event on the cpu that it's on
2092		 */
2093		cpu_function_call(event->cpu, __perf_event_enable, event);
2094		return;
2095	}
2096
2097	raw_spin_lock_irq(&ctx->lock);
2098	if (event->state >= PERF_EVENT_STATE_INACTIVE)
2099		goto out;
2100
2101	/*
2102	 * If the event is in error state, clear that first.
2103	 * That way, if we see the event in error state below, we
2104	 * know that it has gone back into error state, as distinct
2105	 * from the task having been scheduled away before the
2106	 * cross-call arrived.
2107	 */
2108	if (event->state == PERF_EVENT_STATE_ERROR)
2109		event->state = PERF_EVENT_STATE_OFF;
 
2110
2111retry:
2112	if (!ctx->is_active) {
2113		__perf_event_mark_enabled(event);
2114		goto out;
2115	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2116
2117	raw_spin_unlock_irq(&ctx->lock);
 
2118
2119	if (!task_function_call(task, __perf_event_enable, event))
2120		return;
 
 
 
 
2121
2122	raw_spin_lock_irq(&ctx->lock);
2123
2124	/*
2125	 * If the context is active and the event is still off,
2126	 * we need to retry the cross-call.
 
 
 
 
 
2127	 */
2128	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2129		/*
2130		 * task could have been flipped by a concurrent
2131		 * perf_event_context_sched_out()
 
2132		 */
2133		task = ctx->task;
2134		goto retry;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2135	}
2136
2137out:
2138	raw_spin_unlock_irq(&ctx->lock);
2139}
2140EXPORT_SYMBOL_GPL(perf_event_enable);
2141
2142int perf_event_refresh(struct perf_event *event, int refresh)
2143{
2144	/*
2145	 * not supported on inherited events
2146	 */
2147	if (event->attr.inherit || !is_sampling_event(event))
2148		return -EINVAL;
2149
2150	atomic_add(refresh, &event->event_limit);
2151	perf_event_enable(event);
2152
2153	return 0;
2154}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2155EXPORT_SYMBOL_GPL(perf_event_refresh);
2156
2157static void ctx_sched_out(struct perf_event_context *ctx,
2158			  struct perf_cpu_context *cpuctx,
2159			  enum event_type_t event_type)
2160{
 
2161	struct perf_event *event;
2162	int is_active = ctx->is_active;
 
 
 
 
 
 
 
 
 
 
 
2163
2164	ctx->is_active &= ~event_type;
2165	if (likely(!ctx->nr_events))
2166		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2167
2168	update_context_time(ctx);
2169	update_cgrp_time_from_cpuctx(cpuctx);
2170	if (!ctx->nr_active)
2171		return;
2172
2173	perf_pmu_disable(ctx->pmu);
2174	if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2175		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2176			group_sched_out(event, cpuctx, ctx);
2177	}
2178
2179	if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2180		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2181			group_sched_out(event, cpuctx, ctx);
2182	}
2183	perf_pmu_enable(ctx->pmu);
2184}
2185
2186/*
2187 * Test whether two contexts are equivalent, i.e. whether they have both been
2188 * cloned from the same version of the same context.
2189 *
2190 * Equivalence is measured using a generation number in the context that is
2191 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2192 * and list_del_event().
2193 */
2194static int context_equiv(struct perf_event_context *ctx1,
2195			 struct perf_event_context *ctx2)
2196{
 
 
 
2197	/* Pinning disables the swap optimization */
2198	if (ctx1->pin_count || ctx2->pin_count)
2199		return 0;
2200
2201	/* If ctx1 is the parent of ctx2 */
2202	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2203		return 1;
2204
2205	/* If ctx2 is the parent of ctx1 */
2206	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2207		return 1;
2208
2209	/*
2210	 * If ctx1 and ctx2 have the same parent; we flatten the parent
2211	 * hierarchy, see perf_event_init_context().
2212	 */
2213	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2214			ctx1->parent_gen == ctx2->parent_gen)
2215		return 1;
2216
2217	/* Unmatched */
2218	return 0;
2219}
2220
2221static void __perf_event_sync_stat(struct perf_event *event,
2222				     struct perf_event *next_event)
2223{
2224	u64 value;
2225
2226	if (!event->attr.inherit_stat)
2227		return;
2228
2229	/*
2230	 * Update the event value, we cannot use perf_event_read()
2231	 * because we're in the middle of a context switch and have IRQs
2232	 * disabled, which upsets smp_call_function_single(), however
2233	 * we know the event must be on the current CPU, therefore we
2234	 * don't need to use it.
2235	 */
2236	switch (event->state) {
2237	case PERF_EVENT_STATE_ACTIVE:
2238		event->pmu->read(event);
2239		/* fall-through */
2240
2241	case PERF_EVENT_STATE_INACTIVE:
2242		update_event_times(event);
2243		break;
2244
2245	default:
2246		break;
2247	}
2248
2249	/*
2250	 * In order to keep per-task stats reliable we need to flip the event
2251	 * values when we flip the contexts.
2252	 */
2253	value = local64_read(&next_event->count);
2254	value = local64_xchg(&event->count, value);
2255	local64_set(&next_event->count, value);
2256
2257	swap(event->total_time_enabled, next_event->total_time_enabled);
2258	swap(event->total_time_running, next_event->total_time_running);
2259
2260	/*
2261	 * Since we swizzled the values, update the user visible data too.
2262	 */
2263	perf_event_update_userpage(event);
2264	perf_event_update_userpage(next_event);
2265}
2266
2267static void perf_event_sync_stat(struct perf_event_context *ctx,
2268				   struct perf_event_context *next_ctx)
2269{
2270	struct perf_event *event, *next_event;
2271
2272	if (!ctx->nr_stat)
2273		return;
2274
2275	update_context_time(ctx);
2276
2277	event = list_first_entry(&ctx->event_list,
2278				   struct perf_event, event_entry);
2279
2280	next_event = list_first_entry(&next_ctx->event_list,
2281					struct perf_event, event_entry);
2282
2283	while (&event->event_entry != &ctx->event_list &&
2284	       &next_event->event_entry != &next_ctx->event_list) {
2285
2286		__perf_event_sync_stat(event, next_event);
2287
2288		event = list_next_entry(event, event_entry);
2289		next_event = list_next_entry(next_event, event_entry);
2290	}
2291}
2292
2293static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2294					 struct task_struct *next)
2295{
2296	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2297	struct perf_event_context *next_ctx;
2298	struct perf_event_context *parent, *next_parent;
2299	struct perf_cpu_context *cpuctx;
2300	int do_switch = 1;
2301
2302	if (likely(!ctx))
2303		return;
2304
2305	cpuctx = __get_cpu_context(ctx);
2306	if (!cpuctx->task_ctx)
2307		return;
2308
2309	rcu_read_lock();
2310	next_ctx = next->perf_event_ctxp[ctxn];
2311	if (!next_ctx)
2312		goto unlock;
2313
2314	parent = rcu_dereference(ctx->parent_ctx);
2315	next_parent = rcu_dereference(next_ctx->parent_ctx);
2316
2317	/* If neither context have a parent context; they cannot be clones. */
2318	if (!parent && !next_parent)
2319		goto unlock;
2320
2321	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2322		/*
2323		 * Looks like the two contexts are clones, so we might be
2324		 * able to optimize the context switch.  We lock both
2325		 * contexts and check that they are clones under the
2326		 * lock (including re-checking that neither has been
2327		 * uncloned in the meantime).  It doesn't matter which
2328		 * order we take the locks because no other cpu could
2329		 * be trying to lock both of these tasks.
2330		 */
2331		raw_spin_lock(&ctx->lock);
2332		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2333		if (context_equiv(ctx, next_ctx)) {
 
 
 
 
 
2334			/*
2335			 * XXX do we need a memory barrier of sorts
2336			 * wrt to rcu_dereference() of perf_event_ctxp
 
 
 
2337			 */
2338			task->perf_event_ctxp[ctxn] = next_ctx;
2339			next->perf_event_ctxp[ctxn] = ctx;
2340			ctx->task = next;
2341			next_ctx->task = task;
2342			do_switch = 0;
2343
2344			perf_event_sync_stat(ctx, next_ctx);
2345		}
2346		raw_spin_unlock(&next_ctx->lock);
2347		raw_spin_unlock(&ctx->lock);
2348	}
2349unlock:
2350	rcu_read_unlock();
2351
2352	if (do_switch) {
2353		raw_spin_lock(&ctx->lock);
2354		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2355		cpuctx->task_ctx = NULL;
2356		raw_spin_unlock(&ctx->lock);
2357	}
2358}
2359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2360#define for_each_task_context_nr(ctxn)					\
2361	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2362
2363/*
2364 * Called from scheduler to remove the events of the current task,
2365 * with interrupts disabled.
2366 *
2367 * We stop each event and update the event value in event->count.
2368 *
2369 * This does not protect us against NMI, but disable()
2370 * sets the disabled bit in the control field of event _before_
2371 * accessing the event control register. If a NMI hits, then it will
2372 * not restart the event.
2373 */
2374void __perf_event_task_sched_out(struct task_struct *task,
2375				 struct task_struct *next)
2376{
2377	int ctxn;
2378
 
 
 
 
 
 
2379	for_each_task_context_nr(ctxn)
2380		perf_event_context_sched_out(task, ctxn, next);
2381
2382	/*
2383	 * if cgroup events exist on this CPU, then we need
2384	 * to check if we have to switch out PMU state.
2385	 * cgroup event are system-wide mode only
2386	 */
2387	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2388		perf_cgroup_sched_out(task, next);
2389}
2390
2391static void task_ctx_sched_out(struct perf_event_context *ctx)
2392{
2393	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2394
2395	if (!cpuctx->task_ctx)
2396		return;
2397
2398	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2399		return;
2400
2401	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2402	cpuctx->task_ctx = NULL;
2403}
2404
2405/*
2406 * Called with IRQs disabled
2407 */
2408static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2409			      enum event_type_t event_type)
2410{
2411	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2412}
2413
2414static void
2415ctx_pinned_sched_in(struct perf_event_context *ctx,
2416		    struct perf_cpu_context *cpuctx)
2417{
2418	struct perf_event *event;
2419
2420	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2421		if (event->state <= PERF_EVENT_STATE_OFF)
2422			continue;
2423		if (!event_filter_match(event))
2424			continue;
2425
2426		/* may need to reset tstamp_enabled */
2427		if (is_cgroup_event(event))
2428			perf_cgroup_mark_enabled(event, ctx);
2429
2430		if (group_can_go_on(event, cpuctx, 1))
2431			group_sched_in(event, cpuctx, ctx);
2432
2433		/*
2434		 * If this pinned group hasn't been scheduled,
2435		 * put it in error state.
2436		 */
2437		if (event->state == PERF_EVENT_STATE_INACTIVE) {
2438			update_group_times(event);
2439			event->state = PERF_EVENT_STATE_ERROR;
2440		}
2441	}
2442}
2443
2444static void
2445ctx_flexible_sched_in(struct perf_event_context *ctx,
2446		      struct perf_cpu_context *cpuctx)
2447{
2448	struct perf_event *event;
2449	int can_add_hw = 1;
2450
2451	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2452		/* Ignore events in OFF or ERROR state */
2453		if (event->state <= PERF_EVENT_STATE_OFF)
2454			continue;
2455		/*
2456		 * Listen to the 'cpu' scheduling filter constraint
2457		 * of events:
2458		 */
2459		if (!event_filter_match(event))
2460			continue;
2461
2462		/* may need to reset tstamp_enabled */
2463		if (is_cgroup_event(event))
2464			perf_cgroup_mark_enabled(event, ctx);
2465
2466		if (group_can_go_on(event, cpuctx, can_add_hw)) {
2467			if (group_sched_in(event, cpuctx, ctx))
2468				can_add_hw = 0;
2469		}
2470	}
2471}
2472
2473static void
2474ctx_sched_in(struct perf_event_context *ctx,
2475	     struct perf_cpu_context *cpuctx,
2476	     enum event_type_t event_type,
2477	     struct task_struct *task)
2478{
 
2479	u64 now;
2480	int is_active = ctx->is_active;
2481
2482	ctx->is_active |= event_type;
 
2483	if (likely(!ctx->nr_events))
2484		return;
2485
2486	now = perf_clock();
2487	ctx->timestamp = now;
2488	perf_cgroup_set_timestamp(task, ctx);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2489	/*
2490	 * First go through the list and put on any pinned groups
2491	 * in order to give them the best chance of going on.
2492	 */
2493	if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2494		ctx_pinned_sched_in(ctx, cpuctx);
2495
2496	/* Then walk through the lower prio flexible groups */
2497	if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2498		ctx_flexible_sched_in(ctx, cpuctx);
2499}
2500
2501static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2502			     enum event_type_t event_type,
2503			     struct task_struct *task)
2504{
2505	struct perf_event_context *ctx = &cpuctx->ctx;
2506
2507	ctx_sched_in(ctx, cpuctx, event_type, task);
2508}
2509
2510static void perf_event_context_sched_in(struct perf_event_context *ctx,
2511					struct task_struct *task)
2512{
2513	struct perf_cpu_context *cpuctx;
2514
2515	cpuctx = __get_cpu_context(ctx);
2516	if (cpuctx->task_ctx == ctx)
2517		return;
2518
2519	perf_ctx_lock(cpuctx, ctx);
2520	perf_pmu_disable(ctx->pmu);
2521	/*
2522	 * We want to keep the following priority order:
2523	 * cpu pinned (that don't need to move), task pinned,
2524	 * cpu flexible, task flexible.
2525	 */
2526	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2527
2528	if (ctx->nr_events)
2529		cpuctx->task_ctx = ctx;
2530
2531	perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2532
2533	perf_pmu_enable(ctx->pmu);
2534	perf_ctx_unlock(cpuctx, ctx);
2535
2536	/*
2537	 * Since these rotations are per-cpu, we need to ensure the
2538	 * cpu-context we got scheduled on is actually rotating.
2539	 */
2540	perf_pmu_rotate_start(ctx->pmu);
2541}
2542
2543/*
2544 * When sampling the branck stack in system-wide, it may be necessary
2545 * to flush the stack on context switch. This happens when the branch
2546 * stack does not tag its entries with the pid of the current task.
2547 * Otherwise it becomes impossible to associate a branch entry with a
2548 * task. This ambiguity is more likely to appear when the branch stack
2549 * supports priv level filtering and the user sets it to monitor only
2550 * at the user level (which could be a useful measurement in system-wide
2551 * mode). In that case, the risk is high of having a branch stack with
2552 * branch from multiple tasks. Flushing may mean dropping the existing
2553 * entries or stashing them somewhere in the PMU specific code layer.
2554 *
2555 * This function provides the context switch callback to the lower code
2556 * layer. It is invoked ONLY when there is at least one system-wide context
2557 * with at least one active event using taken branch sampling.
2558 */
2559static void perf_branch_stack_sched_in(struct task_struct *prev,
2560				       struct task_struct *task)
2561{
2562	struct perf_cpu_context *cpuctx;
2563	struct pmu *pmu;
2564	unsigned long flags;
2565
2566	/* no need to flush branch stack if not changing task */
2567	if (prev == task)
2568		return;
2569
2570	local_irq_save(flags);
2571
2572	rcu_read_lock();
2573
2574	list_for_each_entry_rcu(pmu, &pmus, entry) {
2575		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2576
2577		/*
2578		 * check if the context has at least one
2579		 * event using PERF_SAMPLE_BRANCH_STACK
2580		 */
2581		if (cpuctx->ctx.nr_branch_stack > 0
2582		    && pmu->flush_branch_stack) {
2583
2584			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2585
2586			perf_pmu_disable(pmu);
2587
2588			pmu->flush_branch_stack();
2589
2590			perf_pmu_enable(pmu);
2591
2592			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2593		}
2594	}
2595
2596	rcu_read_unlock();
2597
2598	local_irq_restore(flags);
2599}
2600
2601/*
2602 * Called from scheduler to add the events of the current task
2603 * with interrupts disabled.
2604 *
2605 * We restore the event value and then enable it.
2606 *
2607 * This does not protect us against NMI, but enable()
2608 * sets the enabled bit in the control field of event _before_
2609 * accessing the event control register. If a NMI hits, then it will
2610 * keep the event running.
2611 */
2612void __perf_event_task_sched_in(struct task_struct *prev,
2613				struct task_struct *task)
2614{
2615	struct perf_event_context *ctx;
2616	int ctxn;
2617
 
 
 
 
 
 
 
 
 
 
2618	for_each_task_context_nr(ctxn) {
2619		ctx = task->perf_event_ctxp[ctxn];
2620		if (likely(!ctx))
2621			continue;
2622
2623		perf_event_context_sched_in(ctx, task);
2624	}
2625	/*
2626	 * if cgroup events exist on this CPU, then we need
2627	 * to check if we have to switch in PMU state.
2628	 * cgroup event are system-wide mode only
2629	 */
2630	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2631		perf_cgroup_sched_in(prev, task);
2632
2633	/* check for system-wide branch_stack events */
2634	if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2635		perf_branch_stack_sched_in(prev, task);
 
 
2636}
2637
2638static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2639{
2640	u64 frequency = event->attr.sample_freq;
2641	u64 sec = NSEC_PER_SEC;
2642	u64 divisor, dividend;
2643
2644	int count_fls, nsec_fls, frequency_fls, sec_fls;
2645
2646	count_fls = fls64(count);
2647	nsec_fls = fls64(nsec);
2648	frequency_fls = fls64(frequency);
2649	sec_fls = 30;
2650
2651	/*
2652	 * We got @count in @nsec, with a target of sample_freq HZ
2653	 * the target period becomes:
2654	 *
2655	 *             @count * 10^9
2656	 * period = -------------------
2657	 *          @nsec * sample_freq
2658	 *
2659	 */
2660
2661	/*
2662	 * Reduce accuracy by one bit such that @a and @b converge
2663	 * to a similar magnitude.
2664	 */
2665#define REDUCE_FLS(a, b)		\
2666do {					\
2667	if (a##_fls > b##_fls) {	\
2668		a >>= 1;		\
2669		a##_fls--;		\
2670	} else {			\
2671		b >>= 1;		\
2672		b##_fls--;		\
2673	}				\
2674} while (0)
2675
2676	/*
2677	 * Reduce accuracy until either term fits in a u64, then proceed with
2678	 * the other, so that finally we can do a u64/u64 division.
2679	 */
2680	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2681		REDUCE_FLS(nsec, frequency);
2682		REDUCE_FLS(sec, count);
2683	}
2684
2685	if (count_fls + sec_fls > 64) {
2686		divisor = nsec * frequency;
2687
2688		while (count_fls + sec_fls > 64) {
2689			REDUCE_FLS(count, sec);
2690			divisor >>= 1;
2691		}
2692
2693		dividend = count * sec;
2694	} else {
2695		dividend = count * sec;
2696
2697		while (nsec_fls + frequency_fls > 64) {
2698			REDUCE_FLS(nsec, frequency);
2699			dividend >>= 1;
2700		}
2701
2702		divisor = nsec * frequency;
2703	}
2704
2705	if (!divisor)
2706		return dividend;
2707
2708	return div64_u64(dividend, divisor);
2709}
2710
2711static DEFINE_PER_CPU(int, perf_throttled_count);
2712static DEFINE_PER_CPU(u64, perf_throttled_seq);
2713
2714static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2715{
2716	struct hw_perf_event *hwc = &event->hw;
2717	s64 period, sample_period;
2718	s64 delta;
2719
2720	period = perf_calculate_period(event, nsec, count);
2721
2722	delta = (s64)(period - hwc->sample_period);
2723	delta = (delta + 7) / 8; /* low pass filter */
2724
2725	sample_period = hwc->sample_period + delta;
2726
2727	if (!sample_period)
2728		sample_period = 1;
2729
2730	hwc->sample_period = sample_period;
2731
2732	if (local64_read(&hwc->period_left) > 8*sample_period) {
2733		if (disable)
2734			event->pmu->stop(event, PERF_EF_UPDATE);
2735
2736		local64_set(&hwc->period_left, 0);
2737
2738		if (disable)
2739			event->pmu->start(event, PERF_EF_RELOAD);
2740	}
2741}
2742
2743/*
2744 * combine freq adjustment with unthrottling to avoid two passes over the
2745 * events. At the same time, make sure, having freq events does not change
2746 * the rate of unthrottling as that would introduce bias.
2747 */
2748static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2749					   int needs_unthr)
2750{
2751	struct perf_event *event;
2752	struct hw_perf_event *hwc;
2753	u64 now, period = TICK_NSEC;
2754	s64 delta;
2755
2756	/*
2757	 * only need to iterate over all events iff:
2758	 * - context have events in frequency mode (needs freq adjust)
2759	 * - there are events to unthrottle on this cpu
2760	 */
2761	if (!(ctx->nr_freq || needs_unthr))
2762		return;
2763
2764	raw_spin_lock(&ctx->lock);
2765	perf_pmu_disable(ctx->pmu);
2766
2767	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2768		if (event->state != PERF_EVENT_STATE_ACTIVE)
2769			continue;
2770
2771		if (!event_filter_match(event))
2772			continue;
2773
2774		perf_pmu_disable(event->pmu);
2775
2776		hwc = &event->hw;
2777
2778		if (hwc->interrupts == MAX_INTERRUPTS) {
2779			hwc->interrupts = 0;
2780			perf_log_throttle(event, 1);
2781			event->pmu->start(event, 0);
2782		}
2783
2784		if (!event->attr.freq || !event->attr.sample_freq)
2785			goto next;
2786
2787		/*
2788		 * stop the event and update event->count
2789		 */
2790		event->pmu->stop(event, PERF_EF_UPDATE);
2791
2792		now = local64_read(&event->count);
2793		delta = now - hwc->freq_count_stamp;
2794		hwc->freq_count_stamp = now;
2795
2796		/*
2797		 * restart the event
2798		 * reload only if value has changed
2799		 * we have stopped the event so tell that
2800		 * to perf_adjust_period() to avoid stopping it
2801		 * twice.
2802		 */
2803		if (delta > 0)
2804			perf_adjust_period(event, period, delta, false);
2805
2806		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2807	next:
2808		perf_pmu_enable(event->pmu);
2809	}
2810
2811	perf_pmu_enable(ctx->pmu);
2812	raw_spin_unlock(&ctx->lock);
2813}
2814
2815/*
2816 * Round-robin a context's events:
2817 */
2818static void rotate_ctx(struct perf_event_context *ctx)
2819{
2820	/*
2821	 * Rotate the first entry last of non-pinned groups. Rotation might be
2822	 * disabled by the inheritance code.
2823	 */
2824	if (!ctx->rotate_disable)
2825		list_rotate_left(&ctx->flexible_groups);
2826}
2827
2828/*
2829 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2830 * because they're strictly cpu affine and rotate_start is called with IRQs
2831 * disabled, while rotate_context is called from IRQ context.
2832 */
2833static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2834{
2835	struct perf_event_context *ctx = NULL;
2836	int rotate = 0, remove = 1;
2837
2838	if (cpuctx->ctx.nr_events) {
2839		remove = 0;
2840		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2841			rotate = 1;
2842	}
2843
2844	ctx = cpuctx->task_ctx;
2845	if (ctx && ctx->nr_events) {
2846		remove = 0;
2847		if (ctx->nr_events != ctx->nr_active)
2848			rotate = 1;
2849	}
2850
2851	if (!rotate)
2852		goto done;
2853
2854	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2855	perf_pmu_disable(cpuctx->ctx.pmu);
2856
2857	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2858	if (ctx)
2859		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2860
2861	rotate_ctx(&cpuctx->ctx);
2862	if (ctx)
2863		rotate_ctx(ctx);
2864
2865	perf_event_sched_in(cpuctx, ctx, current);
2866
2867	perf_pmu_enable(cpuctx->ctx.pmu);
2868	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2869done:
2870	if (remove)
2871		list_del_init(&cpuctx->rotation_list);
2872
2873	return rotate;
2874}
2875
2876#ifdef CONFIG_NO_HZ_FULL
2877bool perf_event_can_stop_tick(void)
2878{
2879	if (atomic_read(&nr_freq_events) ||
2880	    __this_cpu_read(perf_throttled_count))
2881		return false;
2882	else
2883		return true;
2884}
2885#endif
2886
2887void perf_event_task_tick(void)
2888{
2889	struct list_head *head = &__get_cpu_var(rotation_list);
2890	struct perf_cpu_context *cpuctx, *tmp;
2891	struct perf_event_context *ctx;
2892	int throttled;
2893
2894	WARN_ON(!irqs_disabled());
2895
2896	__this_cpu_inc(perf_throttled_seq);
2897	throttled = __this_cpu_xchg(perf_throttled_count, 0);
 
2898
2899	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2900		ctx = &cpuctx->ctx;
2901		perf_adjust_freq_unthr_context(ctx, throttled);
2902
2903		ctx = cpuctx->task_ctx;
2904		if (ctx)
2905			perf_adjust_freq_unthr_context(ctx, throttled);
2906	}
2907}
2908
2909static int event_enable_on_exec(struct perf_event *event,
2910				struct perf_event_context *ctx)
2911{
2912	if (!event->attr.enable_on_exec)
2913		return 0;
2914
2915	event->attr.enable_on_exec = 0;
2916	if (event->state >= PERF_EVENT_STATE_INACTIVE)
2917		return 0;
2918
2919	__perf_event_mark_enabled(event);
2920
2921	return 1;
2922}
2923
2924/*
2925 * Enable all of a task's events that have been marked enable-on-exec.
2926 * This expects task == current.
2927 */
2928static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2929{
 
 
2930	struct perf_event *event;
2931	unsigned long flags;
2932	int enabled = 0;
2933	int ret;
2934
2935	local_irq_save(flags);
 
2936	if (!ctx || !ctx->nr_events)
2937		goto out;
2938
 
 
 
 
 
 
2939	/*
2940	 * We must ctxsw out cgroup events to avoid conflict
2941	 * when invoking perf_task_event_sched_in() later on
2942	 * in this function. Otherwise we end up trying to
2943	 * ctxswin cgroup events which are already scheduled
2944	 * in.
2945	 */
2946	perf_cgroup_sched_out(current, NULL);
 
 
 
 
 
 
 
 
 
 
 
2947
2948	raw_spin_lock(&ctx->lock);
2949	task_ctx_sched_out(ctx);
 
 
 
 
 
 
 
2950
2951	list_for_each_entry(event, &ctx->event_list, event_entry) {
2952		ret = event_enable_on_exec(event, ctx);
2953		if (ret)
2954			enabled = 1;
2955	}
2956
2957	/*
2958	 * Unclone this context if we enabled any event.
2959	 */
2960	if (enabled)
2961		unclone_ctx(ctx);
2962
2963	raw_spin_unlock(&ctx->lock);
 
 
2964
2965	/*
2966	 * Also calls ctxswin for cgroup events, if any:
2967	 */
2968	perf_event_context_sched_in(ctx, ctx->task);
2969out:
2970	local_irq_restore(flags);
2971}
2972
2973/*
2974 * Cross CPU call to read the hardware event
2975 */
2976static void __perf_event_read(void *info)
2977{
2978	struct perf_event *event = info;
 
2979	struct perf_event_context *ctx = event->ctx;
2980	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
2981
2982	/*
2983	 * If this is a task context, we need to check whether it is
2984	 * the current task context of this cpu.  If not it has been
2985	 * scheduled out before the smp call arrived.  In that case
2986	 * event->count would have been updated to a recent sample
2987	 * when the event was scheduled out.
2988	 */
2989	if (ctx->task && cpuctx->task_ctx != ctx)
2990		return;
2991
2992	raw_spin_lock(&ctx->lock);
2993	if (ctx->is_active) {
2994		update_context_time(ctx);
2995		update_cgrp_time_from_event(event);
2996	}
 
2997	update_event_times(event);
2998	if (event->state == PERF_EVENT_STATE_ACTIVE)
2999		event->pmu->read(event);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3000	raw_spin_unlock(&ctx->lock);
3001}
3002
3003static inline u64 perf_event_count(struct perf_event *event)
3004{
3005	return local64_read(&event->count) + atomic64_read(&event->child_count);
 
 
 
3006}
3007
3008static u64 perf_event_read(struct perf_event *event)
 
 
 
 
 
 
 
 
3009{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3010	/*
3011	 * If event is enabled and currently active on a CPU, update the
3012	 * value in the event structure:
3013	 */
3014	if (event->state == PERF_EVENT_STATE_ACTIVE) {
3015		smp_call_function_single(event->oncpu,
3016					 __perf_event_read, event, 1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3017	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3018		struct perf_event_context *ctx = event->ctx;
3019		unsigned long flags;
3020
3021		raw_spin_lock_irqsave(&ctx->lock, flags);
3022		/*
3023		 * may read while context is not active
3024		 * (e.g., thread is blocked), in that case
3025		 * we cannot update context time
3026		 */
3027		if (ctx->is_active) {
3028			update_context_time(ctx);
3029			update_cgrp_time_from_event(event);
3030		}
3031		update_event_times(event);
 
 
 
3032		raw_spin_unlock_irqrestore(&ctx->lock, flags);
3033	}
3034
3035	return perf_event_count(event);
3036}
3037
3038/*
3039 * Initialize the perf_event context in a task_struct:
3040 */
3041static void __perf_event_init_context(struct perf_event_context *ctx)
3042{
3043	raw_spin_lock_init(&ctx->lock);
3044	mutex_init(&ctx->mutex);
 
3045	INIT_LIST_HEAD(&ctx->pinned_groups);
3046	INIT_LIST_HEAD(&ctx->flexible_groups);
3047	INIT_LIST_HEAD(&ctx->event_list);
3048	atomic_set(&ctx->refcount, 1);
3049}
3050
3051static struct perf_event_context *
3052alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3053{
3054	struct perf_event_context *ctx;
3055
3056	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3057	if (!ctx)
3058		return NULL;
3059
3060	__perf_event_init_context(ctx);
3061	if (task) {
3062		ctx->task = task;
3063		get_task_struct(task);
3064	}
3065	ctx->pmu = pmu;
3066
3067	return ctx;
3068}
3069
3070static struct task_struct *
3071find_lively_task_by_vpid(pid_t vpid)
3072{
3073	struct task_struct *task;
3074	int err;
3075
3076	rcu_read_lock();
3077	if (!vpid)
3078		task = current;
3079	else
3080		task = find_task_by_vpid(vpid);
3081	if (task)
3082		get_task_struct(task);
3083	rcu_read_unlock();
3084
3085	if (!task)
3086		return ERR_PTR(-ESRCH);
3087
3088	/* Reuse ptrace permission checks for now. */
3089	err = -EACCES;
3090	if (!ptrace_may_access(task, PTRACE_MODE_READ))
3091		goto errout;
3092
3093	return task;
3094errout:
3095	put_task_struct(task);
3096	return ERR_PTR(err);
3097
3098}
3099
3100/*
3101 * Returns a matching context with refcount and pincount.
3102 */
3103static struct perf_event_context *
3104find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 
3105{
3106	struct perf_event_context *ctx;
3107	struct perf_cpu_context *cpuctx;
 
3108	unsigned long flags;
3109	int ctxn, err;
 
3110
3111	if (!task) {
3112		/* Must be root to operate on a CPU event: */
3113		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3114			return ERR_PTR(-EACCES);
3115
3116		/*
3117		 * We could be clever and allow to attach a event to an
3118		 * offline CPU and activate it when the CPU comes up, but
3119		 * that's for later.
3120		 */
3121		if (!cpu_online(cpu))
3122			return ERR_PTR(-ENODEV);
3123
3124		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3125		ctx = &cpuctx->ctx;
3126		get_ctx(ctx);
3127		++ctx->pin_count;
3128
3129		return ctx;
3130	}
3131
3132	err = -EINVAL;
3133	ctxn = pmu->task_ctx_nr;
3134	if (ctxn < 0)
3135		goto errout;
3136
 
 
 
 
 
 
 
 
3137retry:
3138	ctx = perf_lock_task_context(task, ctxn, &flags);
3139	if (ctx) {
3140		unclone_ctx(ctx);
3141		++ctx->pin_count;
 
 
 
 
 
3142		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 
 
 
3143	} else {
3144		ctx = alloc_perf_context(pmu, task);
3145		err = -ENOMEM;
3146		if (!ctx)
3147			goto errout;
3148
 
 
 
 
 
3149		err = 0;
3150		mutex_lock(&task->perf_event_mutex);
3151		/*
3152		 * If it has already passed perf_event_exit_task().
3153		 * we must see PF_EXITING, it takes this mutex too.
3154		 */
3155		if (task->flags & PF_EXITING)
3156			err = -ESRCH;
3157		else if (task->perf_event_ctxp[ctxn])
3158			err = -EAGAIN;
3159		else {
3160			get_ctx(ctx);
3161			++ctx->pin_count;
3162			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3163		}
3164		mutex_unlock(&task->perf_event_mutex);
3165
3166		if (unlikely(err)) {
3167			put_ctx(ctx);
3168
3169			if (err == -EAGAIN)
3170				goto retry;
3171			goto errout;
3172		}
3173	}
3174
 
3175	return ctx;
3176
3177errout:
 
3178	return ERR_PTR(err);
3179}
3180
3181static void perf_event_free_filter(struct perf_event *event);
 
3182
3183static void free_event_rcu(struct rcu_head *head)
3184{
3185	struct perf_event *event;
3186
3187	event = container_of(head, struct perf_event, rcu_head);
3188	if (event->ns)
3189		put_pid_ns(event->ns);
3190	perf_event_free_filter(event);
3191	kfree(event);
3192}
3193
3194static void ring_buffer_put(struct ring_buffer *rb);
3195static void ring_buffer_attach(struct perf_event *event,
3196			       struct ring_buffer *rb);
3197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3198static void unaccount_event_cpu(struct perf_event *event, int cpu)
3199{
3200	if (event->parent)
3201		return;
3202
3203	if (has_branch_stack(event)) {
3204		if (!(event->attach_state & PERF_ATTACH_TASK))
3205			atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3206	}
3207	if (is_cgroup_event(event))
3208		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3209}
3210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3211static void unaccount_event(struct perf_event *event)
3212{
 
 
3213	if (event->parent)
3214		return;
3215
3216	if (event->attach_state & PERF_ATTACH_TASK)
3217		static_key_slow_dec_deferred(&perf_sched_events);
3218	if (event->attr.mmap || event->attr.mmap_data)
3219		atomic_dec(&nr_mmap_events);
3220	if (event->attr.comm)
3221		atomic_dec(&nr_comm_events);
3222	if (event->attr.task)
3223		atomic_dec(&nr_task_events);
3224	if (event->attr.freq)
3225		atomic_dec(&nr_freq_events);
 
 
 
 
3226	if (is_cgroup_event(event))
3227		static_key_slow_dec_deferred(&perf_sched_events);
3228	if (has_branch_stack(event))
3229		static_key_slow_dec_deferred(&perf_sched_events);
 
 
 
 
 
3230
3231	unaccount_event_cpu(event, event->cpu);
 
 
3232}
3233
3234static void __free_event(struct perf_event *event)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3235{
3236	if (!event->parent) {
3237		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3238			put_callchain_buffers();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3239	}
3240
3241	if (event->destroy)
3242		event->destroy(event);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3243
3244	if (event->ctx)
3245		put_ctx(event->ctx);
 
 
3246
3247	call_rcu(&event->rcu_head, free_event_rcu);
3248}
3249static void free_event(struct perf_event *event)
 
 
 
 
3250{
3251	irq_work_sync(&event->pending);
3252
3253	unaccount_event(event);
3254
3255	if (event->rb) {
3256		/*
3257		 * Can happen when we close an event with re-directed output.
3258		 *
3259		 * Since we have a 0 refcount, perf_mmap_close() will skip
3260		 * over us; possibly making our ring_buffer_put() the last.
3261		 */
3262		mutex_lock(&event->mmap_mutex);
3263		ring_buffer_attach(event, NULL);
3264		mutex_unlock(&event->mmap_mutex);
3265	}
3266
3267	if (is_cgroup_event(event))
3268		perf_detach_cgroup(event);
3269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3270
3271	__free_event(event);
3272}
3273
3274int perf_event_release_kernel(struct perf_event *event)
 
 
 
 
3275{
3276	struct perf_event_context *ctx = event->ctx;
3277
3278	WARN_ON_ONCE(ctx->parent_ctx);
3279	/*
3280	 * There are two ways this annotation is useful:
3281	 *
3282	 *  1) there is a lock recursion from perf_event_exit_task
3283	 *     see the comment there.
3284	 *
3285	 *  2) there is a lock-inversion with mmap_sem through
3286	 *     perf_event_read_group(), which takes faults while
3287	 *     holding ctx->mutex, however this is called after
3288	 *     the last filedesc died, so there is no possibility
3289	 *     to trigger the AB-BA case.
3290	 */
3291	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3292	perf_remove_from_context(event, true);
3293	mutex_unlock(&ctx->mutex);
3294
3295	free_event(event);
3296
3297	return 0;
3298}
3299EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3300
3301/*
3302 * Called when the last reference to the file is gone.
3303 */
3304static void put_event(struct perf_event *event)
3305{
3306	struct task_struct *owner;
3307
3308	if (!atomic_long_dec_and_test(&event->refcount))
3309		return;
3310
3311	rcu_read_lock();
3312	owner = ACCESS_ONCE(event->owner);
3313	/*
3314	 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3315	 * !owner it means the list deletion is complete and we can indeed
3316	 * free this event, otherwise we need to serialize on
3317	 * owner->perf_event_mutex.
3318	 */
3319	smp_read_barrier_depends();
3320	if (owner) {
3321		/*
3322		 * Since delayed_put_task_struct() also drops the last
3323		 * task reference we can safely take a new reference
3324		 * while holding the rcu_read_lock().
3325		 */
3326		get_task_struct(owner);
3327	}
3328	rcu_read_unlock();
3329
3330	if (owner) {
3331		mutex_lock(&owner->perf_event_mutex);
 
 
 
 
 
 
 
 
 
3332		/*
3333		 * We have to re-check the event->owner field, if it is cleared
3334		 * we raced with perf_event_exit_task(), acquiring the mutex
3335		 * ensured they're done, and we can proceed with freeing the
3336		 * event.
3337		 */
3338		if (event->owner)
3339			list_del_init(&event->owner_entry);
 
 
3340		mutex_unlock(&owner->perf_event_mutex);
3341		put_task_struct(owner);
3342	}
 
3343
3344	perf_event_release_kernel(event);
 
 
 
 
 
3345}
3346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3347static int perf_release(struct inode *inode, struct file *file)
3348{
3349	put_event(file->private_data);
3350	return 0;
3351}
3352
3353u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3354{
3355	struct perf_event *child;
3356	u64 total = 0;
3357
3358	*enabled = 0;
3359	*running = 0;
3360
3361	mutex_lock(&event->child_mutex);
3362	total += perf_event_read(event);
 
 
 
3363	*enabled += event->total_time_enabled +
3364			atomic64_read(&event->child_total_time_enabled);
3365	*running += event->total_time_running +
3366			atomic64_read(&event->child_total_time_running);
3367
3368	list_for_each_entry(child, &event->child_list, child_list) {
3369		total += perf_event_read(child);
 
3370		*enabled += child->total_time_enabled;
3371		*running += child->total_time_running;
3372	}
3373	mutex_unlock(&event->child_mutex);
3374
3375	return total;
3376}
3377EXPORT_SYMBOL_GPL(perf_event_read_value);
3378
3379static int perf_event_read_group(struct perf_event *event,
3380				   u64 read_format, char __user *buf)
3381{
3382	struct perf_event *leader = event->group_leader, *sub;
3383	int n = 0, size = 0, ret = -EFAULT;
3384	struct perf_event_context *ctx = leader->ctx;
3385	u64 values[5];
3386	u64 count, enabled, running;
 
 
 
 
 
 
 
 
 
 
 
 
3387
3388	mutex_lock(&ctx->mutex);
3389	count = perf_event_read_value(leader, &enabled, &running);
 
 
3390
3391	values[n++] = 1 + leader->nr_siblings;
3392	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3393		values[n++] = enabled;
3394	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3395		values[n++] = running;
3396	values[n++] = count;
3397	if (read_format & PERF_FORMAT_ID)
3398		values[n++] = primary_event_id(leader);
3399
3400	size = n * sizeof(u64);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3401
3402	if (copy_to_user(buf, values, size))
3403		goto unlock;
3404
3405	ret = size;
 
 
3406
3407	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3408		n = 0;
3409
3410		values[n++] = perf_event_read_value(sub, &enabled, &running);
3411		if (read_format & PERF_FORMAT_ID)
3412			values[n++] = primary_event_id(sub);
 
 
3413
3414		size = n * sizeof(u64);
 
 
3415
3416		if (copy_to_user(buf + ret, values, size)) {
3417			ret = -EFAULT;
 
3418			goto unlock;
3419		}
 
 
 
 
 
 
 
3420
3421		ret += size;
3422	}
3423unlock:
3424	mutex_unlock(&ctx->mutex);
3425
 
3426	return ret;
3427}
3428
3429static int perf_event_read_one(struct perf_event *event,
3430				 u64 read_format, char __user *buf)
3431{
3432	u64 enabled, running;
3433	u64 values[4];
3434	int n = 0;
3435
3436	values[n++] = perf_event_read_value(event, &enabled, &running);
3437	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3438		values[n++] = enabled;
3439	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3440		values[n++] = running;
3441	if (read_format & PERF_FORMAT_ID)
3442		values[n++] = primary_event_id(event);
3443
3444	if (copy_to_user(buf, values, n * sizeof(u64)))
3445		return -EFAULT;
3446
3447	return n * sizeof(u64);
3448}
3449
 
 
 
 
 
 
 
 
 
 
 
 
 
3450/*
3451 * Read the performance event - simple non blocking version for now
3452 */
3453static ssize_t
3454perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3455{
3456	u64 read_format = event->attr.read_format;
3457	int ret;
3458
3459	/*
3460	 * Return end-of-file for a read on a event that is in
3461	 * error state (i.e. because it was pinned but it couldn't be
3462	 * scheduled on to the CPU at some point).
3463	 */
3464	if (event->state == PERF_EVENT_STATE_ERROR)
3465		return 0;
3466
3467	if (count < event->read_size)
3468		return -ENOSPC;
3469
3470	WARN_ON_ONCE(event->ctx->parent_ctx);
3471	if (read_format & PERF_FORMAT_GROUP)
3472		ret = perf_event_read_group(event, read_format, buf);
3473	else
3474		ret = perf_event_read_one(event, read_format, buf);
3475
3476	return ret;
3477}
3478
3479static ssize_t
3480perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3481{
3482	struct perf_event *event = file->private_data;
 
 
 
 
 
 
3483
3484	return perf_read_hw(event, buf, count);
3485}
3486
3487static unsigned int perf_poll(struct file *file, poll_table *wait)
3488{
3489	struct perf_event *event = file->private_data;
3490	struct ring_buffer *rb;
3491	unsigned int events = POLL_HUP;
 
 
 
 
 
3492
3493	/*
3494	 * Pin the event->rb by taking event->mmap_mutex; otherwise
3495	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3496	 */
3497	mutex_lock(&event->mmap_mutex);
3498	rb = event->rb;
3499	if (rb)
3500		events = atomic_xchg(&rb->poll, 0);
3501	mutex_unlock(&event->mmap_mutex);
3502
3503	poll_wait(file, &event->waitq, wait);
3504
3505	return events;
3506}
3507
3508static void perf_event_reset(struct perf_event *event)
3509{
3510	(void)perf_event_read(event);
3511	local64_set(&event->count, 0);
3512	perf_event_update_userpage(event);
3513}
3514
3515/*
3516 * Holding the top-level event's child_mutex means that any
3517 * descendant process that has inherited this event will block
3518 * in sync_child_event if it goes to exit, thus satisfying the
3519 * task existence requirements of perf_event_enable/disable.
3520 */
3521static void perf_event_for_each_child(struct perf_event *event,
3522					void (*func)(struct perf_event *))
3523{
3524	struct perf_event *child;
3525
3526	WARN_ON_ONCE(event->ctx->parent_ctx);
 
3527	mutex_lock(&event->child_mutex);
3528	func(event);
3529	list_for_each_entry(child, &event->child_list, child_list)
3530		func(child);
3531	mutex_unlock(&event->child_mutex);
3532}
3533
3534static void perf_event_for_each(struct perf_event *event,
3535				  void (*func)(struct perf_event *))
3536{
3537	struct perf_event_context *ctx = event->ctx;
3538	struct perf_event *sibling;
3539
3540	WARN_ON_ONCE(ctx->parent_ctx);
3541	mutex_lock(&ctx->mutex);
3542	event = event->group_leader;
3543
3544	perf_event_for_each_child(event, func);
3545	list_for_each_entry(sibling, &event->sibling_list, group_entry)
3546		perf_event_for_each_child(sibling, func);
3547	mutex_unlock(&ctx->mutex);
3548}
3549
3550static int perf_event_period(struct perf_event *event, u64 __user *arg)
 
 
 
3551{
3552	struct perf_event_context *ctx = event->ctx;
3553	int ret = 0, active;
3554	u64 value;
3555
3556	if (!is_sampling_event(event))
3557		return -EINVAL;
3558
3559	if (copy_from_user(&value, arg, sizeof(value)))
3560		return -EFAULT;
3561
3562	if (!value)
3563		return -EINVAL;
3564
3565	raw_spin_lock_irq(&ctx->lock);
3566	if (event->attr.freq) {
3567		if (value > sysctl_perf_event_sample_rate) {
3568			ret = -EINVAL;
3569			goto unlock;
3570		}
3571
3572		event->attr.sample_freq = value;
3573	} else {
3574		event->attr.sample_period = value;
3575		event->hw.sample_period = value;
3576	}
3577
3578	active = (event->state == PERF_EVENT_STATE_ACTIVE);
3579	if (active) {
3580		perf_pmu_disable(ctx->pmu);
 
 
 
 
 
 
 
 
3581		event->pmu->stop(event, PERF_EF_UPDATE);
3582	}
3583
3584	local64_set(&event->hw.period_left, 0);
3585
3586	if (active) {
3587		event->pmu->start(event, PERF_EF_RELOAD);
3588		perf_pmu_enable(ctx->pmu);
3589	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3590
3591unlock:
3592	raw_spin_unlock_irq(&ctx->lock);
3593
3594	return ret;
3595}
3596
3597static const struct file_operations perf_fops;
3598
3599static inline int perf_fget_light(int fd, struct fd *p)
3600{
3601	struct fd f = fdget(fd);
3602	if (!f.file)
3603		return -EBADF;
3604
3605	if (f.file->f_op != &perf_fops) {
3606		fdput(f);
3607		return -EBADF;
3608	}
3609	*p = f;
3610	return 0;
3611}
3612
3613static int perf_event_set_output(struct perf_event *event,
3614				 struct perf_event *output_event);
3615static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
3616
3617static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3618{
3619	struct perf_event *event = file->private_data;
3620	void (*func)(struct perf_event *);
3621	u32 flags = arg;
3622
3623	switch (cmd) {
3624	case PERF_EVENT_IOC_ENABLE:
3625		func = perf_event_enable;
3626		break;
3627	case PERF_EVENT_IOC_DISABLE:
3628		func = perf_event_disable;
3629		break;
3630	case PERF_EVENT_IOC_RESET:
3631		func = perf_event_reset;
3632		break;
3633
3634	case PERF_EVENT_IOC_REFRESH:
3635		return perf_event_refresh(event, arg);
3636
3637	case PERF_EVENT_IOC_PERIOD:
3638		return perf_event_period(event, (u64 __user *)arg);
3639
3640	case PERF_EVENT_IOC_ID:
3641	{
3642		u64 id = primary_event_id(event);
3643
3644		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3645			return -EFAULT;
3646		return 0;
3647	}
3648
3649	case PERF_EVENT_IOC_SET_OUTPUT:
3650	{
3651		int ret;
3652		if (arg != -1) {
3653			struct perf_event *output_event;
3654			struct fd output;
3655			ret = perf_fget_light(arg, &output);
3656			if (ret)
3657				return ret;
3658			output_event = output.file->private_data;
3659			ret = perf_event_set_output(event, output_event);
3660			fdput(output);
3661		} else {
3662			ret = perf_event_set_output(event, NULL);
3663		}
3664		return ret;
3665	}
3666
3667	case PERF_EVENT_IOC_SET_FILTER:
3668		return perf_event_set_filter(event, (void __user *)arg);
3669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3670	default:
3671		return -ENOTTY;
3672	}
3673
3674	if (flags & PERF_IOC_FLAG_GROUP)
3675		perf_event_for_each(event, func);
3676	else
3677		perf_event_for_each_child(event, func);
3678
3679	return 0;
3680}
3681
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3682int perf_event_task_enable(void)
3683{
 
3684	struct perf_event *event;
3685
3686	mutex_lock(&current->perf_event_mutex);
3687	list_for_each_entry(event, &current->perf_event_list, owner_entry)
3688		perf_event_for_each_child(event, perf_event_enable);
 
 
 
3689	mutex_unlock(&current->perf_event_mutex);
3690
3691	return 0;
3692}
3693
3694int perf_event_task_disable(void)
3695{
 
3696	struct perf_event *event;
3697
3698	mutex_lock(&current->perf_event_mutex);
3699	list_for_each_entry(event, &current->perf_event_list, owner_entry)
3700		perf_event_for_each_child(event, perf_event_disable);
 
 
 
3701	mutex_unlock(&current->perf_event_mutex);
3702
3703	return 0;
3704}
3705
3706static int perf_event_index(struct perf_event *event)
3707{
3708	if (event->hw.state & PERF_HES_STOPPED)
3709		return 0;
3710
3711	if (event->state != PERF_EVENT_STATE_ACTIVE)
3712		return 0;
3713
3714	return event->pmu->event_idx(event);
3715}
3716
3717static void calc_timer_values(struct perf_event *event,
3718				u64 *now,
3719				u64 *enabled,
3720				u64 *running)
3721{
3722	u64 ctx_time;
3723
3724	*now = perf_clock();
3725	ctx_time = event->shadow_ctx_time + *now;
3726	*enabled = ctx_time - event->tstamp_enabled;
3727	*running = ctx_time - event->tstamp_running;
3728}
3729
3730static void perf_event_init_userpage(struct perf_event *event)
3731{
3732	struct perf_event_mmap_page *userpg;
3733	struct ring_buffer *rb;
3734
3735	rcu_read_lock();
3736	rb = rcu_dereference(event->rb);
3737	if (!rb)
3738		goto unlock;
3739
3740	userpg = rb->user_page;
3741
3742	/* Allow new userspace to detect that bit 0 is deprecated */
3743	userpg->cap_bit0_is_deprecated = 1;
3744	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
 
 
3745
3746unlock:
3747	rcu_read_unlock();
3748}
3749
3750void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 
3751{
3752}
3753
3754/*
3755 * Callers need to ensure there can be no nesting of this function, otherwise
3756 * the seqlock logic goes bad. We can not serialize this because the arch
3757 * code calls this from NMI context.
3758 */
3759void perf_event_update_userpage(struct perf_event *event)
3760{
3761	struct perf_event_mmap_page *userpg;
3762	struct ring_buffer *rb;
3763	u64 enabled, running, now;
3764
3765	rcu_read_lock();
3766	rb = rcu_dereference(event->rb);
3767	if (!rb)
3768		goto unlock;
3769
3770	/*
3771	 * compute total_time_enabled, total_time_running
3772	 * based on snapshot values taken when the event
3773	 * was last scheduled in.
3774	 *
3775	 * we cannot simply called update_context_time()
3776	 * because of locking issue as we can be called in
3777	 * NMI context
3778	 */
3779	calc_timer_values(event, &now, &enabled, &running);
3780
3781	userpg = rb->user_page;
3782	/*
3783	 * Disable preemption so as to not let the corresponding user-space
3784	 * spin too long if we get preempted.
3785	 */
3786	preempt_disable();
3787	++userpg->lock;
3788	barrier();
3789	userpg->index = perf_event_index(event);
3790	userpg->offset = perf_event_count(event);
3791	if (userpg->index)
3792		userpg->offset -= local64_read(&event->hw.prev_count);
3793
3794	userpg->time_enabled = enabled +
3795			atomic64_read(&event->child_total_time_enabled);
3796
3797	userpg->time_running = running +
3798			atomic64_read(&event->child_total_time_running);
3799
3800	arch_perf_update_userpage(userpg, now);
3801
3802	barrier();
3803	++userpg->lock;
3804	preempt_enable();
3805unlock:
3806	rcu_read_unlock();
3807}
3808
3809static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3810{
3811	struct perf_event *event = vma->vm_file->private_data;
3812	struct ring_buffer *rb;
3813	int ret = VM_FAULT_SIGBUS;
3814
3815	if (vmf->flags & FAULT_FLAG_MKWRITE) {
3816		if (vmf->pgoff == 0)
3817			ret = 0;
3818		return ret;
3819	}
3820
3821	rcu_read_lock();
3822	rb = rcu_dereference(event->rb);
3823	if (!rb)
3824		goto unlock;
3825
3826	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3827		goto unlock;
3828
3829	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3830	if (!vmf->page)
3831		goto unlock;
3832
3833	get_page(vmf->page);
3834	vmf->page->mapping = vma->vm_file->f_mapping;
3835	vmf->page->index   = vmf->pgoff;
3836
3837	ret = 0;
3838unlock:
3839	rcu_read_unlock();
3840
3841	return ret;
3842}
3843
3844static void ring_buffer_attach(struct perf_event *event,
3845			       struct ring_buffer *rb)
3846{
3847	struct ring_buffer *old_rb = NULL;
3848	unsigned long flags;
3849
3850	if (event->rb) {
3851		/*
3852		 * Should be impossible, we set this when removing
3853		 * event->rb_entry and wait/clear when adding event->rb_entry.
3854		 */
3855		WARN_ON_ONCE(event->rcu_pending);
3856
3857		old_rb = event->rb;
3858		event->rcu_batches = get_state_synchronize_rcu();
3859		event->rcu_pending = 1;
3860
3861		spin_lock_irqsave(&old_rb->event_lock, flags);
3862		list_del_rcu(&event->rb_entry);
3863		spin_unlock_irqrestore(&old_rb->event_lock, flags);
3864	}
3865
3866	if (event->rcu_pending && rb) {
3867		cond_synchronize_rcu(event->rcu_batches);
3868		event->rcu_pending = 0;
3869	}
3870
3871	if (rb) {
 
 
 
 
 
3872		spin_lock_irqsave(&rb->event_lock, flags);
3873		list_add_rcu(&event->rb_entry, &rb->event_list);
3874		spin_unlock_irqrestore(&rb->event_lock, flags);
3875	}
3876
 
 
 
 
 
 
 
 
 
 
 
 
 
3877	rcu_assign_pointer(event->rb, rb);
3878
3879	if (old_rb) {
3880		ring_buffer_put(old_rb);
3881		/*
3882		 * Since we detached before setting the new rb, so that we
3883		 * could attach the new rb, we could have missed a wakeup.
3884		 * Provide it now.
3885		 */
3886		wake_up_all(&event->waitq);
3887	}
3888}
3889
3890static void ring_buffer_wakeup(struct perf_event *event)
3891{
3892	struct ring_buffer *rb;
3893
3894	rcu_read_lock();
3895	rb = rcu_dereference(event->rb);
3896	if (rb) {
3897		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3898			wake_up_all(&event->waitq);
3899	}
3900	rcu_read_unlock();
3901}
3902
3903static void rb_free_rcu(struct rcu_head *rcu_head)
3904{
3905	struct ring_buffer *rb;
3906
3907	rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3908	rb_free(rb);
3909}
3910
3911static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3912{
3913	struct ring_buffer *rb;
3914
3915	rcu_read_lock();
3916	rb = rcu_dereference(event->rb);
3917	if (rb) {
3918		if (!atomic_inc_not_zero(&rb->refcount))
3919			rb = NULL;
3920	}
3921	rcu_read_unlock();
3922
3923	return rb;
3924}
3925
3926static void ring_buffer_put(struct ring_buffer *rb)
3927{
3928	if (!atomic_dec_and_test(&rb->refcount))
3929		return;
3930
3931	WARN_ON_ONCE(!list_empty(&rb->event_list));
3932
3933	call_rcu(&rb->rcu_head, rb_free_rcu);
3934}
3935
3936static void perf_mmap_open(struct vm_area_struct *vma)
3937{
3938	struct perf_event *event = vma->vm_file->private_data;
3939
3940	atomic_inc(&event->mmap_count);
3941	atomic_inc(&event->rb->mmap_count);
 
 
 
 
 
 
3942}
3943
 
 
3944/*
3945 * A buffer can be mmap()ed multiple times; either directly through the same
3946 * event, or through other events by use of perf_event_set_output().
3947 *
3948 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3949 * the buffer here, where we still have a VM context. This means we need
3950 * to detach all events redirecting to us.
3951 */
3952static void perf_mmap_close(struct vm_area_struct *vma)
3953{
3954	struct perf_event *event = vma->vm_file->private_data;
3955
3956	struct ring_buffer *rb = ring_buffer_get(event);
3957	struct user_struct *mmap_user = rb->mmap_user;
3958	int mmap_locked = rb->mmap_locked;
3959	unsigned long size = perf_data_size(rb);
3960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3961	atomic_dec(&rb->mmap_count);
3962
3963	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3964		goto out_put;
3965
3966	ring_buffer_attach(event, NULL);
3967	mutex_unlock(&event->mmap_mutex);
3968
3969	/* If there's still other mmap()s of this buffer, we're done. */
3970	if (atomic_read(&rb->mmap_count))
3971		goto out_put;
3972
3973	/*
3974	 * No other mmap()s, detach from all other events that might redirect
3975	 * into the now unreachable buffer. Somewhat complicated by the
3976	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3977	 */
3978again:
3979	rcu_read_lock();
3980	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3981		if (!atomic_long_inc_not_zero(&event->refcount)) {
3982			/*
3983			 * This event is en-route to free_event() which will
3984			 * detach it and remove it from the list.
3985			 */
3986			continue;
3987		}
3988		rcu_read_unlock();
3989
3990		mutex_lock(&event->mmap_mutex);
3991		/*
3992		 * Check we didn't race with perf_event_set_output() which can
3993		 * swizzle the rb from under us while we were waiting to
3994		 * acquire mmap_mutex.
3995		 *
3996		 * If we find a different rb; ignore this event, a next
3997		 * iteration will no longer find it on the list. We have to
3998		 * still restart the iteration to make sure we're not now
3999		 * iterating the wrong list.
4000		 */
4001		if (event->rb == rb)
4002			ring_buffer_attach(event, NULL);
4003
4004		mutex_unlock(&event->mmap_mutex);
4005		put_event(event);
4006
4007		/*
4008		 * Restart the iteration; either we're on the wrong list or
4009		 * destroyed its integrity by doing a deletion.
4010		 */
4011		goto again;
4012	}
4013	rcu_read_unlock();
4014
4015	/*
4016	 * It could be there's still a few 0-ref events on the list; they'll
4017	 * get cleaned up by free_event() -- they'll also still have their
4018	 * ref on the rb and will free it whenever they are done with it.
4019	 *
4020	 * Aside from that, this buffer is 'fully' detached and unmapped,
4021	 * undo the VM accounting.
4022	 */
4023
4024	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4025	vma->vm_mm->pinned_vm -= mmap_locked;
4026	free_uid(mmap_user);
4027
4028out_put:
4029	ring_buffer_put(rb); /* could be last */
4030}
4031
4032static const struct vm_operations_struct perf_mmap_vmops = {
4033	.open		= perf_mmap_open,
4034	.close		= perf_mmap_close,
4035	.fault		= perf_mmap_fault,
4036	.page_mkwrite	= perf_mmap_fault,
4037};
4038
4039static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4040{
4041	struct perf_event *event = file->private_data;
4042	unsigned long user_locked, user_lock_limit;
4043	struct user_struct *user = current_user();
4044	unsigned long locked, lock_limit;
4045	struct ring_buffer *rb;
4046	unsigned long vma_size;
4047	unsigned long nr_pages;
4048	long user_extra, extra;
4049	int ret = 0, flags = 0;
4050
4051	/*
4052	 * Don't allow mmap() of inherited per-task counters. This would
4053	 * create a performance issue due to all children writing to the
4054	 * same rb.
4055	 */
4056	if (event->cpu == -1 && event->attr.inherit)
4057		return -EINVAL;
4058
4059	if (!(vma->vm_flags & VM_SHARED))
4060		return -EINVAL;
4061
4062	vma_size = vma->vm_end - vma->vm_start;
4063	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4064
4065	/*
4066	 * If we have rb pages ensure they're a power-of-two number, so we
4067	 * can do bitmasks instead of modulo.
4068	 */
4069	if (nr_pages != 0 && !is_power_of_2(nr_pages))
4070		return -EINVAL;
4071
4072	if (vma_size != PAGE_SIZE * (1 + nr_pages))
4073		return -EINVAL;
4074
4075	if (vma->vm_pgoff != 0)
4076		return -EINVAL;
4077
4078	WARN_ON_ONCE(event->ctx->parent_ctx);
4079again:
4080	mutex_lock(&event->mmap_mutex);
4081	if (event->rb) {
4082		if (event->rb->nr_pages != nr_pages) {
4083			ret = -EINVAL;
4084			goto unlock;
4085		}
4086
4087		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4088			/*
4089			 * Raced against perf_mmap_close() through
4090			 * perf_event_set_output(). Try again, hope for better
4091			 * luck.
4092			 */
4093			mutex_unlock(&event->mmap_mutex);
4094			goto again;
4095		}
4096
4097		goto unlock;
4098	}
4099
4100	user_extra = nr_pages + 1;
 
 
4101	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4102
4103	/*
4104	 * Increase the limit linearly with more CPUs:
4105	 */
4106	user_lock_limit *= num_online_cpus();
4107
4108	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4109
4110	extra = 0;
4111	if (user_locked > user_lock_limit)
4112		extra = user_locked - user_lock_limit;
4113
4114	lock_limit = rlimit(RLIMIT_MEMLOCK);
4115	lock_limit >>= PAGE_SHIFT;
4116	locked = vma->vm_mm->pinned_vm + extra;
4117
4118	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4119		!capable(CAP_IPC_LOCK)) {
4120		ret = -EPERM;
4121		goto unlock;
4122	}
4123
4124	WARN_ON(event->rb);
4125
4126	if (vma->vm_flags & VM_WRITE)
4127		flags |= RING_BUFFER_WRITABLE;
4128
4129	rb = rb_alloc(nr_pages, 
4130		event->attr.watermark ? event->attr.wakeup_watermark : 0,
4131		event->cpu, flags);
4132
4133	if (!rb) {
4134		ret = -ENOMEM;
4135		goto unlock;
4136	}
4137
4138	atomic_set(&rb->mmap_count, 1);
4139	rb->mmap_locked = extra;
4140	rb->mmap_user = get_current_user();
 
4141
4142	atomic_long_add(user_extra, &user->locked_vm);
4143	vma->vm_mm->pinned_vm += extra;
 
4144
4145	ring_buffer_attach(event, rb);
4146
4147	perf_event_init_userpage(event);
4148	perf_event_update_userpage(event);
 
 
 
 
 
 
4149
4150unlock:
4151	if (!ret)
 
 
 
4152		atomic_inc(&event->mmap_count);
 
 
 
 
4153	mutex_unlock(&event->mmap_mutex);
4154
4155	/*
4156	 * Since pinned accounting is per vm we cannot allow fork() to copy our
4157	 * vma.
4158	 */
4159	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4160	vma->vm_ops = &perf_mmap_vmops;
4161
 
 
 
4162	return ret;
4163}
4164
4165static int perf_fasync(int fd, struct file *filp, int on)
4166{
4167	struct inode *inode = file_inode(filp);
4168	struct perf_event *event = filp->private_data;
4169	int retval;
4170
4171	mutex_lock(&inode->i_mutex);
4172	retval = fasync_helper(fd, filp, on, &event->fasync);
4173	mutex_unlock(&inode->i_mutex);
4174
4175	if (retval < 0)
4176		return retval;
4177
4178	return 0;
4179}
4180
4181static const struct file_operations perf_fops = {
4182	.llseek			= no_llseek,
4183	.release		= perf_release,
4184	.read			= perf_read,
4185	.poll			= perf_poll,
4186	.unlocked_ioctl		= perf_ioctl,
4187	.compat_ioctl		= perf_ioctl,
4188	.mmap			= perf_mmap,
4189	.fasync			= perf_fasync,
4190};
4191
4192/*
4193 * Perf event wakeup
4194 *
4195 * If there's data, ensure we set the poll() state and publish everything
4196 * to user-space before waking everybody up.
4197 */
4198
 
 
 
 
 
 
 
 
4199void perf_event_wakeup(struct perf_event *event)
4200{
4201	ring_buffer_wakeup(event);
4202
4203	if (event->pending_kill) {
4204		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4205		event->pending_kill = 0;
4206	}
4207}
4208
4209static void perf_pending_event(struct irq_work *entry)
4210{
4211	struct perf_event *event = container_of(entry,
4212			struct perf_event, pending);
 
 
 
 
 
 
 
4213
4214	if (event->pending_disable) {
4215		event->pending_disable = 0;
4216		__perf_event_disable(event);
4217	}
4218
4219	if (event->pending_wakeup) {
4220		event->pending_wakeup = 0;
4221		perf_event_wakeup(event);
4222	}
 
 
 
4223}
4224
4225/*
4226 * We assume there is only KVM supporting the callbacks.
4227 * Later on, we might change it to a list if there is
4228 * another virtualization implementation supporting the callbacks.
4229 */
4230struct perf_guest_info_callbacks *perf_guest_cbs;
4231
4232int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4233{
4234	perf_guest_cbs = cbs;
4235	return 0;
4236}
4237EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4238
4239int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4240{
4241	perf_guest_cbs = NULL;
4242	return 0;
4243}
4244EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4245
4246static void
4247perf_output_sample_regs(struct perf_output_handle *handle,
4248			struct pt_regs *regs, u64 mask)
4249{
4250	int bit;
 
4251
4252	for_each_set_bit(bit, (const unsigned long *) &mask,
4253			 sizeof(mask) * BITS_PER_BYTE) {
4254		u64 val;
4255
4256		val = perf_reg_value(regs, bit);
4257		perf_output_put(handle, val);
4258	}
4259}
4260
4261static void perf_sample_regs_user(struct perf_regs_user *regs_user,
4262				  struct pt_regs *regs)
 
4263{
4264	if (!user_mode(regs)) {
4265		if (current->mm)
4266			regs = task_pt_regs(current);
4267		else
4268			regs = NULL;
 
 
 
4269	}
 
4270
4271	if (regs) {
4272		regs_user->regs = regs;
4273		regs_user->abi  = perf_reg_abi(current);
4274	}
 
4275}
4276
 
4277/*
4278 * Get remaining task size from user stack pointer.
4279 *
4280 * It'd be better to take stack vma map and limit this more
4281 * precisly, but there's no way to get it safely under interrupt,
4282 * so using TASK_SIZE as limit.
4283 */
4284static u64 perf_ustack_task_size(struct pt_regs *regs)
4285{
4286	unsigned long addr = perf_user_stack_pointer(regs);
4287
4288	if (!addr || addr >= TASK_SIZE)
4289		return 0;
4290
4291	return TASK_SIZE - addr;
4292}
4293
4294static u16
4295perf_sample_ustack_size(u16 stack_size, u16 header_size,
4296			struct pt_regs *regs)
4297{
4298	u64 task_size;
4299
4300	/* No regs, no stack pointer, no dump. */
4301	if (!regs)
4302		return 0;
4303
4304	/*
4305	 * Check if we fit in with the requested stack size into the:
4306	 * - TASK_SIZE
4307	 *   If we don't, we limit the size to the TASK_SIZE.
4308	 *
4309	 * - remaining sample size
4310	 *   If we don't, we customize the stack size to
4311	 *   fit in to the remaining sample size.
4312	 */
4313
4314	task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4315	stack_size = min(stack_size, (u16) task_size);
4316
4317	/* Current header size plus static size and dynamic size. */
4318	header_size += 2 * sizeof(u64);
4319
4320	/* Do we fit in with the current stack dump size? */
4321	if ((u16) (header_size + stack_size) < header_size) {
4322		/*
4323		 * If we overflow the maximum size for the sample,
4324		 * we customize the stack dump size to fit in.
4325		 */
4326		stack_size = USHRT_MAX - header_size - sizeof(u64);
4327		stack_size = round_up(stack_size, sizeof(u64));
4328	}
4329
4330	return stack_size;
4331}
4332
4333static void
4334perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4335			  struct pt_regs *regs)
4336{
4337	/* Case of a kernel thread, nothing to dump */
4338	if (!regs) {
4339		u64 size = 0;
4340		perf_output_put(handle, size);
4341	} else {
4342		unsigned long sp;
4343		unsigned int rem;
4344		u64 dyn_size;
4345
4346		/*
4347		 * We dump:
4348		 * static size
4349		 *   - the size requested by user or the best one we can fit
4350		 *     in to the sample max size
4351		 * data
4352		 *   - user stack dump data
4353		 * dynamic size
4354		 *   - the actual dumped size
4355		 */
4356
4357		/* Static size. */
4358		perf_output_put(handle, dump_size);
4359
4360		/* Data. */
4361		sp = perf_user_stack_pointer(regs);
4362		rem = __output_copy_user(handle, (void *) sp, dump_size);
4363		dyn_size = dump_size - rem;
4364
4365		perf_output_skip(handle, rem);
4366
4367		/* Dynamic size. */
4368		perf_output_put(handle, dyn_size);
4369	}
4370}
4371
4372static void __perf_event_header__init_id(struct perf_event_header *header,
4373					 struct perf_sample_data *data,
4374					 struct perf_event *event)
4375{
4376	u64 sample_type = event->attr.sample_type;
4377
4378	data->type = sample_type;
4379	header->size += event->id_header_size;
4380
4381	if (sample_type & PERF_SAMPLE_TID) {
4382		/* namespace issues */
4383		data->tid_entry.pid = perf_event_pid(event, current);
4384		data->tid_entry.tid = perf_event_tid(event, current);
4385	}
4386
4387	if (sample_type & PERF_SAMPLE_TIME)
4388		data->time = perf_clock();
4389
4390	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4391		data->id = primary_event_id(event);
4392
4393	if (sample_type & PERF_SAMPLE_STREAM_ID)
4394		data->stream_id = event->id;
4395
4396	if (sample_type & PERF_SAMPLE_CPU) {
4397		data->cpu_entry.cpu	 = raw_smp_processor_id();
4398		data->cpu_entry.reserved = 0;
4399	}
4400}
4401
4402void perf_event_header__init_id(struct perf_event_header *header,
4403				struct perf_sample_data *data,
4404				struct perf_event *event)
4405{
4406	if (event->attr.sample_id_all)
4407		__perf_event_header__init_id(header, data, event);
4408}
4409
4410static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4411					   struct perf_sample_data *data)
4412{
4413	u64 sample_type = data->type;
4414
4415	if (sample_type & PERF_SAMPLE_TID)
4416		perf_output_put(handle, data->tid_entry);
4417
4418	if (sample_type & PERF_SAMPLE_TIME)
4419		perf_output_put(handle, data->time);
4420
4421	if (sample_type & PERF_SAMPLE_ID)
4422		perf_output_put(handle, data->id);
4423
4424	if (sample_type & PERF_SAMPLE_STREAM_ID)
4425		perf_output_put(handle, data->stream_id);
4426
4427	if (sample_type & PERF_SAMPLE_CPU)
4428		perf_output_put(handle, data->cpu_entry);
4429
4430	if (sample_type & PERF_SAMPLE_IDENTIFIER)
4431		perf_output_put(handle, data->id);
4432}
4433
4434void perf_event__output_id_sample(struct perf_event *event,
4435				  struct perf_output_handle *handle,
4436				  struct perf_sample_data *sample)
4437{
4438	if (event->attr.sample_id_all)
4439		__perf_event__output_id_sample(handle, sample);
4440}
4441
4442static void perf_output_read_one(struct perf_output_handle *handle,
4443				 struct perf_event *event,
4444				 u64 enabled, u64 running)
4445{
4446	u64 read_format = event->attr.read_format;
4447	u64 values[4];
4448	int n = 0;
4449
4450	values[n++] = perf_event_count(event);
4451	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4452		values[n++] = enabled +
4453			atomic64_read(&event->child_total_time_enabled);
4454	}
4455	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4456		values[n++] = running +
4457			atomic64_read(&event->child_total_time_running);
4458	}
4459	if (read_format & PERF_FORMAT_ID)
4460		values[n++] = primary_event_id(event);
4461
4462	__output_copy(handle, values, n * sizeof(u64));
4463}
4464
4465/*
4466 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4467 */
4468static void perf_output_read_group(struct perf_output_handle *handle,
4469			    struct perf_event *event,
4470			    u64 enabled, u64 running)
4471{
4472	struct perf_event *leader = event->group_leader, *sub;
4473	u64 read_format = event->attr.read_format;
4474	u64 values[5];
4475	int n = 0;
4476
4477	values[n++] = 1 + leader->nr_siblings;
4478
4479	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4480		values[n++] = enabled;
4481
4482	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4483		values[n++] = running;
4484
4485	if (leader != event)
4486		leader->pmu->read(leader);
4487
4488	values[n++] = perf_event_count(leader);
4489	if (read_format & PERF_FORMAT_ID)
4490		values[n++] = primary_event_id(leader);
4491
4492	__output_copy(handle, values, n * sizeof(u64));
4493
4494	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4495		n = 0;
4496
4497		if ((sub != event) &&
4498		    (sub->state == PERF_EVENT_STATE_ACTIVE))
4499			sub->pmu->read(sub);
4500
4501		values[n++] = perf_event_count(sub);
4502		if (read_format & PERF_FORMAT_ID)
4503			values[n++] = primary_event_id(sub);
4504
4505		__output_copy(handle, values, n * sizeof(u64));
4506	}
4507}
4508
4509#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4510				 PERF_FORMAT_TOTAL_TIME_RUNNING)
4511
4512static void perf_output_read(struct perf_output_handle *handle,
4513			     struct perf_event *event)
4514{
4515	u64 enabled = 0, running = 0, now;
4516	u64 read_format = event->attr.read_format;
4517
4518	/*
4519	 * compute total_time_enabled, total_time_running
4520	 * based on snapshot values taken when the event
4521	 * was last scheduled in.
4522	 *
4523	 * we cannot simply called update_context_time()
4524	 * because of locking issue as we are called in
4525	 * NMI context
4526	 */
4527	if (read_format & PERF_FORMAT_TOTAL_TIMES)
4528		calc_timer_values(event, &now, &enabled, &running);
4529
4530	if (event->attr.read_format & PERF_FORMAT_GROUP)
4531		perf_output_read_group(handle, event, enabled, running);
4532	else
4533		perf_output_read_one(handle, event, enabled, running);
4534}
4535
4536void perf_output_sample(struct perf_output_handle *handle,
4537			struct perf_event_header *header,
4538			struct perf_sample_data *data,
4539			struct perf_event *event)
4540{
4541	u64 sample_type = data->type;
4542
4543	perf_output_put(handle, *header);
4544
4545	if (sample_type & PERF_SAMPLE_IDENTIFIER)
4546		perf_output_put(handle, data->id);
4547
4548	if (sample_type & PERF_SAMPLE_IP)
4549		perf_output_put(handle, data->ip);
4550
4551	if (sample_type & PERF_SAMPLE_TID)
4552		perf_output_put(handle, data->tid_entry);
4553
4554	if (sample_type & PERF_SAMPLE_TIME)
4555		perf_output_put(handle, data->time);
4556
4557	if (sample_type & PERF_SAMPLE_ADDR)
4558		perf_output_put(handle, data->addr);
4559
4560	if (sample_type & PERF_SAMPLE_ID)
4561		perf_output_put(handle, data->id);
4562
4563	if (sample_type & PERF_SAMPLE_STREAM_ID)
4564		perf_output_put(handle, data->stream_id);
4565
4566	if (sample_type & PERF_SAMPLE_CPU)
4567		perf_output_put(handle, data->cpu_entry);
4568
4569	if (sample_type & PERF_SAMPLE_PERIOD)
4570		perf_output_put(handle, data->period);
4571
4572	if (sample_type & PERF_SAMPLE_READ)
4573		perf_output_read(handle, event);
4574
4575	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4576		if (data->callchain) {
4577			int size = 1;
4578
4579			if (data->callchain)
4580				size += data->callchain->nr;
4581
4582			size *= sizeof(u64);
4583
4584			__output_copy(handle, data->callchain, size);
4585		} else {
4586			u64 nr = 0;
4587			perf_output_put(handle, nr);
4588		}
4589	}
4590
4591	if (sample_type & PERF_SAMPLE_RAW) {
4592		if (data->raw) {
4593			perf_output_put(handle, data->raw->size);
4594			__output_copy(handle, data->raw->data,
4595					   data->raw->size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4596		} else {
4597			struct {
4598				u32	size;
4599				u32	data;
4600			} raw = {
4601				.size = sizeof(u32),
4602				.data = 0,
4603			};
4604			perf_output_put(handle, raw);
4605		}
4606	}
4607
4608	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4609		if (data->br_stack) {
4610			size_t size;
4611
4612			size = data->br_stack->nr
4613			     * sizeof(struct perf_branch_entry);
4614
4615			perf_output_put(handle, data->br_stack->nr);
4616			perf_output_copy(handle, data->br_stack->entries, size);
4617		} else {
4618			/*
4619			 * we always store at least the value of nr
4620			 */
4621			u64 nr = 0;
4622			perf_output_put(handle, nr);
4623		}
4624	}
4625
4626	if (sample_type & PERF_SAMPLE_REGS_USER) {
4627		u64 abi = data->regs_user.abi;
4628
4629		/*
4630		 * If there are no regs to dump, notice it through
4631		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4632		 */
4633		perf_output_put(handle, abi);
4634
4635		if (abi) {
4636			u64 mask = event->attr.sample_regs_user;
4637			perf_output_sample_regs(handle,
4638						data->regs_user.regs,
4639						mask);
4640		}
4641	}
4642
4643	if (sample_type & PERF_SAMPLE_STACK_USER) {
4644		perf_output_sample_ustack(handle,
4645					  data->stack_user_size,
4646					  data->regs_user.regs);
4647	}
4648
4649	if (sample_type & PERF_SAMPLE_WEIGHT)
4650		perf_output_put(handle, data->weight);
4651
4652	if (sample_type & PERF_SAMPLE_DATA_SRC)
4653		perf_output_put(handle, data->data_src.val);
4654
4655	if (sample_type & PERF_SAMPLE_TRANSACTION)
4656		perf_output_put(handle, data->txn);
4657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4658	if (!event->attr.watermark) {
4659		int wakeup_events = event->attr.wakeup_events;
4660
4661		if (wakeup_events) {
4662			struct ring_buffer *rb = handle->rb;
4663			int events = local_inc_return(&rb->events);
4664
4665			if (events >= wakeup_events) {
4666				local_sub(wakeup_events, &rb->events);
4667				local_inc(&rb->wakeup);
4668			}
4669		}
4670	}
4671}
4672
4673void perf_prepare_sample(struct perf_event_header *header,
4674			 struct perf_sample_data *data,
4675			 struct perf_event *event,
4676			 struct pt_regs *regs)
4677{
4678	u64 sample_type = event->attr.sample_type;
4679
4680	header->type = PERF_RECORD_SAMPLE;
4681	header->size = sizeof(*header) + event->header_size;
4682
4683	header->misc = 0;
4684	header->misc |= perf_misc_flags(regs);
4685
4686	__perf_event_header__init_id(header, data, event);
4687
4688	if (sample_type & PERF_SAMPLE_IP)
4689		data->ip = perf_instruction_pointer(regs);
4690
4691	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4692		int size = 1;
4693
4694		data->callchain = perf_callchain(event, regs);
4695
4696		if (data->callchain)
4697			size += data->callchain->nr;
4698
4699		header->size += size * sizeof(u64);
4700	}
4701
4702	if (sample_type & PERF_SAMPLE_RAW) {
4703		int size = sizeof(u32);
 
4704
4705		if (data->raw)
4706			size += data->raw->size;
4707		else
4708			size += sizeof(u32);
 
 
 
 
 
 
 
 
 
 
 
 
 
4709
4710		WARN_ON_ONCE(size & (sizeof(u64)-1));
4711		header->size += size;
4712	}
4713
4714	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4715		int size = sizeof(u64); /* nr */
4716		if (data->br_stack) {
4717			size += data->br_stack->nr
4718			      * sizeof(struct perf_branch_entry);
4719		}
4720		header->size += size;
4721	}
4722
 
 
 
 
4723	if (sample_type & PERF_SAMPLE_REGS_USER) {
4724		/* regs dump ABI info */
4725		int size = sizeof(u64);
4726
4727		perf_sample_regs_user(&data->regs_user, regs);
4728
4729		if (data->regs_user.regs) {
4730			u64 mask = event->attr.sample_regs_user;
4731			size += hweight64(mask) * sizeof(u64);
4732		}
4733
4734		header->size += size;
4735	}
4736
4737	if (sample_type & PERF_SAMPLE_STACK_USER) {
4738		/*
4739		 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
4740		 * processed as the last one or have additional check added
4741		 * in case new sample type is added, because we could eat
4742		 * up the rest of the sample size.
4743		 */
4744		struct perf_regs_user *uregs = &data->regs_user;
4745		u16 stack_size = event->attr.sample_stack_user;
4746		u16 size = sizeof(u64);
4747
4748		if (!uregs->abi)
4749			perf_sample_regs_user(uregs, regs);
4750
4751		stack_size = perf_sample_ustack_size(stack_size, header->size,
4752						     uregs->regs);
4753
4754		/*
4755		 * If there is something to dump, add space for the dump
4756		 * itself and for the field that tells the dynamic size,
4757		 * which is how many have been actually dumped.
4758		 */
4759		if (stack_size)
4760			size += sizeof(u64) + stack_size;
4761
4762		data->stack_user_size = stack_size;
4763		header->size += size;
4764	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4765}
4766
4767static void perf_event_output(struct perf_event *event,
4768				struct perf_sample_data *data,
4769				struct pt_regs *regs)
 
 
 
 
4770{
4771	struct perf_output_handle handle;
4772	struct perf_event_header header;
4773
4774	/* protect the callchain buffers */
4775	rcu_read_lock();
4776
4777	perf_prepare_sample(&header, data, event, regs);
4778
4779	if (perf_output_begin(&handle, event, header.size))
4780		goto exit;
4781
4782	perf_output_sample(&handle, &header, data, event);
4783
4784	perf_output_end(&handle);
4785
4786exit:
4787	rcu_read_unlock();
4788}
4789
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4790/*
4791 * read event_id
4792 */
4793
4794struct perf_read_event {
4795	struct perf_event_header	header;
4796
4797	u32				pid;
4798	u32				tid;
4799};
4800
4801static void
4802perf_event_read_event(struct perf_event *event,
4803			struct task_struct *task)
4804{
4805	struct perf_output_handle handle;
4806	struct perf_sample_data sample;
4807	struct perf_read_event read_event = {
4808		.header = {
4809			.type = PERF_RECORD_READ,
4810			.misc = 0,
4811			.size = sizeof(read_event) + event->read_size,
4812		},
4813		.pid = perf_event_pid(event, task),
4814		.tid = perf_event_tid(event, task),
4815	};
4816	int ret;
4817
4818	perf_event_header__init_id(&read_event.header, &sample, event);
4819	ret = perf_output_begin(&handle, event, read_event.header.size);
4820	if (ret)
4821		return;
4822
4823	perf_output_put(&handle, read_event);
4824	perf_output_read(&handle, event);
4825	perf_event__output_id_sample(event, &handle, &sample);
4826
4827	perf_output_end(&handle);
4828}
4829
4830typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4831
4832static void
4833perf_event_aux_ctx(struct perf_event_context *ctx,
4834		   perf_event_aux_output_cb output,
4835		   void *data)
4836{
4837	struct perf_event *event;
4838
4839	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4840		if (event->state < PERF_EVENT_STATE_INACTIVE)
4841			continue;
4842		if (!event_filter_match(event))
4843			continue;
4844		output(event, data);
4845	}
4846}
4847
 
 
 
 
 
 
4848static void
4849perf_event_aux(perf_event_aux_output_cb output, void *data,
4850	       struct perf_event_context *task_ctx)
4851{
4852	struct perf_cpu_context *cpuctx;
4853	struct perf_event_context *ctx;
4854	struct pmu *pmu;
4855	int ctxn;
4856
4857	rcu_read_lock();
4858	list_for_each_entry_rcu(pmu, &pmus, entry) {
4859		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4860		if (cpuctx->unique_pmu != pmu)
4861			goto next;
4862		perf_event_aux_ctx(&cpuctx->ctx, output, data);
4863		if (task_ctx)
4864			goto next;
4865		ctxn = pmu->task_ctx_nr;
4866		if (ctxn < 0)
4867			goto next;
 
 
 
 
 
4868		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4869		if (ctx)
4870			perf_event_aux_ctx(ctx, output, data);
4871next:
4872		put_cpu_ptr(pmu->pmu_cpu_context);
4873	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4874
4875	if (task_ctx) {
4876		preempt_disable();
4877		perf_event_aux_ctx(task_ctx, output, data);
4878		preempt_enable();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4879	}
4880	rcu_read_unlock();
4881}
4882
4883/*
4884 * task tracking -- fork/exit
4885 *
4886 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
4887 */
4888
4889struct perf_task_event {
4890	struct task_struct		*task;
4891	struct perf_event_context	*task_ctx;
4892
4893	struct {
4894		struct perf_event_header	header;
4895
4896		u32				pid;
4897		u32				ppid;
4898		u32				tid;
4899		u32				ptid;
4900		u64				time;
4901	} event_id;
4902};
4903
4904static int perf_event_task_match(struct perf_event *event)
4905{
4906	return event->attr.comm  || event->attr.mmap ||
4907	       event->attr.mmap2 || event->attr.mmap_data ||
4908	       event->attr.task;
4909}
4910
4911static void perf_event_task_output(struct perf_event *event,
4912				   void *data)
4913{
4914	struct perf_task_event *task_event = data;
4915	struct perf_output_handle handle;
4916	struct perf_sample_data	sample;
4917	struct task_struct *task = task_event->task;
4918	int ret, size = task_event->event_id.header.size;
4919
4920	if (!perf_event_task_match(event))
4921		return;
4922
4923	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4924
4925	ret = perf_output_begin(&handle, event,
4926				task_event->event_id.header.size);
4927	if (ret)
4928		goto out;
4929
4930	task_event->event_id.pid = perf_event_pid(event, task);
4931	task_event->event_id.ppid = perf_event_pid(event, current);
4932
4933	task_event->event_id.tid = perf_event_tid(event, task);
4934	task_event->event_id.ptid = perf_event_tid(event, current);
4935
 
 
4936	perf_output_put(&handle, task_event->event_id);
4937
4938	perf_event__output_id_sample(event, &handle, &sample);
4939
4940	perf_output_end(&handle);
4941out:
4942	task_event->event_id.header.size = size;
4943}
4944
4945static void perf_event_task(struct task_struct *task,
4946			      struct perf_event_context *task_ctx,
4947			      int new)
4948{
4949	struct perf_task_event task_event;
4950
4951	if (!atomic_read(&nr_comm_events) &&
4952	    !atomic_read(&nr_mmap_events) &&
4953	    !atomic_read(&nr_task_events))
4954		return;
4955
4956	task_event = (struct perf_task_event){
4957		.task	  = task,
4958		.task_ctx = task_ctx,
4959		.event_id    = {
4960			.header = {
4961				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4962				.misc = 0,
4963				.size = sizeof(task_event.event_id),
4964			},
4965			/* .pid  */
4966			/* .ppid */
4967			/* .tid  */
4968			/* .ptid */
4969			.time = perf_clock(),
4970		},
4971	};
4972
4973	perf_event_aux(perf_event_task_output,
4974		       &task_event,
4975		       task_ctx);
4976}
4977
4978void perf_event_fork(struct task_struct *task)
4979{
4980	perf_event_task(task, NULL, 1);
4981}
4982
4983/*
4984 * comm tracking
4985 */
4986
4987struct perf_comm_event {
4988	struct task_struct	*task;
4989	char			*comm;
4990	int			comm_size;
4991
4992	struct {
4993		struct perf_event_header	header;
4994
4995		u32				pid;
4996		u32				tid;
4997	} event_id;
4998};
4999
5000static int perf_event_comm_match(struct perf_event *event)
5001{
5002	return event->attr.comm;
5003}
5004
5005static void perf_event_comm_output(struct perf_event *event,
5006				   void *data)
5007{
5008	struct perf_comm_event *comm_event = data;
5009	struct perf_output_handle handle;
5010	struct perf_sample_data sample;
5011	int size = comm_event->event_id.header.size;
5012	int ret;
5013
5014	if (!perf_event_comm_match(event))
5015		return;
5016
5017	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5018	ret = perf_output_begin(&handle, event,
5019				comm_event->event_id.header.size);
5020
5021	if (ret)
5022		goto out;
5023
5024	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5025	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5026
5027	perf_output_put(&handle, comm_event->event_id);
5028	__output_copy(&handle, comm_event->comm,
5029				   comm_event->comm_size);
5030
5031	perf_event__output_id_sample(event, &handle, &sample);
5032
5033	perf_output_end(&handle);
5034out:
5035	comm_event->event_id.header.size = size;
5036}
5037
5038static void perf_event_comm_event(struct perf_comm_event *comm_event)
5039{
5040	char comm[TASK_COMM_LEN];
5041	unsigned int size;
5042
5043	memset(comm, 0, sizeof(comm));
5044	strlcpy(comm, comm_event->task->comm, sizeof(comm));
5045	size = ALIGN(strlen(comm)+1, sizeof(u64));
5046
5047	comm_event->comm = comm;
5048	comm_event->comm_size = size;
5049
5050	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5051
5052	perf_event_aux(perf_event_comm_output,
5053		       comm_event,
5054		       NULL);
5055}
5056
5057void perf_event_comm(struct task_struct *task)
5058{
5059	struct perf_comm_event comm_event;
5060	struct perf_event_context *ctx;
5061	int ctxn;
5062
5063	rcu_read_lock();
5064	for_each_task_context_nr(ctxn) {
5065		ctx = task->perf_event_ctxp[ctxn];
5066		if (!ctx)
5067			continue;
5068
5069		perf_event_enable_on_exec(ctx);
5070	}
5071	rcu_read_unlock();
5072
5073	if (!atomic_read(&nr_comm_events))
5074		return;
5075
5076	comm_event = (struct perf_comm_event){
5077		.task	= task,
5078		/* .comm      */
5079		/* .comm_size */
5080		.event_id  = {
5081			.header = {
5082				.type = PERF_RECORD_COMM,
5083				.misc = 0,
5084				/* .size */
5085			},
5086			/* .pid */
5087			/* .tid */
5088		},
5089	};
5090
5091	perf_event_comm_event(&comm_event);
5092}
5093
5094/*
5095 * mmap tracking
5096 */
5097
5098struct perf_mmap_event {
5099	struct vm_area_struct	*vma;
5100
5101	const char		*file_name;
5102	int			file_size;
5103	int			maj, min;
5104	u64			ino;
5105	u64			ino_generation;
 
5106
5107	struct {
5108		struct perf_event_header	header;
5109
5110		u32				pid;
5111		u32				tid;
5112		u64				start;
5113		u64				len;
5114		u64				pgoff;
5115	} event_id;
5116};
5117
5118static int perf_event_mmap_match(struct perf_event *event,
5119				 void *data)
5120{
5121	struct perf_mmap_event *mmap_event = data;
5122	struct vm_area_struct *vma = mmap_event->vma;
5123	int executable = vma->vm_flags & VM_EXEC;
5124
5125	return (!executable && event->attr.mmap_data) ||
5126	       (executable && (event->attr.mmap || event->attr.mmap2));
5127}
5128
5129static void perf_event_mmap_output(struct perf_event *event,
5130				   void *data)
5131{
5132	struct perf_mmap_event *mmap_event = data;
5133	struct perf_output_handle handle;
5134	struct perf_sample_data sample;
5135	int size = mmap_event->event_id.header.size;
5136	int ret;
5137
5138	if (!perf_event_mmap_match(event, data))
5139		return;
5140
5141	if (event->attr.mmap2) {
5142		mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5143		mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5144		mmap_event->event_id.header.size += sizeof(mmap_event->min);
5145		mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5146		mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
 
 
5147	}
5148
5149	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5150	ret = perf_output_begin(&handle, event,
5151				mmap_event->event_id.header.size);
5152	if (ret)
5153		goto out;
5154
5155	mmap_event->event_id.pid = perf_event_pid(event, current);
5156	mmap_event->event_id.tid = perf_event_tid(event, current);
5157
5158	perf_output_put(&handle, mmap_event->event_id);
5159
5160	if (event->attr.mmap2) {
5161		perf_output_put(&handle, mmap_event->maj);
5162		perf_output_put(&handle, mmap_event->min);
5163		perf_output_put(&handle, mmap_event->ino);
5164		perf_output_put(&handle, mmap_event->ino_generation);
 
 
5165	}
5166
5167	__output_copy(&handle, mmap_event->file_name,
5168				   mmap_event->file_size);
5169
5170	perf_event__output_id_sample(event, &handle, &sample);
5171
5172	perf_output_end(&handle);
5173out:
5174	mmap_event->event_id.header.size = size;
5175}
5176
5177static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5178{
5179	struct vm_area_struct *vma = mmap_event->vma;
5180	struct file *file = vma->vm_file;
5181	int maj = 0, min = 0;
5182	u64 ino = 0, gen = 0;
 
5183	unsigned int size;
5184	char tmp[16];
5185	char *buf = NULL;
5186	char *name;
5187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5188	if (file) {
5189		struct inode *inode;
5190		dev_t dev;
5191
5192		buf = kmalloc(PATH_MAX, GFP_KERNEL);
5193		if (!buf) {
5194			name = "//enomem";
5195			goto cpy_name;
5196		}
5197		/*
5198		 * d_path() works from the end of the rb backwards, so we
5199		 * need to add enough zero bytes after the string to handle
5200		 * the 64bit alignment we do later.
5201		 */
5202		name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5203		if (IS_ERR(name)) {
5204			name = "//toolong";
5205			goto cpy_name;
5206		}
5207		inode = file_inode(vma->vm_file);
5208		dev = inode->i_sb->s_dev;
5209		ino = inode->i_ino;
5210		gen = inode->i_generation;
5211		maj = MAJOR(dev);
5212		min = MINOR(dev);
 
5213		goto got_name;
5214	} else {
 
 
 
 
 
 
5215		name = (char *)arch_vma_name(vma);
5216		if (name)
5217			goto cpy_name;
5218
5219		if (vma->vm_start <= vma->vm_mm->start_brk &&
5220				vma->vm_end >= vma->vm_mm->brk) {
5221			name = "[heap]";
5222			goto cpy_name;
5223		}
5224		if (vma->vm_start <= vma->vm_mm->start_stack &&
5225				vma->vm_end >= vma->vm_mm->start_stack) {
5226			name = "[stack]";
5227			goto cpy_name;
5228		}
5229
5230		name = "//anon";
5231		goto cpy_name;
5232	}
5233
5234cpy_name:
5235	strlcpy(tmp, name, sizeof(tmp));
5236	name = tmp;
5237got_name:
5238	/*
5239	 * Since our buffer works in 8 byte units we need to align our string
5240	 * size to a multiple of 8. However, we must guarantee the tail end is
5241	 * zero'd out to avoid leaking random bits to userspace.
5242	 */
5243	size = strlen(name)+1;
5244	while (!IS_ALIGNED(size, sizeof(u64)))
5245		name[size++] = '\0';
5246
5247	mmap_event->file_name = name;
5248	mmap_event->file_size = size;
5249	mmap_event->maj = maj;
5250	mmap_event->min = min;
5251	mmap_event->ino = ino;
5252	mmap_event->ino_generation = gen;
 
 
5253
5254	if (!(vma->vm_flags & VM_EXEC))
5255		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5256
5257	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5258
5259	perf_event_aux(perf_event_mmap_output,
5260		       mmap_event,
5261		       NULL);
5262
5263	kfree(buf);
5264}
5265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5266void perf_event_mmap(struct vm_area_struct *vma)
5267{
5268	struct perf_mmap_event mmap_event;
5269
5270	if (!atomic_read(&nr_mmap_events))
5271		return;
5272
5273	mmap_event = (struct perf_mmap_event){
5274		.vma	= vma,
5275		/* .file_name */
5276		/* .file_size */
5277		.event_id  = {
5278			.header = {
5279				.type = PERF_RECORD_MMAP,
5280				.misc = PERF_RECORD_MISC_USER,
5281				/* .size */
5282			},
5283			/* .pid */
5284			/* .tid */
5285			.start  = vma->vm_start,
5286			.len    = vma->vm_end - vma->vm_start,
5287			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
5288		},
5289		/* .maj (attr_mmap2 only) */
5290		/* .min (attr_mmap2 only) */
5291		/* .ino (attr_mmap2 only) */
5292		/* .ino_generation (attr_mmap2 only) */
 
 
5293	};
5294
 
5295	perf_event_mmap_event(&mmap_event);
5296}
5297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5298/*
5299 * IRQ throttle logging
5300 */
5301
5302static void perf_log_throttle(struct perf_event *event, int enable)
5303{
5304	struct perf_output_handle handle;
5305	struct perf_sample_data sample;
5306	int ret;
5307
5308	struct {
5309		struct perf_event_header	header;
5310		u64				time;
5311		u64				id;
5312		u64				stream_id;
5313	} throttle_event = {
5314		.header = {
5315			.type = PERF_RECORD_THROTTLE,
5316			.misc = 0,
5317			.size = sizeof(throttle_event),
5318		},
5319		.time		= perf_clock(),
5320		.id		= primary_event_id(event),
5321		.stream_id	= event->id,
5322	};
5323
5324	if (enable)
5325		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5326
5327	perf_event_header__init_id(&throttle_event.header, &sample, event);
5328
5329	ret = perf_output_begin(&handle, event,
5330				throttle_event.header.size);
5331	if (ret)
5332		return;
5333
5334	perf_output_put(&handle, throttle_event);
5335	perf_event__output_id_sample(event, &handle, &sample);
5336	perf_output_end(&handle);
5337}
5338
5339/*
5340 * Generic event overflow handling, sampling.
5341 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5342
5343static int __perf_event_overflow(struct perf_event *event,
5344				   int throttle, struct perf_sample_data *data,
5345				   struct pt_regs *regs)
5346{
5347	int events = atomic_read(&event->event_limit);
5348	struct hw_perf_event *hwc = &event->hw;
 
5349	u64 seq;
5350	int ret = 0;
5351
5352	/*
5353	 * Non-sampling counters might still use the PMI to fold short
5354	 * hardware counters, ignore those.
5355	 */
5356	if (unlikely(!is_sampling_event(event)))
5357		return 0;
5358
5359	seq = __this_cpu_read(perf_throttled_seq);
5360	if (seq != hwc->interrupts_seq) {
5361		hwc->interrupts_seq = seq;
5362		hwc->interrupts = 1;
5363	} else {
5364		hwc->interrupts++;
5365		if (unlikely(throttle
5366			     && hwc->interrupts >= max_samples_per_tick)) {
5367			__this_cpu_inc(perf_throttled_count);
 
5368			hwc->interrupts = MAX_INTERRUPTS;
5369			perf_log_throttle(event, 0);
5370			tick_nohz_full_kick();
5371			ret = 1;
5372		}
5373	}
5374
5375	if (event->attr.freq) {
5376		u64 now = perf_clock();
5377		s64 delta = now - hwc->freq_time_stamp;
5378
5379		hwc->freq_time_stamp = now;
5380
5381		if (delta > 0 && delta < 2*TICK_NSEC)
5382			perf_adjust_period(event, delta, hwc->last_period, true);
5383	}
5384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5385	/*
5386	 * XXX event_limit might not quite work as expected on inherited
5387	 * events
5388	 */
5389
5390	event->pending_kill = POLL_IN;
5391	if (events && atomic_dec_and_test(&event->event_limit)) {
5392		ret = 1;
5393		event->pending_kill = POLL_HUP;
5394		event->pending_disable = 1;
5395		irq_work_queue(&event->pending);
5396	}
5397
5398	if (event->overflow_handler)
5399		event->overflow_handler(event, data, regs);
5400	else
5401		perf_event_output(event, data, regs);
5402
5403	if (event->fasync && event->pending_kill) {
5404		event->pending_wakeup = 1;
5405		irq_work_queue(&event->pending);
5406	}
5407
5408	return ret;
5409}
5410
5411int perf_event_overflow(struct perf_event *event,
5412			  struct perf_sample_data *data,
5413			  struct pt_regs *regs)
5414{
5415	return __perf_event_overflow(event, 1, data, regs);
5416}
5417
5418/*
5419 * Generic software event infrastructure
5420 */
5421
5422struct swevent_htable {
5423	struct swevent_hlist		*swevent_hlist;
5424	struct mutex			hlist_mutex;
5425	int				hlist_refcount;
5426
5427	/* Recursion avoidance in each contexts */
5428	int				recursion[PERF_NR_CONTEXTS];
5429
5430	/* Keeps track of cpu being initialized/exited */
5431	bool				online;
5432};
5433
5434static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5435
5436/*
5437 * We directly increment event->count and keep a second value in
5438 * event->hw.period_left to count intervals. This period event
5439 * is kept in the range [-sample_period, 0] so that we can use the
5440 * sign as trigger.
5441 */
5442
5443u64 perf_swevent_set_period(struct perf_event *event)
5444{
5445	struct hw_perf_event *hwc = &event->hw;
5446	u64 period = hwc->last_period;
5447	u64 nr, offset;
5448	s64 old, val;
5449
5450	hwc->last_period = hwc->sample_period;
5451
5452again:
5453	old = val = local64_read(&hwc->period_left);
5454	if (val < 0)
5455		return 0;
5456
5457	nr = div64_u64(period + val, period);
5458	offset = nr * period;
5459	val -= offset;
5460	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
5461		goto again;
5462
5463	return nr;
5464}
5465
5466static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5467				    struct perf_sample_data *data,
5468				    struct pt_regs *regs)
5469{
5470	struct hw_perf_event *hwc = &event->hw;
5471	int throttle = 0;
5472
5473	if (!overflow)
5474		overflow = perf_swevent_set_period(event);
5475
5476	if (hwc->interrupts == MAX_INTERRUPTS)
5477		return;
5478
5479	for (; overflow; overflow--) {
5480		if (__perf_event_overflow(event, throttle,
5481					    data, regs)) {
5482			/*
5483			 * We inhibit the overflow from happening when
5484			 * hwc->interrupts == MAX_INTERRUPTS.
5485			 */
5486			break;
5487		}
5488		throttle = 1;
5489	}
5490}
5491
5492static void perf_swevent_event(struct perf_event *event, u64 nr,
5493			       struct perf_sample_data *data,
5494			       struct pt_regs *regs)
5495{
5496	struct hw_perf_event *hwc = &event->hw;
5497
5498	local64_add(nr, &event->count);
5499
5500	if (!regs)
5501		return;
5502
5503	if (!is_sampling_event(event))
5504		return;
5505
5506	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
5507		data->period = nr;
5508		return perf_swevent_overflow(event, 1, data, regs);
5509	} else
5510		data->period = event->hw.last_period;
5511
5512	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5513		return perf_swevent_overflow(event, 1, data, regs);
5514
5515	if (local64_add_negative(nr, &hwc->period_left))
5516		return;
5517
5518	perf_swevent_overflow(event, 0, data, regs);
5519}
5520
5521static int perf_exclude_event(struct perf_event *event,
5522			      struct pt_regs *regs)
5523{
5524	if (event->hw.state & PERF_HES_STOPPED)
5525		return 1;
5526
5527	if (regs) {
5528		if (event->attr.exclude_user && user_mode(regs))
5529			return 1;
5530
5531		if (event->attr.exclude_kernel && !user_mode(regs))
5532			return 1;
5533	}
5534
5535	return 0;
5536}
5537
5538static int perf_swevent_match(struct perf_event *event,
5539				enum perf_type_id type,
5540				u32 event_id,
5541				struct perf_sample_data *data,
5542				struct pt_regs *regs)
5543{
5544	if (event->attr.type != type)
5545		return 0;
5546
5547	if (event->attr.config != event_id)
5548		return 0;
5549
5550	if (perf_exclude_event(event, regs))
5551		return 0;
5552
5553	return 1;
5554}
5555
5556static inline u64 swevent_hash(u64 type, u32 event_id)
5557{
5558	u64 val = event_id | (type << 32);
5559
5560	return hash_64(val, SWEVENT_HLIST_BITS);
5561}
5562
5563static inline struct hlist_head *
5564__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
5565{
5566	u64 hash = swevent_hash(type, event_id);
5567
5568	return &hlist->heads[hash];
5569}
5570
5571/* For the read side: events when they trigger */
5572static inline struct hlist_head *
5573find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
5574{
5575	struct swevent_hlist *hlist;
5576
5577	hlist = rcu_dereference(swhash->swevent_hlist);
5578	if (!hlist)
5579		return NULL;
5580
5581	return __find_swevent_head(hlist, type, event_id);
5582}
5583
5584/* For the event head insertion and removal in the hlist */
5585static inline struct hlist_head *
5586find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5587{
5588	struct swevent_hlist *hlist;
5589	u32 event_id = event->attr.config;
5590	u64 type = event->attr.type;
5591
5592	/*
5593	 * Event scheduling is always serialized against hlist allocation
5594	 * and release. Which makes the protected version suitable here.
5595	 * The context lock guarantees that.
5596	 */
5597	hlist = rcu_dereference_protected(swhash->swevent_hlist,
5598					  lockdep_is_held(&event->ctx->lock));
5599	if (!hlist)
5600		return NULL;
5601
5602	return __find_swevent_head(hlist, type, event_id);
5603}
5604
5605static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5606				    u64 nr,
5607				    struct perf_sample_data *data,
5608				    struct pt_regs *regs)
5609{
5610	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5611	struct perf_event *event;
5612	struct hlist_head *head;
5613
5614	rcu_read_lock();
5615	head = find_swevent_head_rcu(swhash, type, event_id);
5616	if (!head)
5617		goto end;
5618
5619	hlist_for_each_entry_rcu(event, head, hlist_entry) {
5620		if (perf_swevent_match(event, type, event_id, data, regs))
5621			perf_swevent_event(event, nr, data, regs);
5622	}
5623end:
5624	rcu_read_unlock();
5625}
5626
 
 
5627int perf_swevent_get_recursion_context(void)
5628{
5629	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5630
5631	return get_recursion_context(swhash->recursion);
5632}
5633EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
5634
5635inline void perf_swevent_put_recursion_context(int rctx)
5636{
5637	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5638
5639	put_recursion_context(swhash->recursion, rctx);
5640}
5641
 
 
 
 
 
 
 
 
 
 
 
5642void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5643{
5644	struct perf_sample_data data;
5645	int rctx;
5646
5647	preempt_disable_notrace();
5648	rctx = perf_swevent_get_recursion_context();
5649	if (rctx < 0)
5650		return;
5651
5652	perf_sample_data_init(&data, addr, 0);
5653
5654	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5655
5656	perf_swevent_put_recursion_context(rctx);
 
5657	preempt_enable_notrace();
5658}
5659
5660static void perf_swevent_read(struct perf_event *event)
5661{
5662}
5663
5664static int perf_swevent_add(struct perf_event *event, int flags)
5665{
5666	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5667	struct hw_perf_event *hwc = &event->hw;
5668	struct hlist_head *head;
5669
5670	if (is_sampling_event(event)) {
5671		hwc->last_period = hwc->sample_period;
5672		perf_swevent_set_period(event);
5673	}
5674
5675	hwc->state = !(flags & PERF_EF_START);
5676
5677	head = find_swevent_head(swhash, event);
5678	if (!head) {
5679		/*
5680		 * We can race with cpu hotplug code. Do not
5681		 * WARN if the cpu just got unplugged.
5682		 */
5683		WARN_ON_ONCE(swhash->online);
5684		return -EINVAL;
5685	}
5686
5687	hlist_add_head_rcu(&event->hlist_entry, head);
 
5688
5689	return 0;
5690}
5691
5692static void perf_swevent_del(struct perf_event *event, int flags)
5693{
5694	hlist_del_rcu(&event->hlist_entry);
5695}
5696
5697static void perf_swevent_start(struct perf_event *event, int flags)
5698{
5699	event->hw.state = 0;
5700}
5701
5702static void perf_swevent_stop(struct perf_event *event, int flags)
5703{
5704	event->hw.state = PERF_HES_STOPPED;
5705}
5706
5707/* Deref the hlist from the update side */
5708static inline struct swevent_hlist *
5709swevent_hlist_deref(struct swevent_htable *swhash)
5710{
5711	return rcu_dereference_protected(swhash->swevent_hlist,
5712					 lockdep_is_held(&swhash->hlist_mutex));
5713}
5714
5715static void swevent_hlist_release(struct swevent_htable *swhash)
5716{
5717	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
5718
5719	if (!hlist)
5720		return;
5721
5722	rcu_assign_pointer(swhash->swevent_hlist, NULL);
5723	kfree_rcu(hlist, rcu_head);
5724}
5725
5726static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5727{
5728	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5729
5730	mutex_lock(&swhash->hlist_mutex);
5731
5732	if (!--swhash->hlist_refcount)
5733		swevent_hlist_release(swhash);
5734
5735	mutex_unlock(&swhash->hlist_mutex);
5736}
5737
5738static void swevent_hlist_put(struct perf_event *event)
5739{
5740	int cpu;
5741
5742	for_each_possible_cpu(cpu)
5743		swevent_hlist_put_cpu(event, cpu);
5744}
5745
5746static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5747{
5748	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5749	int err = 0;
5750
5751	mutex_lock(&swhash->hlist_mutex);
5752
5753	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
5754		struct swevent_hlist *hlist;
5755
5756		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5757		if (!hlist) {
5758			err = -ENOMEM;
5759			goto exit;
5760		}
5761		rcu_assign_pointer(swhash->swevent_hlist, hlist);
5762	}
5763	swhash->hlist_refcount++;
5764exit:
5765	mutex_unlock(&swhash->hlist_mutex);
5766
5767	return err;
5768}
5769
5770static int swevent_hlist_get(struct perf_event *event)
5771{
5772	int err;
5773	int cpu, failed_cpu;
5774
5775	get_online_cpus();
5776	for_each_possible_cpu(cpu) {
5777		err = swevent_hlist_get_cpu(event, cpu);
5778		if (err) {
5779			failed_cpu = cpu;
5780			goto fail;
5781		}
5782	}
5783	put_online_cpus();
5784
5785	return 0;
5786fail:
5787	for_each_possible_cpu(cpu) {
5788		if (cpu == failed_cpu)
5789			break;
5790		swevent_hlist_put_cpu(event, cpu);
5791	}
5792
5793	put_online_cpus();
5794	return err;
5795}
5796
5797struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5798
5799static void sw_perf_event_destroy(struct perf_event *event)
5800{
5801	u64 event_id = event->attr.config;
5802
5803	WARN_ON(event->parent);
5804
5805	static_key_slow_dec(&perf_swevent_enabled[event_id]);
5806	swevent_hlist_put(event);
5807}
5808
5809static int perf_swevent_init(struct perf_event *event)
5810{
5811	u64 event_id = event->attr.config;
5812
5813	if (event->attr.type != PERF_TYPE_SOFTWARE)
5814		return -ENOENT;
5815
5816	/*
5817	 * no branch sampling for software events
5818	 */
5819	if (has_branch_stack(event))
5820		return -EOPNOTSUPP;
5821
5822	switch (event_id) {
5823	case PERF_COUNT_SW_CPU_CLOCK:
5824	case PERF_COUNT_SW_TASK_CLOCK:
5825		return -ENOENT;
5826
5827	default:
5828		break;
5829	}
5830
5831	if (event_id >= PERF_COUNT_SW_MAX)
5832		return -ENOENT;
5833
5834	if (!event->parent) {
5835		int err;
5836
5837		err = swevent_hlist_get(event);
5838		if (err)
5839			return err;
5840
5841		static_key_slow_inc(&perf_swevent_enabled[event_id]);
5842		event->destroy = sw_perf_event_destroy;
5843	}
5844
5845	return 0;
5846}
5847
5848static int perf_swevent_event_idx(struct perf_event *event)
5849{
5850	return 0;
5851}
5852
5853static struct pmu perf_swevent = {
5854	.task_ctx_nr	= perf_sw_context,
5855
 
 
5856	.event_init	= perf_swevent_init,
5857	.add		= perf_swevent_add,
5858	.del		= perf_swevent_del,
5859	.start		= perf_swevent_start,
5860	.stop		= perf_swevent_stop,
5861	.read		= perf_swevent_read,
5862
5863	.event_idx	= perf_swevent_event_idx,
5864};
5865
5866#ifdef CONFIG_EVENT_TRACING
5867
5868static int perf_tp_filter_match(struct perf_event *event,
5869				struct perf_sample_data *data)
5870{
5871	void *record = data->raw->data;
 
 
 
 
5872
5873	if (likely(!event->filter) || filter_match_preds(event->filter, record))
5874		return 1;
5875	return 0;
5876}
5877
5878static int perf_tp_event_match(struct perf_event *event,
5879				struct perf_sample_data *data,
5880				struct pt_regs *regs)
5881{
5882	if (event->hw.state & PERF_HES_STOPPED)
5883		return 0;
5884	/*
5885	 * All tracepoints are from kernel-space.
5886	 */
5887	if (event->attr.exclude_kernel)
5888		return 0;
5889
5890	if (!perf_tp_filter_match(event, data))
5891		return 0;
5892
5893	return 1;
5894}
5895
5896void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5897		   struct pt_regs *regs, struct hlist_head *head, int rctx,
5898		   struct task_struct *task)
5899{
5900	struct perf_sample_data data;
5901	struct perf_event *event;
5902
5903	struct perf_raw_record raw = {
5904		.size = entry_size,
5905		.data = record,
 
 
5906	};
5907
5908	perf_sample_data_init(&data, addr, 0);
5909	data.raw = &raw;
5910
 
 
5911	hlist_for_each_entry_rcu(event, head, hlist_entry) {
5912		if (perf_tp_event_match(event, &data, regs))
5913			perf_swevent_event(event, count, &data, regs);
5914	}
5915
5916	/*
5917	 * If we got specified a target task, also iterate its context and
5918	 * deliver this event there too.
5919	 */
5920	if (task && task != current) {
5921		struct perf_event_context *ctx;
5922		struct trace_entry *entry = record;
5923
5924		rcu_read_lock();
5925		ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5926		if (!ctx)
5927			goto unlock;
5928
5929		list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5930			if (event->attr.type != PERF_TYPE_TRACEPOINT)
5931				continue;
5932			if (event->attr.config != entry->type)
5933				continue;
5934			if (perf_tp_event_match(event, &data, regs))
5935				perf_swevent_event(event, count, &data, regs);
5936		}
5937unlock:
5938		rcu_read_unlock();
5939	}
5940
5941	perf_swevent_put_recursion_context(rctx);
5942}
5943EXPORT_SYMBOL_GPL(perf_tp_event);
5944
5945static void tp_perf_event_destroy(struct perf_event *event)
5946{
5947	perf_trace_destroy(event);
5948}
5949
5950static int perf_tp_event_init(struct perf_event *event)
5951{
5952	int err;
5953
5954	if (event->attr.type != PERF_TYPE_TRACEPOINT)
5955		return -ENOENT;
5956
5957	/*
5958	 * no branch sampling for tracepoint events
5959	 */
5960	if (has_branch_stack(event))
5961		return -EOPNOTSUPP;
5962
5963	err = perf_trace_init(event);
5964	if (err)
5965		return err;
5966
5967	event->destroy = tp_perf_event_destroy;
5968
5969	return 0;
5970}
5971
5972static struct pmu perf_tracepoint = {
5973	.task_ctx_nr	= perf_sw_context,
5974
5975	.event_init	= perf_tp_event_init,
5976	.add		= perf_trace_add,
5977	.del		= perf_trace_del,
5978	.start		= perf_swevent_start,
5979	.stop		= perf_swevent_stop,
5980	.read		= perf_swevent_read,
5981
5982	.event_idx	= perf_swevent_event_idx,
5983};
5984
5985static inline void perf_tp_register(void)
5986{
5987	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
5988}
5989
5990static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5991{
5992	char *filter_str;
5993	int ret;
 
 
 
 
5994
5995	if (event->attr.type != PERF_TYPE_TRACEPOINT)
5996		return -EINVAL;
5997
5998	filter_str = strndup_user(arg, PAGE_SIZE);
5999	if (IS_ERR(filter_str))
6000		return PTR_ERR(filter_str);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6001
6002	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
 
 
 
 
 
6003
6004	kfree(filter_str);
6005	return ret;
6006}
6007
6008static void perf_event_free_filter(struct perf_event *event)
6009{
6010	ftrace_profile_free_filter(event);
 
 
 
 
 
 
 
 
 
 
 
6011}
6012
6013#else
6014
6015static inline void perf_tp_register(void)
6016{
6017}
6018
6019static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 
 
 
 
6020{
6021	return -ENOENT;
6022}
6023
6024static void perf_event_free_filter(struct perf_event *event)
6025{
6026}
6027
6028#endif /* CONFIG_EVENT_TRACING */
6029
6030#ifdef CONFIG_HAVE_HW_BREAKPOINT
6031void perf_bp_event(struct perf_event *bp, void *data)
6032{
6033	struct perf_sample_data sample;
6034	struct pt_regs *regs = data;
6035
6036	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
6037
6038	if (!bp->hw.state && !perf_exclude_event(bp, regs))
6039		perf_swevent_event(bp, 1, &sample, regs);
6040}
6041#endif
6042
6043/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6044 * hrtimer based swevent callback
6045 */
6046
6047static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
6048{
6049	enum hrtimer_restart ret = HRTIMER_RESTART;
6050	struct perf_sample_data data;
6051	struct pt_regs *regs;
6052	struct perf_event *event;
6053	u64 period;
6054
6055	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
6056
6057	if (event->state != PERF_EVENT_STATE_ACTIVE)
6058		return HRTIMER_NORESTART;
6059
6060	event->pmu->read(event);
6061
6062	perf_sample_data_init(&data, 0, event->hw.last_period);
6063	regs = get_irq_regs();
6064
6065	if (regs && !perf_exclude_event(event, regs)) {
6066		if (!(event->attr.exclude_idle && is_idle_task(current)))
6067			if (__perf_event_overflow(event, 1, &data, regs))
6068				ret = HRTIMER_NORESTART;
6069	}
6070
6071	period = max_t(u64, 10000, event->hw.sample_period);
6072	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
6073
6074	return ret;
6075}
6076
6077static void perf_swevent_start_hrtimer(struct perf_event *event)
6078{
6079	struct hw_perf_event *hwc = &event->hw;
6080	s64 period;
6081
6082	if (!is_sampling_event(event))
6083		return;
6084
6085	period = local64_read(&hwc->period_left);
6086	if (period) {
6087		if (period < 0)
6088			period = 10000;
6089
6090		local64_set(&hwc->period_left, 0);
6091	} else {
6092		period = max_t(u64, 10000, hwc->sample_period);
6093	}
6094	__hrtimer_start_range_ns(&hwc->hrtimer,
6095				ns_to_ktime(period), 0,
6096				HRTIMER_MODE_REL_PINNED, 0);
6097}
6098
6099static void perf_swevent_cancel_hrtimer(struct perf_event *event)
6100{
6101	struct hw_perf_event *hwc = &event->hw;
6102
6103	if (is_sampling_event(event)) {
6104		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
6105		local64_set(&hwc->period_left, ktime_to_ns(remaining));
6106
6107		hrtimer_cancel(&hwc->hrtimer);
6108	}
6109}
6110
6111static void perf_swevent_init_hrtimer(struct perf_event *event)
6112{
6113	struct hw_perf_event *hwc = &event->hw;
6114
6115	if (!is_sampling_event(event))
6116		return;
6117
6118	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6119	hwc->hrtimer.function = perf_swevent_hrtimer;
6120
6121	/*
6122	 * Since hrtimers have a fixed rate, we can do a static freq->period
6123	 * mapping and avoid the whole period adjust feedback stuff.
6124	 */
6125	if (event->attr.freq) {
6126		long freq = event->attr.sample_freq;
6127
6128		event->attr.sample_period = NSEC_PER_SEC / freq;
6129		hwc->sample_period = event->attr.sample_period;
6130		local64_set(&hwc->period_left, hwc->sample_period);
6131		hwc->last_period = hwc->sample_period;
6132		event->attr.freq = 0;
6133	}
6134}
6135
6136/*
6137 * Software event: cpu wall time clock
6138 */
6139
6140static void cpu_clock_event_update(struct perf_event *event)
6141{
6142	s64 prev;
6143	u64 now;
6144
6145	now = local_clock();
6146	prev = local64_xchg(&event->hw.prev_count, now);
6147	local64_add(now - prev, &event->count);
6148}
6149
6150static void cpu_clock_event_start(struct perf_event *event, int flags)
6151{
6152	local64_set(&event->hw.prev_count, local_clock());
6153	perf_swevent_start_hrtimer(event);
6154}
6155
6156static void cpu_clock_event_stop(struct perf_event *event, int flags)
6157{
6158	perf_swevent_cancel_hrtimer(event);
6159	cpu_clock_event_update(event);
6160}
6161
6162static int cpu_clock_event_add(struct perf_event *event, int flags)
6163{
6164	if (flags & PERF_EF_START)
6165		cpu_clock_event_start(event, flags);
 
6166
6167	return 0;
6168}
6169
6170static void cpu_clock_event_del(struct perf_event *event, int flags)
6171{
6172	cpu_clock_event_stop(event, flags);
6173}
6174
6175static void cpu_clock_event_read(struct perf_event *event)
6176{
6177	cpu_clock_event_update(event);
6178}
6179
6180static int cpu_clock_event_init(struct perf_event *event)
6181{
6182	if (event->attr.type != PERF_TYPE_SOFTWARE)
6183		return -ENOENT;
6184
6185	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6186		return -ENOENT;
6187
6188	/*
6189	 * no branch sampling for software events
6190	 */
6191	if (has_branch_stack(event))
6192		return -EOPNOTSUPP;
6193
6194	perf_swevent_init_hrtimer(event);
6195
6196	return 0;
6197}
6198
6199static struct pmu perf_cpu_clock = {
6200	.task_ctx_nr	= perf_sw_context,
6201
 
 
6202	.event_init	= cpu_clock_event_init,
6203	.add		= cpu_clock_event_add,
6204	.del		= cpu_clock_event_del,
6205	.start		= cpu_clock_event_start,
6206	.stop		= cpu_clock_event_stop,
6207	.read		= cpu_clock_event_read,
6208
6209	.event_idx	= perf_swevent_event_idx,
6210};
6211
6212/*
6213 * Software event: task time clock
6214 */
6215
6216static void task_clock_event_update(struct perf_event *event, u64 now)
6217{
6218	u64 prev;
6219	s64 delta;
6220
6221	prev = local64_xchg(&event->hw.prev_count, now);
6222	delta = now - prev;
6223	local64_add(delta, &event->count);
6224}
6225
6226static void task_clock_event_start(struct perf_event *event, int flags)
6227{
6228	local64_set(&event->hw.prev_count, event->ctx->time);
6229	perf_swevent_start_hrtimer(event);
6230}
6231
6232static void task_clock_event_stop(struct perf_event *event, int flags)
6233{
6234	perf_swevent_cancel_hrtimer(event);
6235	task_clock_event_update(event, event->ctx->time);
6236}
6237
6238static int task_clock_event_add(struct perf_event *event, int flags)
6239{
6240	if (flags & PERF_EF_START)
6241		task_clock_event_start(event, flags);
 
6242
6243	return 0;
6244}
6245
6246static void task_clock_event_del(struct perf_event *event, int flags)
6247{
6248	task_clock_event_stop(event, PERF_EF_UPDATE);
6249}
6250
6251static void task_clock_event_read(struct perf_event *event)
6252{
6253	u64 now = perf_clock();
6254	u64 delta = now - event->ctx->timestamp;
6255	u64 time = event->ctx->time + delta;
6256
6257	task_clock_event_update(event, time);
6258}
6259
6260static int task_clock_event_init(struct perf_event *event)
6261{
6262	if (event->attr.type != PERF_TYPE_SOFTWARE)
6263		return -ENOENT;
6264
6265	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
6266		return -ENOENT;
6267
6268	/*
6269	 * no branch sampling for software events
6270	 */
6271	if (has_branch_stack(event))
6272		return -EOPNOTSUPP;
6273
6274	perf_swevent_init_hrtimer(event);
6275
6276	return 0;
6277}
6278
6279static struct pmu perf_task_clock = {
6280	.task_ctx_nr	= perf_sw_context,
6281
 
 
6282	.event_init	= task_clock_event_init,
6283	.add		= task_clock_event_add,
6284	.del		= task_clock_event_del,
6285	.start		= task_clock_event_start,
6286	.stop		= task_clock_event_stop,
6287	.read		= task_clock_event_read,
6288
6289	.event_idx	= perf_swevent_event_idx,
6290};
6291
6292static void perf_pmu_nop_void(struct pmu *pmu)
6293{
6294}
6295
 
 
 
 
6296static int perf_pmu_nop_int(struct pmu *pmu)
6297{
6298	return 0;
6299}
6300
6301static void perf_pmu_start_txn(struct pmu *pmu)
 
 
6302{
 
 
 
 
 
6303	perf_pmu_disable(pmu);
6304}
6305
6306static int perf_pmu_commit_txn(struct pmu *pmu)
6307{
 
 
 
 
 
 
 
6308	perf_pmu_enable(pmu);
6309	return 0;
6310}
6311
6312static void perf_pmu_cancel_txn(struct pmu *pmu)
6313{
 
 
 
 
 
 
 
6314	perf_pmu_enable(pmu);
6315}
6316
6317static int perf_event_idx_default(struct perf_event *event)
6318{
6319	return event->hw.idx + 1;
6320}
6321
6322/*
6323 * Ensures all contexts with the same task_ctx_nr have the same
6324 * pmu_cpu_context too.
6325 */
6326static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
6327{
6328	struct pmu *pmu;
6329
6330	if (ctxn < 0)
6331		return NULL;
6332
6333	list_for_each_entry(pmu, &pmus, entry) {
6334		if (pmu->task_ctx_nr == ctxn)
6335			return pmu->pmu_cpu_context;
6336	}
6337
6338	return NULL;
6339}
6340
6341static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
6342{
6343	int cpu;
6344
6345	for_each_possible_cpu(cpu) {
6346		struct perf_cpu_context *cpuctx;
6347
6348		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6349
6350		if (cpuctx->unique_pmu == old_pmu)
6351			cpuctx->unique_pmu = pmu;
6352	}
6353}
6354
6355static void free_pmu_context(struct pmu *pmu)
6356{
6357	struct pmu *i;
6358
6359	mutex_lock(&pmus_lock);
6360	/*
6361	 * Like a real lame refcount.
6362	 */
6363	list_for_each_entry(i, &pmus, entry) {
6364		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
6365			update_pmu_context(i, pmu);
6366			goto out;
6367		}
6368	}
6369
6370	free_percpu(pmu->pmu_cpu_context);
6371out:
6372	mutex_unlock(&pmus_lock);
6373}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6374static struct idr pmu_idr;
6375
6376static ssize_t
6377type_show(struct device *dev, struct device_attribute *attr, char *page)
6378{
6379	struct pmu *pmu = dev_get_drvdata(dev);
6380
6381	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6382}
6383static DEVICE_ATTR_RO(type);
6384
6385static ssize_t
6386perf_event_mux_interval_ms_show(struct device *dev,
6387				struct device_attribute *attr,
6388				char *page)
6389{
6390	struct pmu *pmu = dev_get_drvdata(dev);
6391
6392	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6393}
6394
 
 
6395static ssize_t
6396perf_event_mux_interval_ms_store(struct device *dev,
6397				 struct device_attribute *attr,
6398				 const char *buf, size_t count)
6399{
6400	struct pmu *pmu = dev_get_drvdata(dev);
6401	int timer, cpu, ret;
6402
6403	ret = kstrtoint(buf, 0, &timer);
6404	if (ret)
6405		return ret;
6406
6407	if (timer < 1)
6408		return -EINVAL;
6409
6410	/* same value, noting to do */
6411	if (timer == pmu->hrtimer_interval_ms)
6412		return count;
6413
 
6414	pmu->hrtimer_interval_ms = timer;
6415
6416	/* update all cpuctx for this PMU */
6417	for_each_possible_cpu(cpu) {
 
6418		struct perf_cpu_context *cpuctx;
6419		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6420		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6421
6422		if (hrtimer_active(&cpuctx->hrtimer))
6423			hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6424	}
 
 
6425
6426	return count;
6427}
6428static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6429
6430static struct attribute *pmu_dev_attrs[] = {
6431	&dev_attr_type.attr,
6432	&dev_attr_perf_event_mux_interval_ms.attr,
6433	NULL,
6434};
6435ATTRIBUTE_GROUPS(pmu_dev);
6436
6437static int pmu_bus_running;
6438static struct bus_type pmu_bus = {
6439	.name		= "event_source",
6440	.dev_groups	= pmu_dev_groups,
6441};
6442
6443static void pmu_dev_release(struct device *dev)
6444{
6445	kfree(dev);
6446}
6447
6448static int pmu_dev_alloc(struct pmu *pmu)
6449{
6450	int ret = -ENOMEM;
6451
6452	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
6453	if (!pmu->dev)
6454		goto out;
6455
6456	pmu->dev->groups = pmu->attr_groups;
6457	device_initialize(pmu->dev);
6458	ret = dev_set_name(pmu->dev, "%s", pmu->name);
6459	if (ret)
6460		goto free_dev;
6461
6462	dev_set_drvdata(pmu->dev, pmu);
6463	pmu->dev->bus = &pmu_bus;
6464	pmu->dev->release = pmu_dev_release;
6465	ret = device_add(pmu->dev);
6466	if (ret)
6467		goto free_dev;
6468
 
 
 
 
 
 
 
6469out:
6470	return ret;
6471
 
 
 
6472free_dev:
6473	put_device(pmu->dev);
6474	goto out;
6475}
6476
6477static struct lock_class_key cpuctx_mutex;
6478static struct lock_class_key cpuctx_lock;
6479
6480int perf_pmu_register(struct pmu *pmu, const char *name, int type)
6481{
6482	int cpu, ret;
6483
6484	mutex_lock(&pmus_lock);
6485	ret = -ENOMEM;
6486	pmu->pmu_disable_count = alloc_percpu(int);
6487	if (!pmu->pmu_disable_count)
6488		goto unlock;
6489
6490	pmu->type = -1;
6491	if (!name)
6492		goto skip_type;
6493	pmu->name = name;
6494
6495	if (type < 0) {
6496		type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
6497		if (type < 0) {
6498			ret = type;
6499			goto free_pdc;
6500		}
6501	}
6502	pmu->type = type;
6503
6504	if (pmu_bus_running) {
6505		ret = pmu_dev_alloc(pmu);
6506		if (ret)
6507			goto free_idr;
6508	}
6509
6510skip_type:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6511	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6512	if (pmu->pmu_cpu_context)
6513		goto got_cpu_context;
6514
6515	ret = -ENOMEM;
6516	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6517	if (!pmu->pmu_cpu_context)
6518		goto free_dev;
6519
6520	for_each_possible_cpu(cpu) {
6521		struct perf_cpu_context *cpuctx;
6522
6523		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6524		__perf_event_init_context(&cpuctx->ctx);
6525		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6526		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6527		cpuctx->ctx.type = cpu_context;
6528		cpuctx->ctx.pmu = pmu;
6529
6530		__perf_cpu_hrtimer_init(cpuctx, cpu);
6531
6532		INIT_LIST_HEAD(&cpuctx->rotation_list);
6533		cpuctx->unique_pmu = pmu;
6534	}
6535
6536got_cpu_context:
6537	if (!pmu->start_txn) {
6538		if (pmu->pmu_enable) {
6539			/*
6540			 * If we have pmu_enable/pmu_disable calls, install
6541			 * transaction stubs that use that to try and batch
6542			 * hardware accesses.
6543			 */
6544			pmu->start_txn  = perf_pmu_start_txn;
6545			pmu->commit_txn = perf_pmu_commit_txn;
6546			pmu->cancel_txn = perf_pmu_cancel_txn;
6547		} else {
6548			pmu->start_txn  = perf_pmu_nop_void;
6549			pmu->commit_txn = perf_pmu_nop_int;
6550			pmu->cancel_txn = perf_pmu_nop_void;
6551		}
6552	}
6553
6554	if (!pmu->pmu_enable) {
6555		pmu->pmu_enable  = perf_pmu_nop_void;
6556		pmu->pmu_disable = perf_pmu_nop_void;
6557	}
6558
6559	if (!pmu->event_idx)
6560		pmu->event_idx = perf_event_idx_default;
6561
6562	list_add_rcu(&pmu->entry, &pmus);
 
6563	ret = 0;
6564unlock:
6565	mutex_unlock(&pmus_lock);
6566
6567	return ret;
6568
6569free_dev:
6570	device_del(pmu->dev);
6571	put_device(pmu->dev);
6572
6573free_idr:
6574	if (pmu->type >= PERF_TYPE_MAX)
6575		idr_remove(&pmu_idr, pmu->type);
6576
6577free_pdc:
6578	free_percpu(pmu->pmu_disable_count);
6579	goto unlock;
6580}
 
6581
6582void perf_pmu_unregister(struct pmu *pmu)
6583{
 
 
6584	mutex_lock(&pmus_lock);
 
6585	list_del_rcu(&pmu->entry);
6586	mutex_unlock(&pmus_lock);
6587
6588	/*
6589	 * We dereference the pmu list under both SRCU and regular RCU, so
6590	 * synchronize against both of those.
6591	 */
6592	synchronize_srcu(&pmus_srcu);
6593	synchronize_rcu();
6594
6595	free_percpu(pmu->pmu_disable_count);
6596	if (pmu->type >= PERF_TYPE_MAX)
6597		idr_remove(&pmu_idr, pmu->type);
6598	device_del(pmu->dev);
6599	put_device(pmu->dev);
 
 
 
 
6600	free_pmu_context(pmu);
6601}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6602
6603struct pmu *perf_init_event(struct perf_event *event)
6604{
6605	struct pmu *pmu = NULL;
6606	int idx;
6607	int ret;
6608
6609	idx = srcu_read_lock(&pmus_srcu);
6610
6611	rcu_read_lock();
6612	pmu = idr_find(&pmu_idr, event->attr.type);
6613	rcu_read_unlock();
6614	if (pmu) {
6615		event->pmu = pmu;
6616		ret = pmu->event_init(event);
6617		if (ret)
6618			pmu = ERR_PTR(ret);
6619		goto unlock;
6620	}
6621
6622	list_for_each_entry_rcu(pmu, &pmus, entry) {
6623		event->pmu = pmu;
6624		ret = pmu->event_init(event);
6625		if (!ret)
6626			goto unlock;
6627
6628		if (ret != -ENOENT) {
6629			pmu = ERR_PTR(ret);
6630			goto unlock;
6631		}
6632	}
6633	pmu = ERR_PTR(-ENOENT);
6634unlock:
6635	srcu_read_unlock(&pmus_srcu, idx);
6636
6637	return pmu;
6638}
6639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6640static void account_event_cpu(struct perf_event *event, int cpu)
6641{
6642	if (event->parent)
6643		return;
6644
6645	if (has_branch_stack(event)) {
6646		if (!(event->attach_state & PERF_ATTACH_TASK))
6647			atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
6648	}
6649	if (is_cgroup_event(event))
6650		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
6651}
6652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6653static void account_event(struct perf_event *event)
6654{
 
 
6655	if (event->parent)
6656		return;
6657
6658	if (event->attach_state & PERF_ATTACH_TASK)
6659		static_key_slow_inc(&perf_sched_events.key);
6660	if (event->attr.mmap || event->attr.mmap_data)
6661		atomic_inc(&nr_mmap_events);
6662	if (event->attr.comm)
6663		atomic_inc(&nr_comm_events);
6664	if (event->attr.task)
6665		atomic_inc(&nr_task_events);
6666	if (event->attr.freq) {
6667		if (atomic_inc_return(&nr_freq_events) == 1)
6668			tick_nohz_full_kick_all();
 
 
6669	}
6670	if (has_branch_stack(event))
6671		static_key_slow_inc(&perf_sched_events.key);
6672	if (is_cgroup_event(event))
6673		static_key_slow_inc(&perf_sched_events.key);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6674
6675	account_event_cpu(event, event->cpu);
 
 
6676}
6677
6678/*
6679 * Allocate and initialize a event structure
6680 */
6681static struct perf_event *
6682perf_event_alloc(struct perf_event_attr *attr, int cpu,
6683		 struct task_struct *task,
6684		 struct perf_event *group_leader,
6685		 struct perf_event *parent_event,
6686		 perf_overflow_handler_t overflow_handler,
6687		 void *context)
6688{
6689	struct pmu *pmu;
6690	struct perf_event *event;
6691	struct hw_perf_event *hwc;
6692	long err = -EINVAL;
6693
6694	if ((unsigned)cpu >= nr_cpu_ids) {
6695		if (!task || cpu != -1)
6696			return ERR_PTR(-EINVAL);
6697	}
6698
6699	event = kzalloc(sizeof(*event), GFP_KERNEL);
6700	if (!event)
6701		return ERR_PTR(-ENOMEM);
6702
6703	/*
6704	 * Single events are their own group leaders, with an
6705	 * empty sibling list:
6706	 */
6707	if (!group_leader)
6708		group_leader = event;
6709
6710	mutex_init(&event->child_mutex);
6711	INIT_LIST_HEAD(&event->child_list);
6712
6713	INIT_LIST_HEAD(&event->group_entry);
6714	INIT_LIST_HEAD(&event->event_entry);
6715	INIT_LIST_HEAD(&event->sibling_list);
6716	INIT_LIST_HEAD(&event->rb_entry);
6717	INIT_LIST_HEAD(&event->active_entry);
 
6718	INIT_HLIST_NODE(&event->hlist_entry);
6719
6720
6721	init_waitqueue_head(&event->waitq);
6722	init_irq_work(&event->pending, perf_pending_event);
6723
6724	mutex_init(&event->mmap_mutex);
 
6725
6726	atomic_long_set(&event->refcount, 1);
6727	event->cpu		= cpu;
6728	event->attr		= *attr;
6729	event->group_leader	= group_leader;
6730	event->pmu		= NULL;
6731	event->oncpu		= -1;
6732
6733	event->parent		= parent_event;
6734
6735	event->ns		= get_pid_ns(task_active_pid_ns(current));
6736	event->id		= atomic64_inc_return(&perf_event_id);
6737
6738	event->state		= PERF_EVENT_STATE_INACTIVE;
6739
6740	if (task) {
6741		event->attach_state = PERF_ATTACH_TASK;
6742
6743		if (attr->type == PERF_TYPE_TRACEPOINT)
6744			event->hw.tp_target = task;
6745#ifdef CONFIG_HAVE_HW_BREAKPOINT
6746		/*
6747		 * hw_breakpoint is a bit difficult here..
 
 
6748		 */
6749		else if (attr->type == PERF_TYPE_BREAKPOINT)
6750			event->hw.bp_target = task;
6751#endif
6752	}
6753
 
 
 
 
6754	if (!overflow_handler && parent_event) {
6755		overflow_handler = parent_event->overflow_handler;
6756		context = parent_event->overflow_handler_context;
 
 
 
 
 
 
 
 
 
 
 
 
 
6757	}
6758
6759	event->overflow_handler	= overflow_handler;
6760	event->overflow_handler_context = context;
 
 
 
 
 
 
 
 
6761
6762	perf_event__state_init(event);
6763
6764	pmu = NULL;
6765
6766	hwc = &event->hw;
6767	hwc->sample_period = attr->sample_period;
6768	if (attr->freq && attr->sample_freq)
6769		hwc->sample_period = 1;
6770	hwc->last_period = hwc->sample_period;
6771
6772	local64_set(&hwc->period_left, hwc->sample_period);
6773
6774	/*
6775	 * we currently do not support PERF_FORMAT_GROUP on inherited events
6776	 */
6777	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6778		goto err_ns;
6779
 
 
 
 
 
 
 
 
 
6780	pmu = perf_init_event(event);
6781	if (!pmu)
6782		goto err_ns;
6783	else if (IS_ERR(pmu)) {
6784		err = PTR_ERR(pmu);
6785		goto err_ns;
6786	}
6787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6788	if (!event->parent) {
6789		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6790			err = get_callchain_buffers();
6791			if (err)
6792				goto err_pmu;
6793		}
6794	}
6795
 
 
 
6796	return event;
6797
 
 
 
 
 
 
6798err_pmu:
6799	if (event->destroy)
6800		event->destroy(event);
 
6801err_ns:
 
 
6802	if (event->ns)
6803		put_pid_ns(event->ns);
6804	kfree(event);
6805
6806	return ERR_PTR(err);
6807}
6808
6809static int perf_copy_attr(struct perf_event_attr __user *uattr,
6810			  struct perf_event_attr *attr)
6811{
6812	u32 size;
6813	int ret;
6814
6815	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6816		return -EFAULT;
6817
6818	/*
6819	 * zero the full structure, so that a short copy will be nice.
6820	 */
6821	memset(attr, 0, sizeof(*attr));
6822
6823	ret = get_user(size, &uattr->size);
6824	if (ret)
6825		return ret;
6826
6827	if (size > PAGE_SIZE)	/* silly large */
6828		goto err_size;
6829
6830	if (!size)		/* abi compat */
6831		size = PERF_ATTR_SIZE_VER0;
6832
6833	if (size < PERF_ATTR_SIZE_VER0)
6834		goto err_size;
6835
6836	/*
6837	 * If we're handed a bigger struct than we know of,
6838	 * ensure all the unknown bits are 0 - i.e. new
6839	 * user-space does not rely on any kernel feature
6840	 * extensions we dont know about yet.
6841	 */
6842	if (size > sizeof(*attr)) {
6843		unsigned char __user *addr;
6844		unsigned char __user *end;
6845		unsigned char val;
6846
6847		addr = (void __user *)uattr + sizeof(*attr);
6848		end  = (void __user *)uattr + size;
6849
6850		for (; addr < end; addr++) {
6851			ret = get_user(val, addr);
6852			if (ret)
6853				return ret;
6854			if (val)
6855				goto err_size;
6856		}
6857		size = sizeof(*attr);
6858	}
6859
6860	ret = copy_from_user(attr, uattr, size);
6861	if (ret)
6862		return -EFAULT;
6863
6864	/* disabled for now */
6865	if (attr->mmap2)
6866		return -EINVAL;
6867
6868	if (attr->__reserved_1)
6869		return -EINVAL;
6870
6871	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6872		return -EINVAL;
6873
6874	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6875		return -EINVAL;
6876
6877	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6878		u64 mask = attr->branch_sample_type;
6879
6880		/* only using defined bits */
6881		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6882			return -EINVAL;
6883
6884		/* at least one branch bit must be set */
6885		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6886			return -EINVAL;
6887
6888		/* propagate priv level, when not set for branch */
6889		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6890
6891			/* exclude_kernel checked on syscall entry */
6892			if (!attr->exclude_kernel)
6893				mask |= PERF_SAMPLE_BRANCH_KERNEL;
6894
6895			if (!attr->exclude_user)
6896				mask |= PERF_SAMPLE_BRANCH_USER;
6897
6898			if (!attr->exclude_hv)
6899				mask |= PERF_SAMPLE_BRANCH_HV;
6900			/*
6901			 * adjust user setting (for HW filter setup)
6902			 */
6903			attr->branch_sample_type = mask;
6904		}
6905		/* privileged levels capture (kernel, hv): check permissions */
6906		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6907		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6908			return -EACCES;
6909	}
6910
6911	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
6912		ret = perf_reg_validate(attr->sample_regs_user);
6913		if (ret)
6914			return ret;
6915	}
6916
6917	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
6918		if (!arch_perf_have_user_stack_dump())
6919			return -ENOSYS;
6920
6921		/*
6922		 * We have __u32 type for the size, but so far
6923		 * we can only use __u16 as maximum due to the
6924		 * __u16 sample size limit.
6925		 */
6926		if (attr->sample_stack_user >= USHRT_MAX)
6927			ret = -EINVAL;
6928		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
6929			ret = -EINVAL;
6930	}
6931
 
 
6932out:
6933	return ret;
6934
6935err_size:
6936	put_user(sizeof(*attr), &uattr->size);
6937	ret = -E2BIG;
6938	goto out;
6939}
6940
6941static int
6942perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6943{
6944	struct ring_buffer *rb = NULL;
6945	int ret = -EINVAL;
6946
6947	if (!output_event)
6948		goto set;
6949
6950	/* don't allow circular references */
6951	if (event == output_event)
6952		goto out;
6953
6954	/*
6955	 * Don't allow cross-cpu buffers
6956	 */
6957	if (output_event->cpu != event->cpu)
6958		goto out;
6959
6960	/*
6961	 * If its not a per-cpu rb, it must be the same task.
6962	 */
6963	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6964		goto out;
6965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6966set:
6967	mutex_lock(&event->mmap_mutex);
6968	/* Can't redirect output if we've got an active mmap() */
6969	if (atomic_read(&event->mmap_count))
6970		goto unlock;
6971
6972	if (output_event) {
6973		/* get the rb we want to redirect to */
6974		rb = ring_buffer_get(output_event);
6975		if (!rb)
6976			goto unlock;
6977	}
6978
6979	ring_buffer_attach(event, rb);
6980
6981	ret = 0;
6982unlock:
6983	mutex_unlock(&event->mmap_mutex);
6984
6985out:
6986	return ret;
6987}
6988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6989/**
6990 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6991 *
6992 * @attr_uptr:	event_id type attributes for monitoring/sampling
6993 * @pid:		target pid
6994 * @cpu:		target cpu
6995 * @group_fd:		group leader event fd
6996 */
6997SYSCALL_DEFINE5(perf_event_open,
6998		struct perf_event_attr __user *, attr_uptr,
6999		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
7000{
7001	struct perf_event *group_leader = NULL, *output_event = NULL;
7002	struct perf_event *event, *sibling;
7003	struct perf_event_attr attr;
7004	struct perf_event_context *ctx;
7005	struct file *event_file = NULL;
7006	struct fd group = {NULL, 0};
7007	struct task_struct *task = NULL;
7008	struct pmu *pmu;
7009	int event_fd;
7010	int move_group = 0;
7011	int err;
7012	int f_flags = O_RDWR;
 
7013
7014	/* for future expandability... */
7015	if (flags & ~PERF_FLAG_ALL)
7016		return -EINVAL;
7017
7018	err = perf_copy_attr(attr_uptr, &attr);
7019	if (err)
7020		return err;
7021
7022	if (!attr.exclude_kernel) {
7023		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7024			return -EACCES;
7025	}
7026
7027	if (attr.freq) {
7028		if (attr.sample_freq > sysctl_perf_event_sample_rate)
7029			return -EINVAL;
7030	} else {
7031		if (attr.sample_period & (1ULL << 63))
7032			return -EINVAL;
7033	}
7034
 
 
 
7035	/*
7036	 * In cgroup mode, the pid argument is used to pass the fd
7037	 * opened to the cgroup directory in cgroupfs. The cpu argument
7038	 * designates the cpu on which to monitor threads from that
7039	 * cgroup.
7040	 */
7041	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7042		return -EINVAL;
7043
7044	if (flags & PERF_FLAG_FD_CLOEXEC)
7045		f_flags |= O_CLOEXEC;
7046
7047	event_fd = get_unused_fd_flags(f_flags);
7048	if (event_fd < 0)
7049		return event_fd;
7050
7051	if (group_fd != -1) {
7052		err = perf_fget_light(group_fd, &group);
7053		if (err)
7054			goto err_fd;
7055		group_leader = group.file->private_data;
7056		if (flags & PERF_FLAG_FD_OUTPUT)
7057			output_event = group_leader;
7058		if (flags & PERF_FLAG_FD_NO_GROUP)
7059			group_leader = NULL;
7060	}
7061
7062	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
7063		task = find_lively_task_by_vpid(pid);
7064		if (IS_ERR(task)) {
7065			err = PTR_ERR(task);
7066			goto err_group_fd;
7067		}
7068	}
7069
 
 
 
 
 
 
7070	get_online_cpus();
7071
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7072	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7073				 NULL, NULL);
7074	if (IS_ERR(event)) {
7075		err = PTR_ERR(event);
7076		goto err_task;
7077	}
7078
7079	if (flags & PERF_FLAG_PID_CGROUP) {
7080		err = perf_cgroup_connect(pid, event, &attr, group_leader);
7081		if (err) {
7082			__free_event(event);
7083			goto err_task;
7084		}
7085	}
7086
7087	account_event(event);
7088
7089	/*
7090	 * Special case software events and allow them to be part of
7091	 * any hardware group.
7092	 */
7093	pmu = event->pmu;
7094
 
 
 
 
 
 
 
 
 
7095	if (group_leader &&
7096	    (is_software_event(event) != is_software_event(group_leader))) {
7097		if (is_software_event(event)) {
7098			/*
7099			 * If event and group_leader are not both a software
7100			 * event, and event is, then group leader is not.
7101			 *
7102			 * Allow the addition of software events to !software
7103			 * groups, this is safe because software events never
7104			 * fail to schedule.
7105			 */
7106			pmu = group_leader->pmu;
7107		} else if (is_software_event(group_leader) &&
7108			   (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7109			/*
7110			 * In case the group is a pure software group, and we
7111			 * try to add a hardware event, move the whole group to
7112			 * the hardware context.
7113			 */
7114			move_group = 1;
7115		}
7116	}
7117
7118	/*
7119	 * Get the target context (task or percpu):
7120	 */
7121	ctx = find_get_context(pmu, task, event->cpu);
7122	if (IS_ERR(ctx)) {
7123		err = PTR_ERR(ctx);
7124		goto err_alloc;
7125	}
7126
7127	if (task) {
7128		put_task_struct(task);
7129		task = NULL;
7130	}
7131
7132	/*
7133	 * Look up the group leader (we will attach this event to it):
7134	 */
7135	if (group_leader) {
7136		err = -EINVAL;
7137
7138		/*
7139		 * Do not allow a recursive hierarchy (this new sibling
7140		 * becoming part of another group-sibling):
7141		 */
7142		if (group_leader->group_leader != group_leader)
7143			goto err_context;
 
 
 
 
 
7144		/*
7145		 * Do not allow to attach to a group in a different
7146		 * task or CPU context:
7147		 */
7148		if (move_group) {
7149			if (group_leader->ctx->type != ctx->type)
 
 
 
 
 
 
 
 
 
 
 
 
7150				goto err_context;
7151		} else {
7152			if (group_leader->ctx != ctx)
7153				goto err_context;
7154		}
7155
7156		/*
7157		 * Only a group leader can be exclusive or pinned
7158		 */
7159		if (attr.exclusive || attr.pinned)
7160			goto err_context;
7161	}
7162
7163	if (output_event) {
7164		err = perf_event_set_output(event, output_event);
7165		if (err)
7166			goto err_context;
7167	}
7168
7169	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
7170					f_flags);
7171	if (IS_ERR(event_file)) {
7172		err = PTR_ERR(event_file);
 
7173		goto err_context;
7174	}
7175
7176	if (move_group) {
7177		struct perf_event_context *gctx = group_leader->ctx;
7178
7179		mutex_lock(&gctx->mutex);
7180		perf_remove_from_context(group_leader, false);
 
 
7181
7182		/*
7183		 * Removing from the context ends up with disabled
7184		 * event. What we want here is event in the initial
7185		 * startup state, ready to be add into new context.
7186		 */
7187		perf_event__state_init(group_leader);
7188		list_for_each_entry(sibling, &group_leader->sibling_list,
7189				    group_entry) {
7190			perf_remove_from_context(sibling, false);
7191			perf_event__state_init(sibling);
7192			put_ctx(gctx);
 
 
 
 
 
 
 
7193		}
7194		mutex_unlock(&gctx->mutex);
7195		put_ctx(gctx);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7196	}
7197
7198	WARN_ON_ONCE(ctx->parent_ctx);
7199	mutex_lock(&ctx->mutex);
 
 
 
 
7200
7201	if (move_group) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7202		synchronize_rcu();
7203		perf_install_in_context(ctx, group_leader, event->cpu);
7204		get_ctx(ctx);
 
 
 
 
 
 
 
 
 
7205		list_for_each_entry(sibling, &group_leader->sibling_list,
7206				    group_entry) {
7207			perf_install_in_context(ctx, sibling, event->cpu);
 
7208			get_ctx(ctx);
7209		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7210	}
7211
 
 
 
 
 
 
 
 
 
 
 
7212	perf_install_in_context(ctx, event, event->cpu);
7213	perf_unpin_context(ctx);
 
 
 
7214	mutex_unlock(&ctx->mutex);
7215
 
 
 
 
 
7216	put_online_cpus();
7217
7218	event->owner = current;
7219
7220	mutex_lock(&current->perf_event_mutex);
7221	list_add_tail(&event->owner_entry, &current->perf_event_list);
7222	mutex_unlock(&current->perf_event_mutex);
7223
7224	/*
7225	 * Precalculate sample_data sizes
7226	 */
7227	perf_event__header_size(event);
7228	perf_event__id_header_size(event);
7229
7230	/*
7231	 * Drop the reference on the group_event after placing the
7232	 * new event on the sibling_list. This ensures destruction
7233	 * of the group leader will find the pointer to itself in
7234	 * perf_group_detach().
7235	 */
7236	fdput(group);
7237	fd_install(event_fd, event_file);
7238	return event_fd;
7239
 
 
 
 
 
 
7240err_context:
7241	perf_unpin_context(ctx);
7242	put_ctx(ctx);
7243err_alloc:
7244	free_event(event);
 
 
 
 
 
 
 
 
 
 
7245err_task:
7246	put_online_cpus();
7247	if (task)
7248		put_task_struct(task);
7249err_group_fd:
7250	fdput(group);
7251err_fd:
7252	put_unused_fd(event_fd);
7253	return err;
7254}
7255
7256/**
7257 * perf_event_create_kernel_counter
7258 *
7259 * @attr: attributes of the counter to create
7260 * @cpu: cpu in which the counter is bound
7261 * @task: task to profile (NULL for percpu)
7262 */
7263struct perf_event *
7264perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7265				 struct task_struct *task,
7266				 perf_overflow_handler_t overflow_handler,
7267				 void *context)
7268{
7269	struct perf_event_context *ctx;
7270	struct perf_event *event;
7271	int err;
7272
7273	/*
7274	 * Get the target context (task or percpu):
7275	 */
7276
7277	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
7278				 overflow_handler, context);
7279	if (IS_ERR(event)) {
7280		err = PTR_ERR(event);
7281		goto err;
7282	}
7283
7284	account_event(event);
 
7285
7286	ctx = find_get_context(event->pmu, task, cpu);
7287	if (IS_ERR(ctx)) {
7288		err = PTR_ERR(ctx);
7289		goto err_free;
7290	}
7291
7292	WARN_ON_ONCE(ctx->parent_ctx);
7293	mutex_lock(&ctx->mutex);
 
 
 
 
 
 
 
 
 
 
7294	perf_install_in_context(ctx, event, cpu);
7295	perf_unpin_context(ctx);
7296	mutex_unlock(&ctx->mutex);
7297
7298	return event;
7299
 
 
 
 
7300err_free:
7301	free_event(event);
7302err:
7303	return ERR_PTR(err);
7304}
7305EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
7306
7307void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7308{
7309	struct perf_event_context *src_ctx;
7310	struct perf_event_context *dst_ctx;
7311	struct perf_event *event, *tmp;
7312	LIST_HEAD(events);
7313
7314	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
7315	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
7316
7317	mutex_lock(&src_ctx->mutex);
 
 
 
 
7318	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7319				 event_entry) {
7320		perf_remove_from_context(event, false);
7321		unaccount_event_cpu(event, src_cpu);
7322		put_ctx(src_ctx);
7323		list_add(&event->migrate_entry, &events);
7324	}
7325	mutex_unlock(&src_ctx->mutex);
7326
 
 
 
7327	synchronize_rcu();
7328
7329	mutex_lock(&dst_ctx->mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7330	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7331		list_del(&event->migrate_entry);
7332		if (event->state >= PERF_EVENT_STATE_OFF)
7333			event->state = PERF_EVENT_STATE_INACTIVE;
7334		account_event_cpu(event, dst_cpu);
7335		perf_install_in_context(dst_ctx, event, dst_cpu);
7336		get_ctx(dst_ctx);
7337	}
7338	mutex_unlock(&dst_ctx->mutex);
 
7339}
7340EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7341
7342static void sync_child_event(struct perf_event *child_event,
7343			       struct task_struct *child)
7344{
7345	struct perf_event *parent_event = child_event->parent;
7346	u64 child_val;
7347
7348	if (child_event->attr.inherit_stat)
7349		perf_event_read_event(child_event, child);
7350
7351	child_val = perf_event_count(child_event);
7352
7353	/*
7354	 * Add back the child's count to the parent's count:
7355	 */
7356	atomic64_add(child_val, &parent_event->child_count);
7357	atomic64_add(child_event->total_time_enabled,
7358		     &parent_event->child_total_time_enabled);
7359	atomic64_add(child_event->total_time_running,
7360		     &parent_event->child_total_time_running);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7361
7362	/*
7363	 * Remove this event from the parent's list
7364	 */
7365	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7366	mutex_lock(&parent_event->child_mutex);
7367	list_del_init(&child_event->child_list);
7368	mutex_unlock(&parent_event->child_mutex);
7369
7370	/*
7371	 * Release the parent event, if this was the last
7372	 * reference to it.
7373	 */
 
 
7374	put_event(parent_event);
7375}
7376
7377static void
7378__perf_event_exit_task(struct perf_event *child_event,
7379			 struct perf_event_context *child_ctx,
7380			 struct task_struct *child)
7381{
7382	perf_remove_from_context(child_event, !!child_event->parent);
 
7383
7384	/*
7385	 * It can happen that the parent exits first, and has events
7386	 * that are still around due to the child reference. These
7387	 * events need to be zapped.
7388	 */
7389	if (child_event->parent) {
7390		sync_child_event(child_event, child);
7391		free_event(child_event);
7392	}
7393}
7394
7395static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7396{
7397	struct perf_event *child_event, *tmp;
7398	struct perf_event_context *child_ctx;
7399	unsigned long flags;
7400
7401	if (likely(!child->perf_event_ctxp[ctxn])) {
7402		perf_event_task(child, NULL, 0);
7403		return;
7404	}
7405
7406	local_irq_save(flags);
7407	/*
7408	 * We can't reschedule here because interrupts are disabled,
7409	 * and either child is current or it is a task that can't be
7410	 * scheduled, so we are now safe from rescheduling changing
7411	 * our context.
 
 
 
 
7412	 */
7413	child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
7414
7415	/*
7416	 * Take the context lock here so that if find_get_context is
7417	 * reading child->perf_event_ctxp, we wait until it has
7418	 * incremented the context's refcount before we do put_ctx below.
7419	 */
7420	raw_spin_lock(&child_ctx->lock);
7421	task_ctx_sched_out(child_ctx);
7422	child->perf_event_ctxp[ctxn] = NULL;
7423	/*
7424	 * If this context is a clone; unclone it so it can't get
7425	 * swapped to another process while we're removing all
7426	 * the events from it.
7427	 */
7428	unclone_ctx(child_ctx);
7429	update_context_time(child_ctx);
7430	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
 
 
 
 
 
 
 
7431
7432	/*
7433	 * Report the task dead after unscheduling the events so that we
7434	 * won't get any samples after PERF_RECORD_EXIT. We can however still
7435	 * get a few PERF_RECORD_READ events.
7436	 */
7437	perf_event_task(child, child_ctx, 0);
7438
7439	/*
7440	 * We can recurse on the same lock type through:
7441	 *
7442	 *   __perf_event_exit_task()
7443	 *     sync_child_event()
7444	 *       put_event()
7445	 *         mutex_lock(&ctx->mutex)
7446	 *
7447	 * But since its the parent context it won't be the same instance.
7448	 */
7449	mutex_lock(&child_ctx->mutex);
7450
7451again:
7452	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
7453				 group_entry)
7454		__perf_event_exit_task(child_event, child_ctx, child);
7455
7456	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
7457				 group_entry)
7458		__perf_event_exit_task(child_event, child_ctx, child);
7459
7460	/*
7461	 * If the last event was a group event, it will have appended all
7462	 * its siblings to the list, but we obtained 'tmp' before that which
7463	 * will still point to the list head terminating the iteration.
7464	 */
7465	if (!list_empty(&child_ctx->pinned_groups) ||
7466	    !list_empty(&child_ctx->flexible_groups))
7467		goto again;
7468
7469	mutex_unlock(&child_ctx->mutex);
7470
7471	put_ctx(child_ctx);
7472}
7473
7474/*
7475 * When a child task exits, feed back event values to parent events.
 
 
 
7476 */
7477void perf_event_exit_task(struct task_struct *child)
7478{
7479	struct perf_event *event, *tmp;
7480	int ctxn;
7481
7482	mutex_lock(&child->perf_event_mutex);
7483	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
7484				 owner_entry) {
7485		list_del_init(&event->owner_entry);
7486
7487		/*
7488		 * Ensure the list deletion is visible before we clear
7489		 * the owner, closes a race against perf_release() where
7490		 * we need to serialize on the owner->perf_event_mutex.
7491		 */
7492		smp_wmb();
7493		event->owner = NULL;
7494	}
7495	mutex_unlock(&child->perf_event_mutex);
7496
7497	for_each_task_context_nr(ctxn)
7498		perf_event_exit_task_context(child, ctxn);
 
 
 
 
 
 
 
 
7499}
7500
7501static void perf_free_event(struct perf_event *event,
7502			    struct perf_event_context *ctx)
7503{
7504	struct perf_event *parent = event->parent;
7505
7506	if (WARN_ON_ONCE(!parent))
7507		return;
7508
7509	mutex_lock(&parent->child_mutex);
7510	list_del_init(&event->child_list);
7511	mutex_unlock(&parent->child_mutex);
7512
7513	put_event(parent);
7514
 
7515	perf_group_detach(event);
7516	list_del_event(event, ctx);
 
7517	free_event(event);
7518}
7519
7520/*
7521 * free an unexposed, unused context as created by inheritance by
7522 * perf_event_init_task below, used by fork() in case of fail.
 
 
 
7523 */
7524void perf_event_free_task(struct task_struct *task)
7525{
7526	struct perf_event_context *ctx;
7527	struct perf_event *event, *tmp;
7528	int ctxn;
7529
7530	for_each_task_context_nr(ctxn) {
7531		ctx = task->perf_event_ctxp[ctxn];
7532		if (!ctx)
7533			continue;
7534
7535		mutex_lock(&ctx->mutex);
 
 
 
 
 
 
 
 
 
 
 
7536again:
7537		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
7538				group_entry)
7539			perf_free_event(event, ctx);
7540
7541		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
7542				group_entry)
7543			perf_free_event(event, ctx);
7544
7545		if (!list_empty(&ctx->pinned_groups) ||
7546				!list_empty(&ctx->flexible_groups))
7547			goto again;
7548
7549		mutex_unlock(&ctx->mutex);
7550
7551		put_ctx(ctx);
7552	}
7553}
7554
7555void perf_event_delayed_put(struct task_struct *task)
7556{
7557	int ctxn;
7558
7559	for_each_task_context_nr(ctxn)
7560		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
7561}
7562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7563/*
7564 * inherit a event from parent task to child task:
7565 */
7566static struct perf_event *
7567inherit_event(struct perf_event *parent_event,
7568	      struct task_struct *parent,
7569	      struct perf_event_context *parent_ctx,
7570	      struct task_struct *child,
7571	      struct perf_event *group_leader,
7572	      struct perf_event_context *child_ctx)
7573{
 
7574	struct perf_event *child_event;
7575	unsigned long flags;
7576
7577	/*
7578	 * Instead of creating recursive hierarchies of events,
7579	 * we link inherited events back to the original parent,
7580	 * which has a filp for sure, which we use as the reference
7581	 * count:
7582	 */
7583	if (parent_event->parent)
7584		parent_event = parent_event->parent;
7585
7586	child_event = perf_event_alloc(&parent_event->attr,
7587					   parent_event->cpu,
7588					   child,
7589					   group_leader, parent_event,
7590				           NULL, NULL);
7591	if (IS_ERR(child_event))
7592		return child_event;
7593
7594	if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
 
 
 
 
 
 
 
 
 
7595		free_event(child_event);
7596		return NULL;
7597	}
7598
7599	get_ctx(child_ctx);
7600
7601	/*
7602	 * Make the child state follow the state of the parent event,
7603	 * not its attr.disabled bit.  We hold the parent's mutex,
7604	 * so we won't race with perf_event_{en, dis}able_family.
7605	 */
7606	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
7607		child_event->state = PERF_EVENT_STATE_INACTIVE;
7608	else
7609		child_event->state = PERF_EVENT_STATE_OFF;
7610
7611	if (parent_event->attr.freq) {
7612		u64 sample_period = parent_event->hw.sample_period;
7613		struct hw_perf_event *hwc = &child_event->hw;
7614
7615		hwc->sample_period = sample_period;
7616		hwc->last_period   = sample_period;
7617
7618		local64_set(&hwc->period_left, sample_period);
7619	}
7620
7621	child_event->ctx = child_ctx;
7622	child_event->overflow_handler = parent_event->overflow_handler;
7623	child_event->overflow_handler_context
7624		= parent_event->overflow_handler_context;
7625
7626	/*
7627	 * Precalculate sample_data sizes
7628	 */
7629	perf_event__header_size(child_event);
7630	perf_event__id_header_size(child_event);
7631
7632	/*
7633	 * Link it up in the child's context:
7634	 */
7635	raw_spin_lock_irqsave(&child_ctx->lock, flags);
7636	add_event_to_ctx(child_event, child_ctx);
7637	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7638
7639	/*
7640	 * Link this into the parent event's child list
7641	 */
7642	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7643	mutex_lock(&parent_event->child_mutex);
7644	list_add_tail(&child_event->child_list, &parent_event->child_list);
7645	mutex_unlock(&parent_event->child_mutex);
7646
7647	return child_event;
7648}
7649
7650static int inherit_group(struct perf_event *parent_event,
7651	      struct task_struct *parent,
7652	      struct perf_event_context *parent_ctx,
7653	      struct task_struct *child,
7654	      struct perf_event_context *child_ctx)
7655{
7656	struct perf_event *leader;
7657	struct perf_event *sub;
7658	struct perf_event *child_ctr;
7659
7660	leader = inherit_event(parent_event, parent, parent_ctx,
7661				 child, NULL, child_ctx);
7662	if (IS_ERR(leader))
7663		return PTR_ERR(leader);
7664	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7665		child_ctr = inherit_event(sub, parent, parent_ctx,
7666					    child, leader, child_ctx);
7667		if (IS_ERR(child_ctr))
7668			return PTR_ERR(child_ctr);
7669	}
7670	return 0;
7671}
7672
7673static int
7674inherit_task_group(struct perf_event *event, struct task_struct *parent,
7675		   struct perf_event_context *parent_ctx,
7676		   struct task_struct *child, int ctxn,
7677		   int *inherited_all)
7678{
7679	int ret;
7680	struct perf_event_context *child_ctx;
7681
7682	if (!event->attr.inherit) {
7683		*inherited_all = 0;
7684		return 0;
7685	}
7686
7687	child_ctx = child->perf_event_ctxp[ctxn];
7688	if (!child_ctx) {
7689		/*
7690		 * This is executed from the parent task context, so
7691		 * inherit events that have been marked for cloning.
7692		 * First allocate and initialize a context for the
7693		 * child.
7694		 */
7695
7696		child_ctx = alloc_perf_context(parent_ctx->pmu, child);
7697		if (!child_ctx)
7698			return -ENOMEM;
7699
7700		child->perf_event_ctxp[ctxn] = child_ctx;
7701	}
7702
7703	ret = inherit_group(event, parent, parent_ctx,
7704			    child, child_ctx);
7705
7706	if (ret)
7707		*inherited_all = 0;
7708
7709	return ret;
7710}
7711
7712/*
7713 * Initialize the perf_event context in task_struct
7714 */
7715int perf_event_init_context(struct task_struct *child, int ctxn)
7716{
7717	struct perf_event_context *child_ctx, *parent_ctx;
7718	struct perf_event_context *cloned_ctx;
7719	struct perf_event *event;
7720	struct task_struct *parent = current;
7721	int inherited_all = 1;
7722	unsigned long flags;
7723	int ret = 0;
7724
7725	if (likely(!parent->perf_event_ctxp[ctxn]))
7726		return 0;
7727
7728	/*
7729	 * If the parent's context is a clone, pin it so it won't get
7730	 * swapped under us.
7731	 */
7732	parent_ctx = perf_pin_task_context(parent, ctxn);
7733	if (!parent_ctx)
7734		return 0;
7735
7736	/*
7737	 * No need to check if parent_ctx != NULL here; since we saw
7738	 * it non-NULL earlier, the only reason for it to become NULL
7739	 * is if we exit, and since we're currently in the middle of
7740	 * a fork we can't be exiting at the same time.
7741	 */
7742
7743	/*
7744	 * Lock the parent list. No need to lock the child - not PID
7745	 * hashed yet and not running, so nobody can access it.
7746	 */
7747	mutex_lock(&parent_ctx->mutex);
7748
7749	/*
7750	 * We dont have to disable NMIs - we are only looking at
7751	 * the list, not manipulating it:
7752	 */
7753	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
7754		ret = inherit_task_group(event, parent, parent_ctx,
7755					 child, ctxn, &inherited_all);
7756		if (ret)
7757			break;
7758	}
7759
7760	/*
7761	 * We can't hold ctx->lock when iterating the ->flexible_group list due
7762	 * to allocations, but we need to prevent rotation because
7763	 * rotate_ctx() will change the list from interrupt context.
7764	 */
7765	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7766	parent_ctx->rotate_disable = 1;
7767	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7768
7769	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
7770		ret = inherit_task_group(event, parent, parent_ctx,
7771					 child, ctxn, &inherited_all);
7772		if (ret)
7773			break;
7774	}
7775
7776	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7777	parent_ctx->rotate_disable = 0;
7778
7779	child_ctx = child->perf_event_ctxp[ctxn];
7780
7781	if (child_ctx && inherited_all) {
7782		/*
7783		 * Mark the child context as a clone of the parent
7784		 * context, or of whatever the parent is a clone of.
7785		 *
7786		 * Note that if the parent is a clone, the holding of
7787		 * parent_ctx->lock avoids it from being uncloned.
7788		 */
7789		cloned_ctx = parent_ctx->parent_ctx;
7790		if (cloned_ctx) {
7791			child_ctx->parent_ctx = cloned_ctx;
7792			child_ctx->parent_gen = parent_ctx->parent_gen;
7793		} else {
7794			child_ctx->parent_ctx = parent_ctx;
7795			child_ctx->parent_gen = parent_ctx->generation;
7796		}
7797		get_ctx(child_ctx->parent_ctx);
7798	}
7799
7800	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
7801	mutex_unlock(&parent_ctx->mutex);
7802
7803	perf_unpin_context(parent_ctx);
7804	put_ctx(parent_ctx);
7805
7806	return ret;
7807}
7808
7809/*
7810 * Initialize the perf_event context in task_struct
7811 */
7812int perf_event_init_task(struct task_struct *child)
7813{
7814	int ctxn, ret;
7815
7816	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7817	mutex_init(&child->perf_event_mutex);
7818	INIT_LIST_HEAD(&child->perf_event_list);
7819
7820	for_each_task_context_nr(ctxn) {
7821		ret = perf_event_init_context(child, ctxn);
7822		if (ret)
 
7823			return ret;
 
7824	}
7825
7826	return 0;
7827}
7828
7829static void __init perf_event_init_all_cpus(void)
7830{
7831	struct swevent_htable *swhash;
7832	int cpu;
7833
7834	for_each_possible_cpu(cpu) {
7835		swhash = &per_cpu(swevent_htable, cpu);
7836		mutex_init(&swhash->hlist_mutex);
7837		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
 
 
 
 
 
7838	}
7839}
7840
7841static void perf_event_init_cpu(int cpu)
7842{
7843	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7844
7845	mutex_lock(&swhash->hlist_mutex);
7846	swhash->online = true;
7847	if (swhash->hlist_refcount > 0) {
7848		struct swevent_hlist *hlist;
7849
7850		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7851		WARN_ON(!hlist);
7852		rcu_assign_pointer(swhash->swevent_hlist, hlist);
7853	}
7854	mutex_unlock(&swhash->hlist_mutex);
 
7855}
7856
7857#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
7858static void perf_pmu_rotate_stop(struct pmu *pmu)
7859{
7860	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7861
7862	WARN_ON(!irqs_disabled());
7863
7864	list_del_init(&cpuctx->rotation_list);
7865}
7866
7867static void __perf_event_exit_context(void *__info)
7868{
7869	struct remove_event re = { .detach_group = false };
7870	struct perf_event_context *ctx = __info;
 
 
7871
7872	perf_pmu_rotate_stop(ctx->pmu);
7873
7874	rcu_read_lock();
7875	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
7876		__perf_remove_from_context(&re);
7877	rcu_read_unlock();
7878}
7879
7880static void perf_event_exit_cpu_context(int cpu)
7881{
7882	struct perf_event_context *ctx;
7883	struct pmu *pmu;
7884	int idx;
7885
7886	idx = srcu_read_lock(&pmus_srcu);
7887	list_for_each_entry_rcu(pmu, &pmus, entry) {
7888		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
7889
7890		mutex_lock(&ctx->mutex);
7891		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7892		mutex_unlock(&ctx->mutex);
7893	}
7894	srcu_read_unlock(&pmus_srcu, idx);
7895}
 
7896
7897static void perf_event_exit_cpu(int cpu)
 
 
 
 
7898{
7899	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7900
7901	perf_event_exit_cpu_context(cpu);
7902
7903	mutex_lock(&swhash->hlist_mutex);
7904	swhash->online = false;
7905	swevent_hlist_release(swhash);
7906	mutex_unlock(&swhash->hlist_mutex);
7907}
7908#else
7909static inline void perf_event_exit_cpu(int cpu) { }
7910#endif
7911
7912static int
7913perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7914{
7915	int cpu;
7916
7917	for_each_online_cpu(cpu)
7918		perf_event_exit_cpu(cpu);
7919
7920	return NOTIFY_OK;
7921}
7922
7923/*
7924 * Run the perf reboot notifier at the very last possible moment so that
7925 * the generic watchdog code runs as long as possible.
7926 */
7927static struct notifier_block perf_reboot_notifier = {
7928	.notifier_call = perf_reboot,
7929	.priority = INT_MIN,
7930};
7931
7932static int
7933perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7934{
7935	unsigned int cpu = (long)hcpu;
7936
7937	switch (action & ~CPU_TASKS_FROZEN) {
7938
7939	case CPU_UP_PREPARE:
7940	case CPU_DOWN_FAILED:
7941		perf_event_init_cpu(cpu);
7942		break;
7943
7944	case CPU_UP_CANCELED:
7945	case CPU_DOWN_PREPARE:
7946		perf_event_exit_cpu(cpu);
7947		break;
7948	default:
7949		break;
7950	}
7951
7952	return NOTIFY_OK;
7953}
7954
7955void __init perf_event_init(void)
7956{
7957	int ret;
7958
7959	idr_init(&pmu_idr);
7960
7961	perf_event_init_all_cpus();
7962	init_srcu_struct(&pmus_srcu);
7963	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7964	perf_pmu_register(&perf_cpu_clock, NULL, -1);
7965	perf_pmu_register(&perf_task_clock, NULL, -1);
7966	perf_tp_register();
7967	perf_cpu_notifier(perf_cpu_notify);
7968	register_reboot_notifier(&perf_reboot_notifier);
7969
7970	ret = init_hw_breakpoint();
7971	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
7972
7973	/* do not patch jump label more than once per second */
7974	jump_label_rate_limit(&perf_sched_events, HZ);
7975
7976	/*
7977	 * Build time assertion that we keep the data_head at the intended
7978	 * location.  IOW, validation we got the __reserved[] size right.
7979	 */
7980	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
7981		     != 1024);
7982}
7983
 
 
 
 
 
 
 
 
 
 
 
 
 
7984static int __init perf_event_sysfs_init(void)
7985{
7986	struct pmu *pmu;
7987	int ret;
7988
7989	mutex_lock(&pmus_lock);
7990
7991	ret = bus_register(&pmu_bus);
7992	if (ret)
7993		goto unlock;
7994
7995	list_for_each_entry(pmu, &pmus, entry) {
7996		if (!pmu->name || pmu->type < 0)
7997			continue;
7998
7999		ret = pmu_dev_alloc(pmu);
8000		WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8001	}
8002	pmu_bus_running = 1;
8003	ret = 0;
8004
8005unlock:
8006	mutex_unlock(&pmus_lock);
8007
8008	return ret;
8009}
8010device_initcall(perf_event_sysfs_init);
8011
8012#ifdef CONFIG_CGROUP_PERF
8013static struct cgroup_subsys_state *
8014perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8015{
8016	struct perf_cgroup *jc;
8017
8018	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
8019	if (!jc)
8020		return ERR_PTR(-ENOMEM);
8021
8022	jc->info = alloc_percpu(struct perf_cgroup_info);
8023	if (!jc->info) {
8024		kfree(jc);
8025		return ERR_PTR(-ENOMEM);
8026	}
8027
8028	return &jc->css;
8029}
8030
8031static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
8032{
8033	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
8034
8035	free_percpu(jc->info);
8036	kfree(jc);
8037}
8038
8039static int __perf_cgroup_move(void *info)
8040{
8041	struct task_struct *task = info;
 
8042	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
 
8043	return 0;
8044}
8045
8046static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8047			       struct cgroup_taskset *tset)
8048{
8049	struct task_struct *task;
 
8050
8051	cgroup_taskset_for_each(task, tset)
8052		task_function_call(task, __perf_cgroup_move, task);
8053}
8054
8055static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8056			     struct cgroup_subsys_state *old_css,
8057			     struct task_struct *task)
8058{
8059	/*
8060	 * cgroup_exit() is called in the copy_process() failure path.
8061	 * Ignore this case since the task hasn't ran yet, this avoids
8062	 * trying to poke a half freed task state from generic code.
8063	 */
8064	if (!(task->flags & PF_EXITING))
8065		return;
8066
8067	task_function_call(task, __perf_cgroup_move, task);
8068}
8069
8070struct cgroup_subsys perf_event_cgrp_subsys = {
8071	.css_alloc	= perf_cgroup_css_alloc,
8072	.css_free	= perf_cgroup_css_free,
8073	.exit		= perf_cgroup_exit,
8074	.attach		= perf_cgroup_attach,
8075};
8076#endif /* CONFIG_CGROUP_PERF */