#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
+#include <linux/reboot.h>
#include <linux/vmstat.h>
+#include <linux/device.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
#include <linux/kernel_stat.h>
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
#include <asm/irq_regs.h>
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
atomic_t perf_task_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
/*
* max perf event sample rate
static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
return "pmu";
}
+static inline u64 perf_clock(void)
+{
+ return local_clock();
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
}
}
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_pid_nr_ns(p, event->ns);
+}
+
/*
* If we inherit events we want to return the parent event id
* to userspace.
put_ctx(ctx);
}
-static inline u64 perf_clock(void)
-{
- return local_clock();
-}
-
/*
* Update the record of the current time in a context.
*/
ctx->timestamp = now;
}
+static u64 perf_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ return ctx ? ctx->time : 0;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
return;
if (ctx->is_active)
- run_end = ctx->time;
+ run_end = perf_event_time(event);
else
run_end = event->tstamp_stopped;
if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
else
- run_end = ctx->time;
+ run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
}
ctx->nr_stat++;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += event->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+ event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ perf_event__read_size(event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ size += sizeof(data->ip);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ size += sizeof(data->addr);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ size += sizeof(data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ size += event->read_size;
+
+ event->header_size = size;
+}
+
+static void perf_event__id_header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu_entry);
+
+ event->id_header_size = size;
+}
+
static void perf_group_attach(struct perf_event *event)
{
- struct perf_event *group_leader = event->group_leader;
+ struct perf_event *group_leader = event->group_leader, *pos;
/*
* We can have double attach due to group movement in perf_event_open.
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
+
+ perf_event__header_size(group_leader);
+
+ list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ perf_event__header_size(pos);
}
/*
if (event->group_leader != event) {
list_del_init(&event->group_entry);
event->group_leader->nr_siblings--;
- return;
+ goto out;
}
if (!list_empty(&event->group_entry))
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
}
+
+out:
+ perf_event__header_size(event->group_leader);
+
+ list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ perf_event__header_size(tmp);
}
static inline int
return event->cpu == -1 || event->cpu == smp_processor_id();
}
-static int
-__event_sched_out(struct perf_event *event,
+static void
+event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
u64 delta;
/*
* An event which could not be activated because of
&& !event_filter_match(event)) {
delta = ctx->time - event->tstamp_stopped;
event->tstamp_running += delta;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
- return 0;
+ return;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
+ event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
ctx->nr_active--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- return 1;
-}
-
-static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- int ret;
-
- ret = __event_sched_out(event, cpuctx, ctx);
- if (ret)
- event->tstamp_stopped = ctx->time;
}
static void
raw_spin_unlock_irq(&ctx->lock);
}
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
static int
-__event_sched_in(struct perf_event *event,
+event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
event->state = PERF_EVENT_STATE_ACTIVE;
event->oncpu = smp_processor_id();
+
+ /*
+ * Unthrottle events, since we scheduled we might have missed several
+ * ticks already, also for a heavily scheduling task there is little
+ * guarantee it'll get a tick in a timely manner.
+ */
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+ perf_log_throttle(event, 1);
+ event->hw.interrupts = 0;
+ }
+
/*
* The new state must be visible before we turn it on in the hardware:
*/
return -EAGAIN;
}
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
+
if (!is_software_event(event))
cpuctx->active_oncpu++;
ctx->nr_active++;
return 0;
}
-static inline int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- int ret = __event_sched_in(event, cpuctx, ctx);
- if (ret)
- return ret;
- event->tstamp_running += ctx->time - event->tstamp_stopped;
- return 0;
-}
-
-static void
-group_commit_event_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- struct perf_event *event;
- u64 now = ctx->time;
-
- group_event->tstamp_running += now - group_event->tstamp_stopped;
- /*
- * Schedule in siblings as one group (if any):
- */
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- event->tstamp_running += now - event->tstamp_stopped;
- }
-}
-
static int
group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = group_event->pmu;
+ u64 now = ctx->time;
+ bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu);
- /*
- * use __event_sched_in() to delay updating tstamp_running
- * until the transaction is committed. In case of failure
- * we will keep an unmodified tstamp_running which is a
- * requirement to get correct timing information
- */
- if (__event_sched_in(group_event, cpuctx, ctx)) {
+ if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
return -EAGAIN;
}
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- if (__event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
}
}
- if (!pmu->commit_txn(pmu)) {
- /* commit tstamp_running */
- group_commit_event_sched_in(group_event, cpuctx, ctx);
+ if (!pmu->commit_txn(pmu))
return 0;
- }
+
group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
+ * The events up to the failed event are scheduled out normally,
+ * tstamp_stopped will be updated.
*
- * use __event_sched_out() to avoid updating tstamp_stopped
- * because the event never actually ran
+ * The failed events and the remaining siblings need to have
+ * their timings updated as if they had gone thru event_sched_in()
+ * and event_sched_out(). This is required to get consistent timings
+ * across the group. This also takes care of the case where the group
+ * could never be scheduled by ensuring tstamp_stopped is set to mark
+ * the time the event was actually stopped, such that time delta
+ * calculation in update_event_times() is correct.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- break;
- __event_sched_out(event, cpuctx, ctx);
+ simulate = true;
+
+ if (simulate) {
+ event->tstamp_running += now - event->tstamp_stopped;
+ event->tstamp_stopped = now;
+ } else {
+ event_sched_out(event, cpuctx, ctx);
+ }
}
- __event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, cpuctx, ctx);
pmu->cancel_txn(pmu);
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = ctx->time;
- event->tstamp_running = ctx->time;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
}
/*
add_event_to_ctx(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
struct perf_event_context *ctx)
{
struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled =
- ctx->time - sub->total_time_enabled;
- }
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
}
}
goto unlock;
__perf_event_mark_enabled(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
/*
* not supported on inherited events
*/
- if (event->attr.inherit)
+ if (event->attr.inherit || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
return 0;
}
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
int ctxn;
- perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
}
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
if (group_can_go_on(event, cpuctx, 1))
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
if (group_can_go_on(event, cpuctx, can_add_hw)) {
}
}
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_event *event, int enable);
-
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
u64 frequency = event->attr.sample_freq;
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
hwc = &event->hw;
{
raw_spin_lock(&ctx->lock);
- /* Rotate the first entry last of non-pinned groups */
- list_rotate_left(&ctx->flexible_groups);
+ /*
+ * Rotate the first entry last of non-pinned groups. Rotation might be
+ * disabled by the inheritance code.
+ */
+ if (!ctx->rotate_disable)
+ list_rotate_left(&ctx->flexible_groups);
raw_spin_unlock(&ctx->lock);
}
return;
raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
+ if (ctx->is_active)
+ update_context_time(ctx);
update_event_times(event);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
raw_spin_unlock(&ctx->lock);
-
- event->pmu->read(event);
}
static inline u64 perf_event_count(struct perf_event *event)
* accessed from NMI. Use a temporary manual per cpu allocation
* until that gets sorted out.
*/
- size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
- num_possible_cpus();
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
entries = kzalloc(size, GFP_KERNEL);
if (!entries)
if (!task)
return ERR_PTR(-ESRCH);
- /*
- * Can't attach events to a dying task.
- */
- err = -ESRCH;
- if (task->flags & PF_EXITING)
- goto errout;
-
/* Reuse ptrace permission checks for now. */
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ))
unsigned long flags;
int ctxn, err;
- if (!task && cpu != -1) {
+ if (!task) {
/* Must be root to operate on a CPU event: */
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- if (cpu < 0 || cpu >= nr_cpumask_bits)
- return ERR_PTR(-EINVAL);
-
/*
* We could be clever and allow to attach a event to an
* offline CPU and activate it when the CPU comes up, but
get_ctx(ctx);
- if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
- /*
- * We raced with some other task; use
- * the context they set.
- */
+ err = 0;
+ mutex_lock(&task->perf_event_mutex);
+ /*
+ * If it has already passed perf_event_exit_task().
+ * we must see PF_EXITING, it takes this mutex too.
+ */
+ if (task->flags & PF_EXITING)
+ err = -ESRCH;
+ else if (task->perf_event_ctxp[ctxn])
+ err = -EAGAIN;
+ else
+ rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ mutex_unlock(&task->perf_event_mutex);
+
+ if (unlikely(err)) {
put_task_struct(task);
kfree(ctx);
- goto retry;
+
+ if (err == -EAGAIN)
+ goto retry;
+ goto errout;
}
}
raw_spin_unlock_irq(&ctx->lock);
mutex_unlock(&ctx->mutex);
- mutex_lock(&event->owner->perf_event_mutex);
- list_del_init(&event->owner_entry);
- mutex_unlock(&event->owner->perf_event_mutex);
- put_task_struct(event->owner);
-
free_event(event);
return 0;
static int perf_release(struct inode *inode, struct file *file)
{
struct perf_event *event = file->private_data;
+ struct task_struct *owner;
file->private_data = NULL;
- return perf_event_release_kernel(event);
-}
-
-static int perf_event_read_size(struct perf_event *event)
-{
- int entry = sizeof(u64); /* value */
- int size = 0;
- int nr = 1;
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_ID)
- entry += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
- size += sizeof(u64);
+ rcu_read_lock();
+ owner = ACCESS_ONCE(event->owner);
+ /*
+ * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+ * !owner it means the list deletion is complete and we can indeed
+ * free this event, otherwise we need to serialize on
+ * owner->perf_event_mutex.
+ */
+ smp_read_barrier_depends();
+ if (owner) {
+ /*
+ * Since delayed_put_task_struct() also drops the last
+ * task reference we can safely take a new reference
+ * while holding the rcu_read_lock().
+ */
+ get_task_struct(owner);
}
+ rcu_read_unlock();
- size += entry * nr;
+ if (owner) {
+ mutex_lock(&owner->perf_event_mutex);
+ /*
+ * We have to re-check the event->owner field, if it is cleared
+ * we raced with perf_event_exit_task(), acquiring the mutex
+ * ensured they're done, and we can proceed with freeing the
+ * event.
+ */
+ if (event->owner)
+ list_del_init(&event->owner_entry);
+ mutex_unlock(&owner->perf_event_mutex);
+ put_task_struct(owner);
+ }
- return size;
+ return perf_event_release_kernel(event);
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
if (event->state == PERF_EVENT_STATE_ERROR)
return 0;
- if (count < perf_event_read_size(event))
+ if (count < event->read_size)
return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx);
int ret = 0;
u64 value;
- if (!event->attr.sample_period)
+ if (!is_sampling_event(event))
return -EINVAL;
if (copy_from_user(&value, arg, sizeof(value)))
} while (len);
}
-int perf_output_begin(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size,
- int nmi, int sample)
+static void __perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
{
- struct perf_buffer *buffer;
- unsigned long tail, offset, head;
- int have_lost;
- struct {
- struct perf_event_header header;
- u64 id;
- u64 lost;
- } lost_event;
+ u64 sample_type = event->attr.sample_type;
- rcu_read_lock();
- /*
- * For inherited events we send all the output towards the parent.
- */
+ data->type = sample_type;
+ header->size += event->id_header_size;
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_event_pid(event, current);
+ data->tid_entry.tid = perf_event_tid(event, current);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ data->time = perf_clock();
+
+ if (sample_type & PERF_SAMPLE_ID)
+ data->id = primary_event_id(event);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ data->stream_id = event->id;
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+ }
+}
+
+static void perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ if (event->attr.sample_id_all)
+ __perf_event_header__init_id(header, data, event);
+}
+
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ u64 sample_type = data->type;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+}
+
+static void perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample)
+{
+ if (event->attr.sample_id_all)
+ __perf_event__output_id_sample(handle, sample);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size,
+ int nmi, int sample)
+{
+ struct perf_buffer *buffer;
+ unsigned long tail, offset, head;
+ int have_lost;
+ struct perf_sample_data sample_data;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
+
+ rcu_read_lock();
+ /*
+ * For inherited events we send all the output towards the parent.
+ */
if (event->parent)
event = event->parent;
goto out;
have_lost = local_read(&buffer->lost);
- if (have_lost)
- size += sizeof(lost_event);
+ if (have_lost) {
+ lost_event.header.size = sizeof(lost_event);
+ perf_event_header__init_id(&lost_event.header, &sample_data,
+ event);
+ size += lost_event.header.size;
+ }
perf_output_get_handle(handle);
if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
- lost_event.header.size = sizeof(lost_event);
lost_event.id = event->id;
lost_event.lost = local_xchg(&buffer->lost, 0);
perf_output_put(handle, lost_event);
+ perf_event__output_id_sample(event, handle, &sample_data);
}
return 0;
rcu_read_unlock();
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
-
- return task_tgid_nr_ns(p, event->ns);
-}
-
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
-
- return task_pid_nr_ns(p, event->ns);
-}
-
static void perf_output_read_one(struct perf_output_handle *handle,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
u64 values[4];
values[n++] = perf_event_count(event);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- values[n++] = event->total_time_enabled +
+ values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
}
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- values[n++] = event->total_time_running +
+ values[n++] = running +
atomic64_read(&event->child_total_time_running);
}
if (read_format & PERF_FORMAT_ID)
* XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
*/
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = leader->total_time_enabled;
+ values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = leader->total_time_running;
+ values[n++] = running;
if (leader != event)
leader->pmu->read(leader);
}
}
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+ PERF_FORMAT_TOTAL_TIME_RUNNING)
+
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
+ u64 enabled = 0, running = 0, now, ctx_time;
+ u64 read_format = event->attr.read_format;
+
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we are called in
+ * NMI context
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+ now = perf_clock();
+ ctx_time = event->shadow_ctx_time + now;
+ enabled = ctx_time - event->tstamp_enabled;
+ running = ctx_time - event->tstamp_running;
+ }
+
if (event->attr.read_format & PERF_FORMAT_GROUP)
- perf_output_read_group(handle, event);
+ perf_output_read_group(handle, event, enabled, running);
else
- perf_output_read_one(handle, event);
+ perf_output_read_one(handle, event, enabled, running);
}
void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = event->attr.sample_type;
- data->type = sample_type;
-
header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header);
+ header->size = sizeof(*header) + event->header_size;
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- if (sample_type & PERF_SAMPLE_IP) {
- data->ip = perf_instruction_pointer(regs);
-
- header->size += sizeof(data->ip);
- }
-
- if (sample_type & PERF_SAMPLE_TID) {
- /* namespace issues */
- data->tid_entry.pid = perf_event_pid(event, current);
- data->tid_entry.tid = perf_event_tid(event, current);
-
- header->size += sizeof(data->tid_entry);
- }
-
- if (sample_type & PERF_SAMPLE_TIME) {
- data->time = perf_clock();
-
- header->size += sizeof(data->time);
- }
-
- if (sample_type & PERF_SAMPLE_ADDR)
- header->size += sizeof(data->addr);
-
- if (sample_type & PERF_SAMPLE_ID) {
- data->id = primary_event_id(event);
+ __perf_event_header__init_id(header, data, event);
- header->size += sizeof(data->id);
- }
-
- if (sample_type & PERF_SAMPLE_STREAM_ID) {
- data->stream_id = event->id;
-
- header->size += sizeof(data->stream_id);
- }
-
- if (sample_type & PERF_SAMPLE_CPU) {
- data->cpu_entry.cpu = raw_smp_processor_id();
- data->cpu_entry.reserved = 0;
-
- header->size += sizeof(data->cpu_entry);
- }
-
- if (sample_type & PERF_SAMPLE_PERIOD)
- header->size += sizeof(data->period);
-
- if (sample_type & PERF_SAMPLE_READ)
- header->size += perf_event_read_size(event);
+ if (sample_type & PERF_SAMPLE_IP)
+ data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
struct task_struct *task)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
struct perf_read_event read_event = {
.header = {
.type = PERF_RECORD_READ,
.misc = 0,
- .size = sizeof(read_event) + perf_event_read_size(event),
+ .size = sizeof(read_event) + event->read_size,
},
.pid = perf_event_pid(event, task),
.tid = perf_event_tid(event, task),
};
int ret;
+ perf_event_header__init_id(&read_event.header, &sample, event);
ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
if (ret)
return;
perf_output_put(&handle, read_event);
perf_output_read(&handle, event);
+ perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
struct perf_task_event *task_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
struct task_struct *task = task_event->task;
- int size, ret;
+ int ret, size = task_event->event_id.header.size;
- size = task_event->event_id.header.size;
- ret = perf_output_begin(&handle, event, size, 0, 0);
+ perf_event_header__init_id(&task_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ task_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
perf_output_put(&handle, task_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ task_event->event_id.header.size = size;
}
static int perf_event_task_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm || event->attr.mmap ||
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_task_ctx(&cpuctx->ctx, task_event);
ctx = task_event->task_ctx;
struct perf_comm_event *comm_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
- int ret = perf_output_begin(&handle, event, size, 0, 0);
+ int ret;
+
+ perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ comm_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
perf_output_put(&handle, comm_event->event_id);
perf_output_copy(&handle, comm_event->comm,
comm_event->comm_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ comm_event->event_id.header.size = size;
}
static int perf_event_comm_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_comm_ctx(&cpuctx->ctx, comm_event);
ctxn = pmu->task_ctx_nr;
struct perf_mmap_event *mmap_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
- int ret = perf_output_begin(&handle, event, size, 0, 0);
+ int ret;
+ perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ mmap_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
perf_output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ mmap_event->event_id.header.size = size;
}
static int perf_event_mmap_match(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if ((!executable && event->attr.mmap_data) ||
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
vma->vm_flags & VM_EXEC);
static void perf_log_throttle(struct perf_event *event, int enable)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int ret;
struct {
if (enable)
throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
- ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+ perf_event_header__init_id(&throttle_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ throttle_event.header.size, 1, 0);
if (ret)
return;
perf_output_put(&handle, throttle_event);
+ perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
struct hw_perf_event *hwc = &event->hw;
int ret = 0;
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
if (!throttle) {
hwc->interrupts++;
} else {
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- if (nmi) {
- event->pending_disable = 1;
- irq_work_queue(&event->pending);
- } else
- perf_event_disable(event);
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
}
if (event->overflow_handler)
if (!regs)
return;
- if (!hwc->sample_period)
+ if (!is_sampling_event(event))
return;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
- return 0;
+ return 1;
if (regs) {
if (event->attr.exclude_user && user_mode(regs))
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
}
break;
}
- if (event_id > PERF_COUNT_SW_MAX)
+ if (event_id >= PERF_COUNT_SW_MAX)
return -ENOENT;
if (!event->parent) {
struct perf_sample_data *data,
struct pt_regs *regs)
{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
/*
* All tracepoints are from kernel-space.
*/
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
- /*
- * Raw tracepoint data is a severe data leak, only allow root to
- * have these.
- */
- if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
- perf_paranoid_tracepoint_raw() &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
err = perf_trace_init(event);
if (err)
return err;
static inline void perf_tp_register(void)
{
- perf_pmu_register(&perf_tracepoint);
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hwc->hrtimer.function = perf_swevent_hrtimer;
- if (hwc->sample_period) {
- s64 period = local64_read(&hwc->period_left);
- if (period) {
- if (period < 0)
- period = 10000;
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
- local64_set(&hwc->period_left, 0);
- } else {
- period = max_t(u64, 10000, hwc->sample_period);
- }
- __hrtimer_start_range_ns(&hwc->hrtimer,
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
ns_to_ktime(period), 0,
HRTIMER_MODE_REL_PINNED, 0);
- }
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
return NULL;
}
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
{
- struct pmu *pmu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+ if (cpuctx->active_pmu == old_pmu)
+ cpuctx->active_pmu = pmu;
+ }
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+ struct pmu *i;
mutex_lock(&pmus_lock);
/*
* Like a real lame refcount.
*/
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->pmu_cpu_context == cpu_context)
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
goto out;
+ }
}
- free_percpu(cpu_context);
+ free_percpu(pmu->pmu_cpu_context);
out:
mutex_unlock(&pmus_lock);
}
+static struct idr pmu_idr;
-int perf_pmu_register(struct pmu *pmu)
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+
+static struct device_attribute pmu_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_NULL,
+};
+
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_attrs = pmu_dev_attrs,
+};
+
+static void pmu_dev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+ int ret = -ENOMEM;
+
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;
+
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;
+
+out:
+ return ret;
+
+free_dev:
+ put_device(pmu->dev);
+ goto out;
+}
+
+static struct lock_class_key cpuctx_mutex;
+
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
int cpu, ret;
if (!pmu->pmu_disable_count)
goto unlock;
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;
+
+ if (type < 0) {
+ int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+ if (!err)
+ goto free_pdc;
+
+ err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+ if (err) {
+ ret = err;
+ goto free_pdc;
+ }
+ }
+ pmu->type = type;
+
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }
+
+skip_type:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
- goto free_pdc;
+ goto free_dev;
for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
INIT_LIST_HEAD(&cpuctx->rotation_list);
+ cpuctx->active_pmu = pmu;
}
got_cpu_context:
return ret;
+free_dev:
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+
+free_idr:
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+
free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- free_pmu_context(pmu->pmu_cpu_context);
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ free_pmu_context(pmu);
}
struct pmu *perf_init_event(struct perf_event *event)
int idx;
idx = srcu_read_lock(&pmus_srcu);
+
+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu)
+ goto unlock;
+
list_for_each_entry_rcu(pmu, &pmus, entry) {
int ret = pmu->event_init(event);
if (!ret)
struct hw_perf_event *hwc;
long err;
+ if ((unsigned)cpu >= nr_cpu_ids) {
+ if (!task || cpu != -1)
+ return ERR_PTR(-EINVAL);
+ }
+
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return ERR_PTR(-ENOMEM);
if (!overflow_handler && parent_event)
overflow_handler = parent_event->overflow_handler;
-
+
event->overflow_handler = overflow_handler;
if (attr->disabled)
goto err_alloc;
}
+ if (task) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
/*
* Look up the group leader (we will attach this event to it):
*/
mutex_unlock(&ctx->mutex);
event->owner = current;
- get_task_struct(current);
+
mutex_lock(¤t->perf_event_mutex);
list_add_tail(&event->owner_entry, ¤t->perf_event_list);
mutex_unlock(¤t->perf_event_mutex);
/*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
+ /*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
* of the group leader will find the pointer to itself in
++ctx->generation;
mutex_unlock(&ctx->mutex);
- event->owner = current;
- get_task_struct(current);
- mutex_lock(¤t->perf_event_mutex);
- list_add_tail(&event->owner_entry, ¤t->perf_event_list);
- mutex_unlock(¤t->perf_event_mutex);
-
return event;
err_free:
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- struct perf_event *parent_event;
+ if (child_event->parent) {
+ raw_spin_lock_irq(&child_ctx->lock);
+ perf_group_detach(child_event);
+ raw_spin_unlock_irq(&child_ctx->lock);
+ }
perf_event_remove_from_context(child_event);
- parent_event = child_event->parent;
/*
- * It can happen that parent exits first, and has events
+ * It can happen that the parent exits first, and has events
* that are still around due to the child reference. These
- * events need to be zapped - but otherwise linger.
+ * events need to be zapped.
*/
- if (parent_event) {
+ if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
}
* scheduled, so we are now safe from rescheduling changing
* our context.
*/
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
*/
void perf_event_exit_task(struct task_struct *child)
{
+ struct perf_event *event, *tmp;
int ctxn;
+ mutex_lock(&child->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ owner_entry) {
+ list_del_init(&event->owner_entry);
+
+ /*
+ * Ensure the list deletion is visible before we clear
+ * the owner, closes a race against perf_release() where
+ * we need to serialize on the owner->perf_event_mutex.
+ */
+ smp_wmb();
+ event->owner = NULL;
+ }
+ mutex_unlock(&child->perf_event_mutex);
+
for_each_task_context_nr(ctxn)
perf_event_exit_task_context(child, ctxn);
}
child_event->overflow_handler = parent_event->overflow_handler;
/*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(child_event);
+ perf_event__id_header_size(child_event);
+
+ /*
* Link it up in the child's context:
*/
raw_spin_lock_irqsave(&child_ctx->lock, flags);
struct perf_event *event;
struct task_struct *parent = current;
int inherited_all = 1;
+ unsigned long flags;
int ret = 0;
- child->perf_event_ctxp[ctxn] = NULL;
-
- mutex_init(&child->perf_event_mutex);
- INIT_LIST_HEAD(&child->perf_event_list);
-
if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
break;
}
+ /*
+ * We can't hold ctx->lock when iterating the ->flexible_group list due
+ * to allocations, but we need to prevent rotation because
+ * rotate_ctx() will change the list from interrupt context.
+ */
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 1;
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
break;
}
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 0;
+
child_ctx = child->perf_event_ctxp[ctxn];
if (child_ctx && inherited_all) {
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.
- * Note that if the parent is a clone, it could get
- * uncloned at any point, but that doesn't matter
- * because the list of events and the generation
- * count can't have changed since we took the mutex.
+ *
+ * Note that if the parent is a clone, the holding of
+ * parent_ctx->lock avoids it from being uncloned.
*/
- cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+ cloned_ctx = parent_ctx->parent_ctx;
if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
child_ctx->parent_gen = parent_ctx->parent_gen;
get_ctx(child_ctx->parent_ctx);
}
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
{
int ctxn, ret;
+ memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ mutex_init(&child->perf_event_mutex);
+ INIT_LIST_HEAD(&child->perf_event_list);
+
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
if (ret)
mutex_unlock(&swhash->hlist_mutex);
}
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
static void perf_pmu_rotate_stop(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
static inline void perf_event_exit_cpu(int cpu) { }
#endif
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ perf_event_exit_cpu(cpu);
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+ .notifier_call = perf_reboot,
+ .priority = INT_MIN,
+};
+
static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
void __init perf_event_init(void)
{
+ int ret;
+
+ idr_init(&pmu_idr);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
- perf_pmu_register(&perf_swevent);
- perf_pmu_register(&perf_cpu_clock);
- perf_pmu_register(&perf_task_clock);
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
+ register_reboot_notifier(&perf_reboot_notifier);
+
+ ret = init_hw_breakpoint();
+ WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+}
+
+static int __init perf_event_sysfs_init(void)
+{
+ struct pmu *pmu;
+ int ret;
+
+ mutex_lock(&pmus_lock);
+
+ ret = bus_register(&pmu_bus);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (!pmu->name || pmu->type < 0)
+ continue;
+
+ ret = pmu_dev_alloc(pmu);
+ WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+ }
+ pmu_bus_running = 1;
+ ret = 0;
+
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
}
+device_initcall(perf_event_sysfs_init);