perf: Do the big rename: Performance Counters -> Performance Events

author Ingo Molnar <mingo@elte.hu>

Mon, 21 Sep 2009 10:02:48 +0000 (12:02 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 21 Sep 2009 12:28:04 +0000 (14:28 +0200)
author Ingo Molnar <mingo@elte.hu>
Mon, 21 Sep 2009 10:02:48 +0000 (12:02 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 21 Sep 2009 12:28:04 +0000 (14:28 +0200)
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h

index 9122c9e..89f7ead 100644 (file)
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -390,7 +390,7 @@
  #define __NR_preadv                    (__NR_SYSCALL_BASE+361)
  #define __NR_pwritev                   (__NR_SYSCALL_BASE+362)
  #define __NR_rt_tgsigqueueinfo         (__NR_SYSCALL_BASE+363)
-#define __NR_perf_counter_open         (__NR_SYSCALL_BASE+364)
+#define __NR_perf_event_open           (__NR_SYSCALL_BASE+364)
  
  /*
   * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S

index ecfa989..fafce1b 100644 (file)
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -373,7 +373,7 @@
                 CALL(sys_preadv)
                 CALL(sys_pwritev)
                 CALL(sys_rt_tgsigqueueinfo)
-               CALL(sys_perf_counter_open)
+               CALL(sys_perf_event_open)
  #ifndef syscalls_counted
  .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
  #define syscalls_counted
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h

index c8e7ee4..02b1529 100644 (file)
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -381,7 +381,7 @@
  #define __NR_preadv            366
  #define __NR_pwritev           367
  #define __NR_rt_tgsigqueueinfo 368
-#define __NR_perf_counter_open 369
+#define __NR_perf_event_open   369
  
  #define __NR_syscall           370
  #define NR_syscalls            __NR_syscall
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S

index 01af24c..1e7cac2 100644 (file)
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1620,7 +1620,7 @@ ENTRY(_sys_call_table)
         .long _sys_preadv
         .long _sys_pwritev
         .long _sys_rt_tgsigqueueinfo
-       .long _sys_perf_counter_open
+       .long _sys_perf_event_open
  
         .rept NR_syscalls-(.-_sys_call_table)/4
         .long _sys_ni_syscall
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig

index b86e19c..4b5830b 100644 (file)
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,7 +7,7 @@ config FRV
         default y
         select HAVE_IDE
         select HAVE_ARCH_TRACEHOOK
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
  
  config ZONE_DMA
         bool
diff --git a/arch/frv/include/asm/perf_counter.h b/arch/frv/include/asm/perf_counter.h

deleted file mode 100644 (file)

index ccf726e..0000000
--- a/arch/frv/include/asm/perf_counter.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* FRV performance counter support
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_PERF_COUNTER_H
-#define _ASM_PERF_COUNTER_H
-
-#define PERF_COUNTER_INDEX_OFFSET      0
-
-#endif /* _ASM_PERF_COUNTER_H */
diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..a69e015
--- /dev/null
+++ b/arch/frv/include/asm/perf_event.h
@@ -0,0 +1,17 @@
+/* FRV performance event support
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_PERF_EVENT_H
+#define _ASM_PERF_EVENT_H
+
+#define PERF_EVENT_INDEX_OFFSET        0
+
+#endif /* _ASM_PERF_EVENT_H */
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h

index 4a8fb42..be6ef0f 100644 (file)
--- a/arch/frv/include/asm/unistd.h
+++ b/arch/frv/include/asm/unistd.h
@@ -342,7 +342,7 @@
  #define __NR_preadv            333
  #define __NR_pwritev           334
  #define __NR_rt_tgsigqueueinfo 335
-#define __NR_perf_counter_open 336
+#define __NR_perf_event_open   336
  
  #ifdef __KERNEL__
  
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S

index fde1e44..189397e 100644 (file)
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1525,6 +1525,6 @@ sys_call_table:
         .long sys_preadv
         .long sys_pwritev
         .long sys_rt_tgsigqueueinfo     /* 335 */
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
  
  syscall_table_size = (. - sys_call_table)
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile

index 0a37721..f470975 100644 (file)
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
  lib-y := \
         __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
         checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
-       outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_counter.o
+       outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
diff --git a/arch/frv/lib/perf_counter.c b/arch/frv/lib/perf_counter.c

deleted file mode 100644 (file)

index 2000fee..0000000
--- a/arch/frv/lib/perf_counter.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance counter handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_counter.h>
-
-/*
- * mark the performance counter as pending
- */
-void set_perf_counter_pending(void)
-{
-}
diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c

new file mode 100644 (file)

index 0000000..9ac5acf
--- /dev/null
+++ b/arch/frv/lib/perf_event.c
@@ -0,0 +1,19 @@
+/* Performance event handling
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/perf_event.h>
+
+/*
+ * mark the performance event as pending
+ */
+void set_perf_event_pending(void)
+{
+}
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h

index 946d869..48b87f5 100644 (file)
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -335,7 +335,7 @@
  #define __NR_preadv            329
  #define __NR_pwritev           330
  #define __NR_rt_tgsigqueueinfo 331
-#define __NR_perf_counter_open 332
+#define __NR_perf_event_open   332
  
  #ifdef __KERNEL__
  
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S

index 922f52e..c5b3363 100644 (file)
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -756,5 +756,5 @@ sys_call_table:
         .long sys_preadv
         .long sys_pwritev               /* 330 */
         .long sys_rt_tgsigqueueinfo
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
  
diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S

index 0ae123e..23535cc 100644 (file)
--- a/arch/m68knommu/kernel/syscalltable.S
+++ b/arch/m68knommu/kernel/syscalltable.S
@@ -350,7 +350,7 @@ ENTRY(sys_call_table)
         .long sys_preadv
         .long sys_pwritev               /* 330 */
         .long sys_rt_tgsigqueueinfo
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
  
         .rept NR_syscalls-(.-sys_call_table)/4
                 .long sys_ni_syscall
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h

index 0b85232..cb05a07 100644 (file)
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -381,7 +381,7 @@
  #define __NR_preadv            363 /* new */
  #define __NR_pwritev           364 /* new */
  #define __NR_rt_tgsigqueueinfo 365 /* new */
-#define __NR_perf_counter_open 366 /* new */
+#define __NR_perf_event_open   366 /* new */
  
  #define __NR_syscalls          367
  
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S

index 4572160..ecec191 100644 (file)
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -370,4 +370,4 @@ ENTRY(sys_call_table)
         .long sys_ni_syscall
         .long sys_ni_syscall
         .long sys_rt_tgsigqueueinfo     /* 365 */
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h

index e753a77..8c9dfa9 100644 (file)
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -353,7 +353,7 @@
  #define __NR_preadv                    (__NR_Linux + 330)
  #define __NR_pwritev                   (__NR_Linux + 331)
  #define __NR_rt_tgsigqueueinfo         (__NR_Linux + 332)
-#define __NR_perf_counter_open         (__NR_Linux + 333)
+#define __NR_perf_event_open           (__NR_Linux + 333)
  #define __NR_accept4                   (__NR_Linux + 334)
  
  /*
@@ -664,7 +664,7 @@
  #define __NR_preadv                    (__NR_Linux + 289)
  #define __NR_pwritev                   (__NR_Linux + 290)
  #define __NR_rt_tgsigqueueinfo         (__NR_Linux + 291)
-#define __NR_perf_counter_open         (__NR_Linux + 292)
+#define __NR_perf_event_open           (__NR_Linux + 292)
  #define __NR_accept4                   (__NR_Linux + 293)
  
  /*
@@ -979,7 +979,7 @@
  #define __NR_preadv                    (__NR_Linux + 293)
  #define __NR_pwritev                   (__NR_Linux + 294)
  #define __NR_rt_tgsigqueueinfo         (__NR_Linux + 295)
-#define __NR_perf_counter_open         (__NR_Linux + 296)
+#define __NR_perf_event_open           (__NR_Linux + 296)
  #define __NR_accept4                   (__NR_Linux + 297)
  
  /*
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S

index 7c2de4f..fd2a9bb 100644 (file)
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -581,7 +581,7 @@ einval:     li      v0, -ENOSYS
         sys     sys_preadv              6       /* 4330 */
         sys     sys_pwritev             6
         sys     sys_rt_tgsigqueueinfo   4
-       sys     sys_perf_counter_open   5
+       sys     sys_perf_event_open     5
         sys     sys_accept4             4
         .endm
  
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S

index b97b993..18bf7f3 100644 (file)
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -418,6 +418,6 @@ sys_call_table:
         PTR     sys_preadv
         PTR     sys_pwritev                     /* 5390 */
         PTR     sys_rt_tgsigqueueinfo
-       PTR     sys_perf_counter_open
+       PTR     sys_perf_event_open
         PTR     sys_accept4
         .size   sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S

index 1a6ae12..6ebc079 100644 (file)
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -416,6 +416,6 @@ EXPORT(sysn32_call_table)
         PTR     sys_preadv
         PTR     sys_pwritev
         PTR     compat_sys_rt_tgsigqueueinfo    /* 5295 */
-       PTR     sys_perf_counter_open
+       PTR     sys_perf_event_open
         PTR     sys_accept4
         .size   sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S

index cd31087..9bbf977 100644 (file)
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -536,6 +536,6 @@ sys_call_table:
         PTR     compat_sys_preadv               /* 4330 */
         PTR     compat_sys_pwritev
         PTR     compat_sys_rt_tgsigqueueinfo
-       PTR     sys_perf_counter_open
+       PTR     sys_perf_event_open
         PTR     sys_accept4
         .size   sys_call_table,.-sys_call_table
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h

index fad6861..2a98393 100644 (file)
--- a/arch/mn10300/include/asm/unistd.h
+++ b/arch/mn10300/include/asm/unistd.h
@@ -347,7 +347,7 @@
  #define __NR_preadv            334
  #define __NR_pwritev           335
  #define __NR_rt_tgsigqueueinfo 336
-#define __NR_perf_counter_open 337
+#define __NR_perf_event_open   337
  
  #ifdef __KERNEL__
  
diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S

index e0d2563..a94e7ea 100644 (file)
--- a/arch/mn10300/kernel/entry.S
+++ b/arch/mn10300/kernel/entry.S
@@ -723,7 +723,7 @@ ENTRY(sys_call_table)
         .long sys_preadv
         .long sys_pwritev               /* 335 */
         .long sys_rt_tgsigqueueinfo
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
  
  
  nr_syscalls=(.-sys_call_table)/4
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index 06f8d5b..f388dc6 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,7 +16,7 @@ config PARISC
         select RTC_DRV_GENERIC
         select INIT_ALL_POSSIBLE
         select BUG
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
         select GENERIC_ATOMIC64 if !64BIT
         help
           The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/perf_counter.h b/arch/parisc/include/asm/perf_counter.h

deleted file mode 100644 (file)

index dc9e829..0000000
--- a/arch/parisc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_PARISC_PERF_COUNTER_H
-#define __ASM_PARISC_PERF_COUNTER_H
-
-/* parisc only supports software counters through this interface. */
-static inline void set_perf_counter_pending(void) { }
-
-#endif /* __ASM_PARISC_PERF_COUNTER_H */
diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..cc14642
--- /dev/null
+++ b/arch/parisc/include/asm/perf_event.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_PARISC_PERF_EVENT_H
+#define __ASM_PARISC_PERF_EVENT_H
+
+/* parisc only supports software events through this interface. */
+static inline void set_perf_event_pending(void) { }
+
+#endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h

index f3d3b8b..cda1583 100644 (file)
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -810,9 +810,9 @@
  #define __NR_preadv            (__NR_Linux + 315)
  #define __NR_pwritev           (__NR_Linux + 316)
  #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317)
-#define __NR_perf_counter_open (__NR_Linux + 318)
+#define __NR_perf_event_open   (__NR_Linux + 318)
  
-#define __NR_Linux_syscalls    (__NR_perf_counter_open + 1)
+#define __NR_Linux_syscalls    (__NR_perf_event_open + 1)
  
  
  #define __IGNORE_select                /* newselect */
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S

index cf145eb..843f423 100644 (file)
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -416,7 +416,7 @@
         ENTRY_COMP(preadv)              /* 315 */
         ENTRY_COMP(pwritev)
         ENTRY_COMP(rt_tgsigqueueinfo)
-       ENTRY_SAME(perf_counter_open)
+       ENTRY_SAME(perf_event_open)
  
         /* Nothing yet */
  
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index 8250902..4fd4790 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,7 +129,7 @@ config PPC
         select HAVE_OPROFILE
         select HAVE_SYSCALL_WRAPPERS if PPC64
         select GENERIC_ATOMIC64 if PPC32
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
  
  config EARLY_PRINTK
         bool
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h

index e73d554..abbc2aa 100644 (file)
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -135,43 +135,43 @@ static inline int irqs_disabled_flags(unsigned long flags)
   */
  struct irq_chip;
  
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
  
  #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_counter_pending(void)
+static inline unsigned long test_perf_event_pending(void)
  {
         unsigned long x;
  
         asm volatile("lbz %0,%1(13)"
                 : "=r" (x)
-               : "i" (offsetof(struct paca_struct, perf_counter_pending)));
+               : "i" (offsetof(struct paca_struct, perf_event_pending)));
         return x;
  }
  
-static inline void set_perf_counter_pending(void)
+static inline void set_perf_event_pending(void)
  {
         asm volatile("stb %0,%1(13)" : :
                 "r" (1),
-               "i" (offsetof(struct paca_struct, perf_counter_pending)));
+               "i" (offsetof(struct paca_struct, perf_event_pending)));
  }
  
-static inline void clear_perf_counter_pending(void)
+static inline void clear_perf_event_pending(void)
  {
         asm volatile("stb %0,%1(13)" : :
                 "r" (0),
-               "i" (offsetof(struct paca_struct, perf_counter_pending)));
+               "i" (offsetof(struct paca_struct, perf_event_pending)));
  }
  #endif /* CONFIG_PPC64 */
  
-#else  /* CONFIG_PERF_COUNTERS */
+#else  /* CONFIG_PERF_EVENTS */
  
-static inline unsigned long test_perf_counter_pending(void)
+static inline unsigned long test_perf_event_pending(void)
  {
         return 0;
  }
  
-static inline void clear_perf_counter_pending(void) {}
-#endif /* CONFIG_PERF_COUNTERS */
+static inline void clear_perf_event_pending(void) {}
+#endif /* CONFIG_PERF_EVENTS */
  
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h

index b634456..154f405 100644 (file)
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -122,7 +122,7 @@ struct paca_struct {
         u8 soft_enabled;                /* irq soft-enable flag */
         u8 hard_enabled;                /* set if irqs are enabled in MSR */
         u8 io_sync;                     /* writel() needs spin_unlock sync */
-       u8 perf_counter_pending;        /* PM interrupt while soft-disabled */
+       u8 perf_event_pending;  /* PM interrupt while soft-disabled */
  
         /* Stuff for accurate time accounting */
         u64 user_time;                  /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h

deleted file mode 100644 (file)

index 0ea0639..0000000
--- a/arch/powerpc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Performance counter support - PowerPC-specific definitions.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/types.h>
-
-#include <asm/hw_irq.h>
-
-#define MAX_HWCOUNTERS         8
-#define MAX_EVENT_ALTERNATIVES 8
-#define MAX_LIMITED_HWCOUNTERS 2
-
-/*
- * This struct provides the constants and functions needed to
- * describe the PMU on a particular POWER-family CPU.
- */
-struct power_pmu {
-       const char      *name;
-       int             n_counter;
-       int             max_alternatives;
-       unsigned long   add_fields;
-       unsigned long   test_adder;
-       int             (*compute_mmcr)(u64 events[], int n_ev,
-                               unsigned int hwc[], unsigned long mmcr[]);
-       int             (*get_constraint)(u64 event, unsigned long *mskp,
-                               unsigned long *valp);
-       int             (*get_alternatives)(u64 event, unsigned int flags,
-                               u64 alt[]);
-       void            (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
-       int             (*limited_pmc_event)(u64 event);
-       u32             flags;
-       int             n_generic;
-       int             *generic_events;
-       int             (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
-                              [PERF_COUNT_HW_CACHE_OP_MAX]
-                              [PERF_COUNT_HW_CACHE_RESULT_MAX];
-};
-
-/*
- * Values for power_pmu.flags
- */
-#define PPMU_LIMITED_PMC5_6    1       /* PMC5/6 have limited function */
-#define PPMU_ALT_SIPR          2       /* uses alternate posn for SIPR/HV */
-
-/*
- * Values for flags to get_alternatives()
- */
-#define PPMU_LIMITED_PMC_OK    1       /* can put this on a limited PMC */
-#define PPMU_LIMITED_PMC_REQD  2       /* have to put this on a limited PMC */
-#define PPMU_ONLY_COUNT_RUN    4       /* only counting in run state */
-
-extern int register_power_pmu(struct power_pmu *);
-
-struct pt_regs;
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-
-#define PERF_COUNTER_INDEX_OFFSET      1
-
-/*
- * Only override the default definitions in include/linux/perf_counter.h
- * if we have hardware PMU support.
- */
-#ifdef CONFIG_PPC_PERF_CTRS
-#define perf_misc_flags(regs)  perf_misc_flags(regs)
-#endif
-
-/*
- * The power_pmu.get_constraint function returns a 32/64-bit value and
- * a 32/64-bit mask that express the constraints between this event and
- * other events.
- *
- * The value and mask are divided up into (non-overlapping) bitfields
- * of three different types:
- *
- * Select field: this expresses the constraint that some set of bits
- * in MMCR* needs to be set to a specific value for this event.  For a
- * select field, the mask contains 1s in every bit of the field, and
- * the value contains a unique value for each possible setting of the
- * MMCR* bits.  The constraint checking code will ensure that two events
- * that set the same field in their masks have the same value in their
- * value dwords.
- *
- * Add field: this expresses the constraint that there can be at most
- * N events in a particular class.  A field of k bits can be used for
- * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
- * set (and the other bits 0), and the value has only the least significant
- * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
- * in the struct power_pmu for this processor come into play.  The
- * add_fields value contains 1 in the LSB of the field, and the
- * test_adder contains 2^(k-1) - 1 - N in the field.
- *
- * NAND field: this expresses the constraint that you may not have events
- * in all of a set of classes.  (For example, on PPC970, you can't select
- * events from the FPU, ISU and IDU simultaneously, although any two are
- * possible.)  For N classes, the field is N+1 bits wide, and each class
- * is assigned one bit from the least-significant N bits.  The mask has
- * only the most-significant bit set, and the value has only the bit
- * for the event's class set.  The test_adder has the least significant
- * bit set in the field.
- *
- * If an event is not subject to the constraint expressed by a particular
- * field, then it will have 0 in both the mask and value for that field.
- */
diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..2499aaa
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -0,0 +1,110 @@
+/*
+ * Performance event support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+
+#include <asm/hw_irq.h>
+
+#define MAX_HWEVENTS           8
+#define MAX_EVENT_ALTERNATIVES 8
+#define MAX_LIMITED_HWEVENTS   2
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+       const char      *name;
+       int             n_event;
+       int             max_alternatives;
+       unsigned long   add_fields;
+       unsigned long   test_adder;
+       int             (*compute_mmcr)(u64 events[], int n_ev,
+                               unsigned int hwc[], unsigned long mmcr[]);
+       int             (*get_constraint)(u64 event_id, unsigned long *mskp,
+                               unsigned long *valp);
+       int             (*get_alternatives)(u64 event_id, unsigned int flags,
+                               u64 alt[]);
+       void            (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
+       int             (*limited_pmc_event)(u64 event_id);
+       u32             flags;
+       int             n_generic;
+       int             *generic_events;
+       int             (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+                              [PERF_COUNT_HW_CACHE_OP_MAX]
+                              [PERF_COUNT_HW_CACHE_RESULT_MAX];
+};
+
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6    1       /* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR          2       /* uses alternate posn for SIPR/HV */
+
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK    1       /* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD  2       /* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN    4       /* only counting in run state */
+
+extern int register_power_pmu(struct power_pmu *);
+
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
+#define PERF_EVENT_INDEX_OFFSET        1
+
+/*
+ * Only override the default definitions in include/linux/perf_event.h
+ * if we have hardware PMU support.
+ */
+#ifdef CONFIG_PPC_PERF_CTRS
+#define perf_misc_flags(regs)  perf_misc_flags(regs)
+#endif
+
+/*
+ * The power_pmu.get_constraint function returns a 32/64-bit value and
+ * a 32/64-bit mask that express the constraints between this event_id and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event_id.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event_id's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event_id is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h

index ed24bd9..c7d671a 100644 (file)
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,7 +322,7 @@ SYSCALL_SPU(epoll_create1)
  SYSCALL_SPU(dup3)
  SYSCALL_SPU(pipe2)
  SYSCALL(inotify_init1)
-SYSCALL_SPU(perf_counter_open)
+SYSCALL_SPU(perf_event_open)
  COMPAT_SYS_SPU(preadv)
  COMPAT_SYS_SPU(pwritev)
  COMPAT_SYS(rt_tgsigqueueinfo)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h

index cef080b..f6ca761 100644 (file)
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,7 +341,7 @@
  #define __NR_dup3              316
  #define __NR_pipe2             317
  #define __NR_inotify_init1     318
-#define __NR_perf_counter_open 319
+#define __NR_perf_event_open   319
  #define __NR_preadv            320
  #define __NR_pwritev           321
  #define __NR_rt_tgsigqueueinfo 322
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile

index 569f79c..b23664a 100644 (file)
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT)         += compat_audit.o
  
  obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
  obj-$(CONFIG_FUNCTION_GRAPH_TRACER)    += ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS)    += perf_counter.o perf_callchain.o
+obj-$(CONFIG_PPC_PERF_CTRS)    += perf_event.o perf_callchain.o
  obj64-$(CONFIG_PPC_PERF_CTRS)  += power4-pmu.o ppc970-pmu.o power5-pmu.o \
                                    power5+-pmu.o power6-pmu.o power7-pmu.o
  obj32-$(CONFIG_PPC_PERF_CTRS)  += mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c

index f0df285..0812b0f 100644 (file)
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -133,7 +133,7 @@ int main(void)
         DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
         DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
         DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
-       DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
+       DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending));
         DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
  #ifdef CONFIG_PPC_MM_SLICES
         DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S

index 66bcda3..900e0ee 100644 (file)
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -556,14 +556,14 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
  2:
         TRACE_AND_RESTORE_IRQ(r5);
  
-#ifdef CONFIG_PERF_COUNTERS
-       /* check paca->perf_counter_pending if we're enabling ints */
+#ifdef CONFIG_PERF_EVENTS
+       /* check paca->perf_event_pending if we're enabling ints */
         lbz     r3,PACAPERFPEND(r13)
         and.    r3,r3,r5
         beq     27f
-       bl      .perf_counter_do_pending
+       bl      .perf_event_do_pending
  27:
-#endif /* CONFIG_PERF_COUNTERS */
+#endif /* CONFIG_PERF_EVENTS */
  
         /* extract EE bit and use it to restore paca->hard_enabled */
         ld      r3,_MSR(r1)
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c

index f7f376e..e5d1211 100644 (file)
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,7 +53,7 @@
  #include <linux/bootmem.h>
  #include <linux/pci.h>
  #include <linux/debugfs.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/uaccess.h>
  #include <asm/system.h>
@@ -138,9 +138,9 @@ notrace void raw_local_irq_restore(unsigned long en)
         }
  #endif /* CONFIG_PPC_STD_MMU_64 */
  
-       if (test_perf_counter_pending()) {
-               clear_perf_counter_pending();
-               perf_counter_do_pending();
+       if (test_perf_event_pending()) {
+               clear_perf_event_pending();
+               perf_event_do_pending();
         }
  
         /*
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c

index cc466d0..09d7202 100644 (file)
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/string.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
  
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c

index f74b62c..0a03cf7 100644 (file)
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -10,7 +10,7 @@
   */
  #include <linux/kernel.h>
  #include <linux/sched.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/percpu.h>
  #include <linux/uaccess.h>
  #include <linux/mm.h>
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c

deleted file mode 100644 (file)

index 5ccf9bc..0000000
--- a/arch/powerpc/kernel/perf_counter.c
+++ /dev/null
@@ -1,1315 +0,0 @@
-/*
- * Performance counter support - powerpc architecture code
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/perf_counter.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/reg.h>
-#include <asm/pmc.h>
-#include <asm/machdep.h>
-#include <asm/firmware.h>
-#include <asm/ptrace.h>
-
-struct cpu_hw_counters {
-       int n_counters;
-       int n_percpu;
-       int disabled;
-       int n_added;
-       int n_limited;
-       u8  pmcs_enabled;
-       struct perf_counter *counter[MAX_HWCOUNTERS];
-       u64 events[MAX_HWCOUNTERS];
-       unsigned int flags[MAX_HWCOUNTERS];
-       unsigned long mmcr[3];
-       struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
-       u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
-       u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-       unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-       unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-};
-DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
-
-struct power_pmu *ppmu;
-
-/*
- * Normally, to ignore kernel events we set the FCS (freeze counters
- * in supervisor mode) bit in MMCR0, but if the kernel runs with the
- * hypervisor bit set in the MSR, or if we are running on a processor
- * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
- * then we need to use the FCHV bit to ignore kernel events.
- */
-static unsigned int freeze_counters_kernel = MMCR0_FCS;
-
-/*
- * 32-bit doesn't have MMCRA but does have an MMCR2,
- * and a few other names are different.
- */
-#ifdef CONFIG_PPC32
-
-#define MMCR0_FCHV             0
-#define MMCR0_PMCjCE           MMCR0_PMCnCE
-
-#define SPRN_MMCRA             SPRN_MMCR2
-#define MMCRA_SAMPLE_ENABLE    0
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
-       return 0;
-}
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
-       return 0;
-}
-static inline void perf_read_regs(struct pt_regs *regs) { }
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-       return 0;
-}
-
-#endif /* CONFIG_PPC32 */
-
-/*
- * Things that are specific to 64-bit implementations.
- */
-#ifdef CONFIG_PPC64
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
-       unsigned long mmcra = regs->dsisr;
-
-       if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
-               unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
-               if (slot > 1)
-                       return 4 * (slot - 1);
-       }
-       return 0;
-}
-
-/*
- * The user wants a data address recorded.
- * If we're not doing instruction sampling, give them the SDAR
- * (sampled data address).  If we are doing instruction sampling, then
- * only give them the SDAR if it corresponds to the instruction
- * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
- * bit in MMCRA.
- */
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
-{
-       unsigned long mmcra = regs->dsisr;
-       unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
-               POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
-
-       if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-               *addrp = mfspr(SPRN_SDAR);
-}
-
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
-       unsigned long mmcra = regs->dsisr;
-
-       if (TRAP(regs) != 0xf00)
-               return 0;       /* not a PMU interrupt */
-
-       if (ppmu->flags & PPMU_ALT_SIPR) {
-               if (mmcra & POWER6_MMCRA_SIHV)
-                       return PERF_EVENT_MISC_HYPERVISOR;
-               return (mmcra & POWER6_MMCRA_SIPR) ?
-                       PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
-       }
-       if (mmcra & MMCRA_SIHV)
-               return PERF_EVENT_MISC_HYPERVISOR;
-       return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
-               PERF_EVENT_MISC_KERNEL;
-}
-
-/*
- * Overload regs->dsisr to store MMCRA so we only need to read it once
- * on each interrupt.
- */
-static inline void perf_read_regs(struct pt_regs *regs)
-{
-       regs->dsisr = mfspr(SPRN_MMCRA);
-}
-
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-       return !regs->softe;
-}
-
-#endif /* CONFIG_PPC64 */
-
-static void perf_counter_interrupt(struct pt_regs *regs);
-
-void perf_counter_print_debug(void)
-{
-}
-
-/*
- * Read one performance monitor counter (PMC).
- */
-static unsigned long read_pmc(int idx)
-{
-       unsigned long val;
-
-       switch (idx) {
-       case 1:
-               val = mfspr(SPRN_PMC1);
-               break;
-       case 2:
-               val = mfspr(SPRN_PMC2);
-               break;
-       case 3:
-               val = mfspr(SPRN_PMC3);
-               break;
-       case 4:
-               val = mfspr(SPRN_PMC4);
-               break;
-       case 5:
-               val = mfspr(SPRN_PMC5);
-               break;
-       case 6:
-               val = mfspr(SPRN_PMC6);
-               break;
-#ifdef CONFIG_PPC64
-       case 7:
-               val = mfspr(SPRN_PMC7);
-               break;
-       case 8:
-               val = mfspr(SPRN_PMC8);
-               break;
-#endif /* CONFIG_PPC64 */
-       default:
-               printk(KERN_ERR "oops trying to read PMC%d\n", idx);
-               val = 0;
-       }
-       return val;
-}
-
-/*
- * Write one PMC.
- */
-static void write_pmc(int idx, unsigned long val)
-{
-       switch (idx) {
-       case 1:
-               mtspr(SPRN_PMC1, val);
-               break;
-       case 2:
-               mtspr(SPRN_PMC2, val);
-               break;
-       case 3:
-               mtspr(SPRN_PMC3, val);
-               break;
-       case 4:
-               mtspr(SPRN_PMC4, val);
-               break;
-       case 5:
-               mtspr(SPRN_PMC5, val);
-               break;
-       case 6:
-               mtspr(SPRN_PMC6, val);
-               break;
-#ifdef CONFIG_PPC64
-       case 7:
-               mtspr(SPRN_PMC7, val);
-               break;
-       case 8:
-               mtspr(SPRN_PMC8, val);
-               break;
-#endif /* CONFIG_PPC64 */
-       default:
-               printk(KERN_ERR "oops trying to write PMC%d\n", idx);
-       }
-}
-
-/*
- * Check if a set of events can all go on the PMU at once.
- * If they can't, this will look at alternative codes for the events
- * and see if any combination of alternative codes is feasible.
- * The feasible set is returned in event[].
- */
-static int power_check_constraints(struct cpu_hw_counters *cpuhw,
-                                  u64 event[], unsigned int cflags[],
-                                  int n_ev)
-{
-       unsigned long mask, value, nv;
-       unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
-       int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
-       int i, j;
-       unsigned long addf = ppmu->add_fields;
-       unsigned long tadd = ppmu->test_adder;
-
-       if (n_ev > ppmu->n_counter)
-               return -1;
-
-       /* First see if the events will go on as-is */
-       for (i = 0; i < n_ev; ++i) {
-               if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
-                   && !ppmu->limited_pmc_event(event[i])) {
-                       ppmu->get_alternatives(event[i], cflags[i],
-                                              cpuhw->alternatives[i]);
-                       event[i] = cpuhw->alternatives[i][0];
-               }
-               if (ppmu->get_constraint(event[i], &cpuhw->amasks[i][0],
-                                        &cpuhw->avalues[i][0]))
-                       return -1;
-       }
-       value = mask = 0;
-       for (i = 0; i < n_ev; ++i) {
-               nv = (value | cpuhw->avalues[i][0]) +
-                       (value & cpuhw->avalues[i][0] & addf);
-               if ((((nv + tadd) ^ value) & mask) != 0 ||
-                   (((nv + tadd) ^ cpuhw->avalues[i][0]) &
-                    cpuhw->amasks[i][0]) != 0)
-                       break;
-               value = nv;
-               mask |= cpuhw->amasks[i][0];
-       }
-       if (i == n_ev)
-               return 0;       /* all OK */
-
-       /* doesn't work, gather alternatives... */
-       if (!ppmu->get_alternatives)
-               return -1;
-       for (i = 0; i < n_ev; ++i) {
-               choice[i] = 0;
-               n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
-                                                 cpuhw->alternatives[i]);
-               for (j = 1; j < n_alt[i]; ++j)
-                       ppmu->get_constraint(cpuhw->alternatives[i][j],
-                                            &cpuhw->amasks[i][j],
-                                            &cpuhw->avalues[i][j]);
-       }
-
-       /* enumerate all possibilities and see if any will work */
-       i = 0;
-       j = -1;
-       value = mask = nv = 0;
-       while (i < n_ev) {
-               if (j >= 0) {
-                       /* we're backtracking, restore context */
-                       value = svalues[i];
-                       mask = smasks[i];
-                       j = choice[i];
-               }
-               /*
-                * See if any alternative k for event i,
-                * where k > j, will satisfy the constraints.
-                */
-               while (++j < n_alt[i]) {
-                       nv = (value | cpuhw->avalues[i][j]) +
-                               (value & cpuhw->avalues[i][j] & addf);
-                       if ((((nv + tadd) ^ value) & mask) == 0 &&
-                           (((nv + tadd) ^ cpuhw->avalues[i][j])
-                            & cpuhw->amasks[i][j]) == 0)
-                               break;
-               }
-               if (j >= n_alt[i]) {
-                       /*
-                        * No feasible alternative, backtrack
-                        * to event i-1 and continue enumerating its
-                        * alternatives from where we got up to.
-                        */
-                       if (--i < 0)
-                               return -1;
-               } else {
-                       /*
-                        * Found a feasible alternative for event i,
-                        * remember where we got up to with this event,
-                        * go on to the next event, and start with
-                        * the first alternative for it.
-                        */
-                       choice[i] = j;
-                       svalues[i] = value;
-                       smasks[i] = mask;
-                       value = nv;
-                       mask |= cpuhw->amasks[i][j];
-                       ++i;
-                       j = -1;
-               }
-       }
-
-       /* OK, we have a feasible combination, tell the caller the solution */
-       for (i = 0; i < n_ev; ++i)
-               event[i] = cpuhw->alternatives[i][choice[i]];
-       return 0;
-}
-
-/*
- * Check if newly-added counters have consistent settings for
- * exclude_{user,kernel,hv} with each other and any previously
- * added counters.
- */
-static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
-                         int n_prev, int n_new)
-{
-       int eu = 0, ek = 0, eh = 0;
-       int i, n, first;
-       struct perf_counter *counter;
-
-       n = n_prev + n_new;
-       if (n <= 1)
-               return 0;
-
-       first = 1;
-       for (i = 0; i < n; ++i) {
-               if (cflags[i] & PPMU_LIMITED_PMC_OK) {
-                       cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
-                       continue;
-               }
-               counter = ctrs[i];
-               if (first) {
-                       eu = counter->attr.exclude_user;
-                       ek = counter->attr.exclude_kernel;
-                       eh = counter->attr.exclude_hv;
-                       first = 0;
-               } else if (counter->attr.exclude_user != eu ||
-                          counter->attr.exclude_kernel != ek ||
-                          counter->attr.exclude_hv != eh) {
-                       return -EAGAIN;
-               }
-       }
-
-       if (eu || ek || eh)
-               for (i = 0; i < n; ++i)
-                       if (cflags[i] & PPMU_LIMITED_PMC_OK)
-                               cflags[i] |= PPMU_LIMITED_PMC_REQD;
-
-       return 0;
-}
-
-static void power_pmu_read(struct perf_counter *counter)
-{
-       s64 val, delta, prev;
-
-       if (!counter->hw.idx)
-               return;
-       /*
-        * Performance monitor interrupts come even when interrupts
-        * are soft-disabled, as long as interrupts are hard-enabled.
-        * Therefore we treat them like NMIs.
-        */
-       do {
-               prev = atomic64_read(&counter->hw.prev_count);
-               barrier();
-               val = read_pmc(counter->hw.idx);
-       } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
-
-       /* The counters are only 32 bits wide */
-       delta = (val - prev) & 0xfffffffful;
-       atomic64_add(delta, &counter->count);
-       atomic64_sub(delta, &counter->hw.period_left);
-}
-
-/*
- * On some machines, PMC5 and PMC6 can't be written, don't respect
- * the freeze conditions, and don't generate interrupts.  This tells
- * us if `counter' is using such a PMC.
- */
-static int is_limited_pmc(int pmcnum)
-{
-       return (ppmu->flags & PPMU_LIMITED_PMC5_6)
-               && (pmcnum == 5 || pmcnum == 6);
-}
-
-static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
-                                   unsigned long pmc5, unsigned long pmc6)
-{
-       struct perf_counter *counter;
-       u64 val, prev, delta;
-       int i;
-
-       for (i = 0; i < cpuhw->n_limited; ++i) {
-               counter = cpuhw->limited_counter[i];
-               if (!counter->hw.idx)
-                       continue;
-               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
-               prev = atomic64_read(&counter->hw.prev_count);
-               counter->hw.idx = 0;
-               delta = (val - prev) & 0xfffffffful;
-               atomic64_add(delta, &counter->count);
-       }
-}
-
-static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
-                                 unsigned long pmc5, unsigned long pmc6)
-{
-       struct perf_counter *counter;
-       u64 val;
-       int i;
-
-       for (i = 0; i < cpuhw->n_limited; ++i) {
-               counter = cpuhw->limited_counter[i];
-               counter->hw.idx = cpuhw->limited_hwidx[i];
-               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
-               atomic64_set(&counter->hw.prev_count, val);
-               perf_counter_update_userpage(counter);
-       }
-}
-
-/*
- * Since limited counters don't respect the freeze conditions, we
- * have to read them immediately after freezing or unfreezing the
- * other counters.  We try to keep the values from the limited
- * counters as consistent as possible by keeping the delay (in
- * cycles and instructions) between freezing/unfreezing and reading
- * the limited counters as small and consistent as possible.
- * Therefore, if any limited counters are in use, we read them
- * both, and always in the same order, to minimize variability,
- * and do it inside the same asm that writes MMCR0.
- */
-static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
-{
-       unsigned long pmc5, pmc6;
-
-       if (!cpuhw->n_limited) {
-               mtspr(SPRN_MMCR0, mmcr0);
-               return;
-       }
-
-       /*
-        * Write MMCR0, then read PMC5 and PMC6 immediately.
-        * To ensure we don't get a performance monitor interrupt
-        * between writing MMCR0 and freezing/thawing the limited
-        * counters, we first write MMCR0 with the counter overflow
-        * interrupt enable bits turned off.
-        */
-       asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
-                    : "=&r" (pmc5), "=&r" (pmc6)
-                    : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
-                      "i" (SPRN_MMCR0),
-                      "i" (SPRN_PMC5), "i" (SPRN_PMC6));
-
-       if (mmcr0 & MMCR0_FC)
-               freeze_limited_counters(cpuhw, pmc5, pmc6);
-       else
-               thaw_limited_counters(cpuhw, pmc5, pmc6);
-
-       /*
-        * Write the full MMCR0 including the counter overflow interrupt
-        * enable bits, if necessary.
-        */
-       if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
-               mtspr(SPRN_MMCR0, mmcr0);
-}
-
-/*
- * Disable all counters to prevent PMU interrupts and to allow
- * counters to be added or removed.
- */
-void hw_perf_disable(void)
-{
-       struct cpu_hw_counters *cpuhw;
-       unsigned long flags;
-
-       if (!ppmu)
-               return;
-       local_irq_save(flags);
-       cpuhw = &__get_cpu_var(cpu_hw_counters);
-
-       if (!cpuhw->disabled) {
-               cpuhw->disabled = 1;
-               cpuhw->n_added = 0;
-
-               /*
-                * Check if we ever enabled the PMU on this cpu.
-                */
-               if (!cpuhw->pmcs_enabled) {
-                       ppc_enable_pmcs();
-                       cpuhw->pmcs_enabled = 1;
-               }
-
-               /*
-                * Disable instruction sampling if it was enabled
-                */
-               if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
-                       mtspr(SPRN_MMCRA,
-                             cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-                       mb();
-               }
-
-               /*
-                * Set the 'freeze counters' bit.
-                * The barrier is to make sure the mtspr has been
-                * executed and the PMU has frozen the counters
-                * before we return.
-                */
-               write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
-               mb();
-       }
-       local_irq_restore(flags);
-}
-
-/*
- * Re-enable all counters if disable == 0.
- * If we were previously disabled and counters were added, then
- * put the new config on the PMU.
- */
-void hw_perf_enable(void)
-{
-       struct perf_counter *counter;
-       struct cpu_hw_counters *cpuhw;
-       unsigned long flags;
-       long i;
-       unsigned long val;
-       s64 left;
-       unsigned int hwc_index[MAX_HWCOUNTERS];
-       int n_lim;
-       int idx;
-
-       if (!ppmu)
-               return;
-       local_irq_save(flags);
-       cpuhw = &__get_cpu_var(cpu_hw_counters);
-       if (!cpuhw->disabled) {
-               local_irq_restore(flags);
-               return;
-       }
-       cpuhw->disabled = 0;
-
-       /*
-        * If we didn't change anything, or only removed counters,
-        * no need to recalculate MMCR* settings and reset the PMCs.
-        * Just reenable the PMU with the current MMCR* settings
-        * (possibly updated for removal of counters).
-        */
-       if (!cpuhw->n_added) {
-               mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-               mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-               if (cpuhw->n_counters == 0)
-                       ppc_set_pmu_inuse(0);
-               goto out_enable;
-       }
-
-       /*
-        * Compute MMCR* values for the new set of counters
-        */
-       if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
-                              cpuhw->mmcr)) {
-               /* shouldn't ever get here */
-               printk(KERN_ERR "oops compute_mmcr failed\n");
-               goto out;
-       }
-
-       /*
-        * Add in MMCR0 freeze bits corresponding to the
-        * attr.exclude_* bits for the first counter.
-        * We have already checked that all counters have the
-        * same values for these bits as the first counter.
-        */
-       counter = cpuhw->counter[0];
-       if (counter->attr.exclude_user)
-               cpuhw->mmcr[0] |= MMCR0_FCP;
-       if (counter->attr.exclude_kernel)
-               cpuhw->mmcr[0] |= freeze_counters_kernel;
-       if (counter->attr.exclude_hv)
-               cpuhw->mmcr[0] |= MMCR0_FCHV;
-
-       /*
-        * Write the new configuration to MMCR* with the freeze
-        * bit set and set the hardware counters to their initial values.
-        * Then unfreeze the counters.
-        */
-       ppc_set_pmu_inuse(1);
-       mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-       mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-       mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
-                               | MMCR0_FC);
-
-       /*
-        * Read off any pre-existing counters that need to move
-        * to another PMC.
-        */
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-                       power_pmu_read(counter);
-                       write_pmc(counter->hw.idx, 0);
-                       counter->hw.idx = 0;
-               }
-       }
-
-       /*
-        * Initialize the PMCs for all the new and moved counters.
-        */
-       cpuhw->n_limited = n_lim = 0;
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (counter->hw.idx)
-                       continue;
-               idx = hwc_index[i] + 1;
-               if (is_limited_pmc(idx)) {
-                       cpuhw->limited_counter[n_lim] = counter;
-                       cpuhw->limited_hwidx[n_lim] = idx;
-                       ++n_lim;
-                       continue;
-               }
-               val = 0;
-               if (counter->hw.sample_period) {
-                       left = atomic64_read(&counter->hw.period_left);
-                       if (left < 0x80000000L)
-                               val = 0x80000000L - left;
-               }
-               atomic64_set(&counter->hw.prev_count, val);
-               counter->hw.idx = idx;
-               write_pmc(idx, val);
-               perf_counter_update_userpage(counter);
-       }
-       cpuhw->n_limited = n_lim;
-       cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
-
- out_enable:
-       mb();
-       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
-       /*
-        * Enable instruction sampling if necessary
-        */
-       if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
-               mb();
-               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
-       }
-
- out:
-       local_irq_restore(flags);
-}
-
-static int collect_events(struct perf_counter *group, int max_count,
-                         struct perf_counter *ctrs[], u64 *events,
-                         unsigned int *flags)
-{
-       int n = 0;
-       struct perf_counter *counter;
-
-       if (!is_software_counter(group)) {
-               if (n >= max_count)
-                       return -1;
-               ctrs[n] = group;
-               flags[n] = group->hw.counter_base;
-               events[n++] = group->hw.config;
-       }
-       list_for_each_entry(counter, &group->sibling_list, list_entry) {
-               if (!is_software_counter(counter) &&
-                   counter->state != PERF_COUNTER_STATE_OFF) {
-                       if (n >= max_count)
-                               return -1;
-                       ctrs[n] = counter;
-                       flags[n] = counter->hw.counter_base;
-                       events[n++] = counter->hw.config;
-               }
-       }
-       return n;
-}
-
-static void counter_sched_in(struct perf_counter *counter, int cpu)
-{
-       counter->state = PERF_COUNTER_STATE_ACTIVE;
-       counter->oncpu = cpu;
-       counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
-       if (is_software_counter(counter))
-               counter->pmu->enable(counter);
-}
-
-/*
- * Called to enable a whole group of counters.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- */
-int hw_perf_group_sched_in(struct perf_counter *group_leader,
-              struct perf_cpu_context *cpuctx,
-              struct perf_counter_context *ctx, int cpu)
-{
-       struct cpu_hw_counters *cpuhw;
-       long i, n, n0;
-       struct perf_counter *sub;
-
-       if (!ppmu)
-               return 0;
-       cpuhw = &__get_cpu_var(cpu_hw_counters);
-       n0 = cpuhw->n_counters;
-       n = collect_events(group_leader, ppmu->n_counter - n0,
-                          &cpuhw->counter[n0], &cpuhw->events[n0],
-                          &cpuhw->flags[n0]);
-       if (n < 0)
-               return -EAGAIN;
-       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
-               return -EAGAIN;
-       i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
-       if (i < 0)
-               return -EAGAIN;
-       cpuhw->n_counters = n0 + n;
-       cpuhw->n_added += n;
-
-       /*
-        * OK, this group can go on; update counter states etc.,
-        * and enable any software counters
-        */
-       for (i = n0; i < n0 + n; ++i)
-               cpuhw->counter[i]->hw.config = cpuhw->events[i];
-       cpuctx->active_oncpu += n;
-       n = 1;
-       counter_sched_in(group_leader, cpu);
-       list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
-               if (sub->state != PERF_COUNTER_STATE_OFF) {
-                       counter_sched_in(sub, cpu);
-                       ++n;
-               }
-       }
-       ctx->nr_active += n;
-
-       return 1;
-}
-
-/*
- * Add a counter to the PMU.
- * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_enable to do the
- * actual work of reconfiguring the PMU.
- */
-static int power_pmu_enable(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuhw;
-       unsigned long flags;
-       int n0;
-       int ret = -EAGAIN;
-
-       local_irq_save(flags);
-       perf_disable();
-
-       /*
-        * Add the counter to the list (if there is room)
-        * and check whether the total set is still feasible.
-        */
-       cpuhw = &__get_cpu_var(cpu_hw_counters);
-       n0 = cpuhw->n_counters;
-       if (n0 >= ppmu->n_counter)
-               goto out;
-       cpuhw->counter[n0] = counter;
-       cpuhw->events[n0] = counter->hw.config;
-       cpuhw->flags[n0] = counter->hw.counter_base;
-       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
-               goto out;
-       if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
-               goto out;
-
-       counter->hw.config = cpuhw->events[n0];
-       ++cpuhw->n_counters;
-       ++cpuhw->n_added;
-
-       ret = 0;
- out:
-       perf_enable();
-       local_irq_restore(flags);
-       return ret;
-}
-
-/*
- * Remove a counter from the PMU.
- */
-static void power_pmu_disable(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuhw;
-       long i;
-       unsigned long flags;
-
-       local_irq_save(flags);
-       perf_disable();
-
-       power_pmu_read(counter);
-
-       cpuhw = &__get_cpu_var(cpu_hw_counters);
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               if (counter == cpuhw->counter[i]) {
-                       while (++i < cpuhw->n_counters)
-                               cpuhw->counter[i-1] = cpuhw->counter[i];
-                       --cpuhw->n_counters;
-                       ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-                       if (counter->hw.idx) {
-                               write_pmc(counter->hw.idx, 0);
-                               counter->hw.idx = 0;
-                       }
-                       perf_counter_update_userpage(counter);
-                       break;
-               }
-       }
-       for (i = 0; i < cpuhw->n_limited; ++i)
-               if (counter == cpuhw->limited_counter[i])
-                       break;
-       if (i < cpuhw->n_limited) {
-               while (++i < cpuhw->n_limited) {
-                       cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
-                       cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
-               }
-               --cpuhw->n_limited;
-       }
-       if (cpuhw->n_counters == 0) {
-               /* disable exceptions if no counters are running */
-               cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
-       }
-
-       perf_enable();
-       local_irq_restore(flags);
-}
-
-/*
- * Re-enable interrupts on a counter after they were throttled
- * because they were coming too fast.
- */
-static void power_pmu_unthrottle(struct perf_counter *counter)
-{
-       s64 val, left;
-       unsigned long flags;
-
-       if (!counter->hw.idx || !counter->hw.sample_period)
-               return;
-       local_irq_save(flags);
-       perf_disable();
-       power_pmu_read(counter);
-       left = counter->hw.sample_period;
-       counter->hw.last_period = left;
-       val = 0;
-       if (left < 0x80000000L)
-               val = 0x80000000L - left;
-       write_pmc(counter->hw.idx, val);
-       atomic64_set(&counter->hw.prev_count, val);
-       atomic64_set(&counter->hw.period_left, left);
-       perf_counter_update_userpage(counter);
-       perf_enable();
-       local_irq_restore(flags);
-}
-
-struct pmu power_pmu = {
-       .enable         = power_pmu_enable,
-       .disable        = power_pmu_disable,
-       .read           = power_pmu_read,
-       .unthrottle     = power_pmu_unthrottle,
-};
-
-/*
- * Return 1 if we might be able to put counter on a limited PMC,
- * or 0 if not.
- * A counter can only go on a limited PMC if it counts something
- * that a limited PMC can count, doesn't require interrupts, and
- * doesn't exclude any processor mode.
- */
-static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
-                                unsigned int flags)
-{
-       int n;
-       u64 alt[MAX_EVENT_ALTERNATIVES];
-
-       if (counter->attr.exclude_user
-           || counter->attr.exclude_kernel
-           || counter->attr.exclude_hv
-           || counter->attr.sample_period)
-               return 0;
-
-       if (ppmu->limited_pmc_event(ev))
-               return 1;
-
-       /*
-        * The requested event isn't on a limited PMC already;
-        * see if any alternative code goes on a limited PMC.
-        */
-       if (!ppmu->get_alternatives)
-               return 0;
-
-       flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
-       n = ppmu->get_alternatives(ev, flags, alt);
-
-       return n > 0;
-}
-
-/*
- * Find an alternative event that goes on a normal PMC, if possible,
- * and return the event code, or 0 if there is no such alternative.
- * (Note: event code 0 is "don't count" on all machines.)
- */
-static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
-{
-       u64 alt[MAX_EVENT_ALTERNATIVES];
-       int n;
-
-       flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
-       n = ppmu->get_alternatives(ev, flags, alt);
-       if (!n)
-               return 0;
-       return alt[0];
-}
-
-/* Number of perf_counters counting hardware events */
-static atomic_t num_counters;
-/* Used to avoid races in calling reserve/release_pmc_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-/*
- * Release the PMU if this is the last perf_counter.
- */
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-       if (!atomic_add_unless(&num_counters, -1, 1)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_dec_return(&num_counters) == 0)
-                       release_pmc_hardware();
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-}
-
-/*
- * Translate a generic cache event config to a raw event code.
- */
-static int hw_perf_cache_event(u64 config, u64 *eventp)
-{
-       unsigned long type, op, result;
-       int ev;
-
-       if (!ppmu->cache_events)
-               return -EINVAL;
-
-       /* unpack config */
-       type = config & 0xff;
-       op = (config >> 8) & 0xff;
-       result = (config >> 16) & 0xff;
-
-       if (type >= PERF_COUNT_HW_CACHE_MAX ||
-           op >= PERF_COUNT_HW_CACHE_OP_MAX ||
-           result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-               return -EINVAL;
-
-       ev = (*ppmu->cache_events)[type][op][result];
-       if (ev == 0)
-               return -EOPNOTSUPP;
-       if (ev == -1)
-               return -EINVAL;
-       *eventp = ev;
-       return 0;
-}
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-       u64 ev;
-       unsigned long flags;
-       struct perf_counter *ctrs[MAX_HWCOUNTERS];
-       u64 events[MAX_HWCOUNTERS];
-       unsigned int cflags[MAX_HWCOUNTERS];
-       int n;
-       int err;
-       struct cpu_hw_counters *cpuhw;
-
-       if (!ppmu)
-               return ERR_PTR(-ENXIO);
-       switch (counter->attr.type) {
-       case PERF_TYPE_HARDWARE:
-               ev = counter->attr.config;
-               if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-                       return ERR_PTR(-EOPNOTSUPP);
-               ev = ppmu->generic_events[ev];
-               break;
-       case PERF_TYPE_HW_CACHE:
-               err = hw_perf_cache_event(counter->attr.config, &ev);
-               if (err)
-                       return ERR_PTR(err);
-               break;
-       case PERF_TYPE_RAW:
-               ev = counter->attr.config;
-               break;
-       default:
-               return ERR_PTR(-EINVAL);
-       }
-       counter->hw.config_base = ev;
-       counter->hw.idx = 0;
-
-       /*
-        * If we are not running on a hypervisor, force the
-        * exclude_hv bit to 0 so that we don't care what
-        * the user set it to.
-        */
-       if (!firmware_has_feature(FW_FEATURE_LPAR))
-               counter->attr.exclude_hv = 0;
-
-       /*
-        * If this is a per-task counter, then we can use
-        * PM_RUN_* events interchangeably with their non RUN_*
-        * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
-        * XXX we should check if the task is an idle task.
-        */
-       flags = 0;
-       if (counter->ctx->task)
-               flags |= PPMU_ONLY_COUNT_RUN;
-
-       /*
-        * If this machine has limited counters, check whether this
-        * event could go on a limited counter.
-        */
-       if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
-               if (can_go_on_limited_pmc(counter, ev, flags)) {
-                       flags |= PPMU_LIMITED_PMC_OK;
-               } else if (ppmu->limited_pmc_event(ev)) {
-                       /*
-                        * The requested event is on a limited PMC,
-                        * but we can't use a limited PMC; see if any
-                        * alternative goes on a normal PMC.
-                        */
-                       ev = normal_pmc_alternative(ev, flags);
-                       if (!ev)
-                               return ERR_PTR(-EINVAL);
-               }
-       }
-
-       /*
-        * If this is in a group, check if it can go on with all the
-        * other hardware counters in the group.  We assume the counter
-        * hasn't been linked into its leader's sibling list at this point.
-        */
-       n = 0;
-       if (counter->group_leader != counter) {
-               n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-                                  ctrs, events, cflags);
-               if (n < 0)
-                       return ERR_PTR(-EINVAL);
-       }
-       events[n] = ev;
-       ctrs[n] = counter;
-       cflags[n] = flags;
-       if (check_excludes(ctrs, cflags, n, 1))
-               return ERR_PTR(-EINVAL);
-
-       cpuhw = &get_cpu_var(cpu_hw_counters);
-       err = power_check_constraints(cpuhw, events, cflags, n + 1);
-       put_cpu_var(cpu_hw_counters);
-       if (err)
-               return ERR_PTR(-EINVAL);
-
-       counter->hw.config = events[n];
-       counter->hw.counter_base = cflags[n];
-       counter->hw.last_period = counter->hw.sample_period;
-       atomic64_set(&counter->hw.period_left, counter->hw.last_period);
-
-       /*
-        * See if we need to reserve the PMU.
-        * If no counters are currently in use, then we have to take a
-        * mutex to ensure that we don't race with another task doing
-        * reserve_pmc_hardware or release_pmc_hardware.
-        */
-       err = 0;
-       if (!atomic_inc_not_zero(&num_counters)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&num_counters) == 0 &&
-                   reserve_pmc_hardware(perf_counter_interrupt))
-                       err = -EBUSY;
-               else
-                       atomic_inc(&num_counters);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-       counter->destroy = hw_perf_counter_destroy;
-
-       if (err)
-               return ERR_PTR(err);
-       return &power_pmu;
-}
-
-/*
- * A counter has overflowed; update its count and record
- * things if requested.  Note that interrupts are hard-disabled
- * here so there is no possibility of being interrupted.
- */
-static void record_and_restart(struct perf_counter *counter, unsigned long val,
-                              struct pt_regs *regs, int nmi)
-{
-       u64 period = counter->hw.sample_period;
-       s64 prev, delta, left;
-       int record = 0;
-
-       /* we don't have to worry about interrupts here */
-       prev = atomic64_read(&counter->hw.prev_count);
-       delta = (val - prev) & 0xfffffffful;
-       atomic64_add(delta, &counter->count);
-
-       /*
-        * See if the total period for this counter has expired,
-        * and update for the next period.
-        */
-       val = 0;
-       left = atomic64_read(&counter->hw.period_left) - delta;
-       if (period) {
-               if (left <= 0) {
-                       left += period;
-                       if (left <= 0)
-                               left = period;
-                       record = 1;
-               }
-               if (left < 0x80000000LL)
-                       val = 0x80000000LL - left;
-       }
-
-       /*
-        * Finally record data if requested.
-        */
-       if (record) {
-               struct perf_sample_data data = {
-                       .addr   = 0,
-                       .period = counter->hw.last_period,
-               };
-
-               if (counter->attr.sample_type & PERF_SAMPLE_ADDR)
-                       perf_get_data_addr(regs, &data.addr);
-
-               if (perf_counter_overflow(counter, nmi, &data, regs)) {
-                       /*
-                        * Interrupts are coming too fast - throttle them
-                        * by setting the counter to 0, so it will be
-                        * at least 2^30 cycles until the next interrupt
-                        * (assuming each counter counts at most 2 counts
-                        * per cycle).
-                        */
-                       val = 0;
-                       left = ~0ULL >> 1;
-               }
-       }
-
-       write_pmc(counter->hw.idx, val);
-       atomic64_set(&counter->hw.prev_count, val);
-       atomic64_set(&counter->hw.period_left, left);
-       perf_counter_update_userpage(counter);
-}
-
-/*
- * Called from generic code to get the misc flags (i.e. processor mode)
- * for an event.
- */
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-       u32 flags = perf_get_misc_flags(regs);
-
-       if (flags)
-               return flags;
-       return user_mode(regs) ? PERF_EVENT_MISC_USER :
-               PERF_EVENT_MISC_KERNEL;
-}
-
-/*
- * Called from generic code to get the instruction pointer
- * for an event.
- */
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-       unsigned long ip;
-
-       if (TRAP(regs) != 0xf00)
-               return regs->nip;       /* not a PMU interrupt */
-
-       ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
-       return ip;
-}
-
-/*
- * Performance monitor interrupt stuff
- */
-static void perf_counter_interrupt(struct pt_regs *regs)
-{
-       int i;
-       struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-       struct perf_counter *counter;
-       unsigned long val;
-       int found = 0;
-       int nmi;
-
-       if (cpuhw->n_limited)
-               freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
-                                       mfspr(SPRN_PMC6));
-
-       perf_read_regs(regs);
-
-       nmi = perf_intr_is_nmi(regs);
-       if (nmi)
-               nmi_enter();
-       else
-               irq_enter();
-
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
-                       continue;
-               val = read_pmc(counter->hw.idx);
-               if ((int)val < 0) {
-                       /* counter has overflowed */
-                       found = 1;
-                       record_and_restart(counter, val, regs, nmi);
-               }
-       }
-
-       /*
-        * In case we didn't find and reset the counter that caused
-        * the interrupt, scan all counters and reset any that are
-        * negative, to avoid getting continual interrupts.
-        * Any that we processed in the previous loop will not be negative.
-        */
-       if (!found) {
-               for (i = 0; i < ppmu->n_counter; ++i) {
-                       if (is_limited_pmc(i + 1))
-                               continue;
-                       val = read_pmc(i + 1);
-                       if ((int)val < 0)
-                               write_pmc(i + 1, 0);
-               }
-       }
-
-       /*
-        * Reset MMCR0 to its normal value.  This will set PMXE and
-        * clear FC (freeze counters) and PMAO (perf mon alert occurred)
-        * and thus allow interrupts to occur again.
-        * XXX might want to use MSR.PM to keep the counters frozen until
-        * we get back out of this interrupt.
-        */
-       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
-       if (nmi)
-               nmi_exit();
-       else
-               irq_exit();
-}
-
-void hw_perf_counter_setup(int cpu)
-{
-       struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
-
-       if (!ppmu)
-               return;
-       memset(cpuhw, 0, sizeof(*cpuhw));
-       cpuhw->mmcr[0] = MMCR0_FC;
-}
-
-int register_power_pmu(struct power_pmu *pmu)
-{
-       if (ppmu)
-               return -EBUSY;          /* something's already registered */
-
-       ppmu = pmu;
-       pr_info("%s performance monitor hardware support registered\n",
-               pmu->name);
-
-#ifdef MSR_HV
-       /*
-        * Use FCHV to ignore kernel events if MSR.HV is set.
-        */
-       if (mfmsr() & MSR_HV)
-               freeze_counters_kernel = MMCR0_FCHV;
-#endif /* CONFIG_PPC64 */
-
-       return 0;
-}
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c

new file mode 100644 (file)

index 0000000..c98321f
--- /dev/null
+++ b/arch/powerpc/kernel/perf_event.c
@@ -0,0 +1,1315 @@
+/*
+ * Performance event support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/ptrace.h>
+
+struct cpu_hw_events {
+       int n_events;
+       int n_percpu;
+       int disabled;
+       int n_added;
+       int n_limited;
+       u8  pmcs_enabled;
+       struct perf_event *event[MAX_HWEVENTS];
+       u64 events[MAX_HWEVENTS];
+       unsigned int flags[MAX_HWEVENTS];
+       unsigned long mmcr[3];
+       struct perf_event *limited_event[MAX_LIMITED_HWEVENTS];
+       u8  limited_hwidx[MAX_LIMITED_HWEVENTS];
+       u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+       unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+       unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+struct power_pmu *ppmu;
+
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze events
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_events_kernel = MMCR0_FCS;
+
+/*
+ * 32-bit doesn't have MMCRA but does have an MMCR2,
+ * and a few other names are different.
+ */
+#ifdef CONFIG_PPC32
+
+#define MMCR0_FCHV             0
+#define MMCR0_PMCjCE           MMCR0_PMCnCE
+
+#define SPRN_MMCRA             SPRN_MMCR2
+#define MMCRA_SAMPLE_ENABLE    0
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+       return 0;
+}
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+       return 0;
+}
+static inline void perf_read_regs(struct pt_regs *regs) { }
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+       return 0;
+}
+
+#endif /* CONFIG_PPC32 */
+
+/*
+ * Things that are specific to 64-bit implementations.
+ */
+#ifdef CONFIG_PPC64
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+       unsigned long mmcra = regs->dsisr;
+
+       if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+               unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+               if (slot > 1)
+                       return 4 * (slot - 1);
+       }
+       return 0;
+}
+
+/*
+ * The user wants a data address recorded.
+ * If we're not doing instruction sampling, give them the SDAR
+ * (sampled data address).  If we are doing instruction sampling, then
+ * only give them the SDAR if it corresponds to the instruction
+ * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
+ * bit in MMCRA.
+ */
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
+{
+       unsigned long mmcra = regs->dsisr;
+       unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+               POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+
+       if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+               *addrp = mfspr(SPRN_SDAR);
+}
+
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+       unsigned long mmcra = regs->dsisr;
+
+       if (TRAP(regs) != 0xf00)
+               return 0;       /* not a PMU interrupt */
+
+       if (ppmu->flags & PPMU_ALT_SIPR) {
+               if (mmcra & POWER6_MMCRA_SIHV)
+                       return PERF_RECORD_MISC_HYPERVISOR;
+               return (mmcra & POWER6_MMCRA_SIPR) ?
+                       PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL;
+       }
+       if (mmcra & MMCRA_SIHV)
+               return PERF_RECORD_MISC_HYPERVISOR;
+       return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER :
+               PERF_RECORD_MISC_KERNEL;
+}
+
+/*
+ * Overload regs->dsisr to store MMCRA so we only need to read it once
+ * on each interrupt.
+ */
+static inline void perf_read_regs(struct pt_regs *regs)
+{
+       regs->dsisr = mfspr(SPRN_MMCRA);
+}
+
+/*
+ * If interrupts were soft-disabled when a PMU interrupt occurs, treat
+ * it as an NMI.
+ */
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+       return !regs->softe;
+}
+
+#endif /* CONFIG_PPC64 */
+
+static void perf_event_interrupt(struct pt_regs *regs);
+
+void perf_event_print_debug(void)
+{
+}
+
+/*
+ * Read one performance monitor event (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+       unsigned long val;
+
+       switch (idx) {
+       case 1:
+               val = mfspr(SPRN_PMC1);
+               break;
+       case 2:
+               val = mfspr(SPRN_PMC2);
+               break;
+       case 3:
+               val = mfspr(SPRN_PMC3);
+               break;
+       case 4:
+               val = mfspr(SPRN_PMC4);
+               break;
+       case 5:
+               val = mfspr(SPRN_PMC5);
+               break;
+       case 6:
+               val = mfspr(SPRN_PMC6);
+               break;
+#ifdef CONFIG_PPC64
+       case 7:
+               val = mfspr(SPRN_PMC7);
+               break;
+       case 8:
+               val = mfspr(SPRN_PMC8);
+               break;
+#endif /* CONFIG_PPC64 */
+       default:
+               printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+               val = 0;
+       }
+       return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+       switch (idx) {
+       case 1:
+               mtspr(SPRN_PMC1, val);
+               break;
+       case 2:
+               mtspr(SPRN_PMC2, val);
+               break;
+       case 3:
+               mtspr(SPRN_PMC3, val);
+               break;
+       case 4:
+               mtspr(SPRN_PMC4, val);
+               break;
+       case 5:
+               mtspr(SPRN_PMC5, val);
+               break;
+       case 6:
+               mtspr(SPRN_PMC6, val);
+               break;
+#ifdef CONFIG_PPC64
+       case 7:
+               mtspr(SPRN_PMC7, val);
+               break;
+       case 8:
+               mtspr(SPRN_PMC8, val);
+               break;
+#endif /* CONFIG_PPC64 */
+       default:
+               printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+       }
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event_id[].
+ */
+static int power_check_constraints(struct cpu_hw_events *cpuhw,
+                                  u64 event_id[], unsigned int cflags[],
+                                  int n_ev)
+{
+       unsigned long mask, value, nv;
+       unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
+       int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
+       int i, j;
+       unsigned long addf = ppmu->add_fields;
+       unsigned long tadd = ppmu->test_adder;
+
+       if (n_ev > ppmu->n_event)
+               return -1;
+
+       /* First see if the events will go on as-is */
+       for (i = 0; i < n_ev; ++i) {
+               if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+                   && !ppmu->limited_pmc_event(event_id[i])) {
+                       ppmu->get_alternatives(event_id[i], cflags[i],
+                                              cpuhw->alternatives[i]);
+                       event_id[i] = cpuhw->alternatives[i][0];
+               }
+               if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
+                                        &cpuhw->avalues[i][0]))
+                       return -1;
+       }
+       value = mask = 0;
+       for (i = 0; i < n_ev; ++i) {
+               nv = (value | cpuhw->avalues[i][0]) +
+                       (value & cpuhw->avalues[i][0] & addf);
+               if ((((nv + tadd) ^ value) & mask) != 0 ||
+                   (((nv + tadd) ^ cpuhw->avalues[i][0]) &
+                    cpuhw->amasks[i][0]) != 0)
+                       break;
+               value = nv;
+               mask |= cpuhw->amasks[i][0];
+       }
+       if (i == n_ev)
+               return 0;       /* all OK */
+
+       /* doesn't work, gather alternatives... */
+       if (!ppmu->get_alternatives)
+               return -1;
+       for (i = 0; i < n_ev; ++i) {
+               choice[i] = 0;
+               n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
+                                                 cpuhw->alternatives[i]);
+               for (j = 1; j < n_alt[i]; ++j)
+                       ppmu->get_constraint(cpuhw->alternatives[i][j],
+                                            &cpuhw->amasks[i][j],
+                                            &cpuhw->avalues[i][j]);
+       }
+
+       /* enumerate all possibilities and see if any will work */
+       i = 0;
+       j = -1;
+       value = mask = nv = 0;
+       while (i < n_ev) {
+               if (j >= 0) {
+                       /* we're backtracking, restore context */
+                       value = svalues[i];
+                       mask = smasks[i];
+                       j = choice[i];
+               }
+               /*
+                * See if any alternative k for event_id i,
+                * where k > j, will satisfy the constraints.
+                */
+               while (++j < n_alt[i]) {
+                       nv = (value | cpuhw->avalues[i][j]) +
+                               (value & cpuhw->avalues[i][j] & addf);
+                       if ((((nv + tadd) ^ value) & mask) == 0 &&
+                           (((nv + tadd) ^ cpuhw->avalues[i][j])
+                            & cpuhw->amasks[i][j]) == 0)
+                               break;
+               }
+               if (j >= n_alt[i]) {
+                       /*
+                        * No feasible alternative, backtrack
+                        * to event_id i-1 and continue enumerating its
+                        * alternatives from where we got up to.
+                        */
+                       if (--i < 0)
+                               return -1;
+               } else {
+                       /*
+                        * Found a feasible alternative for event_id i,
+                        * remember where we got up to with this event_id,
+                        * go on to the next event_id, and start with
+                        * the first alternative for it.
+                        */
+                       choice[i] = j;
+                       svalues[i] = value;
+                       smasks[i] = mask;
+                       value = nv;
+                       mask |= cpuhw->amasks[i][j];
+                       ++i;
+                       j = -1;
+               }
+       }
+
+       /* OK, we have a feasible combination, tell the caller the solution */
+       for (i = 0; i < n_ev; ++i)
+               event_id[i] = cpuhw->alternatives[i][choice[i]];
+       return 0;
+}
+
+/*
+ * Check if newly-added events have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added events.
+ */
+static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
+                         int n_prev, int n_new)
+{
+       int eu = 0, ek = 0, eh = 0;
+       int i, n, first;
+       struct perf_event *event;
+
+       n = n_prev + n_new;
+       if (n <= 1)
+               return 0;
+
+       first = 1;
+       for (i = 0; i < n; ++i) {
+               if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+                       cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+                       continue;
+               }
+               event = ctrs[i];
+               if (first) {
+                       eu = event->attr.exclude_user;
+                       ek = event->attr.exclude_kernel;
+                       eh = event->attr.exclude_hv;
+                       first = 0;
+               } else if (event->attr.exclude_user != eu ||
+                          event->attr.exclude_kernel != ek ||
+                          event->attr.exclude_hv != eh) {
+                       return -EAGAIN;
+               }
+       }
+
+       if (eu || ek || eh)
+               for (i = 0; i < n; ++i)
+                       if (cflags[i] & PPMU_LIMITED_PMC_OK)
+                               cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
+       return 0;
+}
+
+static void power_pmu_read(struct perf_event *event)
+{
+       s64 val, delta, prev;
+
+       if (!event->hw.idx)
+               return;
+       /*
+        * Performance monitor interrupts come even when interrupts
+        * are soft-disabled, as long as interrupts are hard-enabled.
+        * Therefore we treat them like NMIs.
+        */
+       do {
+               prev = atomic64_read(&event->hw.prev_count);
+               barrier();
+               val = read_pmc(event->hw.idx);
+       } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
+
+       /* The events are only 32 bits wide */
+       delta = (val - prev) & 0xfffffffful;
+       atomic64_add(delta, &event->count);
+       atomic64_sub(delta, &event->hw.period_left);
+}
+
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `event' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+       return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+               && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_events(struct cpu_hw_events *cpuhw,
+                                   unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_event *event;
+       u64 val, prev, delta;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               event = cpuhw->limited_event[i];
+               if (!event->hw.idx)
+                       continue;
+               val = (event->hw.idx == 5) ? pmc5 : pmc6;
+               prev = atomic64_read(&event->hw.prev_count);
+               event->hw.idx = 0;
+               delta = (val - prev) & 0xfffffffful;
+               atomic64_add(delta, &event->count);
+       }
+}
+
+static void thaw_limited_events(struct cpu_hw_events *cpuhw,
+                                 unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_event *event;
+       u64 val;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               event = cpuhw->limited_event[i];
+               event->hw.idx = cpuhw->limited_hwidx[i];
+               val = (event->hw.idx == 5) ? pmc5 : pmc6;
+               atomic64_set(&event->hw.prev_count, val);
+               perf_event_update_userpage(event);
+       }
+}
+
+/*
+ * Since limited events don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other events.  We try to keep the values from the limited
+ * events as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited events as small and consistent as possible.
+ * Therefore, if any limited events are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
+{
+       unsigned long pmc5, pmc6;
+
+       if (!cpuhw->n_limited) {
+               mtspr(SPRN_MMCR0, mmcr0);
+               return;
+       }
+
+       /*
+        * Write MMCR0, then read PMC5 and PMC6 immediately.
+        * To ensure we don't get a performance monitor interrupt
+        * between writing MMCR0 and freezing/thawing the limited
+        * events, we first write MMCR0 with the event overflow
+        * interrupt enable bits turned off.
+        */
+       asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+                    : "=&r" (pmc5), "=&r" (pmc6)
+                    : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+                      "i" (SPRN_MMCR0),
+                      "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+       if (mmcr0 & MMCR0_FC)
+               freeze_limited_events(cpuhw, pmc5, pmc6);
+       else
+               thaw_limited_events(cpuhw, pmc5, pmc6);
+
+       /*
+        * Write the full MMCR0 including the event overflow interrupt
+        * enable bits, if necessary.
+        */
+       if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+               mtspr(SPRN_MMCR0, mmcr0);
+}
+
+/*
+ * Disable all events to prevent PMU interrupts and to allow
+ * events to be added or removed.
+ */
+void hw_perf_disable(void)
+{
+       struct cpu_hw_events *cpuhw;
+       unsigned long flags;
+
+       if (!ppmu)
+               return;
+       local_irq_save(flags);
+       cpuhw = &__get_cpu_var(cpu_hw_events);
+
+       if (!cpuhw->disabled) {
+               cpuhw->disabled = 1;
+               cpuhw->n_added = 0;
+
+               /*
+                * Check if we ever enabled the PMU on this cpu.
+                */
+               if (!cpuhw->pmcs_enabled) {
+                       ppc_enable_pmcs();
+                       cpuhw->pmcs_enabled = 1;
+               }
+
+               /*
+                * Disable instruction sampling if it was enabled
+                */
+               if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+                       mtspr(SPRN_MMCRA,
+                             cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+                       mb();
+               }
+
+               /*
+                * Set the 'freeze events' bit.
+                * The barrier is to make sure the mtspr has been
+                * executed and the PMU has frozen the events
+                * before we return.
+                */
+               write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
+               mb();
+       }
+       local_irq_restore(flags);
+}
+
+/*
+ * Re-enable all events if disable == 0.
+ * If we were previously disabled and events were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_enable(void)
+{
+       struct perf_event *event;
+       struct cpu_hw_events *cpuhw;
+       unsigned long flags;
+       long i;
+       unsigned long val;
+       s64 left;
+       unsigned int hwc_index[MAX_HWEVENTS];
+       int n_lim;
+       int idx;
+
+       if (!ppmu)
+               return;
+       local_irq_save(flags);
+       cpuhw = &__get_cpu_var(cpu_hw_events);
+       if (!cpuhw->disabled) {
+               local_irq_restore(flags);
+               return;
+       }
+       cpuhw->disabled = 0;
+
+       /*
+        * If we didn't change anything, or only removed events,
+        * no need to recalculate MMCR* settings and reset the PMCs.
+        * Just reenable the PMU with the current MMCR* settings
+        * (possibly updated for removal of events).
+        */
+       if (!cpuhw->n_added) {
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+               mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+               if (cpuhw->n_events == 0)
+                       ppc_set_pmu_inuse(0);
+               goto out_enable;
+       }
+
+       /*
+        * Compute MMCR* values for the new set of events
+        */
+       if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
+                              cpuhw->mmcr)) {
+               /* shouldn't ever get here */
+               printk(KERN_ERR "oops compute_mmcr failed\n");
+               goto out;
+       }
+
+       /*
+        * Add in MMCR0 freeze bits corresponding to the
+        * attr.exclude_* bits for the first event.
+        * We have already checked that all events have the
+        * same values for these bits as the first event.
+        */
+       event = cpuhw->event[0];
+       if (event->attr.exclude_user)
+               cpuhw->mmcr[0] |= MMCR0_FCP;
+       if (event->attr.exclude_kernel)
+               cpuhw->mmcr[0] |= freeze_events_kernel;
+       if (event->attr.exclude_hv)
+               cpuhw->mmcr[0] |= MMCR0_FCHV;
+
+       /*
+        * Write the new configuration to MMCR* with the freeze
+        * bit set and set the hardware events to their initial values.
+        * Then unfreeze the events.
+        */
+       ppc_set_pmu_inuse(1);
+       mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+       mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+       mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+                               | MMCR0_FC);
+
+       /*
+        * Read off any pre-existing events that need to move
+        * to another PMC.
+        */
+       for (i = 0; i < cpuhw->n_events; ++i) {
+               event = cpuhw->event[i];
+               if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
+                       power_pmu_read(event);
+                       write_pmc(event->hw.idx, 0);
+                       event->hw.idx = 0;
+               }
+       }
+
+       /*
+        * Initialize the PMCs for all the new and moved events.
+        */
+       cpuhw->n_limited = n_lim = 0;
+       for (i = 0; i < cpuhw->n_events; ++i) {
+               event = cpuhw->event[i];
+               if (event->hw.idx)
+                       continue;
+               idx = hwc_index[i] + 1;
+               if (is_limited_pmc(idx)) {
+                       cpuhw->limited_event[n_lim] = event;
+                       cpuhw->limited_hwidx[n_lim] = idx;
+                       ++n_lim;
+                       continue;
+               }
+               val = 0;
+               if (event->hw.sample_period) {
+                       left = atomic64_read(&event->hw.period_left);
+                       if (left < 0x80000000L)
+                               val = 0x80000000L - left;
+               }
+               atomic64_set(&event->hw.prev_count, val);
+               event->hw.idx = idx;
+               write_pmc(idx, val);
+               perf_event_update_userpage(event);
+       }
+       cpuhw->n_limited = n_lim;
+       cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+       mb();
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+       /*
+        * Enable instruction sampling if necessary
+        */
+       if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+               mb();
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+       }
+
+ out:
+       local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_event *group, int max_count,
+                         struct perf_event *ctrs[], u64 *events,
+                         unsigned int *flags)
+{
+       int n = 0;
+       struct perf_event *event;
+
+       if (!is_software_event(group)) {
+               if (n >= max_count)
+                       return -1;
+               ctrs[n] = group;
+               flags[n] = group->hw.event_base;
+               events[n++] = group->hw.config;
+       }
+       list_for_each_entry(event, &group->sibling_list, list_entry) {
+               if (!is_software_event(event) &&
+                   event->state != PERF_EVENT_STATE_OFF) {
+                       if (n >= max_count)
+                               return -1;
+                       ctrs[n] = event;
+                       flags[n] = event->hw.event_base;
+                       events[n++] = event->hw.config;
+               }
+       }
+       return n;
+}
+
+static void event_sched_in(struct perf_event *event, int cpu)
+{
+       event->state = PERF_EVENT_STATE_ACTIVE;
+       event->oncpu = cpu;
+       event->tstamp_running += event->ctx->time - event->tstamp_stopped;
+       if (is_software_event(event))
+               event->pmu->enable(event);
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_event *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_event_context *ctx, int cpu)
+{
+       struct cpu_hw_events *cpuhw;
+       long i, n, n0;
+       struct perf_event *sub;
+
+       if (!ppmu)
+               return 0;
+       cpuhw = &__get_cpu_var(cpu_hw_events);
+       n0 = cpuhw->n_events;
+       n = collect_events(group_leader, ppmu->n_event - n0,
+                          &cpuhw->event[n0], &cpuhw->events[n0],
+                          &cpuhw->flags[n0]);
+       if (n < 0)
+               return -EAGAIN;
+       if (check_excludes(cpuhw->event, cpuhw->flags, n0, n))
+               return -EAGAIN;
+       i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
+       if (i < 0)
+               return -EAGAIN;
+       cpuhw->n_events = n0 + n;
+       cpuhw->n_added += n;
+
+       /*
+        * OK, this group can go on; update event states etc.,
+        * and enable any software events
+        */
+       for (i = n0; i < n0 + n; ++i)
+               cpuhw->event[i]->hw.config = cpuhw->events[i];
+       cpuctx->active_oncpu += n;
+       n = 1;
+       event_sched_in(group_leader, cpu);
+       list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+               if (sub->state != PERF_EVENT_STATE_OFF) {
+                       event_sched_in(sub, cpu);
+                       ++n;
+               }
+       }
+       ctx->nr_active += n;
+
+       return 1;
+}
+
+/*
+ * Add a event to the PMU.
+ * If all events are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_enable to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_pmu_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuhw;
+       unsigned long flags;
+       int n0;
+       int ret = -EAGAIN;
+
+       local_irq_save(flags);
+       perf_disable();
+
+       /*
+        * Add the event to the list (if there is room)
+        * and check whether the total set is still feasible.
+        */
+       cpuhw = &__get_cpu_var(cpu_hw_events);
+       n0 = cpuhw->n_events;
+       if (n0 >= ppmu->n_event)
+               goto out;
+       cpuhw->event[n0] = event;
+       cpuhw->events[n0] = event->hw.config;
+       cpuhw->flags[n0] = event->hw.event_base;
+       if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
+               goto out;
+       if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
+               goto out;
+
+       event->hw.config = cpuhw->events[n0];
+       ++cpuhw->n_events;
+       ++cpuhw->n_added;
+
+       ret = 0;
+ out:
+       perf_enable();
+       local_irq_restore(flags);
+       return ret;
+}
+
+/*
+ * Remove a event from the PMU.
+ */
+static void power_pmu_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuhw;
+       long i;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       perf_disable();
+
+       power_pmu_read(event);
+
+       cpuhw = &__get_cpu_var(cpu_hw_events);
+       for (i = 0; i < cpuhw->n_events; ++i) {
+               if (event == cpuhw->event[i]) {
+                       while (++i < cpuhw->n_events)
+                               cpuhw->event[i-1] = cpuhw->event[i];
+                       --cpuhw->n_events;
+                       ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
+                       if (event->hw.idx) {
+                               write_pmc(event->hw.idx, 0);
+                               event->hw.idx = 0;
+                       }
+                       perf_event_update_userpage(event);
+                       break;
+               }
+       }
+       for (i = 0; i < cpuhw->n_limited; ++i)
+               if (event == cpuhw->limited_event[i])
+                       break;
+       if (i < cpuhw->n_limited) {
+               while (++i < cpuhw->n_limited) {
+                       cpuhw->limited_event[i-1] = cpuhw->limited_event[i];
+                       cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+               }
+               --cpuhw->n_limited;
+       }
+       if (cpuhw->n_events == 0) {
+               /* disable exceptions if no events are running */
+               cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+       }
+
+       perf_enable();
+       local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a event after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_event *event)
+{
+       s64 val, left;
+       unsigned long flags;
+
+       if (!event->hw.idx || !event->hw.sample_period)
+               return;
+       local_irq_save(flags);
+       perf_disable();
+       power_pmu_read(event);
+       left = event->hw.sample_period;
+       event->hw.last_period = left;
+       val = 0;
+       if (left < 0x80000000L)
+               val = 0x80000000L - left;
+       write_pmc(event->hw.idx, val);
+       atomic64_set(&event->hw.prev_count, val);
+       atomic64_set(&event->hw.period_left, left);
+       perf_event_update_userpage(event);
+       perf_enable();
+       local_irq_restore(flags);
+}
+
+struct pmu power_pmu = {
+       .enable         = power_pmu_enable,
+       .disable        = power_pmu_disable,
+       .read           = power_pmu_read,
+       .unthrottle     = power_pmu_unthrottle,
+};
+
+/*
+ * Return 1 if we might be able to put event on a limited PMC,
+ * or 0 if not.
+ * A event can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
+                                unsigned int flags)
+{
+       int n;
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+
+       if (event->attr.exclude_user
+           || event->attr.exclude_kernel
+           || event->attr.exclude_hv
+           || event->attr.sample_period)
+               return 0;
+
+       if (ppmu->limited_pmc_event(ev))
+               return 1;
+
+       /*
+        * The requested event_id isn't on a limited PMC already;
+        * see if any alternative code goes on a limited PMC.
+        */
+       if (!ppmu->get_alternatives)
+               return 0;
+
+       flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+       n = ppmu->get_alternatives(ev, flags, alt);
+
+       return n > 0;
+}
+
+/*
+ * Find an alternative event_id that goes on a normal PMC, if possible,
+ * and return the event_id code, or 0 if there is no such alternative.
+ * (Note: event_id code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
+{
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+       int n;
+
+       flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+       n = ppmu->get_alternatives(ev, flags, alt);
+       if (!n)
+               return 0;
+       return alt[0];
+}
+
+/* Number of perf_events counting hardware events */
+static atomic_t num_events;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_event.
+ */
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+       if (!atomic_add_unless(&num_events, -1, 1)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_dec_return(&num_events) == 0)
+                       release_pmc_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+/*
+ * Translate a generic cache event_id config to a raw event_id code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+       unsigned long type, op, result;
+       int ev;
+
+       if (!ppmu->cache_events)
+               return -EINVAL;
+
+       /* unpack config */
+       type = config & 0xff;
+       op = (config >> 8) & 0xff;
+       result = (config >> 16) & 0xff;
+
+       if (type >= PERF_COUNT_HW_CACHE_MAX ||
+           op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+           result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       ev = (*ppmu->cache_events)[type][op][result];
+       if (ev == 0)
+               return -EOPNOTSUPP;
+       if (ev == -1)
+               return -EINVAL;
+       *eventp = ev;
+       return 0;
+}
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+       u64 ev;
+       unsigned long flags;
+       struct perf_event *ctrs[MAX_HWEVENTS];
+       u64 events[MAX_HWEVENTS];
+       unsigned int cflags[MAX_HWEVENTS];
+       int n;
+       int err;
+       struct cpu_hw_events *cpuhw;
+
+       if (!ppmu)
+               return ERR_PTR(-ENXIO);
+       switch (event->attr.type) {
+       case PERF_TYPE_HARDWARE:
+               ev = event->attr.config;
+               if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+                       return ERR_PTR(-EOPNOTSUPP);
+               ev = ppmu->generic_events[ev];
+               break;
+       case PERF_TYPE_HW_CACHE:
+               err = hw_perf_cache_event(event->attr.config, &ev);
+               if (err)
+                       return ERR_PTR(err);
+               break;
+       case PERF_TYPE_RAW:
+               ev = event->attr.config;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
+       event->hw.config_base = ev;
+       event->hw.idx = 0;
+
+       /*
+        * If we are not running on a hypervisor, force the
+        * exclude_hv bit to 0 so that we don't care what
+        * the user set it to.
+        */
+       if (!firmware_has_feature(FW_FEATURE_LPAR))
+               event->attr.exclude_hv = 0;
+
+       /*
+        * If this is a per-task event, then we can use
+        * PM_RUN_* events interchangeably with their non RUN_*
+        * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+        * XXX we should check if the task is an idle task.
+        */
+       flags = 0;
+       if (event->ctx->task)
+               flags |= PPMU_ONLY_COUNT_RUN;
+
+       /*
+        * If this machine has limited events, check whether this
+        * event_id could go on a limited event.
+        */
+       if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+               if (can_go_on_limited_pmc(event, ev, flags)) {
+                       flags |= PPMU_LIMITED_PMC_OK;
+               } else if (ppmu->limited_pmc_event(ev)) {
+                       /*
+                        * The requested event_id is on a limited PMC,
+                        * but we can't use a limited PMC; see if any
+                        * alternative goes on a normal PMC.
+                        */
+                       ev = normal_pmc_alternative(ev, flags);
+                       if (!ev)
+                               return ERR_PTR(-EINVAL);
+               }
+       }
+
+       /*
+        * If this is in a group, check if it can go on with all the
+        * other hardware events in the group.  We assume the event
+        * hasn't been linked into its leader's sibling list at this point.
+        */
+       n = 0;
+       if (event->group_leader != event) {
+               n = collect_events(event->group_leader, ppmu->n_event - 1,
+                                  ctrs, events, cflags);
+               if (n < 0)
+                       return ERR_PTR(-EINVAL);
+       }
+       events[n] = ev;
+       ctrs[n] = event;
+       cflags[n] = flags;
+       if (check_excludes(ctrs, cflags, n, 1))
+               return ERR_PTR(-EINVAL);
+
+       cpuhw = &get_cpu_var(cpu_hw_events);
+       err = power_check_constraints(cpuhw, events, cflags, n + 1);
+       put_cpu_var(cpu_hw_events);
+       if (err)
+               return ERR_PTR(-EINVAL);
+
+       event->hw.config = events[n];
+       event->hw.event_base = cflags[n];
+       event->hw.last_period = event->hw.sample_period;
+       atomic64_set(&event->hw.period_left, event->hw.last_period);
+
+       /*
+        * See if we need to reserve the PMU.
+        * If no events are currently in use, then we have to take a
+        * mutex to ensure that we don't race with another task doing
+        * reserve_pmc_hardware or release_pmc_hardware.
+        */
+       err = 0;
+       if (!atomic_inc_not_zero(&num_events)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&num_events) == 0 &&
+                   reserve_pmc_hardware(perf_event_interrupt))
+                       err = -EBUSY;
+               else
+                       atomic_inc(&num_events);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+       event->destroy = hw_perf_event_destroy;
+
+       if (err)
+               return ERR_PTR(err);
+       return &power_pmu;
+}
+
+/*
+ * A event has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_event *event, unsigned long val,
+                              struct pt_regs *regs, int nmi)
+{
+       u64 period = event->hw.sample_period;
+       s64 prev, delta, left;
+       int record = 0;
+
+       /* we don't have to worry about interrupts here */
+       prev = atomic64_read(&event->hw.prev_count);
+       delta = (val - prev) & 0xfffffffful;
+       atomic64_add(delta, &event->count);
+
+       /*
+        * See if the total period for this event has expired,
+        * and update for the next period.
+        */
+       val = 0;
+       left = atomic64_read(&event->hw.period_left) - delta;
+       if (period) {
+               if (left <= 0) {
+                       left += period;
+                       if (left <= 0)
+                               left = period;
+                       record = 1;
+               }
+               if (left < 0x80000000LL)
+                       val = 0x80000000LL - left;
+       }
+
+       /*
+        * Finally record data if requested.
+        */
+       if (record) {
+               struct perf_sample_data data = {
+                       .addr   = 0,
+                       .period = event->hw.last_period,
+               };
+
+               if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+                       perf_get_data_addr(regs, &data.addr);
+
+               if (perf_event_overflow(event, nmi, &data, regs)) {
+                       /*
+                        * Interrupts are coming too fast - throttle them
+                        * by setting the event to 0, so it will be
+                        * at least 2^30 cycles until the next interrupt
+                        * (assuming each event counts at most 2 counts
+                        * per cycle).
+                        */
+                       val = 0;
+                       left = ~0ULL >> 1;
+               }
+       }
+
+       write_pmc(event->hw.idx, val);
+       atomic64_set(&event->hw.prev_count, val);
+       atomic64_set(&event->hw.period_left, left);
+       perf_event_update_userpage(event);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event_id.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       u32 flags = perf_get_misc_flags(regs);
+
+       if (flags)
+               return flags;
+       return user_mode(regs) ? PERF_RECORD_MISC_USER :
+               PERF_RECORD_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event_id.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       unsigned long ip;
+
+       if (TRAP(regs) != 0xf00)
+               return regs->nip;       /* not a PMU interrupt */
+
+       ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
+       return ip;
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_event_interrupt(struct pt_regs *regs)
+{
+       int i;
+       struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+       struct perf_event *event;
+       unsigned long val;
+       int found = 0;
+       int nmi;
+
+       if (cpuhw->n_limited)
+               freeze_limited_events(cpuhw, mfspr(SPRN_PMC5),
+                                       mfspr(SPRN_PMC6));
+
+       perf_read_regs(regs);
+
+       nmi = perf_intr_is_nmi(regs);
+       if (nmi)
+               nmi_enter();
+       else
+               irq_enter();
+
+       for (i = 0; i < cpuhw->n_events; ++i) {
+               event = cpuhw->event[i];
+               if (!event->hw.idx || is_limited_pmc(event->hw.idx))
+                       continue;
+               val = read_pmc(event->hw.idx);
+               if ((int)val < 0) {
+                       /* event has overflowed */
+                       found = 1;
+                       record_and_restart(event, val, regs, nmi);
+               }
+       }
+
+       /*
+        * In case we didn't find and reset the event that caused
+        * the interrupt, scan all events and reset any that are
+        * negative, to avoid getting continual interrupts.
+        * Any that we processed in the previous loop will not be negative.
+        */
+       if (!found) {
+               for (i = 0; i < ppmu->n_event; ++i) {
+                       if (is_limited_pmc(i + 1))
+                               continue;
+                       val = read_pmc(i + 1);
+                       if ((int)val < 0)
+                               write_pmc(i + 1, 0);
+               }
+       }
+
+       /*
+        * Reset MMCR0 to its normal value.  This will set PMXE and
+        * clear FC (freeze events) and PMAO (perf mon alert occurred)
+        * and thus allow interrupts to occur again.
+        * XXX might want to use MSR.PM to keep the events frozen until
+        * we get back out of this interrupt.
+        */
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+       if (nmi)
+               nmi_exit();
+       else
+               irq_exit();
+}
+
+void hw_perf_event_setup(int cpu)
+{
+       struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+       if (!ppmu)
+               return;
+       memset(cpuhw, 0, sizeof(*cpuhw));
+       cpuhw->mmcr[0] = MMCR0_FC;
+}
+
+int register_power_pmu(struct power_pmu *pmu)
+{
+       if (ppmu)
+               return -EBUSY;          /* something's already registered */
+
+       ppmu = pmu;
+       pr_info("%s performance monitor hardware support registered\n",
+               pmu->name);
+
+#ifdef MSR_HV
+       /*
+        * Use FCHV to ignore kernel events if MSR.HV is set.
+        */
+       if (mfmsr() & MSR_HV)
+               freeze_events_kernel = MMCR0_FCHV;
+#endif /* CONFIG_PPC64 */
+
+       return 0;
+}
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c

index 3c90a3d..2a361cd 100644 (file)
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/string.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c

index 31918af..0f4c1c7 100644 (file)
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/string.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c

index 867f6f6..c351b3a 100644 (file)
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/string.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c

index fa21890..ca399ba 100644 (file)
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/string.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c

index 018d094..28a4daa 100644 (file)
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/string.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c

index 75dccb7..4795744 100644 (file)
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -9,7 +9,7 @@
   * 2 of the License, or (at your option) any later version.
   */
  #include <linux/string.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <asm/reg.h>
  #include <asm/cputable.h>
  
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index 465e498..df45a74 100644 (file)
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
  #include <linux/posix-timers.h>
  #include <linux/irq.h>
  #include <linux/delay.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/io.h>
  #include <asm/processor.h>
@@ -527,25 +527,25 @@ void __init iSeries_time_init_early(void)
  }
  #endif /* CONFIG_PPC_ISERIES */
  
-#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32)
-DEFINE_PER_CPU(u8, perf_counter_pending);
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32)
+DEFINE_PER_CPU(u8, perf_event_pending);
  
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
  {
-       get_cpu_var(perf_counter_pending) = 1;
+       get_cpu_var(perf_event_pending) = 1;
         set_dec(1);
-       put_cpu_var(perf_counter_pending);
+       put_cpu_var(perf_event_pending);
  }
  
-#define test_perf_counter_pending()    __get_cpu_var(perf_counter_pending)
-#define clear_perf_counter_pending()   __get_cpu_var(perf_counter_pending) = 0
+#define test_perf_event_pending()      __get_cpu_var(perf_event_pending)
+#define clear_perf_event_pending()     __get_cpu_var(perf_event_pending) = 0
  
-#else  /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+#else  /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
  
-#define test_perf_counter_pending()    0
-#define clear_perf_counter_pending()
+#define test_perf_event_pending()      0
+#define clear_perf_event_pending()
  
-#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
  
  /*
   * For iSeries shared processors, we have to let the hypervisor
@@ -573,9 +573,9 @@ void timer_interrupt(struct pt_regs * regs)
         set_dec(DECREMENTER_MAX);
  
  #ifdef CONFIG_PPC32
-       if (test_perf_counter_pending()) {
-               clear_perf_counter_pending();
-               perf_counter_do_pending();
+       if (test_perf_event_pending()) {
+               clear_perf_event_pending();
+               perf_event_do_pending();
         }
         if (atomic_read(&ppc_n_lost_interrupts) != 0)
                 do_IRQ(regs);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c

index 830bef0..e7dae82 100644 (file)
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,7 +29,7 @@
  #include <linux/module.h>
  #include <linux/kprobes.h>
  #include <linux/kdebug.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/firmware.h>
  #include <asm/page.h>
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
                 die("Weird page fault", regs, SIGSEGV);
         }
  
-       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
  
         /* When running in the kernel we expect faults to occur only to
          * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
         }
         if (ret & VM_FAULT_MAJOR) {
                 current->maj_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                      regs, address);
  #ifdef CONFIG_PPC_SMLPAR
                 if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
  #endif
         } else {
                 current->min_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                      regs, address);
         }
         up_read(&mm->mmap_sem);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype

index 9efc8bd..e382cae 100644 (file)
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -280,9 +280,9 @@ config PPC_HAVE_PMU_SUPPORT
  
  config PPC_PERF_CTRS
         def_bool y
-       depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT
+       depends on PERF_EVENTS && PPC_HAVE_PMU_SUPPORT
         help
-         This enables the powerpc-specific perf_counter back-end.
+         This enables the powerpc-specific perf_event back-end.
  
  config SMP
         depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 1c866ef..43c0aca 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -94,7 +94,7 @@ config S390
         select HAVE_KVM if 64BIT
         select HAVE_ARCH_TRACEHOOK
         select INIT_ALL_POSSIBLE
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
  
  config SCHED_OMIT_FRAME_POINTER
         bool
diff --git a/arch/s390/include/asm/perf_counter.h b/arch/s390/include/asm/perf_counter.h

deleted file mode 100644 (file)

index 7015188..0000000
--- a/arch/s390/include/asm/perf_counter.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Performance counter support - s390 specific definitions.
- *
- * Copyright 2009 Martin Schwidefsky, IBM Corporation.
- */
-
-static inline void set_perf_counter_pending(void) {}
-static inline void clear_perf_counter_pending(void) {}
-
-#define PERF_COUNTER_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..3840cbe
--- /dev/null
+++ b/arch/s390/include/asm/perf_event.h
@@ -0,0 +1,10 @@
+/*
+ * Performance event support - s390 specific definitions.
+ *
+ * Copyright 2009 Martin Schwidefsky, IBM Corporation.
+ */
+
+static inline void set_perf_event_pending(void) {}
+static inline void clear_perf_event_pending(void) {}
+
+#define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h

index c80602d..cb5232d 100644 (file)
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -268,7 +268,7 @@
  #define        __NR_preadv             328
  #define        __NR_pwritev            329
  #define __NR_rt_tgsigqueueinfo 330
-#define __NR_perf_counter_open 331
+#define __NR_perf_event_open   331
  #define NR_syscalls 332
  
  /* 
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S

index 88a8336..6247900 100644 (file)
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1832,11 +1832,11 @@ compat_sys_rt_tgsigqueueinfo_wrapper:
         llgtr   %r5,%r5                 # struct compat_siginfo *
         jg      compat_sys_rt_tgsigqueueinfo_wrapper # branch to system call
  
-       .globl  sys_perf_counter_open_wrapper
-sys_perf_counter_open_wrapper:
-       llgtr   %r2,%r2                 # const struct perf_counter_attr *
+       .globl  sys_perf_event_open_wrapper
+sys_perf_event_open_wrapper:
+       llgtr   %r2,%r2                 # const struct perf_event_attr *
         lgfr    %r3,%r3                 # pid_t
         lgfr    %r4,%r4                 # int
         lgfr    %r5,%r5                 # int
         llgfr   %r6,%r6                 # unsigned long
-       jg      sys_perf_counter_open   # branch to system call
+       jg      sys_perf_event_open     # branch to system call
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S

index ad1acd2..0b50836 100644 (file)
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -339,4 +339,4 @@ SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper)
  SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper)
  SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper)
  SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */
-SYSCALL(sys_perf_counter_open,sys_perf_counter_open,sys_perf_counter_open_wrapper)
+SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper)
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c

index 1abbadd..6d50746 100644 (file)
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -10,7 +10,7 @@
   *    Copyright (C) 1995  Linus Torvalds
   */
  
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/signal.h>
  #include <linux/sched.h>
  #include <linux/kernel.h>
@@ -306,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write)
          * interrupts again and then search the VMAs
          */
         local_irq_enable();
-       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
         down_read(&mm->mmap_sem);
  
         si_code = SEGV_MAPERR;
@@ -366,11 +366,11 @@ good_area:
         }
         if (fault & VM_FAULT_MAJOR) {
                 tsk->maj_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                      regs, address);
         } else {
                 tsk->min_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                      regs, address);
         }
          up_read(&mm->mmap_sem);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig

index 4df3570..b940424 100644 (file)
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,7 +16,7 @@ config SUPERH
         select HAVE_IOREMAP_PROT if MMU
         select HAVE_ARCH_TRACEHOOK
         select HAVE_DMA_API_DEBUG
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
         select HAVE_KERNEL_GZIP
         select HAVE_KERNEL_BZIP2
         select HAVE_KERNEL_LZMA
diff --git a/arch/sh/include/asm/perf_counter.h b/arch/sh/include/asm/perf_counter.h

deleted file mode 100644 (file)

index d8e6bb9..0000000
--- a/arch/sh/include/asm/perf_counter.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_SH_PERF_COUNTER_H
-#define __ASM_SH_PERF_COUNTER_H
-
-/* SH only supports software counters through this interface. */
-static inline void set_perf_counter_pending(void) {}
-
-#define PERF_COUNTER_INDEX_OFFSET      0
-
-#endif /* __ASM_SH_PERF_COUNTER_H */
diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..11a3022
--- /dev/null
+++ b/arch/sh/include/asm/perf_event.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_SH_PERF_EVENT_H
+#define __ASM_SH_PERF_EVENT_H
+
+/* SH only supports software events through this interface. */
+static inline void set_perf_event_pending(void) {}
+
+#define PERF_EVENT_INDEX_OFFSET        0
+
+#endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h

index 925dd40..f3fd1b9 100644 (file)
--- a/arch/sh/include/asm/unistd_32.h
+++ b/arch/sh/include/asm/unistd_32.h
@@ -344,7 +344,7 @@
  #define __NR_preadv            333
  #define __NR_pwritev           334
  #define __NR_rt_tgsigqueueinfo 335
-#define __NR_perf_counter_open 336
+#define __NR_perf_event_open   336
  
  #define NR_syscalls 337
  
diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h

index 2b84bc9..343ce8f 100644 (file)
--- a/arch/sh/include/asm/unistd_64.h
+++ b/arch/sh/include/asm/unistd_64.h
@@ -384,7 +384,7 @@
  #define __NR_preadv            361
  #define __NR_pwritev           362
  #define __NR_rt_tgsigqueueinfo 363
-#define __NR_perf_counter_open 364
+#define __NR_perf_event_open   364
  
  #ifdef __KERNEL__
  
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S

index 16ba225..19fd11d 100644 (file)
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -352,4 +352,4 @@ ENTRY(sys_call_table)
         .long sys_preadv
         .long sys_pwritev
         .long sys_rt_tgsigqueueinfo     /* 335 */
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S

index af6fb74..5bfde6c 100644 (file)
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -390,4 +390,4 @@ sys_call_table:
         .long sys_preadv
         .long sys_pwritev
         .long sys_rt_tgsigqueueinfo
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c

index 781b413..4753010 100644 (file)
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -15,7 +15,7 @@
  #include <linux/mm.h>
  #include <linux/hardirq.h>
  #include <linux/kprobes.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <asm/io_trapped.h>
  #include <asm/system.h>
  #include <asm/mmu_context.h>
@@ -157,7 +157,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
         if ((regs->sr & SR_IMASK) != SR_IMASK)
                 local_irq_enable();
  
-       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
  
         /*
          * If we're in an interrupt, have no user context or are running
@@ -208,11 +208,11 @@ survive:
         }
         if (fault & VM_FAULT_MAJOR) {
                 tsk->maj_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                      regs, address);
         } else {
                 tsk->min_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                      regs, address);
         }
  
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c

index 2dcc485..de0b0e8 100644 (file)
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -20,7 +20,7 @@
  #include <linux/mman.h>
  #include <linux/mm.h>
  #include <linux/smp.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/interrupt.h>
  #include <asm/system.h>
  #include <asm/io.h>
@@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
         /* Not an IO address, so reenable interrupts */
         local_irq_enable();
  
-       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
  
         /*
          * If we're in an interrupt or have no user
@@ -201,11 +201,11 @@ survive:
  
         if (fault & VM_FAULT_MAJOR) {
                 tsk->maj_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                      regs, address);
         } else {
                 tsk->min_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                      regs, address);
         }
  
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig

index 86b8234..97fca46 100644 (file)
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,7 +25,7 @@ config SPARC
         select ARCH_WANT_OPTIONAL_GPIOLIB
         select RTC_CLASS
         select RTC_DRV_M48T59
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
         select HAVE_DMA_ATTRS
         select HAVE_DMA_API_DEBUG
  
@@ -47,7 +47,7 @@ config SPARC64
         select RTC_DRV_BQ4802
         select RTC_DRV_SUN4V
         select RTC_DRV_STARFIRE
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_EVENTS
  
  config ARCH_DEFCONFIG
         string
diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h

deleted file mode 100644 (file)

index 5d7a8ca..0000000
--- a/arch/sparc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __ASM_SPARC_PERF_COUNTER_H
-#define __ASM_SPARC_PERF_COUNTER_H
-
-extern void set_perf_counter_pending(void);
-
-#define        PERF_COUNTER_INDEX_OFFSET       0
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-#else
-static inline void init_hw_perf_counters(void) { }
-#endif
-
-#endif
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..7e26698
--- /dev/null
+++ b/arch/sparc/include/asm/perf_event.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_SPARC_PERF_EVENT_H
+#define __ASM_SPARC_PERF_EVENT_H
+
+extern void set_perf_event_pending(void);
+
+#define        PERF_EVENT_INDEX_OFFSET 0
+
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+#else
+static inline void init_hw_perf_events(void)   { }
+#endif
+
+#endif
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h

index 706df66..42f2316 100644 (file)
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -395,7 +395,7 @@
  #define __NR_preadv            324
  #define __NR_pwritev           325
  #define __NR_rt_tgsigqueueinfo 326
-#define __NR_perf_counter_open 327
+#define __NR_perf_event_open   327
  
  #define NR_SYSCALLS            328
  
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile

index 247cc62..3a048fa 100644 (file)
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -104,5 +104,5 @@ obj-$(CONFIG_AUDIT)     += audit.o
  audit--$(CONFIG_AUDIT)  := compat_audit.o
  obj-$(CONFIG_COMPAT)    += $(audit--y)
  
-pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o
+pc--$(CONFIG_PERF_EVENTS) := perf_event.o
  obj-$(CONFIG_SPARC64)  += $(pc--y)
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c

index 378eb53..b129611 100644 (file)
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -19,7 +19,7 @@
  #include <linux/delay.h>
  #include <linux/smp.h>
  
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
  #include <asm/ptrace.h>
  #include <asm/local.h>
  #include <asm/pcr.h>
@@ -265,7 +265,7 @@ int __init nmi_init(void)
                 }
         }
         if (!err)
-               init_hw_perf_counters();
+               init_hw_perf_events();
  
         return err;
  }
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c

index 68ff001..2d94e7a 100644 (file)
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
  #include <linux/init.h>
  #include <linux/irq.h>
  
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/pil.h>
  #include <asm/pcr.h>
@@ -15,7 +15,7 @@
  
  /* This code is shared between various users of the performance
   * counters.  Users will be oprofile, pseudo-NMI watchdog, and the
- * perf_counter support layer.
+ * perf_event support layer.
   */
  
  #define PCR_SUN4U_ENABLE       (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE)
@@ -42,14 +42,14 @@ void deferred_pcr_work_irq(int irq, struct pt_regs *regs)
  
         old_regs = set_irq_regs(regs);
         irq_enter();
-#ifdef CONFIG_PERF_COUNTERS
-       perf_counter_do_pending();
+#ifdef CONFIG_PERF_EVENTS
+       perf_event_do_pending();
  #endif
         irq_exit();
         set_irq_regs(old_regs);
  }
  
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
  {
         set_softint(1 << PIL_DEFERRED_PCR_WORK);
  }
diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c

deleted file mode 100644 (file)

index b1265ce..0000000
--- a/arch/sparc/kernel/perf_counter.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/* Performance counter support for sparc64.
- *
- * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
- *
- * This code is based almost entirely upon the x86 perf counter
- * code, which is:
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-
-#include <linux/perf_counter.h>
-#include <linux/kprobes.h>
-#include <linux/kernel.h>
-#include <linux/kdebug.h>
-#include <linux/mutex.h>
-
-#include <asm/cpudata.h>
-#include <asm/atomic.h>
-#include <asm/nmi.h>
-#include <asm/pcr.h>
-
-/* Sparc64 chips have two performance counters, 32-bits each, with
- * overflow interrupts generated on transition from 0xffffffff to 0.
- * The counters are accessed in one go using a 64-bit register.
- *
- * Both counters are controlled using a single control register.  The
- * only way to stop all sampling is to clear all of the context (user,
- * supervisor, hypervisor) sampling enable bits.  But these bits apply
- * to both counters, thus the two counters can't be enabled/disabled
- * individually.
- *
- * The control register has two event fields, one for each of the two
- * counters.  It's thus nearly impossible to have one counter going
- * while keeping the other one stopped.  Therefore it is possible to
- * get overflow interrupts for counters not currently "in use" and
- * that condition must be checked in the overflow interrupt handler.
- *
- * So we use a hack, in that we program inactive counters with the
- * "sw_count0" and "sw_count1" events.  These count how many times
- * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
- * unusual way to encode a NOP and therefore will not trigger in
- * normal code.
- */
-
-#define MAX_HWCOUNTERS                 2
-#define MAX_PERIOD                     ((1UL << 32) - 1)
-
-#define PIC_UPPER_INDEX                        0
-#define PIC_LOWER_INDEX                        1
-
-struct cpu_hw_counters {
-       struct perf_counter     *counters[MAX_HWCOUNTERS];
-       unsigned long           used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
-       unsigned long           active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
-       int enabled;
-};
-DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, };
-
-struct perf_event_map {
-       u16     encoding;
-       u8      pic_mask;
-#define PIC_NONE       0x00
-#define PIC_UPPER      0x01
-#define PIC_LOWER      0x02
-};
-
-struct sparc_pmu {
-       const struct perf_event_map     *(*event_map)(int);
-       int                             max_events;
-       int                             upper_shift;
-       int                             lower_shift;
-       int                             event_mask;
-       int                             hv_bit;
-       int                             irq_bit;
-       int                             upper_nop;
-       int                             lower_nop;
-};
-
-static const struct perf_event_map ultra3i_perfmon_event_map[] = {
-       [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
-       [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
-};
-
-static const struct perf_event_map *ultra3i_event_map(int event)
-{
-       return &ultra3i_perfmon_event_map[event];
-}
-
-static const struct sparc_pmu ultra3i_pmu = {
-       .event_map      = ultra3i_event_map,
-       .max_events     = ARRAY_SIZE(ultra3i_perfmon_event_map),
-       .upper_shift    = 11,
-       .lower_shift    = 4,
-       .event_mask     = 0x3f,
-       .upper_nop      = 0x1c,
-       .lower_nop      = 0x14,
-};
-
-static const struct perf_event_map niagara2_perfmon_event_map[] = {
-       [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
-       [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
-};
-
-static const struct perf_event_map *niagara2_event_map(int event)
-{
-       return &niagara2_perfmon_event_map[event];
-}
-
-static const struct sparc_pmu niagara2_pmu = {
-       .event_map      = niagara2_event_map,
-       .max_events     = ARRAY_SIZE(niagara2_perfmon_event_map),
-       .upper_shift    = 19,
-       .lower_shift    = 6,
-       .event_mask     = 0xfff,
-       .hv_bit         = 0x8,
-       .irq_bit        = 0x03,
-       .upper_nop      = 0x220,
-       .lower_nop      = 0x220,
-};
-
-static const struct sparc_pmu *sparc_pmu __read_mostly;
-
-static u64 event_encoding(u64 event, int idx)
-{
-       if (idx == PIC_UPPER_INDEX)
-               event <<= sparc_pmu->upper_shift;
-       else
-               event <<= sparc_pmu->lower_shift;
-       return event;
-}
-
-static u64 mask_for_index(int idx)
-{
-       return event_encoding(sparc_pmu->event_mask, idx);
-}
-
-static u64 nop_for_index(int idx)
-{
-       return event_encoding(idx == PIC_UPPER_INDEX ?
-                             sparc_pmu->upper_nop :
-                             sparc_pmu->lower_nop, idx);
-}
-
-static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc,
-                                           int idx)
-{
-       u64 val, mask = mask_for_index(idx);
-
-       val = pcr_ops->read();
-       pcr_ops->write((val & ~mask) | hwc->config);
-}
-
-static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc,
-                                            int idx)
-{
-       u64 mask = mask_for_index(idx);
-       u64 nop = nop_for_index(idx);
-       u64 val = pcr_ops->read();
-
-       pcr_ops->write((val & ~mask) | nop);
-}
-
-void hw_perf_enable(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       u64 val;
-       int i;
-
-       if (cpuc->enabled)
-               return;
-
-       cpuc->enabled = 1;
-       barrier();
-
-       val = pcr_ops->read();
-
-       for (i = 0; i < MAX_HWCOUNTERS; i++) {
-               struct perf_counter *cp = cpuc->counters[i];
-               struct hw_perf_counter *hwc;
-
-               if (!cp)
-                       continue;
-               hwc = &cp->hw;
-               val |= hwc->config_base;
-       }
-
-       pcr_ops->write(val);
-}
-
-void hw_perf_disable(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       u64 val;
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->enabled = 0;
-
-       val = pcr_ops->read();
-       val &= ~(PCR_UTRACE | PCR_STRACE |
-                sparc_pmu->hv_bit | sparc_pmu->irq_bit);
-       pcr_ops->write(val);
-}
-
-static u32 read_pmc(int idx)
-{
-       u64 val;
-
-       read_pic(val);
-       if (idx == PIC_UPPER_INDEX)
-               val >>= 32;
-
-       return val & 0xffffffff;
-}
-
-static void write_pmc(int idx, u64 val)
-{
-       u64 shift, mask, pic;
-
-       shift = 0;
-       if (idx == PIC_UPPER_INDEX)
-               shift = 32;
-
-       mask = ((u64) 0xffffffff) << shift;
-       val <<= shift;
-
-       read_pic(pic);
-       pic &= ~mask;
-       pic |= val;
-       write_pic(pic);
-}
-
-static int sparc_perf_counter_set_period(struct perf_counter *counter,
-                                        struct hw_perf_counter *hwc, int idx)
-{
-       s64 left = atomic64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int ret = 0;
-
-       if (unlikely(left <= -period)) {
-               left = period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-
-       if (unlikely(left <= 0)) {
-               left += period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-       if (left > MAX_PERIOD)
-               left = MAX_PERIOD;
-
-       atomic64_set(&hwc->prev_count, (u64)-left);
-
-       write_pmc(idx, (u64)(-left) & 0xffffffff);
-
-       perf_counter_update_userpage(counter);
-
-       return ret;
-}
-
-static int sparc_pmu_enable(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       struct hw_perf_counter *hwc = &counter->hw;
-       int idx = hwc->idx;
-
-       if (test_and_set_bit(idx, cpuc->used_mask))
-               return -EAGAIN;
-
-       sparc_pmu_disable_counter(hwc, idx);
-
-       cpuc->counters[idx] = counter;
-       set_bit(idx, cpuc->active_mask);
-
-       sparc_perf_counter_set_period(counter, hwc, idx);
-       sparc_pmu_enable_counter(hwc, idx);
-       perf_counter_update_userpage(counter);
-       return 0;
-}
-
-static u64 sparc_perf_counter_update(struct perf_counter *counter,
-                                    struct hw_perf_counter *hwc, int idx)
-{
-       int shift = 64 - 32;
-       u64 prev_raw_count, new_raw_count;
-       s64 delta;
-
-again:
-       prev_raw_count = atomic64_read(&hwc->prev_count);
-       new_raw_count = read_pmc(idx);
-
-       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                            new_raw_count) != prev_raw_count)
-               goto again;
-
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       atomic64_add(delta, &counter->count);
-       atomic64_sub(delta, &hwc->period_left);
-
-       return new_raw_count;
-}
-
-static void sparc_pmu_disable(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       struct hw_perf_counter *hwc = &counter->hw;
-       int idx = hwc->idx;
-
-       clear_bit(idx, cpuc->active_mask);
-       sparc_pmu_disable_counter(hwc, idx);
-
-       barrier();
-
-       sparc_perf_counter_update(counter, hwc, idx);
-       cpuc->counters[idx] = NULL;
-       clear_bit(idx, cpuc->used_mask);
-
-       perf_counter_update_userpage(counter);
-}
-
-static void sparc_pmu_read(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       sparc_perf_counter_update(counter, hwc, hwc->idx);
-}
-
-static void sparc_pmu_unthrottle(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       sparc_pmu_enable_counter(hwc, hwc->idx);
-}
-
-static atomic_t active_counters = ATOMIC_INIT(0);
-static DEFINE_MUTEX(pmc_grab_mutex);
-
-void perf_counter_grab_pmc(void)
-{
-       if (atomic_inc_not_zero(&active_counters))
-               return;
-
-       mutex_lock(&pmc_grab_mutex);
-       if (atomic_read(&active_counters) == 0) {
-               if (atomic_read(&nmi_active) > 0) {
-                       on_each_cpu(stop_nmi_watchdog, NULL, 1);
-                       BUG_ON(atomic_read(&nmi_active) != 0);
-               }
-               atomic_inc(&active_counters);
-       }
-       mutex_unlock(&pmc_grab_mutex);
-}
-
-void perf_counter_release_pmc(void)
-{
-       if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) {
-               if (atomic_read(&nmi_active) == 0)
-                       on_each_cpu(start_nmi_watchdog, NULL, 1);
-               mutex_unlock(&pmc_grab_mutex);
-       }
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-       perf_counter_release_pmc();
-}
-
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
-       struct perf_counter_attr *attr = &counter->attr;
-       struct hw_perf_counter *hwc = &counter->hw;
-       const struct perf_event_map *pmap;
-       u64 enc;
-
-       if (atomic_read(&nmi_active) < 0)
-               return -ENODEV;
-
-       if (attr->type != PERF_TYPE_HARDWARE)
-               return -EOPNOTSUPP;
-
-       if (attr->config >= sparc_pmu->max_events)
-               return -EINVAL;
-
-       perf_counter_grab_pmc();
-       counter->destroy = hw_perf_counter_destroy;
-
-       /* We save the enable bits in the config_base.  So to
-        * turn off sampling just write 'config', and to enable
-        * things write 'config | config_base'.
-        */
-       hwc->config_base = sparc_pmu->irq_bit;
-       if (!attr->exclude_user)
-               hwc->config_base |= PCR_UTRACE;
-       if (!attr->exclude_kernel)
-               hwc->config_base |= PCR_STRACE;
-       if (!attr->exclude_hv)
-               hwc->config_base |= sparc_pmu->hv_bit;
-
-       if (!hwc->sample_period) {
-               hwc->sample_period = MAX_PERIOD;
-               hwc->last_period = hwc->sample_period;
-               atomic64_set(&hwc->period_left, hwc->sample_period);
-       }
-
-       pmap = sparc_pmu->event_map(attr->config);
-
-       enc = pmap->encoding;
-       if (pmap->pic_mask & PIC_UPPER) {
-               hwc->idx = PIC_UPPER_INDEX;
-               enc <<= sparc_pmu->upper_shift;
-       } else {
-               hwc->idx = PIC_LOWER_INDEX;
-               enc <<= sparc_pmu->lower_shift;
-       }
-
-       hwc->config |= enc;
-       return 0;
-}
-
-static const struct pmu pmu = {
-       .enable         = sparc_pmu_enable,
-       .disable        = sparc_pmu_disable,
-       .read           = sparc_pmu_read,
-       .unthrottle     = sparc_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-       int err = __hw_perf_counter_init(counter);
-
-       if (err)
-               return ERR_PTR(err);
-       return &pmu;
-}
-
-void perf_counter_print_debug(void)
-{
-       unsigned long flags;
-       u64 pcr, pic;
-       int cpu;
-
-       if (!sparc_pmu)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-
-       pcr = pcr_ops->read();
-       read_pic(pic);
-
-       pr_info("\n");
-       pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
-               cpu, pcr, pic);
-
-       local_irq_restore(flags);
-}
-
-static int __kprobes perf_counter_nmi_handler(struct notifier_block *self,
-                                             unsigned long cmd, void *__args)
-{
-       struct die_args *args = __args;
-       struct perf_sample_data data;
-       struct cpu_hw_counters *cpuc;
-       struct pt_regs *regs;
-       int idx;
-
-       if (!atomic_read(&active_counters))
-               return NOTIFY_DONE;
-
-       switch (cmd) {
-       case DIE_NMI:
-               break;
-
-       default:
-               return NOTIFY_DONE;
-       }
-
-       regs = args->regs;
-
-       data.addr = 0;
-
-       cpuc = &__get_cpu_var(cpu_hw_counters);
-       for (idx = 0; idx < MAX_HWCOUNTERS; idx++) {
-               struct perf_counter *counter = cpuc->counters[idx];
-               struct hw_perf_counter *hwc;
-               u64 val;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               hwc = &counter->hw;
-               val = sparc_perf_counter_update(counter, hwc, idx);
-               if (val & (1ULL << 31))
-                       continue;
-
-               data.period = counter->hw.last_period;
-               if (!sparc_perf_counter_set_period(counter, hwc, idx))
-                       continue;
-
-               if (perf_counter_overflow(counter, 1, &data, regs))
-                       sparc_pmu_disable_counter(hwc, idx);
-       }
-
-       return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-       .notifier_call          = perf_counter_nmi_handler,
-};
-
-static bool __init supported_pmu(void)
-{
-       if (!strcmp(sparc_pmu_type, "ultra3i")) {
-               sparc_pmu = &ultra3i_pmu;
-               return true;
-       }
-       if (!strcmp(sparc_pmu_type, "niagara2")) {
-               sparc_pmu = &niagara2_pmu;
-               return true;
-       }
-       return false;
-}
-
-void __init init_hw_perf_counters(void)
-{
-       pr_info("Performance counters: ");
-
-       if (!supported_pmu()) {
-               pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
-               return;
-       }
-
-       pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
-
-       /* All sparc64 PMUs currently have 2 counters.  But this simple
-        * driver only supports one active counter at a time.
-        */
-       perf_max_counters = 1;
-
-       register_die_notifier(&perf_counter_nmi_notifier);
-}
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c

new file mode 100644 (file)

index 0000000..2d6a1b1
--- /dev/null
+++ b/arch/sparc/kernel/perf_event.c
@@ -0,0 +1,556 @@
+/* Performance event support for sparc64.
+ *
+ * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
+ *
+ * This code is based almost entirely upon the x86 perf event
+ * code, which is:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+
+#include <asm/cpudata.h>
+#include <asm/atomic.h>
+#include <asm/nmi.h>
+#include <asm/pcr.h>
+
+/* Sparc64 chips have two performance counters, 32-bits each, with
+ * overflow interrupts generated on transition from 0xffffffff to 0.
+ * The counters are accessed in one go using a 64-bit register.
+ *
+ * Both counters are controlled using a single control register.  The
+ * only way to stop all sampling is to clear all of the context (user,
+ * supervisor, hypervisor) sampling enable bits.  But these bits apply
+ * to both counters, thus the two counters can't be enabled/disabled
+ * individually.
+ *
+ * The control register has two event fields, one for each of the two
+ * counters.  It's thus nearly impossible to have one counter going
+ * while keeping the other one stopped.  Therefore it is possible to
+ * get overflow interrupts for counters not currently "in use" and
+ * that condition must be checked in the overflow interrupt handler.
+ *
+ * So we use a hack, in that we program inactive counters with the
+ * "sw_count0" and "sw_count1" events.  These count how many times
+ * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
+ * unusual way to encode a NOP and therefore will not trigger in
+ * normal code.
+ */
+
+#define MAX_HWEVENTS                   2
+#define MAX_PERIOD                     ((1UL << 32) - 1)
+
+#define PIC_UPPER_INDEX                        0
+#define PIC_LOWER_INDEX                        1
+
+struct cpu_hw_events {
+       struct perf_event       *events[MAX_HWEVENTS];
+       unsigned long           used_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+       unsigned long           active_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+       int enabled;
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, };
+
+struct perf_event_map {
+       u16     encoding;
+       u8      pic_mask;
+#define PIC_NONE       0x00
+#define PIC_UPPER      0x01
+#define PIC_LOWER      0x02
+};
+
+struct sparc_pmu {
+       const struct perf_event_map     *(*event_map)(int);
+       int                             max_events;
+       int                             upper_shift;
+       int                             lower_shift;
+       int                             event_mask;
+       int                             hv_bit;
+       int                             irq_bit;
+       int                             upper_nop;
+       int                             lower_nop;
+};
+
+static const struct perf_event_map ultra3i_perfmon_event_map[] = {
+       [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
+       [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
+};
+
+static const struct perf_event_map *ultra3i_event_map(int event_id)
+{
+       return &ultra3i_perfmon_event_map[event_id];
+}
+
+static const struct sparc_pmu ultra3i_pmu = {
+       .event_map      = ultra3i_event_map,
+       .max_events     = ARRAY_SIZE(ultra3i_perfmon_event_map),
+       .upper_shift    = 11,
+       .lower_shift    = 4,
+       .event_mask     = 0x3f,
+       .upper_nop      = 0x1c,
+       .lower_nop      = 0x14,
+};
+
+static const struct perf_event_map niagara2_perfmon_event_map[] = {
+       [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
+       [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
+};
+
+static const struct perf_event_map *niagara2_event_map(int event_id)
+{
+       return &niagara2_perfmon_event_map[event_id];
+}
+
+static const struct sparc_pmu niagara2_pmu = {
+       .event_map      = niagara2_event_map,
+       .max_events     = ARRAY_SIZE(niagara2_perfmon_event_map),
+       .upper_shift    = 19,
+       .lower_shift    = 6,
+       .event_mask     = 0xfff,
+       .hv_bit         = 0x8,
+       .irq_bit        = 0x03,
+       .upper_nop      = 0x220,
+       .lower_nop      = 0x220,
+};
+
+static const struct sparc_pmu *sparc_pmu __read_mostly;
+
+static u64 event_encoding(u64 event_id, int idx)
+{
+       if (idx == PIC_UPPER_INDEX)
+               event_id <<= sparc_pmu->upper_shift;
+       else
+               event_id <<= sparc_pmu->lower_shift;
+       return event_id;
+}
+
+static u64 mask_for_index(int idx)
+{
+       return event_encoding(sparc_pmu->event_mask, idx);
+}
+
+static u64 nop_for_index(int idx)
+{
+       return event_encoding(idx == PIC_UPPER_INDEX ?
+                             sparc_pmu->upper_nop :
+                             sparc_pmu->lower_nop, idx);
+}
+
+static inline void sparc_pmu_enable_event(struct hw_perf_event *hwc,
+                                           int idx)
+{
+       u64 val, mask = mask_for_index(idx);
+
+       val = pcr_ops->read();
+       pcr_ops->write((val & ~mask) | hwc->config);
+}
+
+static inline void sparc_pmu_disable_event(struct hw_perf_event *hwc,
+                                            int idx)
+{
+       u64 mask = mask_for_index(idx);
+       u64 nop = nop_for_index(idx);
+       u64 val = pcr_ops->read();
+
+       pcr_ops->write((val & ~mask) | nop);
+}
+
+void hw_perf_enable(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       u64 val;
+       int i;
+
+       if (cpuc->enabled)
+               return;
+
+       cpuc->enabled = 1;
+       barrier();
+
+       val = pcr_ops->read();
+
+       for (i = 0; i < MAX_HWEVENTS; i++) {
+               struct perf_event *cp = cpuc->events[i];
+               struct hw_perf_event *hwc;
+
+               if (!cp)
+                       continue;
+               hwc = &cp->hw;
+               val |= hwc->config_base;
+       }
+
+       pcr_ops->write(val);
+}
+
+void hw_perf_disable(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       u64 val;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->enabled = 0;
+
+       val = pcr_ops->read();
+       val &= ~(PCR_UTRACE | PCR_STRACE |
+                sparc_pmu->hv_bit | sparc_pmu->irq_bit);
+       pcr_ops->write(val);
+}
+
+static u32 read_pmc(int idx)
+{
+       u64 val;
+
+       read_pic(val);
+       if (idx == PIC_UPPER_INDEX)
+               val >>= 32;
+
+       return val & 0xffffffff;
+}
+
+static void write_pmc(int idx, u64 val)
+{
+       u64 shift, mask, pic;
+
+       shift = 0;
+       if (idx == PIC_UPPER_INDEX)
+               shift = 32;
+
+       mask = ((u64) 0xffffffff) << shift;
+       val <<= shift;
+
+       read_pic(pic);
+       pic &= ~mask;
+       pic |= val;
+       write_pic(pic);
+}
+
+static int sparc_perf_event_set_period(struct perf_event *event,
+                                        struct hw_perf_event *hwc, int idx)
+{
+       s64 left = atomic64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int ret = 0;
+
+       if (unlikely(left <= -period)) {
+               left = period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+       if (left > MAX_PERIOD)
+               left = MAX_PERIOD;
+
+       atomic64_set(&hwc->prev_count, (u64)-left);
+
+       write_pmc(idx, (u64)(-left) & 0xffffffff);
+
+       perf_event_update_userpage(event);
+
+       return ret;
+}
+
+static int sparc_pmu_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+
+       if (test_and_set_bit(idx, cpuc->used_mask))
+               return -EAGAIN;
+
+       sparc_pmu_disable_event(hwc, idx);
+
+       cpuc->events[idx] = event;
+       set_bit(idx, cpuc->active_mask);
+
+       sparc_perf_event_set_period(event, hwc, idx);
+       sparc_pmu_enable_event(hwc, idx);
+       perf_event_update_userpage(event);
+       return 0;
+}
+
+static u64 sparc_perf_event_update(struct perf_event *event,
+                                    struct hw_perf_event *hwc, int idx)
+{
+       int shift = 64 - 32;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta;
+
+again:
+       prev_raw_count = atomic64_read(&hwc->prev_count);
+       new_raw_count = read_pmc(idx);
+
+       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                            new_raw_count) != prev_raw_count)
+               goto again;
+
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       atomic64_add(delta, &event->count);
+       atomic64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
+}
+
+static void sparc_pmu_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+
+       clear_bit(idx, cpuc->active_mask);
+       sparc_pmu_disable_event(hwc, idx);
+
+       barrier();
+
+       sparc_perf_event_update(event, hwc, idx);
+       cpuc->events[idx] = NULL;
+       clear_bit(idx, cpuc->used_mask);
+
+       perf_event_update_userpage(event);
+}
+
+static void sparc_pmu_read(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       sparc_perf_event_update(event, hwc, hwc->idx);
+}
+
+static void sparc_pmu_unthrottle(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       sparc_pmu_enable_event(hwc, hwc->idx);
+}
+
+static atomic_t active_events = ATOMIC_INIT(0);
+static DEFINE_MUTEX(pmc_grab_mutex);
+
+void perf_event_grab_pmc(void)
+{
+       if (atomic_inc_not_zero(&active_events))
+               return;
+
+       mutex_lock(&pmc_grab_mutex);
+       if (atomic_read(&active_events) == 0) {
+               if (atomic_read(&nmi_active) > 0) {
+                       on_each_cpu(stop_nmi_watchdog, NULL, 1);
+                       BUG_ON(atomic_read(&nmi_active) != 0);
+               }
+               atomic_inc(&active_events);
+       }
+       mutex_unlock(&pmc_grab_mutex);
+}
+
+void perf_event_release_pmc(void)
+{
+       if (atomic_dec_and_mutex_lock(&active_events, &pmc_grab_mutex)) {
+               if (atomic_read(&nmi_active) == 0)
+                       on_each_cpu(start_nmi_watchdog, NULL, 1);
+               mutex_unlock(&pmc_grab_mutex);
+       }
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+       perf_event_release_pmc();
+}
+
+static int __hw_perf_event_init(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       struct hw_perf_event *hwc = &event->hw;
+       const struct perf_event_map *pmap;
+       u64 enc;
+
+       if (atomic_read(&nmi_active) < 0)
+               return -ENODEV;
+
+       if (attr->type != PERF_TYPE_HARDWARE)
+               return -EOPNOTSUPP;
+
+       if (attr->config >= sparc_pmu->max_events)
+               return -EINVAL;
+
+       perf_event_grab_pmc();
+       event->destroy = hw_perf_event_destroy;
+
+       /* We save the enable bits in the config_base.  So to
+        * turn off sampling just write 'config', and to enable
+        * things write 'config | config_base'.
+        */
+       hwc->config_base = sparc_pmu->irq_bit;
+       if (!attr->exclude_user)
+               hwc->config_base |= PCR_UTRACE;
+       if (!attr->exclude_kernel)
+               hwc->config_base |= PCR_STRACE;
+       if (!attr->exclude_hv)
+               hwc->config_base |= sparc_pmu->hv_bit;
+
+       if (!hwc->sample_period) {
+               hwc->sample_period = MAX_PERIOD;
+               hwc->last_period = hwc->sample_period;
+               atomic64_set(&hwc->period_left, hwc->sample_period);
+       }
+
+       pmap = sparc_pmu->event_map(attr->config);
+
+       enc = pmap->encoding;
+       if (pmap->pic_mask & PIC_UPPER) {
+               hwc->idx = PIC_UPPER_INDEX;
+               enc <<= sparc_pmu->upper_shift;
+       } else {
+               hwc->idx = PIC_LOWER_INDEX;
+               enc <<= sparc_pmu->lower_shift;
+       }
+
+       hwc->config |= enc;
+       return 0;
+}
+
+static const struct pmu pmu = {
+       .enable         = sparc_pmu_enable,
+       .disable        = sparc_pmu_disable,
+       .read           = sparc_pmu_read,
+       .unthrottle     = sparc_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+       int err = __hw_perf_event_init(event);
+
+       if (err)
+               return ERR_PTR(err);
+       return &pmu;
+}
+
+void perf_event_print_debug(void)
+{
+       unsigned long flags;
+       u64 pcr, pic;
+       int cpu;
+
+       if (!sparc_pmu)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+
+       pcr = pcr_ops->read();
+       read_pic(pic);
+
+       pr_info("\n");
+       pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
+               cpu, pcr, pic);
+
+       local_irq_restore(flags);
+}
+
+static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
+                                             unsigned long cmd, void *__args)
+{
+       struct die_args *args = __args;
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct pt_regs *regs;
+       int idx;
+
+       if (!atomic_read(&active_events))
+               return NOTIFY_DONE;
+
+       switch (cmd) {
+       case DIE_NMI:
+               break;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       regs = args->regs;
+
+       data.addr = 0;
+
+       cpuc = &__get_cpu_var(cpu_hw_events);
+       for (idx = 0; idx < MAX_HWEVENTS; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               struct hw_perf_event *hwc;
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               hwc = &event->hw;
+               val = sparc_perf_event_update(event, hwc, idx);
+               if (val & (1ULL << 31))
+                       continue;
+
+               data.period = event->hw.last_period;
+               if (!sparc_perf_event_set_period(event, hwc, idx))
+                       continue;
+
+               if (perf_event_overflow(event, 1, &data, regs))
+                       sparc_pmu_disable_event(hwc, idx);
+       }
+
+       return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+       .notifier_call          = perf_event_nmi_handler,
+};
+
+static bool __init supported_pmu(void)
+{
+       if (!strcmp(sparc_pmu_type, "ultra3i")) {
+               sparc_pmu = &ultra3i_pmu;
+               return true;
+       }
+       if (!strcmp(sparc_pmu_type, "niagara2")) {
+               sparc_pmu = &niagara2_pmu;
+               return true;
+       }
+       return false;
+}
+
+void __init init_hw_perf_events(void)
+{
+       pr_info("Performance events: ");
+
+       if (!supported_pmu()) {
+               pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
+               return;
+       }
+
+       pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
+
+       /* All sparc64 PMUs currently have 2 events.  But this simple
+        * driver only supports one active event at a time.
+        */
+       perf_max_events = 1;
+
+       register_die_notifier(&perf_event_nmi_notifier);
+}
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S

index 0418157..0f1658d 100644 (file)
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -82,5 +82,5 @@ sys_call_table:
  /*310*/        .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
  /*315*/        .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
  /*320*/        .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-/*325*/        .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
+/*325*/        .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
  
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S

index 91b06b7..009825f 100644 (file)
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -83,7 +83,7 @@ sys_call_table32:
  /*310*/        .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
         .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
  /*320*/        .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
-       .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open
+       .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open
  
  #endif /* CONFIG_COMPAT */
  
@@ -158,4 +158,4 @@ sys_call_table:
  /*310*/        .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
         .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
  /*320*/        .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-       .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
+       .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 51c5901..e4ff5d1 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,7 @@ config X86
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
-       select HAVE_PERF_COUNTERS if (!M386 && !M486)
+       select HAVE_PERF_EVENTS if (!M386 && !M486)
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S

index ba331bf..74619c4 100644 (file)
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -831,5 +831,5 @@ ia32_sys_call_table:
         .quad compat_sys_preadv
         .quad compat_sys_pwritev
         .quad compat_sys_rt_tgsigqueueinfo      /* 335 */
-       .quad sys_perf_counter_open
+       .quad sys_perf_event_open
  ia32_syscall_end:
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h

index 5e3f204..f5693c8 100644 (file)
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
  BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
  BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
  
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
  BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
  #endif
  
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h

deleted file mode 100644 (file)

index e7b7c93..0000000
--- a/arch/x86/include/asm/perf_counter.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef _ASM_X86_PERF_COUNTER_H
-#define _ASM_X86_PERF_COUNTER_H
-
-/*
- * Performance counter hw details:
- */
-
-#define X86_PMC_MAX_GENERIC                                    8
-#define X86_PMC_MAX_FIXED                                      3
-
-#define X86_PMC_IDX_GENERIC                                    0
-#define X86_PMC_IDX_FIXED                                     32
-#define X86_PMC_IDX_MAX                                               64
-
-#define MSR_ARCH_PERFMON_PERFCTR0                            0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1                            0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0                          0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1                          0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE                    (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT                        (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS                         (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR                        (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-#define ARCH_PERFMON_EVENT_MASK                                    0xffff
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL                0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK                (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-               (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED                      6
-
-/*
- * Intel "Architectural Performance Monitoring" CPUID
- * detection/enumeration details:
- */
-union cpuid10_eax {
-       struct {
-               unsigned int version_id:8;
-               unsigned int num_counters:8;
-               unsigned int bit_width:8;
-               unsigned int mask_length:8;
-       } split;
-       unsigned int full;
-};
-
-union cpuid10_edx {
-       struct {
-               unsigned int num_counters_fixed:4;
-               unsigned int reserved:28;
-       } split;
-       unsigned int full;
-};
-
-
-/*
- * Fixed-purpose performance counters:
- */
-
-/*
- * All 3 fixed-mode PMCs are configured via this single MSR:
- */
-#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL                        0x38d
-
-/*
- * The counts are available in three separate MSRs:
- */
-
-/* Instr_Retired.Any: */
-#define MSR_ARCH_PERFMON_FIXED_CTR0                    0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS                 (X86_PMC_IDX_FIXED + 0)
-
-/* CPU_CLK_Unhalted.Core: */
-#define MSR_ARCH_PERFMON_FIXED_CTR1                    0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES                   (X86_PMC_IDX_FIXED + 1)
-
-/* CPU_CLK_Unhalted.Ref: */
-#define MSR_ARCH_PERFMON_FIXED_CTR2                    0x30b
-#define X86_PMC_IDX_FIXED_BUS_CYCLES                   (X86_PMC_IDX_FIXED + 2)
-
-/*
- * We model BTS tracing as another fixed-mode PMC.
- *
- * We choose a value in the middle of the fixed counter range, since lower
- * values are used by actual fixed counters and higher values are used
- * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
- */
-#define X86_PMC_IDX_FIXED_BTS                          (X86_PMC_IDX_FIXED + 16)
-
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(void);
-
-#define PERF_COUNTER_INDEX_OFFSET                      0
-
-#else
-static inline void init_hw_perf_counters(void)         { }
-static inline void perf_counters_lapic_init(void)      { }
-#endif
-
-#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h

new file mode 100644 (file)

index 0000000..ad7ce3f
--- /dev/null
+++ b/arch/x86/include/asm/perf_event.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+/*
+ * Performance event hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC                                    8
+#define X86_PMC_MAX_FIXED                                      3
+
+#define X86_PMC_IDX_GENERIC                                    0
+#define X86_PMC_IDX_FIXED                                     32
+#define X86_PMC_IDX_MAX                                               64
+
+#define MSR_ARCH_PERFMON_PERFCTR0                            0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1                            0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0                          0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1                          0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE                    (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT                        (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS                         (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR                        (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK                                    0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL                0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK                (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+               (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED                      6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+       struct {
+               unsigned int version_id:8;
+               unsigned int num_events:8;
+               unsigned int bit_width:8;
+               unsigned int mask_length:8;
+       } split;
+       unsigned int full;
+};
+
+union cpuid10_edx {
+       struct {
+               unsigned int num_events_fixed:4;
+               unsigned int reserved:28;
+       } split;
+       unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance events:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL                        0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0                    0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS                 (X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1                    0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES                   (X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2                    0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES                   (X86_PMC_IDX_FIXED + 2)
+
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed event range, since lower
+ * values are used by actual fixed events and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define X86_PMC_IDX_FIXED_BTS                          (X86_PMC_IDX_FIXED + 16)
+
+
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+extern void perf_events_lapic_init(void);
+
+#define PERF_EVENT_INDEX_OFFSET                        0
+
+#else
+static inline void init_hw_perf_events(void)           { }
+static inline void perf_events_lapic_init(void)        { }
+#endif
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h

index 8deaada..6fb3c20 100644 (file)
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,7 +341,7 @@
  #define __NR_preadv            333
  #define __NR_pwritev           334
  #define __NR_rt_tgsigqueueinfo 335
-#define __NR_perf_counter_open 336
+#define __NR_perf_event_open   336
  
  #ifdef __KERNEL__
  
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h

index b9f3c60..8d3ad0a 100644 (file)
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
  __SYSCALL(__NR_pwritev, sys_pwritev)
  #define __NR_rt_tgsigqueueinfo                 297
  __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open                 298
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open                   298
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
  
  #ifndef __NO_STUBS
  #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index a34601f..754174d 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
   *     Mikael Pettersson       :       PM converted to driver model.
   */
  
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/kernel_stat.h>
  #include <linux/mc146818rtc.h>
  #include <linux/acpi_pmtmr.h>
@@ -35,7 +35,7 @@
  #include <linux/smp.h>
  #include <linux/mm.h>
  
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
  #include <asm/x86_init.h>
  #include <asm/pgalloc.h>
  #include <asm/atomic.h>
@@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
                 apic_write(APIC_ESR, 0);
         }
  #endif
-       perf_counters_lapic_init();
+       perf_events_lapic_init();
  
         preempt_disable();
  
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile

index 8dd3063..68537e9 100644 (file)
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)         += centaur.o
  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)     += transmeta.o
  obj-$(CONFIG_CPU_SUP_UMC_32)           += umc.o
  
-obj-$(CONFIG_PERF_COUNTERS)            += perf_counter.o
+obj-$(CONFIG_PERF_EVENTS)              += perf_event.o
  
  obj-$(CONFIG_X86_MCE)                  += mcheck/
  obj-$(CONFIG_MTRR)                     += mtrr/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 2fea97e..cc25c2b 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,7 @@
  #include <linux/io.h>
  
  #include <asm/stackprotector.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
  #include <asm/mmu_context.h>
  #include <asm/hypervisor.h>
  #include <asm/processor.h>
@@ -869,7 +869,7 @@ void __init identify_boot_cpu(void)
  #else
         vgetcpu_set_mode();
  #endif
-       init_hw_perf_counters();
+       init_hw_perf_events();
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c

deleted file mode 100644 (file)

index b1f1156..0000000
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ /dev/null
@@ -1,2298 +0,0 @@
-/*
- * Performance counter x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_counter.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-
-static u64 perf_counter_mask __read_mostly;
-
-/* The maximal number of PEBS counters: */
-#define MAX_PEBS_COUNTERS      4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE                24
-
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE                (BTS_RECORD_SIZE * 2048)
-
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH            (BTS_RECORD_SIZE * 128)
-
-
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR                        (1 << 6)
-#define X86_DEBUGCTL_BTS               (1 << 7)
-#define X86_DEBUGCTL_BTINT             (1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS                (1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR       (1 << 10)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-       u64     bts_buffer_base;
-       u64     bts_index;
-       u64     bts_absolute_maximum;
-       u64     bts_interrupt_threshold;
-       u64     pebs_buffer_base;
-       u64     pebs_index;
-       u64     pebs_absolute_maximum;
-       u64     pebs_interrupt_threshold;
-       u64     pebs_counter_reset[MAX_PEBS_COUNTERS];
-};
-
-struct cpu_hw_counters {
-       struct perf_counter     *counters[X86_PMC_IDX_MAX];
-       unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long           interrupts;
-       int                     enabled;
-       struct debug_store      *ds;
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-       const char      *name;
-       int             version;
-       int             (*handle_irq)(struct pt_regs *);
-       void            (*disable_all)(void);
-       void            (*enable_all)(void);
-       void            (*enable)(struct hw_perf_counter *, int);
-       void            (*disable)(struct hw_perf_counter *, int);
-       unsigned        eventsel;
-       unsigned        perfctr;
-       u64             (*event_map)(int);
-       u64             (*raw_event)(u64);
-       int             max_events;
-       int             num_counters;
-       int             num_counters_fixed;
-       int             counter_bits;
-       u64             counter_mask;
-       int             apic;
-       u64             max_period;
-       u64             intel_ctrl;
-       void            (*enable_bts)(u64 config);
-       void            (*disable_bts)(void);
-};
-
-static struct x86_pmu x86_pmu __read_mostly;
-
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
-       .enabled = 1,
-};
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-       return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Counter setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_COUNTER                 0x0000002EULL
-
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK          0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK           0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK           0x00040000ULL
-#define P6_EVNTSEL_INV_MASK            0x00800000ULL
-#define P6_EVNTSEL_COUNTER_MASK                0xFF000000ULL
-
-#define P6_EVNTSEL_MASK                        \
-       (P6_EVNTSEL_EVENT_MASK |        \
-        P6_EVNTSEL_UNIT_MASK  |        \
-        P6_EVNTSEL_EDGE_MASK  |        \
-        P6_EVNTSEL_INV_MASK   |        \
-        P6_EVNTSEL_COUNTER_MASK)
-
-       return hw_event & P6_EVNTSEL_MASK;
-}
-
-
-/*
- * Intel PerfMon v3. Used on Core2 and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x003c,
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x4f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x412e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]           = 0x013c,
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-       return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-static u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-static const u64 nehalem_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
-               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
-               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-               [ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-               [ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-               [ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static const u64 core2_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static const u64 atom_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK                0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK          0x00800000ULL
-#define CORE_EVNTSEL_COUNTER_MASK      0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK              \
-       (CORE_EVNTSEL_EVENT_MASK |      \
-        CORE_EVNTSEL_UNIT_MASK  |      \
-        CORE_EVNTSEL_EDGE_MASK  |      \
-        CORE_EVNTSEL_INV_MASK  |       \
-        CORE_EVNTSEL_COUNTER_MASK)
-
-       return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static const u64 amd_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-               [ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-       return amd_perfmon_event_map[hw_event];
-}
-
-static u64 amd_pmu_raw_event(u64 hw_event)
-{
-#define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
-#define K7_EVNTSEL_UNIT_MASK   0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK   0x000040000ULL
-#define K7_EVNTSEL_INV_MASK    0x000800000ULL
-#define K7_EVNTSEL_COUNTER_MASK        0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK                        \
-       (K7_EVNTSEL_EVENT_MASK |        \
-        K7_EVNTSEL_UNIT_MASK  |        \
-        K7_EVNTSEL_EDGE_MASK  |        \
-        K7_EVNTSEL_INV_MASK   |        \
-        K7_EVNTSEL_COUNTER_MASK)
-
-       return hw_event & K7_EVNTSEL_MASK;
-}
-
-/*
- * Propagate counter elapsed time into the generic counter.
- * Can only be executed on the CPU where the counter is active.
- * Returns the delta events processed.
- */
-static u64
-x86_perf_counter_update(struct perf_counter *counter,
-                       struct hw_perf_counter *hwc, int idx)
-{
-       int shift = 64 - x86_pmu.counter_bits;
-       u64 prev_raw_count, new_raw_count;
-       s64 delta;
-
-       if (idx == X86_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * Careful: an NMI might modify the previous counter value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic counter atomically:
-        */
-again:
-       prev_raw_count = atomic64_read(&hwc->prev_count);
-       rdmsrl(hwc->counter_base + idx, new_raw_count);
-
-       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               goto again;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (counter-)time and add that to the generic counter.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       atomic64_add(delta, &counter->count);
-       atomic64_sub(delta, &hwc->period_left);
-
-       return new_raw_count;
-}
-
-static atomic_t active_counters;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-static bool reserve_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-       int i;
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               disable_lapic_nmi_watchdog();
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
-                       goto perfctr_fail;
-       }
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
-                       goto eventsel_fail;
-       }
-#endif
-
-       return true;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-eventsel_fail:
-       for (i--; i >= 0; i--)
-               release_evntsel_nmi(x86_pmu.eventsel + i);
-
-       i = x86_pmu.num_counters;
-
-perfctr_fail:
-       for (i--; i >= 0; i--)
-               release_perfctr_nmi(x86_pmu.perfctr + i);
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               enable_lapic_nmi_watchdog();
-
-       return false;
-#endif
-}
-
-static void release_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               release_perfctr_nmi(x86_pmu.perfctr + i);
-               release_evntsel_nmi(x86_pmu.eventsel + i);
-       }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               enable_lapic_nmi_watchdog();
-#endif
-}
-
-static inline bool bts_available(void)
-{
-       return x86_pmu.enable_bts != NULL;
-}
-
-static inline void init_debug_store_on_cpu(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
-
-       if (!ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-                    (u32)((u64)(unsigned long)ds),
-                    (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static inline void fini_debug_store_on_cpu(int cpu)
-{
-       if (!per_cpu(cpu_hw_counters, cpu).ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
-       int cpu;
-
-       if (!bts_available())
-               return;
-
-       get_online_cpus();
-
-       for_each_online_cpu(cpu)
-               fini_debug_store_on_cpu(cpu);
-
-       for_each_possible_cpu(cpu) {
-               struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
-
-               if (!ds)
-                       continue;
-
-               per_cpu(cpu_hw_counters, cpu).ds = NULL;
-
-               kfree((void *)(unsigned long)ds->bts_buffer_base);
-               kfree(ds);
-       }
-
-       put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
-       int cpu, err = 0;
-
-       if (!bts_available())
-               return 0;
-
-       get_online_cpus();
-
-       for_each_possible_cpu(cpu) {
-               struct debug_store *ds;
-               void *buffer;
-
-               err = -ENOMEM;
-               buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-               if (unlikely(!buffer))
-                       break;
-
-               ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-               if (unlikely(!ds)) {
-                       kfree(buffer);
-                       break;
-               }
-
-               ds->bts_buffer_base = (u64)(unsigned long)buffer;
-               ds->bts_index = ds->bts_buffer_base;
-               ds->bts_absolute_maximum =
-                       ds->bts_buffer_base + BTS_BUFFER_SIZE;
-               ds->bts_interrupt_threshold =
-                       ds->bts_absolute_maximum - BTS_OVFL_TH;
-
-               per_cpu(cpu_hw_counters, cpu).ds = ds;
-               err = 0;
-       }
-
-       if (err)
-               release_bts_hardware();
-       else {
-               for_each_online_cpu(cpu)
-                       init_debug_store_on_cpu(cpu);
-       }
-
-       put_online_cpus();
-
-       return err;
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-       if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
-               release_pmc_hardware();
-               release_bts_hardware();
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-}
-
-static inline int x86_pmu_initialized(void)
-{
-       return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
-{
-       unsigned int cache_type, cache_op, cache_result;
-       u64 config, val;
-
-       config = attr->config;
-
-       cache_type = (config >>  0) & 0xff;
-       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-               return -EINVAL;
-
-       cache_op = (config >>  8) & 0xff;
-       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-               return -EINVAL;
-
-       cache_result = (config >> 16) & 0xff;
-       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-               return -EINVAL;
-
-       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-       if (val == 0)
-               return -ENOENT;
-
-       if (val == -1)
-               return -EINVAL;
-
-       hwc->config |= val;
-
-       return 0;
-}
-
-static void intel_pmu_enable_bts(u64 config)
-{
-       unsigned long debugctlmsr;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr |= X86_DEBUGCTL_TR;
-       debugctlmsr |= X86_DEBUGCTL_BTS;
-       debugctlmsr |= X86_DEBUGCTL_BTINT;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       unsigned long debugctlmsr;
-
-       if (!cpuc->ds)
-               return;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr &=
-               ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-                 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
-       struct perf_counter_attr *attr = &counter->attr;
-       struct hw_perf_counter *hwc = &counter->hw;
-       u64 config;
-       int err;
-
-       if (!x86_pmu_initialized())
-               return -ENODEV;
-
-       err = 0;
-       if (!atomic_inc_not_zero(&active_counters)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&active_counters) == 0) {
-                       if (!reserve_pmc_hardware())
-                               err = -EBUSY;
-                       else
-                               err = reserve_bts_hardware();
-               }
-               if (!err)
-                       atomic_inc(&active_counters);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-       if (err)
-               return err;
-
-       counter->destroy = hw_perf_counter_destroy;
-
-       /*
-        * Generate PMC IRQs:
-        * (keep 'enabled' bit clear for now)
-        */
-       hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
-       /*
-        * Count user and OS events unless requested not to.
-        */
-       if (!attr->exclude_user)
-               hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-       if (!attr->exclude_kernel)
-               hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-
-       if (!hwc->sample_period) {
-               hwc->sample_period = x86_pmu.max_period;
-               hwc->last_period = hwc->sample_period;
-               atomic64_set(&hwc->period_left, hwc->sample_period);
-       } else {
-               /*
-                * If we have a PMU initialized but no APIC
-                * interrupts, we cannot sample hardware
-                * counters (user-space has to fall back and
-                * sample via a hrtimer based software counter):
-                */
-               if (!x86_pmu.apic)
-                       return -EOPNOTSUPP;
-       }
-
-       /*
-        * Raw hw_event type provide the config in the hw_event structure
-        */
-       if (attr->type == PERF_TYPE_RAW) {
-               hwc->config |= x86_pmu.raw_event(attr->config);
-               return 0;
-       }
-
-       if (attr->type == PERF_TYPE_HW_CACHE)
-               return set_ext_hw_attr(hwc, attr);
-
-       if (attr->config >= x86_pmu.max_events)
-               return -EINVAL;
-
-       /*
-        * The generic map:
-        */
-       config = x86_pmu.event_map(attr->config);
-
-       if (config == 0)
-               return -ENOENT;
-
-       if (config == -1LL)
-               return -EINVAL;
-
-       /*
-        * Branch tracing:
-        */
-       if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-           (hwc->sample_period == 1)) {
-               /* BTS is not supported by this architecture. */
-               if (!bts_available())
-                       return -EOPNOTSUPP;
-
-               /* BTS is currently only allowed for user-mode. */
-               if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-                       return -EOPNOTSUPP;
-       }
-
-       hwc->config |= config;
-
-       return 0;
-}
-
-static void p6_pmu_disable_all(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       u64 val;
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->enabled = 0;
-       barrier();
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_disable_all(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->enabled = 0;
-       barrier();
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-       if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               intel_pmu_disable_bts();
-}
-
-static void amd_pmu_disable_all(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       int idx;
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->enabled = 0;
-       /*
-        * ensure we write the disable before we start disabling the
-        * counters proper, so that amd_pmu_enable_counter() does the
-        * right thing.
-        */
-       barrier();
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               u64 val;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-               if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
-                       continue;
-               val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-               wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-       }
-}
-
-void hw_perf_disable(void)
-{
-       if (!x86_pmu_initialized())
-               return;
-       return x86_pmu.disable_all();
-}
-
-static void p6_pmu_enable_all(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       unsigned long val;
-
-       if (cpuc->enabled)
-               return;
-
-       cpuc->enabled = 1;
-       barrier();
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_enable_all(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       if (cpuc->enabled)
-               return;
-
-       cpuc->enabled = 1;
-       barrier();
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
-
-       if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-               struct perf_counter *counter =
-                       cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-
-               if (WARN_ON_ONCE(!counter))
-                       return;
-
-               intel_pmu_enable_bts(counter->hw.config);
-       }
-}
-
-static void amd_pmu_enable_all(void)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       int idx;
-
-       if (cpuc->enabled)
-               return;
-
-       cpuc->enabled = 1;
-       barrier();
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_counter *counter = cpuc->counters[idx];
-               u64 val;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               val = counter->hw.config;
-               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-               wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-       }
-}
-
-void hw_perf_enable(void)
-{
-       if (!x86_pmu_initialized())
-               return;
-       x86_pmu.enable_all();
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       (void)checking_wrmsrl(hwc->config_base + idx,
-                             hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
-}
-
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
-       int idx = __idx - X86_PMC_IDX_FIXED;
-       u64 ctrl_val, mask;
-
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       (void)checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline void
-p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       u64 val = P6_NOP_COUNTER;
-
-       if (cpuc->enabled)
-               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-       (void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-static inline void
-intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-               intel_pmu_disable_bts();
-               return;
-       }
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_disable_fixed(hwc, idx);
-               return;
-       }
-
-       x86_pmu_disable_counter(hwc, idx);
-}
-
-static inline void
-amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       x86_pmu_disable_counter(hwc, idx);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the counter disabled in hw:
- */
-static int
-x86_perf_counter_set_period(struct perf_counter *counter,
-                            struct hw_perf_counter *hwc, int idx)
-{
-       s64 left = atomic64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int err, ret = 0;
-
-       if (idx == X86_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * If we are way outside a reasoable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-
-       if (unlikely(left <= 0)) {
-               left += period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-       /*
-        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-        */
-       if (unlikely(left < 2))
-               left = 2;
-
-       if (left > x86_pmu.max_period)
-               left = x86_pmu.max_period;
-
-       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-       /*
-        * The hw counter starts counting from this counter offset,
-        * mark it to be able to extra future deltas:
-        */
-       atomic64_set(&hwc->prev_count, (u64)-left);
-
-       err = checking_wrmsrl(hwc->counter_base + idx,
-                            (u64)(-left) & x86_pmu.counter_mask);
-
-       perf_counter_update_userpage(counter);
-
-       return ret;
-}
-
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
-       int idx = __idx - X86_PMC_IDX_FIXED;
-       u64 ctrl_val, bits, mask;
-       int err;
-
-       /*
-        * Enable IRQ generation (0x8),
-        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-        * if requested:
-        */
-       bits = 0x8ULL;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-               bits |= 0x2;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-               bits |= 0x1;
-       bits <<= (idx * 4);
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       ctrl_val |= bits;
-       err = checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       u64 val;
-
-       val = hwc->config;
-       if (cpuc->enabled)
-               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-       (void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-
-static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-               if (!__get_cpu_var(cpu_hw_counters).enabled)
-                       return;
-
-               intel_pmu_enable_bts(hwc->config);
-               return;
-       }
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_enable_fixed(hwc, idx);
-               return;
-       }
-
-       x86_pmu_enable_counter(hwc, idx);
-}
-
-static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       if (cpuc->enabled)
-               x86_pmu_enable_counter(hwc, idx);
-}
-
-static int
-fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
-{
-       unsigned int hw_event;
-
-       hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
-       if (unlikely((hw_event ==
-                     x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
-                    (hwc->sample_period == 1)))
-               return X86_PMC_IDX_FIXED_BTS;
-
-       if (!x86_pmu.num_counters_fixed)
-               return -1;
-
-       if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
-               return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-       if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
-               return X86_PMC_IDX_FIXED_CPU_CYCLES;
-       if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
-               return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
-       return -1;
-}
-
-/*
- * Find a PMC slot for the freshly enabled / scheduled in counter:
- */
-static int x86_pmu_enable(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       struct hw_perf_counter *hwc = &counter->hw;
-       int idx;
-
-       idx = fixed_mode_idx(counter, hwc);
-       if (idx == X86_PMC_IDX_FIXED_BTS) {
-               /* BTS is already occupied. */
-               if (test_and_set_bit(idx, cpuc->used_mask))
-                       return -EAGAIN;
-
-               hwc->config_base        = 0;
-               hwc->counter_base       = 0;
-               hwc->idx                = idx;
-       } else if (idx >= 0) {
-               /*
-                * Try to get the fixed counter, if that is already taken
-                * then try to get a generic counter:
-                */
-               if (test_and_set_bit(idx, cpuc->used_mask))
-                       goto try_generic;
-
-               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               /*
-                * We set it so that counter_base + idx in wrmsr/rdmsr maps to
-                * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
-                */
-               hwc->counter_base =
-                       MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
-               hwc->idx = idx;
-       } else {
-               idx = hwc->idx;
-               /* Try to get the previous generic counter again */
-               if (test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
-                       idx = find_first_zero_bit(cpuc->used_mask,
-                                                 x86_pmu.num_counters);
-                       if (idx == x86_pmu.num_counters)
-                               return -EAGAIN;
-
-                       set_bit(idx, cpuc->used_mask);
-                       hwc->idx = idx;
-               }
-               hwc->config_base  = x86_pmu.eventsel;
-               hwc->counter_base = x86_pmu.perfctr;
-       }
-
-       perf_counters_lapic_init();
-
-       x86_pmu.disable(hwc, idx);
-
-       cpuc->counters[idx] = counter;
-       set_bit(idx, cpuc->active_mask);
-
-       x86_perf_counter_set_period(counter, hwc, idx);
-       x86_pmu.enable(hwc, idx);
-
-       perf_counter_update_userpage(counter);
-
-       return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       struct hw_perf_counter *hwc = &counter->hw;
-
-       if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
-                               cpuc->counters[hwc->idx] != counter))
-               return;
-
-       x86_pmu.enable(hwc, hwc->idx);
-}
-
-void perf_counter_print_debug(void)
-{
-       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-       struct cpu_hw_counters *cpuc;
-       unsigned long flags;
-       int cpu, idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-       if (x86_pmu.version >= 2) {
-               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-               pr_info("\n");
-               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-       }
-       pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
-               rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
-
-               prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-                       cpu, idx, pmc_ctrl);
-               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-                       cpu, idx, prev_left);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-       }
-       local_irq_restore(flags);
-}
-
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
-{
-       struct debug_store *ds = cpuc->ds;
-       struct bts_record {
-               u64     from;
-               u64     to;
-               u64     flags;
-       };
-       struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-       struct bts_record *at, *top;
-       struct perf_output_handle handle;
-       struct perf_event_header header;
-       struct perf_sample_data data;
-       struct pt_regs regs;
-
-       if (!counter)
-               return;
-
-       if (!ds)
-               return;
-
-       at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-       top = (struct bts_record *)(unsigned long)ds->bts_index;
-
-       if (top <= at)
-               return;
-
-       ds->bts_index = ds->bts_buffer_base;
-
-
-       data.period     = counter->hw.last_period;
-       data.addr       = 0;
-       regs.ip         = 0;
-
-       /*
-        * Prepare a generic sample, i.e. fill in the invariant fields.
-        * We will overwrite the from and to address before we output
-        * the sample.
-        */
-       perf_prepare_sample(&header, &data, counter, &regs);
-
-       if (perf_output_begin(&handle, counter,
-                             header.size * (top - at), 1, 1))
-               return;
-
-       for (; at < top; at++) {
-               data.ip         = at->from;
-               data.addr       = at->to;
-
-               perf_output_sample(&handle, &header, &data, counter);
-       }
-
-       perf_output_end(&handle);
-
-       /* There's new data available. */
-       counter->hw.interrupts++;
-       counter->pending_kill = POLL_IN;
-}
-
-static void x86_pmu_disable(struct perf_counter *counter)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       struct hw_perf_counter *hwc = &counter->hw;
-       int idx = hwc->idx;
-
-       /*
-        * Must be done before we disable, otherwise the nmi handler
-        * could reenable again:
-        */
-       clear_bit(idx, cpuc->active_mask);
-       x86_pmu.disable(hwc, idx);
-
-       /*
-        * Make sure the cleared pointer becomes visible before we
-        * (potentially) free the counter:
-        */
-       barrier();
-
-       /*
-        * Drain the remaining delta count out of a counter
-        * that we are disabling:
-        */
-       x86_perf_counter_update(counter, hwc, idx);
-
-       /* Drain the remaining BTS records. */
-       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
-               intel_pmu_drain_bts_buffer(cpuc);
-
-       cpuc->counters[idx] = NULL;
-       clear_bit(idx, cpuc->used_mask);
-
-       perf_counter_update_userpage(counter);
-}
-
-/*
- * Save and restart an expired counter. Called by NMI contexts,
- * so it has to be careful about preempting normal counter ops:
- */
-static int intel_pmu_save_and_restart(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       int idx = hwc->idx;
-       int ret;
-
-       x86_perf_counter_update(counter, hwc, idx);
-       ret = x86_perf_counter_set_period(counter, hwc, idx);
-
-       if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-               intel_pmu_enable_counter(hwc, idx);
-
-       return ret;
-}
-
-static void intel_pmu_reset(void)
-{
-       struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
-       unsigned long flags;
-       int idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
-               checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-               checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-       }
-       if (ds)
-               ds->bts_index = ds->bts_buffer_base;
-
-       local_irq_restore(flags);
-}
-
-static int p6_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_counters *cpuc;
-       struct perf_counter *counter;
-       struct hw_perf_counter *hwc;
-       int idx, handled = 0;
-       u64 val;
-
-       data.addr = 0;
-
-       cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               counter = cpuc->counters[idx];
-               hwc = &counter->hw;
-
-               val = x86_perf_counter_update(counter, hwc, idx);
-               if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-                       continue;
-
-               /*
-                * counter overflow
-                */
-               handled         = 1;
-               data.period     = counter->hw.last_period;
-
-               if (!x86_perf_counter_set_period(counter, hwc, idx))
-                       continue;
-
-               if (perf_counter_overflow(counter, 1, &data, regs))
-                       p6_pmu_disable_counter(hwc, idx);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_counters *cpuc;
-       int bit, loops;
-       u64 ack, status;
-
-       data.addr = 0;
-
-       cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       perf_disable();
-       intel_pmu_drain_bts_buffer(cpuc);
-       status = intel_pmu_get_status();
-       if (!status) {
-               perf_enable();
-               return 0;
-       }
-
-       loops = 0;
-again:
-       if (++loops > 100) {
-               WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
-               perf_counter_print_debug();
-               intel_pmu_reset();
-               perf_enable();
-               return 1;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-       ack = status;
-       for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_counter *counter = cpuc->counters[bit];
-
-               clear_bit(bit, (unsigned long *) &status);
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(counter))
-                       continue;
-
-               data.period = counter->hw.last_period;
-
-               if (perf_counter_overflow(counter, 1, &data, regs))
-                       intel_pmu_disable_counter(&counter->hw, bit);
-       }
-
-       intel_pmu_ack_status(ack);
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = intel_pmu_get_status();
-       if (status)
-               goto again;
-
-       perf_enable();
-
-       return 1;
-}
-
-static int amd_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_counters *cpuc;
-       struct perf_counter *counter;
-       struct hw_perf_counter *hwc;
-       int idx, handled = 0;
-       u64 val;
-
-       data.addr = 0;
-
-       cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               counter = cpuc->counters[idx];
-               hwc = &counter->hw;
-
-               val = x86_perf_counter_update(counter, hwc, idx);
-               if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-                       continue;
-
-               /*
-                * counter overflow
-                */
-               handled         = 1;
-               data.period     = counter->hw.last_period;
-
-               if (!x86_perf_counter_set_period(counter, hwc, idx))
-                       continue;
-
-               if (perf_counter_overflow(counter, 1, &data, regs))
-                       amd_pmu_disable_counter(hwc, idx);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-       irq_enter();
-       ack_APIC_irq();
-       inc_irq_stat(apic_pending_irqs);
-       perf_counter_do_pending();
-       irq_exit();
-}
-
-void set_perf_counter_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-       apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
-void perf_counters_lapic_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-       if (!x86_pmu.apic || !x86_pmu_initialized())
-               return;
-
-       /*
-        * Always use NMI for PMU
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-}
-
-static int __kprobes
-perf_counter_nmi_handler(struct notifier_block *self,
-                        unsigned long cmd, void *__args)
-{
-       struct die_args *args = __args;
-       struct pt_regs *regs;
-
-       if (!atomic_read(&active_counters))
-               return NOTIFY_DONE;
-
-       switch (cmd) {
-       case DIE_NMI:
-       case DIE_NMI_IPI:
-               break;
-
-       default:
-               return NOTIFY_DONE;
-       }
-
-       regs = args->regs;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-       /*
-        * Can't rely on the handled return value to say it was our NMI, two
-        * counters could trigger 'simultaneously' raising two back-to-back NMIs.
-        *
-        * If the first NMI handles both, the latter will be empty and daze
-        * the CPU.
-        */
-       x86_pmu.handle_irq(regs);
-
-       return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-       .notifier_call          = perf_counter_nmi_handler,
-       .next                   = NULL,
-       .priority               = 1
-};
-
-static struct x86_pmu p6_pmu = {
-       .name                   = "p6",
-       .handle_irq             = p6_pmu_handle_irq,
-       .disable_all            = p6_pmu_disable_all,
-       .enable_all             = p6_pmu_enable_all,
-       .enable                 = p6_pmu_enable_counter,
-       .disable                = p6_pmu_disable_counter,
-       .eventsel               = MSR_P6_EVNTSEL0,
-       .perfctr                = MSR_P6_PERFCTR0,
-       .event_map              = p6_pmu_event_map,
-       .raw_event              = p6_pmu_raw_event,
-       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 31) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       /*
-        * Counters have 40 bits implemented. However they are designed such
-        * that bits [32-39] are sign extensions of bit 31. As such the
-        * effective width of a counter for P6-like PMU is 32 bits only.
-        *
-        * See IA-32 Intel Architecture Software developer manual Vol 3B
-        */
-       .counter_bits           = 32,
-       .counter_mask           = (1ULL << 32) - 1,
-};
-
-static struct x86_pmu intel_pmu = {
-       .name                   = "Intel",
-       .handle_irq             = intel_pmu_handle_irq,
-       .disable_all            = intel_pmu_disable_all,
-       .enable_all             = intel_pmu_enable_all,
-       .enable                 = intel_pmu_enable_counter,
-       .disable                = intel_pmu_disable_counter,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .raw_event              = intel_pmu_raw_event,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       /*
-        * Intel PMCs cannot be accessed sanely above 32 bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic counter period:
-        */
-       .max_period             = (1ULL << 31) - 1,
-       .enable_bts             = intel_pmu_enable_bts,
-       .disable_bts            = intel_pmu_disable_bts,
-};
-
-static struct x86_pmu amd_pmu = {
-       .name                   = "AMD",
-       .handle_irq             = amd_pmu_handle_irq,
-       .disable_all            = amd_pmu_disable_all,
-       .enable_all             = amd_pmu_enable_all,
-       .enable                 = amd_pmu_enable_counter,
-       .disable                = amd_pmu_disable_counter,
-       .eventsel               = MSR_K7_EVNTSEL0,
-       .perfctr                = MSR_K7_PERFCTR0,
-       .event_map              = amd_pmu_event_map,
-       .raw_event              = amd_pmu_raw_event,
-       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
-       .num_counters           = 4,
-       .counter_bits           = 48,
-       .counter_mask           = (1ULL << 48) - 1,
-       .apic                   = 1,
-       /* use highest bit to detect overflow */
-       .max_period             = (1ULL << 47) - 1,
-};
-
-static int p6_pmu_init(void)
-{
-       switch (boot_cpu_data.x86_model) {
-       case 1:
-       case 3:  /* Pentium Pro */
-       case 5:
-       case 6:  /* Pentium II */
-       case 7:
-       case 8:
-       case 11: /* Pentium III */
-               break;
-       case 9:
-       case 13:
-               /* Pentium M */
-               break;
-       default:
-               pr_cont("unsupported p6 CPU model %d ",
-                       boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       x86_pmu = p6_pmu;
-
-       if (!cpu_has_apic) {
-               pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-               pr_info("no hardware sampling interrupt available.\n");
-               x86_pmu.apic = 0;
-       }
-
-       return 0;
-}
-
-static int intel_pmu_init(void)
-{
-       union cpuid10_edx edx;
-       union cpuid10_eax eax;
-       unsigned int unused;
-       unsigned int ebx;
-       int version;
-
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-               /* check for P6 processor family */
-          if (boot_cpu_data.x86 == 6) {
-               return p6_pmu_init();
-          } else {
-               return -ENODEV;
-          }
-       }
-
-       /*
-        * Check whether the Architectural PerfMon supports
-        * Branch Misses Retired hw_event or not.
-        */
-       cpuid(10, &eax.full, &ebx, &unused, &edx.full);
-       if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-               return -ENODEV;
-
-       version = eax.split.version_id;
-       if (version < 2)
-               return -ENODEV;
-
-       x86_pmu                         = intel_pmu;
-       x86_pmu.version                 = version;
-       x86_pmu.num_counters            = eax.split.num_counters;
-       x86_pmu.counter_bits            = eax.split.bit_width;
-       x86_pmu.counter_mask            = (1ULL << eax.split.bit_width) - 1;
-
-       /*
-        * Quirk: v2 perfmon does not report fixed-purpose counters, so
-        * assume at least 3 counters:
-        */
-       x86_pmu.num_counters_fixed      = max((int)edx.split.num_counters_fixed, 3);
-
-       /*
-        * Install the hw-cache-events table:
-        */
-       switch (boot_cpu_data.x86_model) {
-       case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
-       case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
-       case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
-       case 29: /* six-core 45 nm xeon "Dunnington" */
-               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               pr_cont("Core2 events, ");
-               break;
-       default:
-       case 26:
-               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               pr_cont("Nehalem/Corei7 events, ");
-               break;
-       case 28:
-               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               pr_cont("Atom events, ");
-               break;
-       }
-       return 0;
-}
-
-static int amd_pmu_init(void)
-{
-       /* Performance-monitoring supported from K7 and later: */
-       if (boot_cpu_data.x86 < 6)
-               return -ENODEV;
-
-       x86_pmu = amd_pmu;
-
-       /* Events are common for all AMDs */
-       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-              sizeof(hw_cache_event_ids));
-
-       return 0;
-}
-
-void __init init_hw_perf_counters(void)
-{
-       int err;
-
-       pr_info("Performance Counters: ");
-
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_INTEL:
-               err = intel_pmu_init();
-               break;
-       case X86_VENDOR_AMD:
-               err = amd_pmu_init();
-               break;
-       default:
-               return;
-       }
-       if (err != 0) {
-               pr_cont("no PMU driver, software counters only.\n");
-               return;
-       }
-
-       pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-       if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
-               WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-                    x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
-               x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-       }
-       perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
-       perf_max_counters = x86_pmu.num_counters;
-
-       if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
-               WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-                    x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
-               x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-       }
-
-       perf_counter_mask |=
-               ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
-       x86_pmu.intel_ctrl = perf_counter_mask;
-
-       perf_counters_lapic_init();
-       register_die_notifier(&perf_counter_nmi_notifier);
-
-       pr_info("... version:                 %d\n",     x86_pmu.version);
-       pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
-       pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
-       pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
-       pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
-       pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
-       pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
-}
-
-static inline void x86_pmu_read(struct perf_counter *counter)
-{
-       x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-}
-
-static const struct pmu pmu = {
-       .enable         = x86_pmu_enable,
-       .disable        = x86_pmu_disable,
-       .read           = x86_pmu_read,
-       .unthrottle     = x86_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-       int err;
-
-       err = __hw_perf_counter_init(counter);
-       if (err) {
-               if (counter->destroy)
-                       counter->destroy(counter);
-               return ERR_PTR(err);
-       }
-
-       return &pmu;
-}
-
-/*
- * callchain support
- */
-
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
-
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-       /* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
-       per_cpu(in_nmi_frame, smp_processor_id()) =
-                       x86_is_stack_id(NMI_STACK, name);
-
-       return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-       struct perf_callchain_entry *entry = data;
-
-       if (per_cpu(in_nmi_frame, smp_processor_id()))
-               return;
-
-       if (reliable)
-               callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-       .warning                = backtrace_warning,
-       .warning_symbol         = backtrace_warning_symbol,
-       .stack                  = backtrace_stack,
-       .address                = backtrace_address,
-};
-
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->ip);
-
-       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-       unsigned long offset, addr = (unsigned long)from;
-       int type = in_nmi() ? KM_NMI : KM_IRQ0;
-       unsigned long size, len = 0;
-       struct page *page;
-       void *map;
-       int ret;
-
-       do {
-               ret = __get_user_pages_fast(addr, 1, 0, &page);
-               if (!ret)
-                       break;
-
-               offset = addr & (PAGE_SIZE - 1);
-               size = min(PAGE_SIZE - offset, n - len);
-
-               map = kmap_atomic(page, type);
-               memcpy(to, map+offset, size);
-               kunmap_atomic(map, type);
-               put_page(page);
-
-               len  += size;
-               to   += size;
-               addr += size;
-
-       } while (len < n);
-
-       return len;
-}
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
-       unsigned long bytes;
-
-       bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
-
-       return bytes == sizeof(*frame);
-}
-
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       struct stack_frame frame;
-       const void __user *fp;
-
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
-
-       fp = (void __user *)regs->bp;
-
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->ip);
-
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               frame.next_frame             = NULL;
-               frame.return_address = 0;
-
-               if (!copy_stack_frame(fp, &frame))
-                       break;
-
-               if ((unsigned long)fp < regs->sp)
-                       break;
-
-               callchain_store(entry, frame.return_address);
-               fp = frame.next_frame;
-       }
-}
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (!current || current->pid == 0)
-               return;
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry;
-
-       if (in_nmi())
-               entry = &__get_cpu_var(pmc_nmi_entry);
-       else
-               entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
-
-void hw_perf_counter_setup_online(int cpu)
-{
-       init_debug_store_on_cpu(cpu);
-}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

new file mode 100644 (file)

index 0000000..0d03629
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -0,0 +1,2298 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_event_mask __read_mostly;
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS        4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE                24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE                (BTS_RECORD_SIZE * 2048)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH            (BTS_RECORD_SIZE * 128)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR                        (1 << 6)
+#define X86_DEBUGCTL_BTS               (1 << 7)
+#define X86_DEBUGCTL_BTINT             (1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS                (1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR       (1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+       u64     bts_buffer_base;
+       u64     bts_index;
+       u64     bts_absolute_maximum;
+       u64     bts_interrupt_threshold;
+       u64     pebs_buffer_base;
+       u64     pebs_index;
+       u64     pebs_absolute_maximum;
+       u64     pebs_interrupt_threshold;
+       u64     pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+struct cpu_hw_events {
+       struct perf_event       *events[X86_PMC_IDX_MAX];
+       unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           interrupts;
+       int                     enabled;
+       struct debug_store      *ds;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+       const char      *name;
+       int             version;
+       int             (*handle_irq)(struct pt_regs *);
+       void            (*disable_all)(void);
+       void            (*enable_all)(void);
+       void            (*enable)(struct hw_perf_event *, int);
+       void            (*disable)(struct hw_perf_event *, int);
+       unsigned        eventsel;
+       unsigned        perfctr;
+       u64             (*event_map)(int);
+       u64             (*raw_event)(u64);
+       int             max_events;
+       int             num_events;
+       int             num_events_fixed;
+       int             event_bits;
+       u64             event_mask;
+       int             apic;
+       u64             max_period;
+       u64             intel_ctrl;
+       void            (*enable_bts)(u64 config);
+       void            (*disable_bts)(void);
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+       .enabled = 1,
+};
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+       return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT                   0x0000002EULL
+
+static u64 p6_pmu_raw_event(u64 hw_event)
+{
+#define P6_EVNTSEL_EVENT_MASK          0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK           0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK           0x00040000ULL
+#define P6_EVNTSEL_INV_MASK            0x00800000ULL
+#define P6_EVNTSEL_REG_MASK            0xFF000000ULL
+
+#define P6_EVNTSEL_MASK                        \
+       (P6_EVNTSEL_EVENT_MASK |        \
+        P6_EVNTSEL_UNIT_MASK  |        \
+        P6_EVNTSEL_EDGE_MASK  |        \
+        P6_EVNTSEL_INV_MASK   |        \
+        P6_EVNTSEL_REG_MASK)
+
+       return hw_event & P6_EVNTSEL_MASK;
+}
+
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]           = 0x013c,
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+       return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+               [ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+               [ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+               [ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 hw_event)
+{
+#define CORE_EVNTSEL_EVENT_MASK                0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK          0x00800000ULL
+#define CORE_EVNTSEL_REG_MASK  0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK              \
+       (CORE_EVNTSEL_EVENT_MASK |      \
+        CORE_EVNTSEL_UNIT_MASK  |      \
+        CORE_EVNTSEL_EDGE_MASK  |      \
+        CORE_EVNTSEL_INV_MASK  |       \
+        CORE_EVNTSEL_REG_MASK)
+
+       return hw_event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+               [ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+       return amd_perfmon_event_map[hw_event];
+}
+
+static u64 amd_pmu_raw_event(u64 hw_event)
+{
+#define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK   0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK   0x000040000ULL
+#define K7_EVNTSEL_INV_MASK    0x000800000ULL
+#define K7_EVNTSEL_REG_MASK    0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK                        \
+       (K7_EVNTSEL_EVENT_MASK |        \
+        K7_EVNTSEL_UNIT_MASK  |        \
+        K7_EVNTSEL_EDGE_MASK  |        \
+        K7_EVNTSEL_INV_MASK   |        \
+        K7_EVNTSEL_REG_MASK)
+
+       return hw_event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_event_update(struct perf_event *event,
+                       struct hw_perf_event *hwc, int idx)
+{
+       int shift = 64 - x86_pmu.event_bits;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta;
+
+       if (idx == X86_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+again:
+       prev_raw_count = atomic64_read(&hwc->prev_count);
+       rdmsrl(hwc->event_base + idx, new_raw_count);
+
+       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               goto again;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       atomic64_add(delta, &event->count);
+       atomic64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
+}
+
+static atomic_t active_events;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+       int i;
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               disable_lapic_nmi_watchdog();
+
+       for (i = 0; i < x86_pmu.num_events; i++) {
+               if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+                       goto perfctr_fail;
+       }
+
+       for (i = 0; i < x86_pmu.num_events; i++) {
+               if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+                       goto eventsel_fail;
+       }
+#endif
+
+       return true;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+eventsel_fail:
+       for (i--; i >= 0; i--)
+               release_evntsel_nmi(x86_pmu.eventsel + i);
+
+       i = x86_pmu.num_events;
+
+perfctr_fail:
+       for (i--; i >= 0; i--)
+               release_perfctr_nmi(x86_pmu.perfctr + i);
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               enable_lapic_nmi_watchdog();
+
+       return false;
+#endif
+}
+
+static void release_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+       int i;
+
+       for (i = 0; i < x86_pmu.num_events; i++) {
+               release_perfctr_nmi(x86_pmu.perfctr + i);
+               release_evntsel_nmi(x86_pmu.eventsel + i);
+       }
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               enable_lapic_nmi_watchdog();
+#endif
+}
+
+static inline bool bts_available(void)
+{
+       return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                    (u32)((u64)(unsigned long)ds),
+                    (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+       if (!per_cpu(cpu_hw_events, cpu).ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+       int cpu;
+
+       if (!bts_available())
+               return;
+
+       get_online_cpus();
+
+       for_each_online_cpu(cpu)
+               fini_debug_store_on_cpu(cpu);
+
+       for_each_possible_cpu(cpu) {
+               struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+               if (!ds)
+                       continue;
+
+               per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+               kfree((void *)(unsigned long)ds->bts_buffer_base);
+               kfree(ds);
+       }
+
+       put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+       int cpu, err = 0;
+
+       if (!bts_available())
+               return 0;
+
+       get_online_cpus();
+
+       for_each_possible_cpu(cpu) {
+               struct debug_store *ds;
+               void *buffer;
+
+               err = -ENOMEM;
+               buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+               if (unlikely(!buffer))
+                       break;
+
+               ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+               if (unlikely(!ds)) {
+                       kfree(buffer);
+                       break;
+               }
+
+               ds->bts_buffer_base = (u64)(unsigned long)buffer;
+               ds->bts_index = ds->bts_buffer_base;
+               ds->bts_absolute_maximum =
+                       ds->bts_buffer_base + BTS_BUFFER_SIZE;
+               ds->bts_interrupt_threshold =
+                       ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+               per_cpu(cpu_hw_events, cpu).ds = ds;
+               err = 0;
+       }
+
+       if (err)
+               release_bts_hardware();
+       else {
+               for_each_online_cpu(cpu)
+                       init_debug_store_on_cpu(cpu);
+       }
+
+       put_online_cpus();
+
+       return err;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+       if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
+               release_pmc_hardware();
+               release_bts_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+static inline int x86_pmu_initialized(void)
+{
+       return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
+{
+       unsigned int cache_type, cache_op, cache_result;
+       u64 config, val;
+
+       config = attr->config;
+
+       cache_type = (config >>  0) & 0xff;
+       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+               return -EINVAL;
+
+       cache_op = (config >>  8) & 0xff;
+       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+               return -EINVAL;
+
+       cache_result = (config >> 16) & 0xff;
+       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+       if (val == 0)
+               return -ENOENT;
+
+       if (val == -1)
+               return -EINVAL;
+
+       hwc->config |= val;
+
+       return 0;
+}
+
+static void intel_pmu_enable_bts(u64 config)
+{
+       unsigned long debugctlmsr;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr |= X86_DEBUGCTL_TR;
+       debugctlmsr |= X86_DEBUGCTL_BTS;
+       debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       unsigned long debugctlmsr;
+
+       if (!cpuc->ds)
+               return;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr &=
+               ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+                 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_event_init(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+       int err;
+
+       if (!x86_pmu_initialized())
+               return -ENODEV;
+
+       err = 0;
+       if (!atomic_inc_not_zero(&active_events)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&active_events) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               err = reserve_bts_hardware();
+               }
+               if (!err)
+                       atomic_inc(&active_events);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+       if (err)
+               return err;
+
+       event->destroy = hw_perf_event_destroy;
+
+       /*
+        * Generate PMC IRQs:
+        * (keep 'enabled' bit clear for now)
+        */
+       hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+       /*
+        * Count user and OS events unless requested not to.
+        */
+       if (!attr->exclude_user)
+               hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+       if (!attr->exclude_kernel)
+               hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+       if (!hwc->sample_period) {
+               hwc->sample_period = x86_pmu.max_period;
+               hwc->last_period = hwc->sample_period;
+               atomic64_set(&hwc->period_left, hwc->sample_period);
+       } else {
+               /*
+                * If we have a PMU initialized but no APIC
+                * interrupts, we cannot sample hardware
+                * events (user-space has to fall back and
+                * sample via a hrtimer based software event):
+                */
+               if (!x86_pmu.apic)
+                       return -EOPNOTSUPP;
+       }
+
+       /*
+        * Raw hw_event type provide the config in the hw_event structure
+        */
+       if (attr->type == PERF_TYPE_RAW) {
+               hwc->config |= x86_pmu.raw_event(attr->config);
+               return 0;
+       }
+
+       if (attr->type == PERF_TYPE_HW_CACHE)
+               return set_ext_hw_attr(hwc, attr);
+
+       if (attr->config >= x86_pmu.max_events)
+               return -EINVAL;
+
+       /*
+        * The generic map:
+        */
+       config = x86_pmu.event_map(attr->config);
+
+       if (config == 0)
+               return -ENOENT;
+
+       if (config == -1LL)
+               return -EINVAL;
+
+       /*
+        * Branch tracing:
+        */
+       if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+           (hwc->sample_period == 1)) {
+               /* BTS is not supported by this architecture. */
+               if (!bts_available())
+                       return -EOPNOTSUPP;
+
+               /* BTS is currently only allowed for user-mode. */
+               if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+                       return -EOPNOTSUPP;
+       }
+
+       hwc->config |= config;
+
+       return 0;
+}
+
+static void p6_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       u64 val;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->enabled = 0;
+       barrier();
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->enabled = 0;
+       barrier();
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+       if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               intel_pmu_disable_bts();
+}
+
+static void amd_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       int idx;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->enabled = 0;
+       /*
+        * ensure we write the disable before we start disabling the
+        * events proper, so that amd_pmu_enable_event() does the
+        * right thing.
+        */
+       barrier();
+
+       for (idx = 0; idx < x86_pmu.num_events; idx++) {
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+               if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+                       continue;
+               val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+               wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+       }
+}
+
+void hw_perf_disable(void)
+{
+       if (!x86_pmu_initialized())
+               return;
+       return x86_pmu.disable_all();
+}
+
+static void p6_pmu_enable_all(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       unsigned long val;
+
+       if (cpuc->enabled)
+               return;
+
+       cpuc->enabled = 1;
+       barrier();
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_enable_all(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+       if (cpuc->enabled)
+               return;
+
+       cpuc->enabled = 1;
+       barrier();
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+       if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+               struct perf_event *event =
+                       cpuc->events[X86_PMC_IDX_FIXED_BTS];
+
+               if (WARN_ON_ONCE(!event))
+                       return;
+
+               intel_pmu_enable_bts(event->hw.config);
+       }
+}
+
+static void amd_pmu_enable_all(void)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       int idx;
+
+       if (cpuc->enabled)
+               return;
+
+       cpuc->enabled = 1;
+       barrier();
+
+       for (idx = 0; idx < x86_pmu.num_events; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               val = event->hw.config;
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+               wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+       }
+}
+
+void hw_perf_enable(void)
+{
+       if (!x86_pmu_initialized())
+               return;
+       x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+       (void)checking_wrmsrl(hwc->config_base + idx,
+                             hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+       (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
+{
+       int idx = __idx - X86_PMC_IDX_FIXED;
+       u64 ctrl_val, mask;
+
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       u64 val = P6_NOP_EVENT;
+
+       if (cpuc->enabled)
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+       (void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+static inline void
+intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+               intel_pmu_disable_bts();
+               return;
+       }
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_disable_fixed(hwc, idx);
+               return;
+       }
+
+       x86_pmu_disable_event(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+       x86_pmu_disable_event(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+static int
+x86_perf_event_set_period(struct perf_event *event,
+                            struct hw_perf_event *hwc, int idx)
+{
+       s64 left = atomic64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int err, ret = 0;
+
+       if (idx == X86_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * If we are way outside a reasoable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               atomic64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+       /*
+        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+        */
+       if (unlikely(left < 2))
+               left = 2;
+
+       if (left > x86_pmu.max_period)
+               left = x86_pmu.max_period;
+
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+       /*
+        * The hw event starts counting from this event offset,
+        * mark it to be able to extra future deltas:
+        */
+       atomic64_set(&hwc->prev_count, (u64)-left);
+
+       err = checking_wrmsrl(hwc->event_base + idx,
+                            (u64)(-left) & x86_pmu.event_mask);
+
+       perf_event_update_userpage(event);
+
+       return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
+{
+       int idx = __idx - X86_PMC_IDX_FIXED;
+       u64 ctrl_val, bits, mask;
+       int err;
+
+       /*
+        * Enable IRQ generation (0x8),
+        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+        * if requested:
+        */
+       bits = 0x8ULL;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+               bits |= 0x2;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+               bits |= 0x1;
+       bits <<= (idx * 4);
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       ctrl_val |= bits;
+       err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       u64 val;
+
+       val = hwc->config;
+       if (cpuc->enabled)
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+       (void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+
+static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+               if (!__get_cpu_var(cpu_hw_events).enabled)
+                       return;
+
+               intel_pmu_enable_bts(hwc->config);
+               return;
+       }
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_enable_fixed(hwc, idx);
+               return;
+       }
+
+       x86_pmu_enable_event(hwc, idx);
+}
+
+static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+       if (cpuc->enabled)
+               x86_pmu_enable_event(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
+{
+       unsigned int hw_event;
+
+       hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+       if (unlikely((hw_event ==
+                     x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+                    (hwc->sample_period == 1)))
+               return X86_PMC_IDX_FIXED_BTS;
+
+       if (!x86_pmu.num_events_fixed)
+               return -1;
+
+       if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+               return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+       if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+               return X86_PMC_IDX_FIXED_CPU_CYCLES;
+       if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+               return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+       return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx;
+
+       idx = fixed_mode_idx(event, hwc);
+       if (idx == X86_PMC_IDX_FIXED_BTS) {
+               /* BTS is already occupied. */
+               if (test_and_set_bit(idx, cpuc->used_mask))
+                       return -EAGAIN;
+
+               hwc->config_base        = 0;
+               hwc->event_base = 0;
+               hwc->idx                = idx;
+       } else if (idx >= 0) {
+               /*
+                * Try to get the fixed event, if that is already taken
+                * then try to get a generic event:
+                */
+               if (test_and_set_bit(idx, cpuc->used_mask))
+                       goto try_generic;
+
+               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               /*
+                * We set it so that event_base + idx in wrmsr/rdmsr maps to
+                * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+                */
+               hwc->event_base =
+                       MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+               hwc->idx = idx;
+       } else {
+               idx = hwc->idx;
+               /* Try to get the previous generic event again */
+               if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+                       idx = find_first_zero_bit(cpuc->used_mask,
+                                                 x86_pmu.num_events);
+                       if (idx == x86_pmu.num_events)
+                               return -EAGAIN;
+
+                       set_bit(idx, cpuc->used_mask);
+                       hwc->idx = idx;
+               }
+               hwc->config_base  = x86_pmu.eventsel;
+               hwc->event_base = x86_pmu.perfctr;
+       }
+
+       perf_events_lapic_init();
+
+       x86_pmu.disable(hwc, idx);
+
+       cpuc->events[idx] = event;
+       set_bit(idx, cpuc->active_mask);
+
+       x86_perf_event_set_period(event, hwc, idx);
+       x86_pmu.enable(hwc, idx);
+
+       perf_event_update_userpage(event);
+
+       return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+                               cpuc->events[hwc->idx] != event))
+               return;
+
+       x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_event_print_debug(void)
+{
+       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+       struct cpu_hw_events *cpuc;
+       unsigned long flags;
+       int cpu, idx;
+
+       if (!x86_pmu.num_events)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.version >= 2) {
+               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+               pr_info("\n");
+               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+       }
+       pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+       for (idx = 0; idx < x86_pmu.num_events; idx++) {
+               rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+               rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
+
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+                       cpu, idx, pmc_ctrl);
+               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+                       cpu, idx, prev_left);
+       }
+       for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+       }
+       local_irq_restore(flags);
+}
+
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
+{
+       struct debug_store *ds = cpuc->ds;
+       struct bts_record {
+               u64     from;
+               u64     to;
+               u64     flags;
+       };
+       struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+       struct bts_record *at, *top;
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+       struct perf_sample_data data;
+       struct pt_regs regs;
+
+       if (!event)
+               return;
+
+       if (!ds)
+               return;
+
+       at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+       top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+       if (top <= at)
+               return;
+
+       ds->bts_index = ds->bts_buffer_base;
+
+
+       data.period     = event->hw.last_period;
+       data.addr       = 0;
+       regs.ip         = 0;
+
+       /*
+        * Prepare a generic sample, i.e. fill in the invariant fields.
+        * We will overwrite the from and to address before we output
+        * the sample.
+        */
+       perf_prepare_sample(&header, &data, event, &regs);
+
+       if (perf_output_begin(&handle, event,
+                             header.size * (top - at), 1, 1))
+               return;
+
+       for (; at < top; at++) {
+               data.ip         = at->from;
+               data.addr       = at->to;
+
+               perf_output_sample(&handle, &header, &data, event);
+       }
+
+       perf_output_end(&handle);
+
+       /* There's new data available. */
+       event->hw.interrupts++;
+       event->pending_kill = POLL_IN;
+}
+
+static void x86_pmu_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+
+       /*
+        * Must be done before we disable, otherwise the nmi handler
+        * could reenable again:
+        */
+       clear_bit(idx, cpuc->active_mask);
+       x86_pmu.disable(hwc, idx);
+
+       /*
+        * Make sure the cleared pointer becomes visible before we
+        * (potentially) free the event:
+        */
+       barrier();
+
+       /*
+        * Drain the remaining delta count out of a event
+        * that we are disabling:
+        */
+       x86_perf_event_update(event, hwc, idx);
+
+       /* Drain the remaining BTS records. */
+       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+               intel_pmu_drain_bts_buffer(cpuc);
+
+       cpuc->events[idx] = NULL;
+       clear_bit(idx, cpuc->used_mask);
+
+       perf_event_update_userpage(event);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+       int ret;
+
+       x86_perf_event_update(event, hwc, idx);
+       ret = x86_perf_event_set_period(event, hwc, idx);
+
+       if (event->state == PERF_EVENT_STATE_ACTIVE)
+               intel_pmu_enable_event(hwc, idx);
+
+       return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+       struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+       unsigned long flags;
+       int idx;
+
+       if (!x86_pmu.num_events)
+               return;
+
+       local_irq_save(flags);
+
+       printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+       for (idx = 0; idx < x86_pmu.num_events; idx++) {
+               checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+               checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+       }
+       for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+               checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+       }
+       if (ds)
+               ds->bts_index = ds->bts_buffer_base;
+
+       local_irq_restore(flags);
+}
+
+static int p6_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int idx, handled = 0;
+       u64 val;
+
+       data.addr = 0;
+
+       cpuc = &__get_cpu_var(cpu_hw_events);
+
+       for (idx = 0; idx < x86_pmu.num_events; idx++) {
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               event = cpuc->events[idx];
+               hwc = &event->hw;
+
+               val = x86_perf_event_update(event, hwc, idx);
+               if (val & (1ULL << (x86_pmu.event_bits - 1)))
+                       continue;
+
+               /*
+                * event overflow
+                */
+               handled         = 1;
+               data.period     = event->hw.last_period;
+
+               if (!x86_perf_event_set_period(event, hwc, idx))
+                       continue;
+
+               if (perf_event_overflow(event, 1, &data, regs))
+                       p6_pmu_disable_event(hwc, idx);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int bit, loops;
+       u64 ack, status;
+
+       data.addr = 0;
+
+       cpuc = &__get_cpu_var(cpu_hw_events);
+
+       perf_disable();
+       intel_pmu_drain_bts_buffer(cpuc);
+       status = intel_pmu_get_status();
+       if (!status) {
+               perf_enable();
+               return 0;
+       }
+
+       loops = 0;
+again:
+       if (++loops > 100) {
+               WARN_ONCE(1, "perfevents: irq loop stuck!\n");
+               perf_event_print_debug();
+               intel_pmu_reset();
+               perf_enable();
+               return 1;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+       ack = status;
+       for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               clear_bit(bit, (unsigned long *) &status);
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               data.period = event->hw.last_period;
+
+               if (perf_event_overflow(event, 1, &data, regs))
+                       intel_pmu_disable_event(&event->hw, bit);
+       }
+
+       intel_pmu_ack_status(ack);
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = intel_pmu_get_status();
+       if (status)
+               goto again;
+
+       perf_enable();
+
+       return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int idx, handled = 0;
+       u64 val;
+
+       data.addr = 0;
+
+       cpuc = &__get_cpu_var(cpu_hw_events);
+
+       for (idx = 0; idx < x86_pmu.num_events; idx++) {
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               event = cpuc->events[idx];
+               hwc = &event->hw;
+
+               val = x86_perf_event_update(event, hwc, idx);
+               if (val & (1ULL << (x86_pmu.event_bits - 1)))
+                       continue;
+
+               /*
+                * event overflow
+                */
+               handled         = 1;
+               data.period     = event->hw.last_period;
+
+               if (!x86_perf_event_set_period(event, hwc, idx))
+                       continue;
+
+               if (perf_event_overflow(event, 1, &data, regs))
+                       amd_pmu_disable_event(hwc, idx);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+       irq_enter();
+       ack_APIC_irq();
+       inc_irq_stat(apic_pending_irqs);
+       perf_event_do_pending();
+       irq_exit();
+}
+
+void set_perf_event_pending(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+       apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
+}
+
+void perf_events_lapic_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (!x86_pmu.apic || !x86_pmu_initialized())
+               return;
+
+       /*
+        * Always use NMI for PMU
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+}
+
+static int __kprobes
+perf_event_nmi_handler(struct notifier_block *self,
+                        unsigned long cmd, void *__args)
+{
+       struct die_args *args = __args;
+       struct pt_regs *regs;
+
+       if (!atomic_read(&active_events))
+               return NOTIFY_DONE;
+
+       switch (cmd) {
+       case DIE_NMI:
+       case DIE_NMI_IPI:
+               break;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       regs = args->regs;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+       /*
+        * Can't rely on the handled return value to say it was our NMI, two
+        * events could trigger 'simultaneously' raising two back-to-back NMIs.
+        *
+        * If the first NMI handles both, the latter will be empty and daze
+        * the CPU.
+        */
+       x86_pmu.handle_irq(regs);
+
+       return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+       .notifier_call          = perf_event_nmi_handler,
+       .next                   = NULL,
+       .priority               = 1
+};
+
+static struct x86_pmu p6_pmu = {
+       .name                   = "p6",
+       .handle_irq             = p6_pmu_handle_irq,
+       .disable_all            = p6_pmu_disable_all,
+       .enable_all             = p6_pmu_enable_all,
+       .enable                 = p6_pmu_enable_event,
+       .disable                = p6_pmu_disable_event,
+       .eventsel               = MSR_P6_EVNTSEL0,
+       .perfctr                = MSR_P6_PERFCTR0,
+       .event_map              = p6_pmu_event_map,
+       .raw_event              = p6_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 31) - 1,
+       .version                = 0,
+       .num_events             = 2,
+       /*
+        * Events have 40 bits implemented. However they are designed such
+        * that bits [32-39] are sign extensions of bit 31. As such the
+        * effective width of a event for P6-like PMU is 32 bits only.
+        *
+        * See IA-32 Intel Architecture Software developer manual Vol 3B
+        */
+       .event_bits             = 32,
+       .event_mask             = (1ULL << 32) - 1,
+};
+
+static struct x86_pmu intel_pmu = {
+       .name                   = "Intel",
+       .handle_irq             = intel_pmu_handle_irq,
+       .disable_all            = intel_pmu_disable_all,
+       .enable_all             = intel_pmu_enable_all,
+       .enable                 = intel_pmu_enable_event,
+       .disable                = intel_pmu_disable_event,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .raw_event              = intel_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       /*
+        * Intel PMCs cannot be accessed sanely above 32 bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL << 31) - 1,
+       .enable_bts             = intel_pmu_enable_bts,
+       .disable_bts            = intel_pmu_disable_bts,
+};
+
+static struct x86_pmu amd_pmu = {
+       .name                   = "AMD",
+       .handle_irq             = amd_pmu_handle_irq,
+       .disable_all            = amd_pmu_disable_all,
+       .enable_all             = amd_pmu_enable_all,
+       .enable                 = amd_pmu_enable_event,
+       .disable                = amd_pmu_disable_event,
+       .eventsel               = MSR_K7_EVNTSEL0,
+       .perfctr                = MSR_K7_PERFCTR0,
+       .event_map              = amd_pmu_event_map,
+       .raw_event              = amd_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
+       .num_events             = 4,
+       .event_bits             = 48,
+       .event_mask             = (1ULL << 48) - 1,
+       .apic                   = 1,
+       /* use highest bit to detect overflow */
+       .max_period             = (1ULL << 47) - 1,
+};
+
+static int p6_pmu_init(void)
+{
+       switch (boot_cpu_data.x86_model) {
+       case 1:
+       case 3:  /* Pentium Pro */
+       case 5:
+       case 6:  /* Pentium II */
+       case 7:
+       case 8:
+       case 11: /* Pentium III */
+               break;
+       case 9:
+       case 13:
+               /* Pentium M */
+               break;
+       default:
+               pr_cont("unsupported p6 CPU model %d ",
+                       boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       x86_pmu = p6_pmu;
+
+       if (!cpu_has_apic) {
+               pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+               pr_info("no hardware sampling interrupt available.\n");
+               x86_pmu.apic = 0;
+       }
+
+       return 0;
+}
+
+static int intel_pmu_init(void)
+{
+       union cpuid10_edx edx;
+       union cpuid10_eax eax;
+       unsigned int unused;
+       unsigned int ebx;
+       int version;
+
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+               /* check for P6 processor family */
+          if (boot_cpu_data.x86 == 6) {
+               return p6_pmu_init();
+          } else {
+               return -ENODEV;
+          }
+       }
+
+       /*
+        * Check whether the Architectural PerfMon supports
+        * Branch Misses Retired hw_event or not.
+        */
+       cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+       if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+               return -ENODEV;
+
+       version = eax.split.version_id;
+       if (version < 2)
+               return -ENODEV;
+
+       x86_pmu                         = intel_pmu;
+       x86_pmu.version                 = version;
+       x86_pmu.num_events              = eax.split.num_events;
+       x86_pmu.event_bits              = eax.split.bit_width;
+       x86_pmu.event_mask              = (1ULL << eax.split.bit_width) - 1;
+
+       /*
+        * Quirk: v2 perfmon does not report fixed-purpose events, so
+        * assume at least 3 events:
+        */
+       x86_pmu.num_events_fixed        = max((int)edx.split.num_events_fixed, 3);
+
+       /*
+        * Install the hw-cache-events table:
+        */
+       switch (boot_cpu_data.x86_model) {
+       case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+       case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+       case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+       case 29: /* six-core 45 nm xeon "Dunnington" */
+               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("Core2 events, ");
+               break;
+       default:
+       case 26:
+               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("Nehalem/Corei7 events, ");
+               break;
+       case 28:
+               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               pr_cont("Atom events, ");
+               break;
+       }
+       return 0;
+}
+
+static int amd_pmu_init(void)
+{
+       /* Performance-monitoring supported from K7 and later: */
+       if (boot_cpu_data.x86 < 6)
+               return -ENODEV;
+
+       x86_pmu = amd_pmu;
+
+       /* Events are common for all AMDs */
+       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+              sizeof(hw_cache_event_ids));
+
+       return 0;
+}
+
+void __init init_hw_perf_events(void)
+{
+       int err;
+
+       pr_info("Performance Events: ");
+
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_INTEL:
+               err = intel_pmu_init();
+               break;
+       case X86_VENDOR_AMD:
+               err = amd_pmu_init();
+               break;
+       default:
+               return;
+       }
+       if (err != 0) {
+               pr_cont("no PMU driver, software events only.\n");
+               return;
+       }
+
+       pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+       if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+                    x86_pmu.num_events, X86_PMC_MAX_GENERIC);
+               x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+       }
+       perf_event_mask = (1 << x86_pmu.num_events) - 1;
+       perf_max_events = x86_pmu.num_events;
+
+       if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+                    x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
+               x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+       }
+
+       perf_event_mask |=
+               ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+       x86_pmu.intel_ctrl = perf_event_mask;
+
+       perf_events_lapic_init();
+       register_die_notifier(&perf_event_nmi_notifier);
+
+       pr_info("... version:                 %d\n",     x86_pmu.version);
+       pr_info("... bit width:               %d\n",     x86_pmu.event_bits);
+       pr_info("... generic events:        %d\n",     x86_pmu.num_events);
+       pr_info("... value mask:              %016Lx\n", x86_pmu.event_mask);
+       pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+       pr_info("... fixed-purpose events:  %d\n",     x86_pmu.num_events_fixed);
+       pr_info("... event mask:            %016Lx\n", perf_event_mask);
+}
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+       x86_perf_event_update(event, &event->hw, event->hw.idx);
+}
+
+static const struct pmu pmu = {
+       .enable         = x86_pmu_enable,
+       .disable        = x86_pmu_disable,
+       .read           = x86_pmu_read,
+       .unthrottle     = x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+       int err;
+
+       err = __hw_perf_event_init(event);
+       if (err) {
+               if (event->destroy)
+                       event->destroy(event);
+               return ERR_PTR(err);
+       }
+
+       return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+       if (entry->nr < PERF_MAX_STACK_DEPTH)
+               entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+static DEFINE_PER_CPU(int, in_nmi_frame);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+       /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+       per_cpu(in_nmi_frame, smp_processor_id()) =
+                       x86_is_stack_id(NMI_STACK, name);
+
+       return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+       struct perf_callchain_entry *entry = data;
+
+       if (per_cpu(in_nmi_frame, smp_processor_id()))
+               return;
+
+       if (reliable)
+               callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+       .warning                = backtrace_warning,
+       .warning_symbol         = backtrace_warning_symbol,
+       .stack                  = backtrace_stack,
+       .address                = backtrace_address,
+};
+
+#include "../dumpstack.h"
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       callchain_store(entry, PERF_CONTEXT_KERNEL);
+       callchain_store(entry, regs->ip);
+
+       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+       unsigned long offset, addr = (unsigned long)from;
+       int type = in_nmi() ? KM_NMI : KM_IRQ0;
+       unsigned long size, len = 0;
+       struct page *page;
+       void *map;
+       int ret;
+
+       do {
+               ret = __get_user_pages_fast(addr, 1, 0, &page);
+               if (!ret)
+                       break;
+
+               offset = addr & (PAGE_SIZE - 1);
+               size = min(PAGE_SIZE - offset, n - len);
+
+               map = kmap_atomic(page, type);
+               memcpy(to, map+offset, size);
+               kunmap_atomic(map, type);
+               put_page(page);
+
+               len  += size;
+               to   += size;
+               addr += size;
+
+       } while (len < n);
+
+       return len;
+}
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+       unsigned long bytes;
+
+       bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+
+       return bytes == sizeof(*frame);
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       struct stack_frame frame;
+       const void __user *fp;
+
+       if (!user_mode(regs))
+               regs = task_pt_regs(current);
+
+       fp = (void __user *)regs->bp;
+
+       callchain_store(entry, PERF_CONTEXT_USER);
+       callchain_store(entry, regs->ip);
+
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               frame.next_frame             = NULL;
+               frame.return_address = 0;
+
+               if (!copy_stack_frame(fp, &frame))
+                       break;
+
+               if ((unsigned long)fp < regs->sp)
+                       break;
+
+               callchain_store(entry, frame.return_address);
+               fp = frame.next_frame;
+       }
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       int is_user;
+
+       if (!regs)
+               return;
+
+       is_user = user_mode(regs);
+
+       if (!current || current->pid == 0)
+               return;
+
+       if (is_user && current->state != TASK_RUNNING)
+               return;
+
+       if (!is_user)
+               perf_callchain_kernel(regs, entry);
+
+       if (current->mm)
+               perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       struct perf_callchain_entry *entry;
+
+       if (in_nmi())
+               entry = &__get_cpu_var(pmc_nmi_entry);
+       else
+               entry = &__get_cpu_var(pmc_irq_entry);
+
+       entry->nr = 0;
+
+       perf_do_callchain(regs, entry);
+
+       return entry;
+}
+
+void hw_perf_event_setup_online(int cpu)
+{
+       init_debug_store_on_cpu(cpu);
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c

index 392bea4..fab786f 100644 (file)
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
  #include <linux/kprobes.h>
  
  #include <asm/apic.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
  
  struct nmi_watchdog_ctlblk {
         unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index d59fe32..681c3fd 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1021,7 +1021,7 @@ apicinterrupt ERROR_APIC_VECTOR \
  apicinterrupt SPURIOUS_APIC_VECTOR \
         spurious_interrupt smp_spurious_interrupt
  
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
  apicinterrupt LOCAL_PENDING_VECTOR \
         perf_pending_interrupt smp_perf_pending_interrupt
  #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

index 3008831..40f3077 100644 (file)
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -208,7 +208,7 @@ static void __init apic_intr_init(void)
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
  
         /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_COUNTERS
+# ifdef CONFIG_PERF_EVENTS
         alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
  # endif
  
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S

index d51321d..0157cd2 100644 (file)
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
         .long sys_preadv
         .long sys_pwritev
         .long sys_rt_tgsigqueueinfo     /* 335 */
-       .long sys_perf_counter_open
+       .long sys_perf_event_open
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index 775a020..82728f2 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
  #include <linux/bootmem.h>             /* max_low_pfn                  */
  #include <linux/kprobes.h>             /* __kprobes, ...               */
  #include <linux/mmiotrace.h>           /* kmmio_handler, ...           */
-#include <linux/perf_counter.h>                /* perf_swcounter_event         */
+#include <linux/perf_event.h>          /* perf_sw_event                */
  
  #include <asm/traps.h>                 /* dotraplinkage, ...           */
  #include <asm/pgalloc.h>               /* pgd_*(), ...                 */
@@ -1017,7 +1017,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
         if (unlikely(error_code & PF_RSVD))
                 pgtable_bad(regs, error_code, address);
  
-       perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
  
         /*
          * If we're in an interrupt, have no user context or are running
@@ -1114,11 +1114,11 @@ good_area:
  
         if (fault & VM_FAULT_MAJOR) {
                 tsk->maj_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                      regs, address);
         } else {
                 tsk->min_flt++;
-               perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                      regs, address);
         }
  
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c

index 4899215..8eb0587 100644 (file)
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void)
         if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
                 current_cpu_data.x86_model == 15) {
                 eax.split.version_id = 2;
-               eax.split.num_counters = 2;
+               eax.split.num_events = 2;
                 eax.split.bit_width = 40;
         }
  
-       num_counters = eax.split.num_counters;
+       num_counters = eax.split.num_events;
  
         op_arch_perfmon_spec.num_counters = num_counters;
         op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h

index b837761..7b8e75d 100644 (file)
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
  #define OP_X86_MODEL_H
  
  #include <asm/types.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
  
  struct op_msr {
         unsigned long   addr;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c

index 50eecfe..44203ff 100644 (file)
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -26,7 +26,7 @@
  #include <linux/proc_fs.h>
  #include <linux/nmi.h>
  #include <linux/quotaops.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/suspend.h>
@@ -252,7 +252,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
         struct pt_regs *regs = get_irq_regs();
         if (regs)
                 show_regs(regs);
-       perf_counter_print_debug();
+       perf_event_print_debug();
  }
  static struct sysrq_key_op sysrq_showregs_op = {
         .handler        = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c

index 172ceb6..434dba7 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
  #include <linux/string.h>
  #include <linux/init.h>
  #include <linux/pagemap.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/highmem.h>
  #include <linux/spinlock.h>
  #include <linux/key.h>
@@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
         task_lock(tsk);
         strlcpy(tsk->comm, buf, sizeof(tsk->comm));
         task_unlock(tsk);
-       perf_counter_comm(tsk);
+       perf_event_comm(tsk);
  }
  
  int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm)
          * security domain:
          */
         if (!get_dumpable(current->mm))
-               perf_counter_exit_task(current);
+               perf_event_exit_task(current);
  
         /* An exec changes our domain. We are no longer part of the thread
            group */
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h

index 1125e5a..d76b66a 100644 (file)
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -620,8 +620,8 @@ __SYSCALL(__NR_move_pages, sys_move_pages)
  
  #define __NR_rt_tgsigqueueinfo 240
  __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open 241
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open 241
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
  
  #undef __NR_syscalls
  #define __NR_syscalls 242
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 9e7f2e8..21a6f5d 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -106,13 +106,13 @@ extern struct group_info init_groups;
  
  extern struct cred init_cred;
  
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)                                       \
-       .perf_counter_mutex =                                           \
-                __MUTEX_INITIALIZER(tsk.perf_counter_mutex),           \
-       .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#ifdef CONFIG_PERF_EVENTS
+# define INIT_PERF_EVENTS(tsk)                                 \
+       .perf_event_mutex =                                             \
+                __MUTEX_INITIALIZER(tsk.perf_event_mutex),             \
+       .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
  #else
-# define INIT_PERF_COUNTERS(tsk)
+# define INIT_PERF_EVENTS(tsk)
  #endif
  
  /*
@@ -178,7 +178,7 @@ extern struct cred init_cred;
         },                                                              \
         .dirties = INIT_PROP_LOCAL_SINGLE(dirties),                     \
         INIT_IDS                                                        \
-       INIT_PERF_COUNTERS(tsk)                                         \
+       INIT_PERF_EVENTS(tsk)                                           \
         INIT_TRACE_IRQFLAGS                                             \
         INIT_LOCKDEP                                                    \
         INIT_FTRACE_GRAPH                                               \
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h

deleted file mode 100644 (file)

index f648627..0000000
--- a/include/linux/perf_counter.h
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- *  Performance counters:
- *
- *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
- *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
- *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
- *
- *  Data type definitions, declarations, prototypes.
- *
- *    Started by: Thomas Gleixner and Ingo Molnar
- *
- *  For licencing details see kernel-base/COPYING
- */
-#ifndef _LINUX_PERF_COUNTER_H
-#define _LINUX_PERF_COUNTER_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <asm/byteorder.h>
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * attr.type
- */
-enum perf_type_id {
-       PERF_TYPE_HARDWARE                      = 0,
-       PERF_TYPE_SOFTWARE                      = 1,
-       PERF_TYPE_TRACEPOINT                    = 2,
-       PERF_TYPE_HW_CACHE                      = 3,
-       PERF_TYPE_RAW                           = 4,
-
-       PERF_TYPE_MAX,                          /* non-ABI */
-};
-
-/*
- * Generalized performance counter event types, used by the
- * attr.event_id parameter of the sys_perf_counter_open()
- * syscall:
- */
-enum perf_hw_id {
-       /*
-        * Common hardware events, generalized by the kernel:
-        */
-       PERF_COUNT_HW_CPU_CYCLES                = 0,
-       PERF_COUNT_HW_INSTRUCTIONS              = 1,
-       PERF_COUNT_HW_CACHE_REFERENCES          = 2,
-       PERF_COUNT_HW_CACHE_MISSES              = 3,
-       PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
-       PERF_COUNT_HW_BRANCH_MISSES             = 5,
-       PERF_COUNT_HW_BUS_CYCLES                = 6,
-
-       PERF_COUNT_HW_MAX,                      /* non-ABI */
-};
-
-/*
- * Generalized hardware cache counters:
- *
- *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
- *       { read, write, prefetch } x
- *       { accesses, misses }
- */
-enum perf_hw_cache_id {
-       PERF_COUNT_HW_CACHE_L1D                 = 0,
-       PERF_COUNT_HW_CACHE_L1I                 = 1,
-       PERF_COUNT_HW_CACHE_LL                  = 2,
-       PERF_COUNT_HW_CACHE_DTLB                = 3,
-       PERF_COUNT_HW_CACHE_ITLB                = 4,
-       PERF_COUNT_HW_CACHE_BPU                 = 5,
-
-       PERF_COUNT_HW_CACHE_MAX,                /* non-ABI */
-};
-
-enum perf_hw_cache_op_id {
-       PERF_COUNT_HW_CACHE_OP_READ             = 0,
-       PERF_COUNT_HW_CACHE_OP_WRITE            = 1,
-       PERF_COUNT_HW_CACHE_OP_PREFETCH         = 2,
-
-       PERF_COUNT_HW_CACHE_OP_MAX,             /* non-ABI */
-};
-
-enum perf_hw_cache_op_result_id {
-       PERF_COUNT_HW_CACHE_RESULT_ACCESS       = 0,
-       PERF_COUNT_HW_CACHE_RESULT_MISS         = 1,
-
-       PERF_COUNT_HW_CACHE_RESULT_MAX,         /* non-ABI */
-};
-
-/*
- * Special "software" counters provided by the kernel, even if the hardware
- * does not support performance counters. These counters measure various
- * physical and sw events of the kernel (and allow the profiling of them as
- * well):
- */
-enum perf_sw_ids {
-       PERF_COUNT_SW_CPU_CLOCK                 = 0,
-       PERF_COUNT_SW_TASK_CLOCK                = 1,
-       PERF_COUNT_SW_PAGE_FAULTS               = 2,
-       PERF_COUNT_SW_CONTEXT_SWITCHES          = 3,
-       PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
-       PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
-       PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
-
-       PERF_COUNT_SW_MAX,                      /* non-ABI */
-};
-
-/*
- * Bits that can be set in attr.sample_type to request information
- * in the overflow packets.
- */
-enum perf_counter_sample_format {
-       PERF_SAMPLE_IP                          = 1U << 0,
-       PERF_SAMPLE_TID                         = 1U << 1,
-       PERF_SAMPLE_TIME                        = 1U << 2,
-       PERF_SAMPLE_ADDR                        = 1U << 3,
-       PERF_SAMPLE_READ                        = 1U << 4,
-       PERF_SAMPLE_CALLCHAIN                   = 1U << 5,
-       PERF_SAMPLE_ID                          = 1U << 6,
-       PERF_SAMPLE_CPU                         = 1U << 7,
-       PERF_SAMPLE_PERIOD                      = 1U << 8,
-       PERF_SAMPLE_STREAM_ID                   = 1U << 9,
-       PERF_SAMPLE_RAW                         = 1U << 10,
-
-       PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */
-};
-
-/*
- * The format of the data returned by read() on a perf counter fd,
- * as specified by attr.read_format:
- *
- * struct read_format {
- *     { u64           value;
- *       { u64         time_enabled; } && PERF_FORMAT_ENABLED
- *       { u64         time_running; } && PERF_FORMAT_RUNNING
- *       { u64         id;           } && PERF_FORMAT_ID
- *     } && !PERF_FORMAT_GROUP
- *
- *     { u64           nr;
- *       { u64         time_enabled; } && PERF_FORMAT_ENABLED
- *       { u64         time_running; } && PERF_FORMAT_RUNNING
- *       { u64         value;
- *         { u64       id;           } && PERF_FORMAT_ID
- *       }             cntr[nr];
- *     } && PERF_FORMAT_GROUP
- * };
- */
-enum perf_counter_read_format {
-       PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,
-       PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,
-       PERF_FORMAT_ID                          = 1U << 2,
-       PERF_FORMAT_GROUP                       = 1U << 3,
-
-       PERF_FORMAT_MAX = 1U << 4,              /* non-ABI */
-};
-
-#define PERF_ATTR_SIZE_VER0    64      /* sizeof first published struct */
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_attr {
-
-       /*
-        * Major type: hardware/software/tracepoint/etc.
-        */
-       __u32                   type;
-
-       /*
-        * Size of the attr structure, for fwd/bwd compat.
-        */
-       __u32                   size;
-
-       /*
-        * Type specific configuration information.
-        */
-       __u64                   config;
-
-       union {
-               __u64           sample_period;
-               __u64           sample_freq;
-       };
-
-       __u64                   sample_type;
-       __u64                   read_format;
-
-       __u64                   disabled       :  1, /* off by default        */
-                               inherit        :  1, /* children inherit it   */
-                               pinned         :  1, /* must always be on PMU */
-                               exclusive      :  1, /* only group on PMU     */
-                               exclude_user   :  1, /* don't count user      */
-                               exclude_kernel :  1, /* ditto kernel          */
-                               exclude_hv     :  1, /* ditto hypervisor      */
-                               exclude_idle   :  1, /* don't count when idle */
-                               mmap           :  1, /* include mmap data     */
-                               comm           :  1, /* include comm data     */
-                               freq           :  1, /* use freq, not period  */
-                               inherit_stat   :  1, /* per task counts       */
-                               enable_on_exec :  1, /* next exec enables     */
-                               task           :  1, /* trace fork/exit       */
-                               watermark      :  1, /* wakeup_watermark      */
-
-                               __reserved_1   : 49;
-
-       union {
-               __u32           wakeup_events;    /* wakeup every n events */
-               __u32           wakeup_watermark; /* bytes before wakeup   */
-       };
-       __u32                   __reserved_2;
-
-       __u64                   __reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE                _IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE       _IO ('$', 1)
-#define PERF_COUNTER_IOC_REFRESH       _IO ('$', 2)
-#define PERF_COUNTER_IOC_RESET         _IO ('$', 3)
-#define PERF_COUNTER_IOC_PERIOD                _IOW('$', 4, u64)
-#define PERF_COUNTER_IOC_SET_OUTPUT    _IO ('$', 5)
-
-enum perf_counter_ioc_flags {
-       PERF_IOC_FLAG_GROUP             = 1U << 0,
-};
-
-/*
- * Structure of the page that can be mapped via mmap
- */
-struct perf_counter_mmap_page {
-       __u32   version;                /* version number of this structure */
-       __u32   compat_version;         /* lowest version this is compat with */
-
-       /*
-        * Bits needed to read the hw counters in user-space.
-        *
-        *   u32 seq;
-        *   s64 count;
-        *
-        *   do {
-        *     seq = pc->lock;
-        *
-        *     barrier()
-        *     if (pc->index) {
-        *       count = pmc_read(pc->index - 1);
-        *       count += pc->offset;
-        *     } else
-        *       goto regular_read;
-        *
-        *     barrier();
-        *   } while (pc->lock != seq);
-        *
-        * NOTE: for obvious reason this only works on self-monitoring
-        *       processes.
-        */
-       __u32   lock;                   /* seqlock for synchronization */
-       __u32   index;                  /* hardware counter identifier */
-       __s64   offset;                 /* add to hardware counter value */
-       __u64   time_enabled;           /* time counter active */
-       __u64   time_running;           /* time counter on cpu */
-
-               /*
-                * Hole for extension of the self monitor capabilities
-                */
-
-       __u64   __reserved[123];        /* align to 1k */
-
-       /*
-        * Control data for the mmap() data buffer.
-        *
-        * User-space reading the @data_head value should issue an rmb(), on
-        * SMP capable platforms, after reading this value -- see
-        * perf_counter_wakeup().
-        *
-        * When the mapping is PROT_WRITE the @data_tail value should be
-        * written by userspace to reflect the last read data. In this case
-        * the kernel will not over-write unread data.
-        */
-       __u64   data_head;              /* head in the data section */
-       __u64   data_tail;              /* user-space written tail */
-};
-
-#define PERF_EVENT_MISC_CPUMODE_MASK           (3 << 0)
-#define PERF_EVENT_MISC_CPUMODE_UNKNOWN                (0 << 0)
-#define PERF_EVENT_MISC_KERNEL                 (1 << 0)
-#define PERF_EVENT_MISC_USER                   (2 << 0)
-#define PERF_EVENT_MISC_HYPERVISOR             (3 << 0)
-
-struct perf_event_header {
-       __u32   type;
-       __u16   misc;
-       __u16   size;
-};
-
-enum perf_event_type {
-
-       /*
-        * The MMAP events record the PROT_EXEC mappings so that we can
-        * correlate userspace IPs to code. They have the following structure:
-        *
-        * struct {
-        *      struct perf_event_header        header;
-        *
-        *      u32                             pid, tid;
-        *      u64                             addr;
-        *      u64                             len;
-        *      u64                             pgoff;
-        *      char                            filename[];
-        * };
-        */
-       PERF_EVENT_MMAP                 = 1,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *      u64                             id;
-        *      u64                             lost;
-        * };
-        */
-       PERF_EVENT_LOST                 = 2,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *
-        *      u32                             pid, tid;
-        *      char                            comm[];
-        * };
-        */
-       PERF_EVENT_COMM                 = 3,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *      u32                             pid, ppid;
-        *      u32                             tid, ptid;
-        *      u64                             time;
-        * };
-        */
-       PERF_EVENT_EXIT                 = 4,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *      u64                             time;
-        *      u64                             id;
-        *      u64                             stream_id;
-        * };
-        */
-       PERF_EVENT_THROTTLE             = 5,
-       PERF_EVENT_UNTHROTTLE           = 6,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *      u32                             pid, ppid;
-        *      u32                             tid, ptid;
-        *      { u64                           time;     } && PERF_SAMPLE_TIME
-        * };
-        */
-       PERF_EVENT_FORK                 = 7,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *      u32                             pid, tid;
-        *
-        *      struct read_format              values;
-        * };
-        */
-       PERF_EVENT_READ                 = 8,
-
-       /*
-        * struct {
-        *      struct perf_event_header        header;
-        *
-        *      { u64                   ip;       } && PERF_SAMPLE_IP
-        *      { u32                   pid, tid; } && PERF_SAMPLE_TID
-        *      { u64                   time;     } && PERF_SAMPLE_TIME
-        *      { u64                   addr;     } && PERF_SAMPLE_ADDR
-        *      { u64                   id;       } && PERF_SAMPLE_ID
-        *      { u64                   stream_id;} && PERF_SAMPLE_STREAM_ID
-        *      { u32                   cpu, res; } && PERF_SAMPLE_CPU
-        *      { u64                   period;   } && PERF_SAMPLE_PERIOD
-        *
-        *      { struct read_format    values;   } && PERF_SAMPLE_READ
-        *
-        *      { u64                   nr,
-        *        u64                   ips[nr];  } && PERF_SAMPLE_CALLCHAIN
-        *
-        *      #
-        *      # The RAW record below is opaque data wrt the ABI
-        *      #
-        *      # That is, the ABI doesn't make any promises wrt to
-        *      # the stability of its content, it may vary depending
-        *      # on event, hardware, kernel version and phase of
-        *      # the moon.
-        *      #
-        *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
-        *      #
-        *
-        *      { u32                   size;
-        *        char                  data[size];}&& PERF_SAMPLE_RAW
-        * };
-        */
-       PERF_EVENT_SAMPLE               = 9,
-
-       PERF_EVENT_MAX,                 /* non-ABI */
-};
-
-enum perf_callchain_context {
-       PERF_CONTEXT_HV                 = (__u64)-32,
-       PERF_CONTEXT_KERNEL             = (__u64)-128,
-       PERF_CONTEXT_USER               = (__u64)-512,
-
-       PERF_CONTEXT_GUEST              = (__u64)-2048,
-       PERF_CONTEXT_GUEST_KERNEL       = (__u64)-2176,
-       PERF_CONTEXT_GUEST_USER         = (__u64)-2560,
-
-       PERF_CONTEXT_MAX                = (__u64)-4095,
-};
-
-#define PERF_FLAG_FD_NO_GROUP  (1U << 0)
-#define PERF_FLAG_FD_OUTPUT    (1U << 1)
-
-#ifdef __KERNEL__
-/*
- * Kernel-internal data types and definitions:
- */
-
-#ifdef CONFIG_PERF_COUNTERS
-# include <asm/perf_counter.h>
-#endif
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-#include <linux/hrtimer.h>
-#include <linux/fs.h>
-#include <linux/pid_namespace.h>
-#include <asm/atomic.h>
-
-#define PERF_MAX_STACK_DEPTH           255
-
-struct perf_callchain_entry {
-       __u64                           nr;
-       __u64                           ip[PERF_MAX_STACK_DEPTH];
-};
-
-struct perf_raw_record {
-       u32                             size;
-       void                            *data;
-};
-
-struct task_struct;
-
-/**
- * struct hw_perf_counter - performance counter hardware details:
- */
-struct hw_perf_counter {
-#ifdef CONFIG_PERF_COUNTERS
-       union {
-               struct { /* hardware */
-                       u64             config;
-                       unsigned long   config_base;
-                       unsigned long   counter_base;
-                       int             idx;
-               };
-               union { /* software */
-                       atomic64_t      count;
-                       struct hrtimer  hrtimer;
-               };
-       };
-       atomic64_t                      prev_count;
-       u64                             sample_period;
-       u64                             last_period;
-       atomic64_t                      period_left;
-       u64                             interrupts;
-
-       u64                             freq_count;
-       u64                             freq_interrupts;
-       u64                             freq_stamp;
-#endif
-};
-
-struct perf_counter;
-
-/**
- * struct pmu - generic performance monitoring unit
- */
-struct pmu {
-       int (*enable)                   (struct perf_counter *counter);
-       void (*disable)                 (struct perf_counter *counter);
-       void (*read)                    (struct perf_counter *counter);
-       void (*unthrottle)              (struct perf_counter *counter);
-};
-
-/**
- * enum perf_counter_active_state - the states of a counter
- */
-enum perf_counter_active_state {
-       PERF_COUNTER_STATE_ERROR        = -2,
-       PERF_COUNTER_STATE_OFF          = -1,
-       PERF_COUNTER_STATE_INACTIVE     =  0,
-       PERF_COUNTER_STATE_ACTIVE       =  1,
-};
-
-struct file;
-
-struct perf_mmap_data {
-       struct rcu_head                 rcu_head;
-       int                             nr_pages;       /* nr of data pages  */
-       int                             writable;       /* are we writable   */
-       int                             nr_locked;      /* nr pages mlocked  */
-
-       atomic_t                        poll;           /* POLL_ for wakeups */
-       atomic_t                        events;         /* event limit       */
-
-       atomic_long_t                   head;           /* write position    */
-       atomic_long_t                   done_head;      /* completed head    */
-
-       atomic_t                        lock;           /* concurrent writes */
-       atomic_t                        wakeup;         /* needs a wakeup    */
-       atomic_t                        lost;           /* nr records lost   */
-
-       long                            watermark;      /* wakeup watermark  */
-
-       struct perf_counter_mmap_page   *user_page;
-       void                            *data_pages[0];
-};
-
-struct perf_pending_entry {
-       struct perf_pending_entry *next;
-       void (*func)(struct perf_pending_entry *);
-};
-
-/**
- * struct perf_counter - performance counter kernel representation:
- */
-struct perf_counter {
-#ifdef CONFIG_PERF_COUNTERS
-       struct list_head                group_entry;
-       struct list_head                event_entry;
-       struct list_head                sibling_list;
-       int                             nr_siblings;
-       struct perf_counter             *group_leader;
-       struct perf_counter             *output;
-       const struct pmu                *pmu;
-
-       enum perf_counter_active_state  state;
-       atomic64_t                      count;
-
-       /*
-        * These are the total time in nanoseconds that the counter
-        * has been enabled (i.e. eligible to run, and the task has
-        * been scheduled in, if this is a per-task counter)
-        * and running (scheduled onto the CPU), respectively.
-        *
-        * They are computed from tstamp_enabled, tstamp_running and
-        * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
-        */
-       u64                             total_time_enabled;
-       u64                             total_time_running;
-
-       /*
-        * These are timestamps used for computing total_time_enabled
-        * and total_time_running when the counter is in INACTIVE or
-        * ACTIVE state, measured in nanoseconds from an arbitrary point
-        * in time.
-        * tstamp_enabled: the notional time when the counter was enabled
-        * tstamp_running: the notional time when the counter was scheduled on
-        * tstamp_stopped: in INACTIVE state, the notional time when the
-        *      counter was scheduled off.
-        */
-       u64                             tstamp_enabled;
-       u64                             tstamp_running;
-       u64                             tstamp_stopped;
-
-       struct perf_counter_attr        attr;
-       struct hw_perf_counter          hw;
-
-       struct perf_counter_context     *ctx;
-       struct file                     *filp;
-
-       /*
-        * These accumulate total time (in nanoseconds) that children
-        * counters have been enabled and running, respectively.
-        */
-       atomic64_t                      child_total_time_enabled;
-       atomic64_t                      child_total_time_running;
-
-       /*
-        * Protect attach/detach and child_list:
-        */
-       struct mutex                    child_mutex;
-       struct list_head                child_list;
-       struct perf_counter             *parent;
-
-       int                             oncpu;
-       int                             cpu;
-
-       struct list_head                owner_entry;
-       struct task_struct              *owner;
-
-       /* mmap bits */
-       struct mutex                    mmap_mutex;
-       atomic_t                        mmap_count;
-       struct perf_mmap_data           *data;
-
-       /* poll related */
-       wait_queue_head_t               waitq;
-       struct fasync_struct            *fasync;
-
-       /* delayed work for NMIs and such */
-       int                             pending_wakeup;
-       int                             pending_kill;
-       int                             pending_disable;
-       struct perf_pending_entry       pending;
-
-       atomic_t                        event_limit;
-
-       void (*destroy)(struct perf_counter *);
-       struct rcu_head                 rcu_head;
-
-       struct pid_namespace            *ns;
-       u64                             id;
-#endif
-};
-
-/**
- * struct perf_counter_context - counter context structure
- *
- * Used as a container for task counters and CPU counters as well:
- */
-struct perf_counter_context {
-       /*
-        * Protect the states of the counters in the list,
-        * nr_active, and the list:
-        */
-       spinlock_t                      lock;
-       /*
-        * Protect the list of counters.  Locking either mutex or lock
-        * is sufficient to ensure the list doesn't change; to change
-        * the list you need to lock both the mutex and the spinlock.
-        */
-       struct mutex                    mutex;
-
-       struct list_head                group_list;
-       struct list_head                event_list;
-       int                             nr_counters;
-       int                             nr_active;
-       int                             is_active;
-       int                             nr_stat;
-       atomic_t                        refcount;
-       struct task_struct              *task;
-
-       /*
-        * Context clock, runs when context enabled.
-        */
-       u64                             time;
-       u64                             timestamp;
-
-       /*
-        * These fields let us detect when two contexts have both
-        * been cloned (inherited) from a common ancestor.
-        */
-       struct perf_counter_context     *parent_ctx;
-       u64                             parent_gen;
-       u64                             generation;
-       int                             pin_count;
-       struct rcu_head                 rcu_head;
-};
-
-/**
- * struct perf_counter_cpu_context - per cpu counter context structure
- */
-struct perf_cpu_context {
-       struct perf_counter_context     ctx;
-       struct perf_counter_context     *task_ctx;
-       int                             active_oncpu;
-       int                             max_pertask;
-       int                             exclusive;
-
-       /*
-        * Recursion avoidance:
-        *
-        * task, softirq, irq, nmi context
-        */
-       int                             recursion[4];
-};
-
-struct perf_output_handle {
-       struct perf_counter     *counter;
-       struct perf_mmap_data   *data;
-       unsigned long           head;
-       unsigned long           offset;
-       int                     nmi;
-       int                     sample;
-       int                     locked;
-       unsigned long           flags;
-};
-
-#ifdef CONFIG_PERF_COUNTERS
-
-/*
- * Set by architecture code:
- */
-extern int perf_max_counters;
-
-extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
-
-extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
-extern void perf_counter_task_sched_out(struct task_struct *task,
-                                       struct task_struct *next, int cpu);
-extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern int perf_counter_init_task(struct task_struct *child);
-extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_free_task(struct task_struct *task);
-extern void set_perf_counter_pending(void);
-extern void perf_counter_do_pending(void);
-extern void perf_counter_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
-extern int perf_counter_task_disable(void);
-extern int perf_counter_task_enable(void);
-extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
-              struct perf_cpu_context *cpuctx,
-              struct perf_counter_context *ctx, int cpu);
-extern void perf_counter_update_userpage(struct perf_counter *counter);
-
-struct perf_sample_data {
-       u64                             type;
-
-       u64                             ip;
-       struct {
-               u32     pid;
-               u32     tid;
-       }                               tid_entry;
-       u64                             time;
-       u64                             addr;
-       u64                             id;
-       u64                             stream_id;
-       struct {
-               u32     cpu;
-               u32     reserved;
-       }                               cpu_entry;
-       u64                             period;
-       struct perf_callchain_entry     *callchain;
-       struct perf_raw_record          *raw;
-};
-
-extern void perf_output_sample(struct perf_output_handle *handle,
-                              struct perf_event_header *header,
-                              struct perf_sample_data *data,
-                              struct perf_counter *counter);
-extern void perf_prepare_sample(struct perf_event_header *header,
-                               struct perf_sample_data *data,
-                               struct perf_counter *counter,
-                               struct pt_regs *regs);
-
-extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
-                                struct perf_sample_data *data,
-                                struct pt_regs *regs);
-
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-       return (counter->attr.type != PERF_TYPE_RAW) &&
-               (counter->attr.type != PERF_TYPE_HARDWARE) &&
-               (counter->attr.type != PERF_TYPE_HW_CACHE);
-}
-
-extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
-
-extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
-
-static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
-{
-       if (atomic_read(&perf_swcounter_enabled[event]))
-               __perf_swcounter_event(event, nr, nmi, regs, addr);
-}
-
-extern void __perf_counter_mmap(struct vm_area_struct *vma);
-
-static inline void perf_counter_mmap(struct vm_area_struct *vma)
-{
-       if (vma->vm_flags & VM_EXEC)
-               __perf_counter_mmap(vma);
-}
-
-extern void perf_counter_comm(struct task_struct *tsk);
-extern void perf_counter_fork(struct task_struct *tsk);
-
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
-
-extern int sysctl_perf_counter_paranoid;
-extern int sysctl_perf_counter_mlock;
-extern int sysctl_perf_counter_sample_rate;
-
-extern void perf_counter_init(void);
-extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
-                                void *record, int entry_size);
-
-#ifndef perf_misc_flags
-#define perf_misc_flags(regs)  (user_mode(regs) ? PERF_EVENT_MISC_USER : \
-                                PERF_EVENT_MISC_KERNEL)
-#define perf_instruction_pointer(regs) instruction_pointer(regs)
-#endif
-
-extern int perf_output_begin(struct perf_output_handle *handle,
-                            struct perf_counter *counter, unsigned int size,
-                            int nmi, int sample);
-extern void perf_output_end(struct perf_output_handle *handle);
-extern void perf_output_copy(struct perf_output_handle *handle,
-                            const void *buf, unsigned int len);
-#else
-static inline void
-perf_counter_task_sched_in(struct task_struct *task, int cpu)          { }
-static inline void
-perf_counter_task_sched_out(struct task_struct *task,
-                           struct task_struct *next, int cpu)          { }
-static inline void
-perf_counter_task_tick(struct task_struct *task, int cpu)              { }
-static inline int perf_counter_init_task(struct task_struct *child)    { return 0; }
-static inline void perf_counter_exit_task(struct task_struct *child)   { }
-static inline void perf_counter_free_task(struct task_struct *task)    { }
-static inline void perf_counter_do_pending(void)                       { }
-static inline void perf_counter_print_debug(void)                      { }
-static inline void perf_disable(void)                                  { }
-static inline void perf_enable(void)                                   { }
-static inline int perf_counter_task_disable(void)      { return -EINVAL; }
-static inline int perf_counter_task_enable(void)       { return -EINVAL; }
-
-static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi,
-                    struct pt_regs *regs, u64 addr)                    { }
-
-static inline void perf_counter_mmap(struct vm_area_struct *vma)       { }
-static inline void perf_counter_comm(struct task_struct *tsk)          { }
-static inline void perf_counter_fork(struct task_struct *tsk)          { }
-static inline void perf_counter_init(void)                             { }
-
-#endif
-
-#define perf_output_put(handle, x) \
-       perf_output_copy((handle), &(x), sizeof(x))
-
-#endif /* __KERNEL__ */
-#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

new file mode 100644 (file)

index 0000000..ae9d9ed
--- /dev/null
+++ b/include/linux/perf_event.h
@@ -0,0 +1,858 @@
+/*
+ *  Performance events:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_EVENT_H
+#define _LINUX_PERF_EVENT_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/byteorder.h>
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+       PERF_TYPE_HARDWARE                      = 0,
+       PERF_TYPE_SOFTWARE                      = 1,
+       PERF_TYPE_TRACEPOINT                    = 2,
+       PERF_TYPE_HW_CACHE                      = 3,
+       PERF_TYPE_RAW                           = 4,
+
+       PERF_TYPE_MAX,                          /* non-ABI */
+};
+
+/*
+ * Generalized performance event event_id types, used by the
+ * attr.event_id parameter of the sys_perf_event_open()
+ * syscall:
+ */
+enum perf_hw_id {
+       /*
+        * Common hardware events, generalized by the kernel:
+        */
+       PERF_COUNT_HW_CPU_CYCLES                = 0,
+       PERF_COUNT_HW_INSTRUCTIONS              = 1,
+       PERF_COUNT_HW_CACHE_REFERENCES          = 2,
+       PERF_COUNT_HW_CACHE_MISSES              = 3,
+       PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
+       PERF_COUNT_HW_BRANCH_MISSES             = 5,
+       PERF_COUNT_HW_BUS_CYCLES                = 6,
+
+       PERF_COUNT_HW_MAX,                      /* non-ABI */
+};
+
+/*
+ * Generalized hardware cache events:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+       PERF_COUNT_HW_CACHE_L1D                 = 0,
+       PERF_COUNT_HW_CACHE_L1I                 = 1,
+       PERF_COUNT_HW_CACHE_LL                  = 2,
+       PERF_COUNT_HW_CACHE_DTLB                = 3,
+       PERF_COUNT_HW_CACHE_ITLB                = 4,
+       PERF_COUNT_HW_CACHE_BPU                 = 5,
+
+       PERF_COUNT_HW_CACHE_MAX,                /* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+       PERF_COUNT_HW_CACHE_OP_READ             = 0,
+       PERF_COUNT_HW_CACHE_OP_WRITE            = 1,
+       PERF_COUNT_HW_CACHE_OP_PREFETCH         = 2,
+
+       PERF_COUNT_HW_CACHE_OP_MAX,             /* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+       PERF_COUNT_HW_CACHE_RESULT_ACCESS       = 0,
+       PERF_COUNT_HW_CACHE_RESULT_MISS         = 1,
+
+       PERF_COUNT_HW_CACHE_RESULT_MAX,         /* non-ABI */
+};
+
+/*
+ * Special "software" events provided by the kernel, even if the hardware
+ * does not support performance events. These events measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+       PERF_COUNT_SW_CPU_CLOCK                 = 0,
+       PERF_COUNT_SW_TASK_CLOCK                = 1,
+       PERF_COUNT_SW_PAGE_FAULTS               = 2,
+       PERF_COUNT_SW_CONTEXT_SWITCHES          = 3,
+       PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
+       PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
+       PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
+
+       PERF_COUNT_SW_MAX,                      /* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_event_sample_format {
+       PERF_SAMPLE_IP                          = 1U << 0,
+       PERF_SAMPLE_TID                         = 1U << 1,
+       PERF_SAMPLE_TIME                        = 1U << 2,
+       PERF_SAMPLE_ADDR                        = 1U << 3,
+       PERF_SAMPLE_READ                        = 1U << 4,
+       PERF_SAMPLE_CALLCHAIN                   = 1U << 5,
+       PERF_SAMPLE_ID                          = 1U << 6,
+       PERF_SAMPLE_CPU                         = 1U << 7,
+       PERF_SAMPLE_PERIOD                      = 1U << 8,
+       PERF_SAMPLE_STREAM_ID                   = 1U << 9,
+       PERF_SAMPLE_RAW                         = 1U << 10,
+
+       PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */
+};
+
+/*
+ * The format of the data returned by read() on a perf event fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ *     { u64           value;
+ *       { u64         time_enabled; } && PERF_FORMAT_ENABLED
+ *       { u64         time_running; } && PERF_FORMAT_RUNNING
+ *       { u64         id;           } && PERF_FORMAT_ID
+ *     } && !PERF_FORMAT_GROUP
+ *
+ *     { u64           nr;
+ *       { u64         time_enabled; } && PERF_FORMAT_ENABLED
+ *       { u64         time_running; } && PERF_FORMAT_RUNNING
+ *       { u64         value;
+ *         { u64       id;           } && PERF_FORMAT_ID
+ *       }             cntr[nr];
+ *     } && PERF_FORMAT_GROUP
+ * };
+ */
+enum perf_event_read_format {
+       PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,
+       PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,
+       PERF_FORMAT_ID                          = 1U << 2,
+       PERF_FORMAT_GROUP                       = 1U << 3,
+
+       PERF_FORMAT_MAX = 1U << 4,              /* non-ABI */
+};
+
+#define PERF_ATTR_SIZE_VER0    64      /* sizeof first published struct */
+
+/*
+ * Hardware event_id to monitor via a performance monitoring event:
+ */
+struct perf_event_attr {
+
+       /*
+        * Major type: hardware/software/tracepoint/etc.
+        */
+       __u32                   type;
+
+       /*
+        * Size of the attr structure, for fwd/bwd compat.
+        */
+       __u32                   size;
+
+       /*
+        * Type specific configuration information.
+        */
+       __u64                   config;
+
+       union {
+               __u64           sample_period;
+               __u64           sample_freq;
+       };
+
+       __u64                   sample_type;
+       __u64                   read_format;
+
+       __u64                   disabled       :  1, /* off by default        */
+                               inherit        :  1, /* children inherit it   */
+                               pinned         :  1, /* must always be on PMU */
+                               exclusive      :  1, /* only group on PMU     */
+                               exclude_user   :  1, /* don't count user      */
+                               exclude_kernel :  1, /* ditto kernel          */
+                               exclude_hv     :  1, /* ditto hypervisor      */
+                               exclude_idle   :  1, /* don't count when idle */
+                               mmap           :  1, /* include mmap data     */
+                               comm           :  1, /* include comm data     */
+                               freq           :  1, /* use freq, not period  */
+                               inherit_stat   :  1, /* per task counts       */
+                               enable_on_exec :  1, /* next exec enables     */
+                               task           :  1, /* trace fork/exit       */
+                               watermark      :  1, /* wakeup_watermark      */
+
+                               __reserved_1   : 49;
+
+       union {
+               __u32           wakeup_events;    /* wakeup every n events */
+               __u32           wakeup_watermark; /* bytes before wakeup   */
+       };
+       __u32                   __reserved_2;
+
+       __u64                   __reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf event fd:
+ */
+#define PERF_EVENT_IOC_ENABLE          _IO ('$', 0)
+#define PERF_EVENT_IOC_DISABLE _IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH _IO ('$', 2)
+#define PERF_EVENT_IOC_RESET           _IO ('$', 3)
+#define PERF_EVENT_IOC_PERIOD          _IOW('$', 4, u64)
+#define PERF_EVENT_IOC_SET_OUTPUT      _IO ('$', 5)
+
+enum perf_event_ioc_flags {
+       PERF_IOC_FLAG_GROUP             = 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_event_mmap_page {
+       __u32   version;                /* version number of this structure */
+       __u32   compat_version;         /* lowest version this is compat with */
+
+       /*
+        * Bits needed to read the hw events in user-space.
+        *
+        *   u32 seq;
+        *   s64 count;
+        *
+        *   do {
+        *     seq = pc->lock;
+        *
+        *     barrier()
+        *     if (pc->index) {
+        *       count = pmc_read(pc->index - 1);
+        *       count += pc->offset;
+        *     } else
+        *       goto regular_read;
+        *
+        *     barrier();
+        *   } while (pc->lock != seq);
+        *
+        * NOTE: for obvious reason this only works on self-monitoring
+        *       processes.
+        */
+       __u32   lock;                   /* seqlock for synchronization */
+       __u32   index;                  /* hardware event identifier */
+       __s64   offset;                 /* add to hardware event value */
+       __u64   time_enabled;           /* time event active */
+       __u64   time_running;           /* time event on cpu */
+
+               /*
+                * Hole for extension of the self monitor capabilities
+                */
+
+       __u64   __reserved[123];        /* align to 1k */
+
+       /*
+        * Control data for the mmap() data buffer.
+        *
+        * User-space reading the @data_head value should issue an rmb(), on
+        * SMP capable platforms, after reading this value -- see
+        * perf_event_wakeup().
+        *
+        * When the mapping is PROT_WRITE the @data_tail value should be
+        * written by userspace to reflect the last read data. In this case
+        * the kernel will not over-write unread data.
+        */
+       __u64   data_head;              /* head in the data section */
+       __u64   data_tail;              /* user-space written tail */
+};
+
+#define PERF_RECORD_MISC_CPUMODE_MASK          (3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_UNKNOWN               (0 << 0)
+#define PERF_RECORD_MISC_KERNEL                        (1 << 0)
+#define PERF_RECORD_MISC_USER                  (2 << 0)
+#define PERF_RECORD_MISC_HYPERVISOR            (3 << 0)
+
+struct perf_event_header {
+       __u32   type;
+       __u16   misc;
+       __u16   size;
+};
+
+enum perf_event_type {
+
+       /*
+        * The MMAP events record the PROT_EXEC mappings so that we can
+        * correlate userspace IPs to code. They have the following structure:
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      u32                             pid, tid;
+        *      u64                             addr;
+        *      u64                             len;
+        *      u64                             pgoff;
+        *      char                            filename[];
+        * };
+        */
+       PERF_RECORD_MMAP                        = 1,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             id;
+        *      u64                             lost;
+        * };
+        */
+       PERF_RECORD_LOST                        = 2,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      u32                             pid, tid;
+        *      char                            comm[];
+        * };
+        */
+       PERF_RECORD_COMM                        = 3,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid, ppid;
+        *      u32                             tid, ptid;
+        *      u64                             time;
+        * };
+        */
+       PERF_RECORD_EXIT                        = 4,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             time;
+        *      u64                             id;
+        *      u64                             stream_id;
+        * };
+        */
+       PERF_RECORD_THROTTLE            = 5,
+       PERF_RECORD_UNTHROTTLE          = 6,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid, ppid;
+        *      u32                             tid, ptid;
+        *      { u64                           time;     } && PERF_SAMPLE_TIME
+        * };
+        */
+       PERF_RECORD_FORK                        = 7,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid, tid;
+        *
+        *      struct read_format              values;
+        * };
+        */
+       PERF_RECORD_READ                        = 8,
+
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      { u64                   ip;       } && PERF_SAMPLE_IP
+        *      { u32                   pid, tid; } && PERF_SAMPLE_TID
+        *      { u64                   time;     } && PERF_SAMPLE_TIME
+        *      { u64                   addr;     } && PERF_SAMPLE_ADDR
+        *      { u64                   id;       } && PERF_SAMPLE_ID
+        *      { u64                   stream_id;} && PERF_SAMPLE_STREAM_ID
+        *      { u32                   cpu, res; } && PERF_SAMPLE_CPU
+        *      { u64                   period;   } && PERF_SAMPLE_PERIOD
+        *
+        *      { struct read_format    values;   } && PERF_SAMPLE_READ
+        *
+        *      { u64                   nr,
+        *        u64                   ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+        *
+        *      #
+        *      # The RAW record below is opaque data wrt the ABI
+        *      #
+        *      # That is, the ABI doesn't make any promises wrt to
+        *      # the stability of its content, it may vary depending
+        *      # on event_id, hardware, kernel version and phase of
+        *      # the moon.
+        *      #
+        *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
+        *      #
+        *
+        *      { u32                   size;
+        *        char                  data[size];}&& PERF_SAMPLE_RAW
+        * };
+        */
+       PERF_RECORD_SAMPLE              = 9,
+
+       PERF_RECORD_MAX,                        /* non-ABI */
+};
+
+enum perf_callchain_context {
+       PERF_CONTEXT_HV                 = (__u64)-32,
+       PERF_CONTEXT_KERNEL             = (__u64)-128,
+       PERF_CONTEXT_USER               = (__u64)-512,
+
+       PERF_CONTEXT_GUEST              = (__u64)-2048,
+       PERF_CONTEXT_GUEST_KERNEL       = (__u64)-2176,
+       PERF_CONTEXT_GUEST_USER         = (__u64)-2560,
+
+       PERF_CONTEXT_MAX                = (__u64)-4095,
+};
+
+#define PERF_FLAG_FD_NO_GROUP  (1U << 0)
+#define PERF_FLAG_FD_OUTPUT    (1U << 1)
+
+#ifdef __KERNEL__
+/*
+ * Kernel-internal data types and definitions:
+ */
+
+#ifdef CONFIG_PERF_EVENTS
+# include <asm/perf_event.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/fs.h>
+#include <linux/pid_namespace.h>
+#include <asm/atomic.h>
+
+#define PERF_MAX_STACK_DEPTH           255
+
+struct perf_callchain_entry {
+       __u64                           nr;
+       __u64                           ip[PERF_MAX_STACK_DEPTH];
+};
+
+struct perf_raw_record {
+       u32                             size;
+       void                            *data;
+};
+
+struct task_struct;
+
+/**
+ * struct hw_perf_event - performance event hardware details:
+ */
+struct hw_perf_event {
+#ifdef CONFIG_PERF_EVENTS
+       union {
+               struct { /* hardware */
+                       u64             config;
+                       unsigned long   config_base;
+                       unsigned long   event_base;
+                       int             idx;
+               };
+               union { /* software */
+                       atomic64_t      count;
+                       struct hrtimer  hrtimer;
+               };
+       };
+       atomic64_t                      prev_count;
+       u64                             sample_period;
+       u64                             last_period;
+       atomic64_t                      period_left;
+       u64                             interrupts;
+
+       u64                             freq_count;
+       u64                             freq_interrupts;
+       u64                             freq_stamp;
+#endif
+};
+
+struct perf_event;
+
+/**
+ * struct pmu - generic performance monitoring unit
+ */
+struct pmu {
+       int (*enable)                   (struct perf_event *event);
+       void (*disable)                 (struct perf_event *event);
+       void (*read)                    (struct perf_event *event);
+       void (*unthrottle)              (struct perf_event *event);
+};
+
+/**
+ * enum perf_event_active_state - the states of a event
+ */
+enum perf_event_active_state {
+       PERF_EVENT_STATE_ERROR  = -2,
+       PERF_EVENT_STATE_OFF            = -1,
+       PERF_EVENT_STATE_INACTIVE       =  0,
+       PERF_EVENT_STATE_ACTIVE =  1,
+};
+
+struct file;
+
+struct perf_mmap_data {
+       struct rcu_head                 rcu_head;
+       int                             nr_pages;       /* nr of data pages  */
+       int                             writable;       /* are we writable   */
+       int                             nr_locked;      /* nr pages mlocked  */
+
+       atomic_t                        poll;           /* POLL_ for wakeups */
+       atomic_t                        events;         /* event_id limit       */
+
+       atomic_long_t                   head;           /* write position    */
+       atomic_long_t                   done_head;      /* completed head    */
+
+       atomic_t                        lock;           /* concurrent writes */
+       atomic_t                        wakeup;         /* needs a wakeup    */
+       atomic_t                        lost;           /* nr records lost   */
+
+       long                            watermark;      /* wakeup watermark  */
+
+       struct perf_event_mmap_page   *user_page;
+       void                            *data_pages[0];
+};
+
+struct perf_pending_entry {
+       struct perf_pending_entry *next;
+       void (*func)(struct perf_pending_entry *);
+};
+
+/**
+ * struct perf_event - performance event kernel representation:
+ */
+struct perf_event {
+#ifdef CONFIG_PERF_EVENTS
+       struct list_head                group_entry;
+       struct list_head                event_entry;
+       struct list_head                sibling_list;
+       int                             nr_siblings;
+       struct perf_event               *group_leader;
+       struct perf_event               *output;
+       const struct pmu                *pmu;
+
+       enum perf_event_active_state    state;
+       atomic64_t                      count;
+
+       /*
+        * These are the total time in nanoseconds that the event
+        * has been enabled (i.e. eligible to run, and the task has
+        * been scheduled in, if this is a per-task event)
+        * and running (scheduled onto the CPU), respectively.
+        *
+        * They are computed from tstamp_enabled, tstamp_running and
+        * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
+        */
+       u64                             total_time_enabled;
+       u64                             total_time_running;
+
+       /*
+        * These are timestamps used for computing total_time_enabled
+        * and total_time_running when the event is in INACTIVE or
+        * ACTIVE state, measured in nanoseconds from an arbitrary point
+        * in time.
+        * tstamp_enabled: the notional time when the event was enabled
+        * tstamp_running: the notional time when the event was scheduled on
+        * tstamp_stopped: in INACTIVE state, the notional time when the
+        *      event was scheduled off.
+        */
+       u64                             tstamp_enabled;
+       u64                             tstamp_running;
+       u64                             tstamp_stopped;
+
+       struct perf_event_attr  attr;
+       struct hw_perf_event            hw;
+
+       struct perf_event_context       *ctx;
+       struct file                     *filp;
+
+       /*
+        * These accumulate total time (in nanoseconds) that children
+        * events have been enabled and running, respectively.
+        */
+       atomic64_t                      child_total_time_enabled;
+       atomic64_t                      child_total_time_running;
+
+       /*
+        * Protect attach/detach and child_list:
+        */
+       struct mutex                    child_mutex;
+       struct list_head                child_list;
+       struct perf_event               *parent;
+
+       int                             oncpu;
+       int                             cpu;
+
+       struct list_head                owner_entry;
+       struct task_struct              *owner;
+
+       /* mmap bits */
+       struct mutex                    mmap_mutex;
+       atomic_t                        mmap_count;
+       struct perf_mmap_data           *data;
+
+       /* poll related */
+       wait_queue_head_t               waitq;
+       struct fasync_struct            *fasync;
+
+       /* delayed work for NMIs and such */
+       int                             pending_wakeup;
+       int                             pending_kill;
+       int                             pending_disable;
+       struct perf_pending_entry       pending;
+
+       atomic_t                        event_limit;
+
+       void (*destroy)(struct perf_event *);
+       struct rcu_head                 rcu_head;
+
+       struct pid_namespace            *ns;
+       u64                             id;
+#endif
+};
+
+/**
+ * struct perf_event_context - event context structure
+ *
+ * Used as a container for task events and CPU events as well:
+ */
+struct perf_event_context {
+       /*
+        * Protect the states of the events in the list,
+        * nr_active, and the list:
+        */
+       spinlock_t                      lock;
+       /*
+        * Protect the list of events.  Locking either mutex or lock
+        * is sufficient to ensure the list doesn't change; to change
+        * the list you need to lock both the mutex and the spinlock.
+        */
+       struct mutex                    mutex;
+
+       struct list_head                group_list;
+       struct list_head                event_list;
+       int                             nr_events;
+       int                             nr_active;
+       int                             is_active;
+       int                             nr_stat;
+       atomic_t                        refcount;
+       struct task_struct              *task;
+
+       /*
+        * Context clock, runs when context enabled.
+        */
+       u64                             time;
+       u64                             timestamp;
+
+       /*
+        * These fields let us detect when two contexts have both
+        * been cloned (inherited) from a common ancestor.
+        */
+       struct perf_event_context       *parent_ctx;
+       u64                             parent_gen;
+       u64                             generation;
+       int                             pin_count;
+       struct rcu_head                 rcu_head;
+};
+
+/**
+ * struct perf_event_cpu_context - per cpu event context structure
+ */
+struct perf_cpu_context {
+       struct perf_event_context       ctx;
+       struct perf_event_context       *task_ctx;
+       int                             active_oncpu;
+       int                             max_pertask;
+       int                             exclusive;
+
+       /*
+        * Recursion avoidance:
+        *
+        * task, softirq, irq, nmi context
+        */
+       int                             recursion[4];
+};
+
+struct perf_output_handle {
+       struct perf_event       *event;
+       struct perf_mmap_data   *data;
+       unsigned long           head;
+       unsigned long           offset;
+       int                     nmi;
+       int                     sample;
+       int                     locked;
+       unsigned long           flags;
+};
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_events;
+
+extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+
+extern void perf_event_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_event_task_sched_out(struct task_struct *task,
+                                       struct task_struct *next, int cpu);
+extern void perf_event_task_tick(struct task_struct *task, int cpu);
+extern int perf_event_init_task(struct task_struct *child);
+extern void perf_event_exit_task(struct task_struct *child);
+extern void perf_event_free_task(struct task_struct *task);
+extern void set_perf_event_pending(void);
+extern void perf_event_do_pending(void);
+extern void perf_event_print_debug(void);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
+extern int perf_event_task_disable(void);
+extern int perf_event_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_event *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_event_context *ctx, int cpu);
+extern void perf_event_update_userpage(struct perf_event *event);
+
+struct perf_sample_data {
+       u64                             type;
+
+       u64                             ip;
+       struct {
+               u32     pid;
+               u32     tid;
+       }                               tid_entry;
+       u64                             time;
+       u64                             addr;
+       u64                             id;
+       u64                             stream_id;
+       struct {
+               u32     cpu;
+               u32     reserved;
+       }                               cpu_entry;
+       u64                             period;
+       struct perf_callchain_entry     *callchain;
+       struct perf_raw_record          *raw;
+};
+
+extern void perf_output_sample(struct perf_output_handle *handle,
+                              struct perf_event_header *header,
+                              struct perf_sample_data *data,
+                              struct perf_event *event);
+extern void perf_prepare_sample(struct perf_event_header *header,
+                               struct perf_sample_data *data,
+                               struct perf_event *event,
+                               struct pt_regs *regs);
+
+extern int perf_event_overflow(struct perf_event *event, int nmi,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs);
+
+/*
+ * Return 1 for a software event, 0 for a hardware event
+ */
+static inline int is_software_event(struct perf_event *event)
+{
+       return (event->attr.type != PERF_TYPE_RAW) &&
+               (event->attr.type != PERF_TYPE_HARDWARE) &&
+               (event->attr.type != PERF_TYPE_HW_CACHE);
+}
+
+extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
+
+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+       if (atomic_read(&perf_swevent_enabled[event_id]))
+               __perf_sw_event(event_id, nr, nmi, regs, addr);
+}
+
+extern void __perf_event_mmap(struct vm_area_struct *vma);
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_EXEC)
+               __perf_event_mmap(vma);
+}
+
+extern void perf_event_comm(struct task_struct *tsk);
+extern void perf_event_fork(struct task_struct *tsk);
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
+extern int sysctl_perf_event_paranoid;
+extern int sysctl_perf_event_mlock;
+extern int sysctl_perf_event_sample_rate;
+
+extern void perf_event_init(void);
+extern void perf_tp_event(int event_id, u64 addr, u64 count,
+                                void *record, int entry_size);
+
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)  (user_mode(regs) ? PERF_RECORD_MISC_USER : \
+                                PERF_RECORD_MISC_KERNEL)
+#define perf_instruction_pointer(regs) instruction_pointer(regs)
+#endif
+
+extern int perf_output_begin(struct perf_output_handle *handle,
+                            struct perf_event *event, unsigned int size,
+                            int nmi, int sample);
+extern void perf_output_end(struct perf_output_handle *handle);
+extern void perf_output_copy(struct perf_output_handle *handle,
+                            const void *buf, unsigned int len);
+#else
+static inline void
+perf_event_task_sched_in(struct task_struct *task, int cpu)            { }
+static inline void
+perf_event_task_sched_out(struct task_struct *task,
+                           struct task_struct *next, int cpu)          { }
+static inline void
+perf_event_task_tick(struct task_struct *task, int cpu)                { }
+static inline int perf_event_init_task(struct task_struct *child)      { return 0; }
+static inline void perf_event_exit_task(struct task_struct *child)     { }
+static inline void perf_event_free_task(struct task_struct *task)      { }
+static inline void perf_event_do_pending(void)                 { }
+static inline void perf_event_print_debug(void)                        { }
+static inline void perf_disable(void)                                  { }
+static inline void perf_enable(void)                                   { }
+static inline int perf_event_task_disable(void)        { return -EINVAL; }
+static inline int perf_event_task_enable(void) { return -EINVAL; }
+
+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi,
+                    struct pt_regs *regs, u64 addr)                    { }
+
+static inline void perf_event_mmap(struct vm_area_struct *vma) { }
+static inline void perf_event_comm(struct task_struct *tsk)            { }
+static inline void perf_event_fork(struct task_struct *tsk)            { }
+static inline void perf_event_init(void)                               { }
+
+#endif
+
+#define perf_output_put(handle, x) \
+       perf_output_copy((handle), &(x), sizeof(x))
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_PERF_EVENT_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h

index b00df4c..07bff66 100644 (file)
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,7 +85,7 @@
  #define PR_SET_TIMERSLACK 29
  #define PR_GET_TIMERSLACK 30
  
-#define PR_TASK_PERF_COUNTERS_DISABLE          31
-#define PR_TASK_PERF_COUNTERS_ENABLE           32
+#define PR_TASK_PERF_EVENTS_DISABLE            31
+#define PR_TASK_PERF_EVENTS_ENABLE             32
  
  #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 8af3d24..8b265a8 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,7 +100,7 @@ struct robust_list_head;
  struct bio;
  struct fs_struct;
  struct bts_context;
-struct perf_counter_context;
+struct perf_event_context;
  
  /*
   * List of flags we want to share for kernel threads,
@@ -701,7 +701,7 @@ struct user_struct {
  #endif
  #endif
  
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
         atomic_long_t locked_vm;
  #endif
  };
@@ -1449,10 +1449,10 @@ struct task_struct {
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
  #endif
-#ifdef CONFIG_PERF_COUNTERS
-       struct perf_counter_context *perf_counter_ctxp;
-       struct mutex perf_counter_mutex;
-       struct list_head perf_counter_list;
+#ifdef CONFIG_PERF_EVENTS
+       struct perf_event_context *perf_event_ctxp;
+       struct mutex perf_event_mutex;
+       struct list_head perf_event_list;
  #endif
  #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;    /* Protected by alloc_lock */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h

index a8e3782..02f19f9 100644 (file)
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
  struct robust_list_head;
  struct getcpu_cache;
  struct old_linux_dirent;
-struct perf_counter_attr;
+struct perf_event_attr;
  
  #include <linux/types.h>
  #include <linux/aio_abi.h>
@@ -885,7 +885,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
  int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
  
  
-asmlinkage long sys_perf_counter_open(
-               struct perf_counter_attr __user *attr_uptr,
+asmlinkage long sys_perf_event_open(
+               struct perf_event_attr __user *attr_uptr,
                 pid_t pid, int cpu, int group_fd, unsigned long flags);
  #endif
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h

index 72a3b43..ec91e78 100644 (file)
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -378,7 +378,7 @@ static inline int ftrace_get_offsets_##call(                                \
  #ifdef CONFIG_EVENT_PROFILE
  
  /*
- * Generate the functions needed for tracepoint perf_counter support.
+ * Generate the functions needed for tracepoint perf_event support.
   *
   * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
   *
@@ -656,7 +656,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {         \
   * {
   *     struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
   *     struct ftrace_event_call *event_call = &event_<call>;
- *     extern void perf_tpcounter_event(int, u64, u64, void *, int);
+ *     extern void perf_tp_event(int, u64, u64, void *, int);
   *     struct ftrace_raw_##call *entry;
   *     u64 __addr = 0, __count = 1;
   *     unsigned long irq_flags;
@@ -691,7 +691,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {         \
   *
   *             <assign>  <- affect our values
   *
- *             perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *             perf_tp_event(event_call->id, __addr, __count, entry,
   *                          __entry_size);  <- submit them to perf counter
   *     } while (0);
   *
@@ -712,7 +712,7 @@ static void ftrace_profile_##call(proto)                            \
  {                                                                      \
         struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
         struct ftrace_event_call *event_call = &event_##call;           \
-       extern void perf_tpcounter_event(int, u64, u64, void *, int);   \
+       extern void perf_tp_event(int, u64, u64, void *, int);  \
         struct ftrace_raw_##call *entry;                                \
         u64 __addr = 0, __count = 1;                                    \
         unsigned long irq_flags;                                        \
@@ -742,7 +742,7 @@ static void ftrace_profile_##call(proto)                            \
                                                                         \
                 { assign; }                                             \
                                                                         \
-               perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+               perf_tp_event(event_call->id, __addr, __count, entry,\
                              __entry_size);                             \
         } while (0);                                                    \
                                                                         \
diff --git a/init/Kconfig b/init/Kconfig

index 8e8b76d..cfdf5c3 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -915,17 +915,17 @@ config AIO
            by some high performance threaded applications. Disabling
            this option saves about 7k.
  
-config HAVE_PERF_COUNTERS
+config HAVE_PERF_EVENTS
         bool
         help
           See tools/perf/design.txt for details.
  
  menu "Performance Counters"
  
-config PERF_COUNTERS
+config PERF_EVENTS
         bool "Kernel Performance Counters"
         default y if PROFILING
-       depends on HAVE_PERF_COUNTERS
+       depends on HAVE_PERF_EVENTS
         select ANON_INODES
         help
           Enable kernel support for performance counter hardware.
@@ -947,7 +947,7 @@ config PERF_COUNTERS
  
  config EVENT_PROFILE
         bool "Tracepoint profiling sources"
-       depends on PERF_COUNTERS && EVENT_TRACING
+       depends on PERF_EVENTS && EVENT_TRACING
         default y
         help
          Allow the use of tracepoints as software performance counters.
diff --git a/kernel/Makefile b/kernel/Makefile

index 3d9c7e2..e26a546 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,7 +96,7 @@ obj-$(CONFIG_X86_DS) += trace/
  obj-$(CONFIG_RING_BUFFER) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
  obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event.o
  
  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c

index ae5d866..e47ee8a 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
  #include <linux/tracehook.h>
  #include <linux/fs_struct.h>
  #include <linux/init_task.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <trace/events/sched.h>
  
  #include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
  {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
-#ifdef CONFIG_PERF_COUNTERS
-       WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#ifdef CONFIG_PERF_EVENTS
+       WARN_ON_ONCE(tsk->perf_event_ctxp);
  #endif
         trace_sched_process_free(tsk);
         put_task_struct(tsk);
@@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code)
          * Flush inherited counters to the parent - before the parent
          * gets woken up by child-exit notifications.
          */
-       perf_counter_exit_task(tsk);
+       perf_event_exit_task(tsk);
  
         exit_notify(tsk, group_dead);
  #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c

index bfee931..2cebfb2 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,7 +61,7 @@
  #include <linux/blkdev.h>
  #include <linux/fs_struct.h>
  #include <linux/magic.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
@@ -1078,7 +1078,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         /* Perform scheduler related setup. Assign this task to a CPU. */
         sched_fork(p, clone_flags);
  
-       retval = perf_counter_init_task(p);
+       retval = perf_event_init_task(p);
         if (retval)
                 goto bad_fork_cleanup_policy;
  
@@ -1253,7 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         write_unlock_irq(&tasklist_lock);
         proc_fork_connector(p);
         cgroup_post_fork(p);
-       perf_counter_fork(p);
+       perf_event_fork(p);
         return p;
  
  bad_fork_free_pid:
@@ -1280,7 +1280,7 @@ bad_fork_cleanup_semundo:
  bad_fork_cleanup_audit:
         audit_free(p);
  bad_fork_cleanup_policy:
-       perf_counter_free_task(p);
+       perf_event_free_task(p);
  #ifdef CONFIG_NUMA
         mpol_put(p->mempolicy);
  bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c

deleted file mode 100644 (file)

index 62de0db..0000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,5000 +0,0 @@
-/*
- * Performance counter core code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- *  For licensing details see kernel-base/COPYING
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/sysfs.h>
-#include <linux/dcache.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/vmstat.h>
-#include <linux/hardirq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/anon_inodes.h>
-#include <linux/kernel_stat.h>
-#include <linux/perf_counter.h>
-
-#include <asm/irq_regs.h>
-
-/*
- * Each CPU has a list of per CPU counters:
- */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_counters __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
-static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_counters __read_mostly;
-static atomic_t nr_comm_counters __read_mostly;
-static atomic_t nr_task_counters __read_mostly;
-
-/*
- * perf counter paranoia level:
- *  -1 - not paranoid at all
- *   0 - disallow raw tracepoint access for unpriv
- *   1 - disallow cpu counters for unpriv
- *   2 - disallow kernel profiling for unpriv
- */
-int sysctl_perf_counter_paranoid __read_mostly = 1;
-
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-       return sysctl_perf_counter_paranoid > -1;
-}
-
-static inline bool perf_paranoid_cpu(void)
-{
-       return sysctl_perf_counter_paranoid > 0;
-}
-
-static inline bool perf_paranoid_kernel(void)
-{
-       return sysctl_perf_counter_paranoid > 1;
-}
-
-int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-
-/*
- * max perf counter sample rate
- */
-int sysctl_perf_counter_sample_rate __read_mostly = 100000;
-
-static atomic64_t perf_counter_id;
-
-/*
- * Lock for (sysadmin-configurable) counter reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
-
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-       return NULL;
-}
-
-void __weak hw_perf_disable(void)              { barrier(); }
-void __weak hw_perf_enable(void)               { barrier(); }
-
-void __weak hw_perf_counter_setup(int cpu)     { barrier(); }
-void __weak hw_perf_counter_setup_online(int cpu)      { barrier(); }
-
-int __weak
-hw_perf_group_sched_in(struct perf_counter *group_leader,
-              struct perf_cpu_context *cpuctx,
-              struct perf_counter_context *ctx, int cpu)
-{
-       return 0;
-}
-
-void __weak perf_counter_print_debug(void)     { }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void __perf_disable(void)
-{
-       __get_cpu_var(perf_disable_count)++;
-}
-
-bool __perf_enable(void)
-{
-       return !--__get_cpu_var(perf_disable_count);
-}
-
-void perf_disable(void)
-{
-       __perf_disable();
-       hw_perf_disable();
-}
-
-void perf_enable(void)
-{
-       if (__perf_enable())
-               hw_perf_enable();
-}
-
-static void get_ctx(struct perf_counter_context *ctx)
-{
-       WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
-}
-
-static void free_ctx(struct rcu_head *head)
-{
-       struct perf_counter_context *ctx;
-
-       ctx = container_of(head, struct perf_counter_context, rcu_head);
-       kfree(ctx);
-}
-
-static void put_ctx(struct perf_counter_context *ctx)
-{
-       if (atomic_dec_and_test(&ctx->refcount)) {
-               if (ctx->parent_ctx)
-                       put_ctx(ctx->parent_ctx);
-               if (ctx->task)
-                       put_task_struct(ctx->task);
-               call_rcu(&ctx->rcu_head, free_ctx);
-       }
-}
-
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-       if (ctx->parent_ctx) {
-               put_ctx(ctx->parent_ctx);
-               ctx->parent_ctx = NULL;
-       }
-}
-
-/*
- * If we inherit counters we want to return the parent counter id
- * to userspace.
- */
-static u64 primary_counter_id(struct perf_counter *counter)
-{
-       u64 id = counter->id;
-
-       if (counter->parent)
-               id = counter->parent->id;
-
-       return id;
-}
-
-/*
- * Get the perf_counter_context for a task and lock it.
- * This has to cope with with the fact that until it is locked,
- * the context could get moved to another task.
- */
-static struct perf_counter_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
-{
-       struct perf_counter_context *ctx;
-
-       rcu_read_lock();
- retry:
-       ctx = rcu_dereference(task->perf_counter_ctxp);
-       if (ctx) {
-               /*
-                * If this context is a clone of another, it might
-                * get swapped for another underneath us by
-                * perf_counter_task_sched_out, though the
-                * rcu_read_lock() protects us from any context
-                * getting freed.  Lock the context and check if it
-                * got swapped before we could get the lock, and retry
-                * if so.  If we locked the right context, then it
-                * can't get swapped on us any more.
-                */
-               spin_lock_irqsave(&ctx->lock, *flags);
-               if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
-                       spin_unlock_irqrestore(&ctx->lock, *flags);
-                       goto retry;
-               }
-
-               if (!atomic_inc_not_zero(&ctx->refcount)) {
-                       spin_unlock_irqrestore(&ctx->lock, *flags);
-                       ctx = NULL;
-               }
-       }
-       rcu_read_unlock();
-       return ctx;
-}
-
-/*
- * Get the context for a task and increment its pin_count so it
- * can't get swapped to another task.  This also increments its
- * reference count so that the context can't get freed.
- */
-static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
-{
-       struct perf_counter_context *ctx;
-       unsigned long flags;
-
-       ctx = perf_lock_task_context(task, &flags);
-       if (ctx) {
-               ++ctx->pin_count;
-               spin_unlock_irqrestore(&ctx->lock, flags);
-       }
-       return ctx;
-}
-
-static void perf_unpin_context(struct perf_counter_context *ctx)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&ctx->lock, flags);
-       --ctx->pin_count;
-       spin_unlock_irqrestore(&ctx->lock, flags);
-       put_ctx(ctx);
-}
-
-/*
- * Add a counter from the lists for its context.
- * Must be called with ctx->mutex and ctx->lock held.
- */
-static void
-list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-{
-       struct perf_counter *group_leader = counter->group_leader;
-
-       /*
-        * Depending on whether it is a standalone or sibling counter,
-        * add it straight to the context's counter list, or to the group
-        * leader's sibling list:
-        */
-       if (group_leader == counter)
-               list_add_tail(&counter->group_entry, &ctx->group_list);
-       else {
-               list_add_tail(&counter->group_entry, &group_leader->sibling_list);
-               group_leader->nr_siblings++;
-       }
-
-       list_add_rcu(&counter->event_entry, &ctx->event_list);
-       ctx->nr_counters++;
-       if (counter->attr.inherit_stat)
-               ctx->nr_stat++;
-}
-
-/*
- * Remove a counter from the lists for its context.
- * Must be called with ctx->mutex and ctx->lock held.
- */
-static void
-list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-{
-       struct perf_counter *sibling, *tmp;
-
-       if (list_empty(&counter->group_entry))
-               return;
-       ctx->nr_counters--;
-       if (counter->attr.inherit_stat)
-               ctx->nr_stat--;
-
-       list_del_init(&counter->group_entry);
-       list_del_rcu(&counter->event_entry);
-
-       if (counter->group_leader != counter)
-               counter->group_leader->nr_siblings--;
-
-       /*
-        * If this was a group counter with sibling counters then
-        * upgrade the siblings to singleton counters by adding them
-        * to the context list directly:
-        */
-       list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) {
-
-               list_move_tail(&sibling->group_entry, &ctx->group_list);
-               sibling->group_leader = sibling;
-       }
-}
-
-static void
-counter_sched_out(struct perf_counter *counter,
-                 struct perf_cpu_context *cpuctx,
-                 struct perf_counter_context *ctx)
-{
-       if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-               return;
-
-       counter->state = PERF_COUNTER_STATE_INACTIVE;
-       if (counter->pending_disable) {
-               counter->pending_disable = 0;
-               counter->state = PERF_COUNTER_STATE_OFF;
-       }
-       counter->tstamp_stopped = ctx->time;
-       counter->pmu->disable(counter);
-       counter->oncpu = -1;
-
-       if (!is_software_counter(counter))
-               cpuctx->active_oncpu--;
-       ctx->nr_active--;
-       if (counter->attr.exclusive || !cpuctx->active_oncpu)
-               cpuctx->exclusive = 0;
-}
-
-static void
-group_sched_out(struct perf_counter *group_counter,
-               struct perf_cpu_context *cpuctx,
-               struct perf_counter_context *ctx)
-{
-       struct perf_counter *counter;
-
-       if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
-               return;
-
-       counter_sched_out(group_counter, cpuctx, ctx);
-
-       /*
-        * Schedule out siblings (if any):
-        */
-       list_for_each_entry(counter, &group_counter->sibling_list, group_entry)
-               counter_sched_out(counter, cpuctx, ctx);
-
-       if (group_counter->attr.exclusive)
-               cpuctx->exclusive = 0;
-}
-
-/*
- * Cross CPU call to remove a performance counter
- *
- * We disable the counter on the hardware level first. After that we
- * remove it from the context list.
- */
-static void __perf_counter_remove_from_context(void *info)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_counter *counter = info;
-       struct perf_counter_context *ctx = counter->ctx;
-
-       /*
-        * If this is a task context, we need to check whether it is
-        * the current task context of this cpu. If not it has been
-        * scheduled out before the smp call arrived.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx)
-               return;
-
-       spin_lock(&ctx->lock);
-       /*
-        * Protect the list operation against NMI by disabling the
-        * counters on a global level.
-        */
-       perf_disable();
-
-       counter_sched_out(counter, cpuctx, ctx);
-
-       list_del_counter(counter, ctx);
-
-       if (!ctx->task) {
-               /*
-                * Allow more per task counters with respect to the
-                * reservation:
-                */
-               cpuctx->max_pertask =
-                       min(perf_max_counters - ctx->nr_counters,
-                           perf_max_counters - perf_reserved_percpu);
-       }
-
-       perf_enable();
-       spin_unlock(&ctx->lock);
-}
-
-
-/*
- * Remove the counter from a task's (or a CPU's) list of counters.
- *
- * Must be called with ctx->mutex held.
- *
- * CPU counters are removed with a smp call. For task counters we only
- * call when the task is on a CPU.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This is OK when called from perf_release since
- * that only calls us on the top-level context, which can't be a clone.
- * When called from perf_counter_exit_task, it's OK because the
- * context has been detached from its task.
- */
-static void perf_counter_remove_from_context(struct perf_counter *counter)
-{
-       struct perf_counter_context *ctx = counter->ctx;
-       struct task_struct *task = ctx->task;
-
-       if (!task) {
-               /*
-                * Per cpu counters are removed via an smp call and
-                * the removal is always sucessful.
-                */
-               smp_call_function_single(counter->cpu,
-                                        __perf_counter_remove_from_context,
-                                        counter, 1);
-               return;
-       }
-
-retry:
-       task_oncpu_function_call(task, __perf_counter_remove_from_context,
-                                counter);
-
-       spin_lock_irq(&ctx->lock);
-       /*
-        * If the context is active we need to retry the smp call.
-        */
-       if (ctx->nr_active && !list_empty(&counter->group_entry)) {
-               spin_unlock_irq(&ctx->lock);
-               goto retry;
-       }
-
-       /*
-        * The lock prevents that this context is scheduled in so we
-        * can remove the counter safely, if the call above did not
-        * succeed.
-        */
-       if (!list_empty(&counter->group_entry)) {
-               list_del_counter(counter, ctx);
-       }
-       spin_unlock_irq(&ctx->lock);
-}
-
-static inline u64 perf_clock(void)
-{
-       return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_counter_context *ctx)
-{
-       u64 now = perf_clock();
-
-       ctx->time += now - ctx->timestamp;
-       ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a counter.
- */
-static void update_counter_times(struct perf_counter *counter)
-{
-       struct perf_counter_context *ctx = counter->ctx;
-       u64 run_end;
-
-       if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-           counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
-               return;
-
-       counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
-
-       if (counter->state == PERF_COUNTER_STATE_INACTIVE)
-               run_end = counter->tstamp_stopped;
-       else
-               run_end = ctx->time;
-
-       counter->total_time_running = run_end - counter->tstamp_running;
-}
-
-/*
- * Update total_time_enabled and total_time_running for all counters in a group.
- */
-static void update_group_times(struct perf_counter *leader)
-{
-       struct perf_counter *counter;
-
-       update_counter_times(leader);
-       list_for_each_entry(counter, &leader->sibling_list, group_entry)
-               update_counter_times(counter);
-}
-
-/*
- * Cross CPU call to disable a performance counter
- */
-static void __perf_counter_disable(void *info)
-{
-       struct perf_counter *counter = info;
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_counter_context *ctx = counter->ctx;
-
-       /*
-        * If this is a per-task counter, need to check whether this
-        * counter's task is the current task on this cpu.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx)
-               return;
-
-       spin_lock(&ctx->lock);
-
-       /*
-        * If the counter is on, turn it off.
-        * If it is in error state, leave it in error state.
-        */
-       if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-               update_context_time(ctx);
-               update_group_times(counter);
-               if (counter == counter->group_leader)
-                       group_sched_out(counter, cpuctx, ctx);
-               else
-                       counter_sched_out(counter, cpuctx, ctx);
-               counter->state = PERF_COUNTER_STATE_OFF;
-       }
-
-       spin_unlock(&ctx->lock);
-}
-
-/*
- * Disable a counter.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This condition is satisifed when called through
- * perf_counter_for_each_child or perf_counter_for_each because they
- * hold the top-level counter's child_mutex, so any descendant that
- * goes to exit will block in sync_child_counter.
- * When called from perf_pending_counter it's OK because counter->ctx
- * is the current context on this CPU and preemption is disabled,
- * hence we can't get into perf_counter_task_sched_out for this context.
- */
-static void perf_counter_disable(struct perf_counter *counter)
-{
-       struct perf_counter_context *ctx = counter->ctx;
-       struct task_struct *task = ctx->task;
-
-       if (!task) {
-               /*
-                * Disable the counter on the cpu that it's on
-                */
-               smp_call_function_single(counter->cpu, __perf_counter_disable,
-                                        counter, 1);
-               return;
-       }
-
- retry:
-       task_oncpu_function_call(task, __perf_counter_disable, counter);
-
-       spin_lock_irq(&ctx->lock);
-       /*
-        * If the counter is still active, we need to retry the cross-call.
-        */
-       if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-               spin_unlock_irq(&ctx->lock);
-               goto retry;
-       }
-
-       /*
-        * Since we have the lock this context can't be scheduled
-        * in, so we can change the state safely.
-        */
-       if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-               update_group_times(counter);
-               counter->state = PERF_COUNTER_STATE_OFF;
-       }
-
-       spin_unlock_irq(&ctx->lock);
-}
-
-static int
-counter_sched_in(struct perf_counter *counter,
-                struct perf_cpu_context *cpuctx,
-                struct perf_counter_context *ctx,
-                int cpu)
-{
-       if (counter->state <= PERF_COUNTER_STATE_OFF)
-               return 0;
-
-       counter->state = PERF_COUNTER_STATE_ACTIVE;
-       counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
-       /*
-        * The new state must be visible before we turn it on in the hardware:
-        */
-       smp_wmb();
-
-       if (counter->pmu->enable(counter)) {
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               counter->oncpu = -1;
-               return -EAGAIN;
-       }
-
-       counter->tstamp_running += ctx->time - counter->tstamp_stopped;
-
-       if (!is_software_counter(counter))
-               cpuctx->active_oncpu++;
-       ctx->nr_active++;
-
-       if (counter->attr.exclusive)
-               cpuctx->exclusive = 1;
-
-       return 0;
-}
-
-static int
-group_sched_in(struct perf_counter *group_counter,
-              struct perf_cpu_context *cpuctx,
-              struct perf_counter_context *ctx,
-              int cpu)
-{
-       struct perf_counter *counter, *partial_group;
-       int ret;
-
-       if (group_counter->state == PERF_COUNTER_STATE_OFF)
-               return 0;
-
-       ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
-       if (ret)
-               return ret < 0 ? ret : 0;
-
-       if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
-               return -EAGAIN;
-
-       /*
-        * Schedule in siblings as one group (if any):
-        */
-       list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
-               if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
-                       partial_group = counter;
-                       goto group_error;
-               }
-       }
-
-       return 0;
-
-group_error:
-       /*
-        * Groups can be scheduled in as one unit only, so undo any
-        * partial group before returning:
-        */
-       list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
-               if (counter == partial_group)
-                       break;
-               counter_sched_out(counter, cpuctx, ctx);
-       }
-       counter_sched_out(group_counter, cpuctx, ctx);
-
-       return -EAGAIN;
-}
-
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-       struct perf_counter *counter;
-
-       if (!is_software_counter(leader))
-               return 0;
-
-       list_for_each_entry(counter, &leader->sibling_list, group_entry)
-               if (!is_software_counter(counter))
-                       return 0;
-
-       return 1;
-}
-
-/*
- * Work out whether we can put this counter group on the CPU now.
- */
-static int group_can_go_on(struct perf_counter *counter,
-                          struct perf_cpu_context *cpuctx,
-                          int can_add_hw)
-{
-       /*
-        * Groups consisting entirely of software counters can always go on.
-        */
-       if (is_software_only_group(counter))
-               return 1;
-       /*
-        * If an exclusive group is already on, no other hardware
-        * counters can go on.
-        */
-       if (cpuctx->exclusive)
-               return 0;
-       /*
-        * If this group is exclusive and there are already
-        * counters on the CPU, it can't go on.
-        */
-       if (counter->attr.exclusive && cpuctx->active_oncpu)
-               return 0;
-       /*
-        * Otherwise, try to add it if all previous groups were able
-        * to go on.
-        */
-       return can_add_hw;
-}
-
-static void add_counter_to_ctx(struct perf_counter *counter,
-                              struct perf_counter_context *ctx)
-{
-       list_add_counter(counter, ctx);
-       counter->tstamp_enabled = ctx->time;
-       counter->tstamp_running = ctx->time;
-       counter->tstamp_stopped = ctx->time;
-}
-
-/*
- * Cross CPU call to install and enable a performance counter
- *
- * Must be called with ctx->mutex held
- */
-static void __perf_install_in_context(void *info)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_counter *counter = info;
-       struct perf_counter_context *ctx = counter->ctx;
-       struct perf_counter *leader = counter->group_leader;
-       int cpu = smp_processor_id();
-       int err;
-
-       /*
-        * If this is a task context, we need to check whether it is
-        * the current task context of this cpu. If not it has been
-        * scheduled out before the smp call arrived.
-        * Or possibly this is the right context but it isn't
-        * on this cpu because it had no counters.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx) {
-               if (cpuctx->task_ctx || ctx->task != current)
-                       return;
-               cpuctx->task_ctx = ctx;
-       }
-
-       spin_lock(&ctx->lock);
-       ctx->is_active = 1;
-       update_context_time(ctx);
-
-       /*
-        * Protect the list operation against NMI by disabling the
-        * counters on a global level. NOP for non NMI based counters.
-        */
-       perf_disable();
-
-       add_counter_to_ctx(counter, ctx);
-
-       /*
-        * Don't put the counter on if it is disabled or if
-        * it is in a group and the group isn't on.
-        */
-       if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
-           (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
-               goto unlock;
-
-       /*
-        * An exclusive counter can't go on if there are already active
-        * hardware counters, and no hardware counter can go on if there
-        * is already an exclusive counter on.
-        */
-       if (!group_can_go_on(counter, cpuctx, 1))
-               err = -EEXIST;
-       else
-               err = counter_sched_in(counter, cpuctx, ctx, cpu);
-
-       if (err) {
-               /*
-                * This counter couldn't go on.  If it is in a group
-                * then we have to pull the whole group off.
-                * If the counter group is pinned then put it in error state.
-                */
-               if (leader != counter)
-                       group_sched_out(leader, cpuctx, ctx);
-               if (leader->attr.pinned) {
-                       update_group_times(leader);
-                       leader->state = PERF_COUNTER_STATE_ERROR;
-               }
-       }
-
-       if (!err && !ctx->task && cpuctx->max_pertask)
-               cpuctx->max_pertask--;
-
- unlock:
-       perf_enable();
-
-       spin_unlock(&ctx->lock);
-}
-
-/*
- * Attach a performance counter to a context
- *
- * First we add the counter to the list with the hardware enable bit
- * in counter->hw_config cleared.
- *
- * If the counter is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
- */
-static void
-perf_install_in_context(struct perf_counter_context *ctx,
-                       struct perf_counter *counter,
-                       int cpu)
-{
-       struct task_struct *task = ctx->task;
-
-       if (!task) {
-               /*
-                * Per cpu counters are installed via an smp call and
-                * the install is always sucessful.
-                */
-               smp_call_function_single(cpu, __perf_install_in_context,
-                                        counter, 1);
-               return;
-       }
-
-retry:
-       task_oncpu_function_call(task, __perf_install_in_context,
-                                counter);
-
-       spin_lock_irq(&ctx->lock);
-       /*
-        * we need to retry the smp call.
-        */
-       if (ctx->is_active && list_empty(&counter->group_entry)) {
-               spin_unlock_irq(&ctx->lock);
-               goto retry;
-       }
-
-       /*
-        * The lock prevents that this context is scheduled in so we
-        * can add the counter safely, if it the call above did not
-        * succeed.
-        */
-       if (list_empty(&counter->group_entry))
-               add_counter_to_ctx(counter, ctx);
-       spin_unlock_irq(&ctx->lock);
-}
-
-/*
- * Put a counter into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_counter_mark_enabled(struct perf_counter *counter,
-                                       struct perf_counter_context *ctx)
-{
-       struct perf_counter *sub;
-
-       counter->state = PERF_COUNTER_STATE_INACTIVE;
-       counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-       list_for_each_entry(sub, &counter->sibling_list, group_entry)
-               if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
-                       sub->tstamp_enabled =
-                               ctx->time - sub->total_time_enabled;
-}
-
-/*
- * Cross CPU call to enable a performance counter
- */
-static void __perf_counter_enable(void *info)
-{
-       struct perf_counter *counter = info;
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_counter_context *ctx = counter->ctx;
-       struct perf_counter *leader = counter->group_leader;
-       int err;
-
-       /*
-        * If this is a per-task counter, need to check whether this
-        * counter's task is the current task on this cpu.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx) {
-               if (cpuctx->task_ctx || ctx->task != current)
-                       return;
-               cpuctx->task_ctx = ctx;
-       }
-
-       spin_lock(&ctx->lock);
-       ctx->is_active = 1;
-       update_context_time(ctx);
-
-       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-               goto unlock;
-       __perf_counter_mark_enabled(counter, ctx);
-
-       /*
-        * If the counter is in a group and isn't the group leader,
-        * then don't put it on unless the group is on.
-        */
-       if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
-               goto unlock;
-
-       if (!group_can_go_on(counter, cpuctx, 1)) {
-               err = -EEXIST;
-       } else {
-               perf_disable();
-               if (counter == leader)
-                       err = group_sched_in(counter, cpuctx, ctx,
-                                            smp_processor_id());
-               else
-                       err = counter_sched_in(counter, cpuctx, ctx,
-                                              smp_processor_id());
-               perf_enable();
-       }
-
-       if (err) {
-               /*
-                * If this counter can't go on and it's part of a
-                * group, then the whole group has to come off.
-                */
-               if (leader != counter)
-                       group_sched_out(leader, cpuctx, ctx);
-               if (leader->attr.pinned) {
-                       update_group_times(leader);
-                       leader->state = PERF_COUNTER_STATE_ERROR;
-               }
-       }
-
- unlock:
-       spin_unlock(&ctx->lock);
-}
-
-/*
- * Enable a counter.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This condition is satisfied when called through
- * perf_counter_for_each_child or perf_counter_for_each as described
- * for perf_counter_disable.
- */
-static void perf_counter_enable(struct perf_counter *counter)
-{
-       struct perf_counter_context *ctx = counter->ctx;
-       struct task_struct *task = ctx->task;
-
-       if (!task) {
-               /*
-                * Enable the counter on the cpu that it's on
-                */
-               smp_call_function_single(counter->cpu, __perf_counter_enable,
-                                        counter, 1);
-               return;
-       }
-
-       spin_lock_irq(&ctx->lock);
-       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-               goto out;
-
-       /*
-        * If the counter is in error state, clear that first.
-        * That way, if we see the counter in error state below, we
-        * know that it has gone back into error state, as distinct
-        * from the task having been scheduled away before the
-        * cross-call arrived.
-        */
-       if (counter->state == PERF_COUNTER_STATE_ERROR)
-               counter->state = PERF_COUNTER_STATE_OFF;
-
- retry:
-       spin_unlock_irq(&ctx->lock);
-       task_oncpu_function_call(task, __perf_counter_enable, counter);
-
-       spin_lock_irq(&ctx->lock);
-
-       /*
-        * If the context is active and the counter is still off,
-        * we need to retry the cross-call.
-        */
-       if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
-               goto retry;
-
-       /*
-        * Since we have the lock this context can't be scheduled
-        * in, so we can change the state safely.
-        */
-       if (counter->state == PERF_COUNTER_STATE_OFF)
-               __perf_counter_mark_enabled(counter, ctx);
-
- out:
-       spin_unlock_irq(&ctx->lock);
-}
-
-static int perf_counter_refresh(struct perf_counter *counter, int refresh)
-{
-       /*
-        * not supported on inherited counters
-        */
-       if (counter->attr.inherit)
-               return -EINVAL;
-
-       atomic_add(refresh, &counter->event_limit);
-       perf_counter_enable(counter);
-
-       return 0;
-}
-
-void __perf_counter_sched_out(struct perf_counter_context *ctx,
-                             struct perf_cpu_context *cpuctx)
-{
-       struct perf_counter *counter;
-
-       spin_lock(&ctx->lock);
-       ctx->is_active = 0;
-       if (likely(!ctx->nr_counters))
-               goto out;
-       update_context_time(ctx);
-
-       perf_disable();
-       if (ctx->nr_active) {
-               list_for_each_entry(counter, &ctx->group_list, group_entry) {
-                       if (counter != counter->group_leader)
-                               counter_sched_out(counter, cpuctx, ctx);
-                       else
-                               group_sched_out(counter, cpuctx, ctx);
-               }
-       }
-       perf_enable();
- out:
-       spin_unlock(&ctx->lock);
-}
-
-/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled counters.
- * If the number of enabled counters is the same, then the set
- * of enabled counters should be the same, because these are both
- * inherited contexts, therefore we can't access individual counters
- * in them directly with an fd; we can only enable/disable all
- * counters via prctl, or enable/disable all counters in a family
- * via ioctl, which will have the same effect on both contexts.
- */
-static int context_equiv(struct perf_counter_context *ctx1,
-                        struct perf_counter_context *ctx2)
-{
-       return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-               && ctx1->parent_gen == ctx2->parent_gen
-               && !ctx1->pin_count && !ctx2->pin_count;
-}
-
-static void __perf_counter_read(void *counter);
-
-static void __perf_counter_sync_stat(struct perf_counter *counter,
-                                    struct perf_counter *next_counter)
-{
-       u64 value;
-
-       if (!counter->attr.inherit_stat)
-               return;
-
-       /*
-        * Update the counter value, we cannot use perf_counter_read()
-        * because we're in the middle of a context switch and have IRQs
-        * disabled, which upsets smp_call_function_single(), however
-        * we know the counter must be on the current CPU, therefore we
-        * don't need to use it.
-        */
-       switch (counter->state) {
-       case PERF_COUNTER_STATE_ACTIVE:
-               __perf_counter_read(counter);
-               break;
-
-       case PERF_COUNTER_STATE_INACTIVE:
-               update_counter_times(counter);
-               break;
-
-       default:
-               break;
-       }
-
-       /*
-        * In order to keep per-task stats reliable we need to flip the counter
-        * values when we flip the contexts.
-        */
-       value = atomic64_read(&next_counter->count);
-       value = atomic64_xchg(&counter->count, value);
-       atomic64_set(&next_counter->count, value);
-
-       swap(counter->total_time_enabled, next_counter->total_time_enabled);
-       swap(counter->total_time_running, next_counter->total_time_running);
-
-       /*
-        * Since we swizzled the values, update the user visible data too.
-        */
-       perf_counter_update_userpage(counter);
-       perf_counter_update_userpage(next_counter);
-}
-
-#define list_next_entry(pos, member) \
-       list_entry(pos->member.next, typeof(*pos), member)
-
-static void perf_counter_sync_stat(struct perf_counter_context *ctx,
-                                  struct perf_counter_context *next_ctx)
-{
-       struct perf_counter *counter, *next_counter;
-
-       if (!ctx->nr_stat)
-               return;
-
-       counter = list_first_entry(&ctx->event_list,
-                                  struct perf_counter, event_entry);
-
-       next_counter = list_first_entry(&next_ctx->event_list,
-                                       struct perf_counter, event_entry);
-
-       while (&counter->event_entry != &ctx->event_list &&
-              &next_counter->event_entry != &next_ctx->event_list) {
-
-               __perf_counter_sync_stat(counter, next_counter);
-
-               counter = list_next_entry(counter, event_entry);
-               next_counter = list_next_entry(next_counter, event_entry);
-       }
-}
-
-/*
- * Called from scheduler to remove the counters of the current task,
- * with interrupts disabled.
- *
- * We stop each counter and update the counter value in counter->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * not restart the counter.
- */
-void perf_counter_task_sched_out(struct task_struct *task,
-                                struct task_struct *next, int cpu)
-{
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_counter_context *ctx = task->perf_counter_ctxp;
-       struct perf_counter_context *next_ctx;
-       struct perf_counter_context *parent;
-       struct pt_regs *regs;
-       int do_switch = 1;
-
-       regs = task_pt_regs(task);
-       perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
-       if (likely(!ctx || !cpuctx->task_ctx))
-               return;
-
-       update_context_time(ctx);
-
-       rcu_read_lock();
-       parent = rcu_dereference(ctx->parent_ctx);
-       next_ctx = next->perf_counter_ctxp;
-       if (parent && next_ctx &&
-           rcu_dereference(next_ctx->parent_ctx) == parent) {
-               /*
-                * Looks like the two contexts are clones, so we might be
-                * able to optimize the context switch.  We lock both
-                * contexts and check that they are clones under the
-                * lock (including re-checking that neither has been
-                * uncloned in the meantime).  It doesn't matter which
-                * order we take the locks because no other cpu could
-                * be trying to lock both of these tasks.
-                */
-               spin_lock(&ctx->lock);
-               spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
-               if (context_equiv(ctx, next_ctx)) {
-                       /*
-                        * XXX do we need a memory barrier of sorts
-                        * wrt to rcu_dereference() of perf_counter_ctxp
-                        */
-                       task->perf_counter_ctxp = next_ctx;
-                       next->perf_counter_ctxp = ctx;
-                       ctx->task = next;
-                       next_ctx->task = task;
-                       do_switch = 0;
-
-                       perf_counter_sync_stat(ctx, next_ctx);
-               }
-               spin_unlock(&next_ctx->lock);
-               spin_unlock(&ctx->lock);
-       }
-       rcu_read_unlock();
-
-       if (do_switch) {
-               __perf_counter_sched_out(ctx, cpuctx);
-               cpuctx->task_ctx = NULL;
-       }
-}
-
-/*
- * Called with IRQs disabled
- */
-static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-
-       if (!cpuctx->task_ctx)
-               return;
-
-       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-               return;
-
-       __perf_counter_sched_out(ctx, cpuctx);
-       cpuctx->task_ctx = NULL;
-}
-
-/*
- * Called with IRQs disabled
- */
-static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
-{
-       __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
-}
-
-static void
-__perf_counter_sched_in(struct perf_counter_context *ctx,
-                       struct perf_cpu_context *cpuctx, int cpu)
-{
-       struct perf_counter *counter;
-       int can_add_hw = 1;
-
-       spin_lock(&ctx->lock);
-       ctx->is_active = 1;
-       if (likely(!ctx->nr_counters))
-               goto out;
-
-       ctx->timestamp = perf_clock();
-
-       perf_disable();
-
-       /*
-        * First go through the list and put on any pinned groups
-        * in order to give them the best chance of going on.
-        */
-       list_for_each_entry(counter, &ctx->group_list, group_entry) {
-               if (counter->state <= PERF_COUNTER_STATE_OFF ||
-                   !counter->attr.pinned)
-                       continue;
-               if (counter->cpu != -1 && counter->cpu != cpu)
-                       continue;
-
-               if (counter != counter->group_leader)
-                       counter_sched_in(counter, cpuctx, ctx, cpu);
-               else {
-                       if (group_can_go_on(counter, cpuctx, 1))
-                               group_sched_in(counter, cpuctx, ctx, cpu);
-               }
-
-               /*
-                * If this pinned group hasn't been scheduled,
-                * put it in error state.
-                */
-               if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-                       update_group_times(counter);
-                       counter->state = PERF_COUNTER_STATE_ERROR;
-               }
-       }
-
-       list_for_each_entry(counter, &ctx->group_list, group_entry) {
-               /*
-                * Ignore counters in OFF or ERROR state, and
-                * ignore pinned counters since we did them already.
-                */
-               if (counter->state <= PERF_COUNTER_STATE_OFF ||
-                   counter->attr.pinned)
-                       continue;
-
-               /*
-                * Listen to the 'cpu' scheduling filter constraint
-                * of counters:
-                */
-               if (counter->cpu != -1 && counter->cpu != cpu)
-                       continue;
-
-               if (counter != counter->group_leader) {
-                       if (counter_sched_in(counter, cpuctx, ctx, cpu))
-                               can_add_hw = 0;
-               } else {
-                       if (group_can_go_on(counter, cpuctx, can_add_hw)) {
-                               if (group_sched_in(counter, cpuctx, ctx, cpu))
-                                       can_add_hw = 0;
-                       }
-               }
-       }
-       perf_enable();
- out:
-       spin_unlock(&ctx->lock);
-}
-
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
-{
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_counter_context *ctx = task->perf_counter_ctxp;
-
-       if (likely(!ctx))
-               return;
-       if (cpuctx->task_ctx == ctx)
-               return;
-       __perf_counter_sched_in(ctx, cpuctx, cpu);
-       cpuctx->task_ctx = ctx;
-}
-
-static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
-{
-       struct perf_counter_context *ctx = &cpuctx->ctx;
-
-       __perf_counter_sched_in(ctx, cpuctx, cpu);
-}
-
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_counter *counter, int enable);
-
-static void perf_adjust_period(struct perf_counter *counter, u64 events)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       u64 period, sample_period;
-       s64 delta;
-
-       events *= hwc->sample_period;
-       period = div64_u64(events, counter->attr.sample_freq);
-
-       delta = (s64)(period - hwc->sample_period);
-       delta = (delta + 7) / 8; /* low pass filter */
-
-       sample_period = hwc->sample_period + delta;
-
-       if (!sample_period)
-               sample_period = 1;
-
-       hwc->sample_period = sample_period;
-}
-
-static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
-{
-       struct perf_counter *counter;
-       struct hw_perf_counter *hwc;
-       u64 interrupts, freq;
-
-       spin_lock(&ctx->lock);
-       list_for_each_entry(counter, &ctx->group_list, group_entry) {
-               if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-                       continue;
-
-               hwc = &counter->hw;
-
-               interrupts = hwc->interrupts;
-               hwc->interrupts = 0;
-
-               /*
-                * unthrottle counters on the tick
-                */
-               if (interrupts == MAX_INTERRUPTS) {
-                       perf_log_throttle(counter, 1);
-                       counter->pmu->unthrottle(counter);
-                       interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
-               }
-
-               if (!counter->attr.freq || !counter->attr.sample_freq)
-                       continue;
-
-               /*
-                * if the specified freq < HZ then we need to skip ticks
-                */
-               if (counter->attr.sample_freq < HZ) {
-                       freq = counter->attr.sample_freq;
-
-                       hwc->freq_count += freq;
-                       hwc->freq_interrupts += interrupts;
-
-                       if (hwc->freq_count < HZ)
-                               continue;
-
-                       interrupts = hwc->freq_interrupts;
-                       hwc->freq_interrupts = 0;
-                       hwc->freq_count -= HZ;
-               } else
-                       freq = HZ;
-
-               perf_adjust_period(counter, freq * interrupts);
-
-               /*
-                * In order to avoid being stalled by an (accidental) huge
-                * sample period, force reset the sample period if we didn't
-                * get any events in this freq period.
-                */
-               if (!interrupts) {
-                       perf_disable();
-                       counter->pmu->disable(counter);
-                       atomic64_set(&hwc->period_left, 0);
-                       counter->pmu->enable(counter);
-                       perf_enable();
-               }
-       }
-       spin_unlock(&ctx->lock);
-}
-
-/*
- * Round-robin a context's counters:
- */
-static void rotate_ctx(struct perf_counter_context *ctx)
-{
-       struct perf_counter *counter;
-
-       if (!ctx->nr_counters)
-               return;
-
-       spin_lock(&ctx->lock);
-       /*
-        * Rotate the first entry last (works just fine for group counters too):
-        */
-       perf_disable();
-       list_for_each_entry(counter, &ctx->group_list, group_entry) {
-               list_move_tail(&counter->group_entry, &ctx->group_list);
-               break;
-       }
-       perf_enable();
-
-       spin_unlock(&ctx->lock);
-}
-
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx;
-
-       if (!atomic_read(&nr_counters))
-               return;
-
-       cpuctx = &per_cpu(perf_cpu_context, cpu);
-       ctx = curr->perf_counter_ctxp;
-
-       perf_ctx_adjust_freq(&cpuctx->ctx);
-       if (ctx)
-               perf_ctx_adjust_freq(ctx);
-
-       perf_counter_cpu_sched_out(cpuctx);
-       if (ctx)
-               __perf_counter_task_sched_out(ctx);
-
-       rotate_ctx(&cpuctx->ctx);
-       if (ctx)
-               rotate_ctx(ctx);
-
-       perf_counter_cpu_sched_in(cpuctx, cpu);
-       if (ctx)
-               perf_counter_task_sched_in(curr, cpu);
-}
-
-/*
- * Enable all of a task's counters that have been marked enable-on-exec.
- * This expects task == current.
- */
-static void perf_counter_enable_on_exec(struct task_struct *task)
-{
-       struct perf_counter_context *ctx;
-       struct perf_counter *counter;
-       unsigned long flags;
-       int enabled = 0;
-
-       local_irq_save(flags);
-       ctx = task->perf_counter_ctxp;
-       if (!ctx || !ctx->nr_counters)
-               goto out;
-
-       __perf_counter_task_sched_out(ctx);
-
-       spin_lock(&ctx->lock);
-
-       list_for_each_entry(counter, &ctx->group_list, group_entry) {
-               if (!counter->attr.enable_on_exec)
-                       continue;
-               counter->attr.enable_on_exec = 0;
-               if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-                       continue;
-               __perf_counter_mark_enabled(counter, ctx);
-               enabled = 1;
-       }
-
-       /*
-        * Unclone this context if we enabled any counter.
-        */
-       if (enabled)
-               unclone_ctx(ctx);
-
-       spin_unlock(&ctx->lock);
-
-       perf_counter_task_sched_in(task, smp_processor_id());
- out:
-       local_irq_restore(flags);
-}
-
-/*
- * Cross CPU call to read the hardware counter
- */
-static void __perf_counter_read(void *info)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_counter *counter = info;
-       struct perf_counter_context *ctx = counter->ctx;
-       unsigned long flags;
-
-       /*
-        * If this is a task context, we need to check whether it is
-        * the current task context of this cpu.  If not it has been
-        * scheduled out before the smp call arrived.  In that case
-        * counter->count would have been updated to a recent sample
-        * when the counter was scheduled out.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx)
-               return;
-
-       local_irq_save(flags);
-       if (ctx->is_active)
-               update_context_time(ctx);
-       counter->pmu->read(counter);
-       update_counter_times(counter);
-       local_irq_restore(flags);
-}
-
-static u64 perf_counter_read(struct perf_counter *counter)
-{
-       /*
-        * If counter is enabled and currently active on a CPU, update the
-        * value in the counter structure:
-        */
-       if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-               smp_call_function_single(counter->oncpu,
-                                        __perf_counter_read, counter, 1);
-       } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-               update_counter_times(counter);
-       }
-
-       return atomic64_read(&counter->count);
-}
-
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-                           struct task_struct *task)
-{
-       memset(ctx, 0, sizeof(*ctx));
-       spin_lock_init(&ctx->lock);
-       mutex_init(&ctx->mutex);
-       INIT_LIST_HEAD(&ctx->group_list);
-       INIT_LIST_HEAD(&ctx->event_list);
-       atomic_set(&ctx->refcount, 1);
-       ctx->task = task;
-}
-
-static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
-{
-       struct perf_counter_context *ctx;
-       struct perf_cpu_context *cpuctx;
-       struct task_struct *task;
-       unsigned long flags;
-       int err;
-
-       /*
-        * If cpu is not a wildcard then this is a percpu counter:
-        */
-       if (cpu != -1) {
-               /* Must be root to operate on a CPU counter: */
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                       return ERR_PTR(-EACCES);
-
-               if (cpu < 0 || cpu > num_possible_cpus())
-                       return ERR_PTR(-EINVAL);
-
-               /*
-                * We could be clever and allow to attach a counter to an
-                * offline CPU and activate it when the CPU comes up, but
-                * that's for later.
-                */
-               if (!cpu_isset(cpu, cpu_online_map))
-                       return ERR_PTR(-ENODEV);
-
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
-               get_ctx(ctx);
-
-               return ctx;
-       }
-
-       rcu_read_lock();
-       if (!pid)
-               task = current;
-       else
-               task = find_task_by_vpid(pid);
-       if (task)
-               get_task_struct(task);
-       rcu_read_unlock();
-
-       if (!task)
-               return ERR_PTR(-ESRCH);
-
-       /*
-        * Can't attach counters to a dying task.
-        */
-       err = -ESRCH;
-       if (task->flags & PF_EXITING)
-               goto errout;
-
-       /* Reuse ptrace permission checks for now. */
-       err = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
-               goto errout;
-
- retry:
-       ctx = perf_lock_task_context(task, &flags);
-       if (ctx) {
-               unclone_ctx(ctx);
-               spin_unlock_irqrestore(&ctx->lock, flags);
-       }
-
-       if (!ctx) {
-               ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-               err = -ENOMEM;
-               if (!ctx)
-                       goto errout;
-               __perf_counter_init_context(ctx, task);
-               get_ctx(ctx);
-               if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
-                       /*
-                        * We raced with some other task; use
-                        * the context they set.
-                        */
-                       kfree(ctx);
-                       goto retry;
-               }
-               get_task_struct(task);
-       }
-
-       put_task_struct(task);
-       return ctx;
-
- errout:
-       put_task_struct(task);
-       return ERR_PTR(err);
-}
-
-static void free_counter_rcu(struct rcu_head *head)
-{
-       struct perf_counter *counter;
-
-       counter = container_of(head, struct perf_counter, rcu_head);
-       if (counter->ns)
-               put_pid_ns(counter->ns);
-       kfree(counter);
-}
-
-static void perf_pending_sync(struct perf_counter *counter);
-
-static void free_counter(struct perf_counter *counter)
-{
-       perf_pending_sync(counter);
-
-       if (!counter->parent) {
-               atomic_dec(&nr_counters);
-               if (counter->attr.mmap)
-                       atomic_dec(&nr_mmap_counters);
-               if (counter->attr.comm)
-                       atomic_dec(&nr_comm_counters);
-               if (counter->attr.task)
-                       atomic_dec(&nr_task_counters);
-       }
-
-       if (counter->output) {
-               fput(counter->output->filp);
-               counter->output = NULL;
-       }
-
-       if (counter->destroy)
-               counter->destroy(counter);
-
-       put_ctx(counter->ctx);
-       call_rcu(&counter->rcu_head, free_counter_rcu);
-}
-
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
-{
-       struct perf_counter *counter = file->private_data;
-       struct perf_counter_context *ctx = counter->ctx;
-
-       file->private_data = NULL;
-
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
-       perf_counter_remove_from_context(counter);
-       mutex_unlock(&ctx->mutex);
-
-       mutex_lock(&counter->owner->perf_counter_mutex);
-       list_del_init(&counter->owner_entry);
-       mutex_unlock(&counter->owner->perf_counter_mutex);
-       put_task_struct(counter->owner);
-
-       free_counter(counter);
-
-       return 0;
-}
-
-static int perf_counter_read_size(struct perf_counter *counter)
-{
-       int entry = sizeof(u64); /* value */
-       int size = 0;
-       int nr = 1;
-
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               size += sizeof(u64);
-
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               size += sizeof(u64);
-
-       if (counter->attr.read_format & PERF_FORMAT_ID)
-               entry += sizeof(u64);
-
-       if (counter->attr.read_format & PERF_FORMAT_GROUP) {
-               nr += counter->group_leader->nr_siblings;
-               size += sizeof(u64);
-       }
-
-       size += entry * nr;
-
-       return size;
-}
-
-static u64 perf_counter_read_value(struct perf_counter *counter)
-{
-       struct perf_counter *child;
-       u64 total = 0;
-
-       total += perf_counter_read(counter);
-       list_for_each_entry(child, &counter->child_list, child_list)
-               total += perf_counter_read(child);
-
-       return total;
-}
-
-static int perf_counter_read_entry(struct perf_counter *counter,
-                                  u64 read_format, char __user *buf)
-{
-       int n = 0, count = 0;
-       u64 values[2];
-
-       values[n++] = perf_counter_read_value(counter);
-       if (read_format & PERF_FORMAT_ID)
-               values[n++] = primary_counter_id(counter);
-
-       count = n * sizeof(u64);
-
-       if (copy_to_user(buf, values, count))
-               return -EFAULT;
-
-       return count;
-}
-
-static int perf_counter_read_group(struct perf_counter *counter,
-                                  u64 read_format, char __user *buf)
-{
-       struct perf_counter *leader = counter->group_leader, *sub;
-       int n = 0, size = 0, err = -EFAULT;
-       u64 values[3];
-
-       values[n++] = 1 + leader->nr_siblings;
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               values[n++] = leader->total_time_enabled +
-                       atomic64_read(&leader->child_total_time_enabled);
-       }
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               values[n++] = leader->total_time_running +
-                       atomic64_read(&leader->child_total_time_running);
-       }
-
-       size = n * sizeof(u64);
-
-       if (copy_to_user(buf, values, size))
-               return -EFAULT;
-
-       err = perf_counter_read_entry(leader, read_format, buf + size);
-       if (err < 0)
-               return err;
-
-       size += err;
-
-       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-               err = perf_counter_read_entry(sub, read_format,
-                               buf + size);
-               if (err < 0)
-                       return err;
-
-               size += err;
-       }
-
-       return size;
-}
-
-static int perf_counter_read_one(struct perf_counter *counter,
-                                u64 read_format, char __user *buf)
-{
-       u64 values[4];
-       int n = 0;
-
-       values[n++] = perf_counter_read_value(counter);
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               values[n++] = counter->total_time_enabled +
-                       atomic64_read(&counter->child_total_time_enabled);
-       }
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               values[n++] = counter->total_time_running +
-                       atomic64_read(&counter->child_total_time_running);
-       }
-       if (read_format & PERF_FORMAT_ID)
-               values[n++] = primary_counter_id(counter);
-
-       if (copy_to_user(buf, values, n * sizeof(u64)))
-               return -EFAULT;
-
-       return n * sizeof(u64);
-}
-
-/*
- * Read the performance counter - simple non blocking version for now
- */
-static ssize_t
-perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
-{
-       u64 read_format = counter->attr.read_format;
-       int ret;
-
-       /*
-        * Return end-of-file for a read on a counter that is in
-        * error state (i.e. because it was pinned but it couldn't be
-        * scheduled on to the CPU at some point).
-        */
-       if (counter->state == PERF_COUNTER_STATE_ERROR)
-               return 0;
-
-       if (count < perf_counter_read_size(counter))
-               return -ENOSPC;
-
-       WARN_ON_ONCE(counter->ctx->parent_ctx);
-       mutex_lock(&counter->child_mutex);
-       if (read_format & PERF_FORMAT_GROUP)
-               ret = perf_counter_read_group(counter, read_format, buf);
-       else
-               ret = perf_counter_read_one(counter, read_format, buf);
-       mutex_unlock(&counter->child_mutex);
-
-       return ret;
-}
-
-static ssize_t
-perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
-       struct perf_counter *counter = file->private_data;
-
-       return perf_read_hw(counter, buf, count);
-}
-
-static unsigned int perf_poll(struct file *file, poll_table *wait)
-{
-       struct perf_counter *counter = file->private_data;
-       struct perf_mmap_data *data;
-       unsigned int events = POLL_HUP;
-
-       rcu_read_lock();
-       data = rcu_dereference(counter->data);
-       if (data)
-               events = atomic_xchg(&data->poll, 0);
-       rcu_read_unlock();
-
-       poll_wait(file, &counter->waitq, wait);
-
-       return events;
-}
-
-static void perf_counter_reset(struct perf_counter *counter)
-{
-       (void)perf_counter_read(counter);
-       atomic64_set(&counter->count, 0);
-       perf_counter_update_userpage(counter);
-}
-
-/*
- * Holding the top-level counter's child_mutex means that any
- * descendant process that has inherited this counter will block
- * in sync_child_counter if it goes to exit, thus satisfying the
- * task existence requirements of perf_counter_enable/disable.
- */
-static void perf_counter_for_each_child(struct perf_counter *counter,
-                                       void (*func)(struct perf_counter *))
-{
-       struct perf_counter *child;
-
-       WARN_ON_ONCE(counter->ctx->parent_ctx);
-       mutex_lock(&counter->child_mutex);
-       func(counter);
-       list_for_each_entry(child, &counter->child_list, child_list)
-               func(child);
-       mutex_unlock(&counter->child_mutex);
-}
-
-static void perf_counter_for_each(struct perf_counter *counter,
-                                 void (*func)(struct perf_counter *))
-{
-       struct perf_counter_context *ctx = counter->ctx;
-       struct perf_counter *sibling;
-
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
-       counter = counter->group_leader;
-
-       perf_counter_for_each_child(counter, func);
-       func(counter);
-       list_for_each_entry(sibling, &counter->sibling_list, group_entry)
-               perf_counter_for_each_child(counter, func);
-       mutex_unlock(&ctx->mutex);
-}
-
-static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
-{
-       struct perf_counter_context *ctx = counter->ctx;
-       unsigned long size;
-       int ret = 0;
-       u64 value;
-
-       if (!counter->attr.sample_period)
-               return -EINVAL;
-
-       size = copy_from_user(&value, arg, sizeof(value));
-       if (size != sizeof(value))
-               return -EFAULT;
-
-       if (!value)
-               return -EINVAL;
-
-       spin_lock_irq(&ctx->lock);
-       if (counter->attr.freq) {
-               if (value > sysctl_perf_counter_sample_rate) {
-                       ret = -EINVAL;
-                       goto unlock;
-               }
-
-               counter->attr.sample_freq = value;
-       } else {
-               counter->attr.sample_period = value;
-               counter->hw.sample_period = value;
-       }
-unlock:
-       spin_unlock_irq(&ctx->lock);
-
-       return ret;
-}
-
-int perf_counter_set_output(struct perf_counter *counter, int output_fd);
-
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-       struct perf_counter *counter = file->private_data;
-       void (*func)(struct perf_counter *);
-       u32 flags = arg;
-
-       switch (cmd) {
-       case PERF_COUNTER_IOC_ENABLE:
-               func = perf_counter_enable;
-               break;
-       case PERF_COUNTER_IOC_DISABLE:
-               func = perf_counter_disable;
-               break;
-       case PERF_COUNTER_IOC_RESET:
-               func = perf_counter_reset;
-               break;
-
-       case PERF_COUNTER_IOC_REFRESH:
-               return perf_counter_refresh(counter, arg);
-
-       case PERF_COUNTER_IOC_PERIOD:
-               return perf_counter_period(counter, (u64 __user *)arg);
-
-       case PERF_COUNTER_IOC_SET_OUTPUT:
-               return perf_counter_set_output(counter, arg);
-
-       default:
-               return -ENOTTY;
-       }
-
-       if (flags & PERF_IOC_FLAG_GROUP)
-               perf_counter_for_each(counter, func);
-       else
-               perf_counter_for_each_child(counter, func);
-
-       return 0;
-}
-
-int perf_counter_task_enable(void)
-{
-       struct perf_counter *counter;
-
-       mutex_lock(&current->perf_counter_mutex);
-       list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-               perf_counter_for_each_child(counter, perf_counter_enable);
-       mutex_unlock(&current->perf_counter_mutex);
-
-       return 0;
-}
-
-int perf_counter_task_disable(void)
-{
-       struct perf_counter *counter;
-
-       mutex_lock(&current->perf_counter_mutex);
-       list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-               perf_counter_for_each_child(counter, perf_counter_disable);
-       mutex_unlock(&current->perf_counter_mutex);
-
-       return 0;
-}
-
-#ifndef PERF_COUNTER_INDEX_OFFSET
-# define PERF_COUNTER_INDEX_OFFSET 0
-#endif
-
-static int perf_counter_index(struct perf_counter *counter)
-{
-       if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-               return 0;
-
-       return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
-}
-
-/*
- * Callers need to ensure there can be no nesting of this function, otherwise
- * the seqlock logic goes bad. We can not serialize this because the arch
- * code calls this from NMI context.
- */
-void perf_counter_update_userpage(struct perf_counter *counter)
-{
-       struct perf_counter_mmap_page *userpg;
-       struct perf_mmap_data *data;
-
-       rcu_read_lock();
-       data = rcu_dereference(counter->data);
-       if (!data)
-               goto unlock;
-
-       userpg = data->user_page;
-
-       /*
-        * Disable preemption so as to not let the corresponding user-space
-        * spin too long if we get preempted.
-        */
-       preempt_disable();
-       ++userpg->lock;
-       barrier();
-       userpg->index = perf_counter_index(counter);
-       userpg->offset = atomic64_read(&counter->count);
-       if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-               userpg->offset -= atomic64_read(&counter->hw.prev_count);
-
-       userpg->time_enabled = counter->total_time_enabled +
-                       atomic64_read(&counter->child_total_time_enabled);
-
-       userpg->time_running = counter->total_time_running +
-                       atomic64_read(&counter->child_total_time_running);
-
-       barrier();
-       ++userpg->lock;
-       preempt_enable();
-unlock:
-       rcu_read_unlock();
-}
-
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct perf_counter *counter = vma->vm_file->private_data;
-       struct perf_mmap_data *data;
-       int ret = VM_FAULT_SIGBUS;
-
-       if (vmf->flags & FAULT_FLAG_MKWRITE) {
-               if (vmf->pgoff == 0)
-                       ret = 0;
-               return ret;
-       }
-
-       rcu_read_lock();
-       data = rcu_dereference(counter->data);
-       if (!data)
-               goto unlock;
-
-       if (vmf->pgoff == 0) {
-               vmf->page = virt_to_page(data->user_page);
-       } else {
-               int nr = vmf->pgoff - 1;
-
-               if ((unsigned)nr > data->nr_pages)
-                       goto unlock;
-
-               if (vmf->flags & FAULT_FLAG_WRITE)
-                       goto unlock;
-
-               vmf->page = virt_to_page(data->data_pages[nr]);
-       }
-
-       get_page(vmf->page);
-       vmf->page->mapping = vma->vm_file->f_mapping;
-       vmf->page->index   = vmf->pgoff;
-
-       ret = 0;
-unlock:
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
-{
-       struct perf_mmap_data *data;
-       unsigned long size;
-       int i;
-
-       WARN_ON(atomic_read(&counter->mmap_count));
-
-       size = sizeof(struct perf_mmap_data);
-       size += nr_pages * sizeof(void *);
-
-       data = kzalloc(size, GFP_KERNEL);
-       if (!data)
-               goto fail;
-
-       data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
-       if (!data->user_page)
-               goto fail_user_page;
-
-       for (i = 0; i < nr_pages; i++) {
-               data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
-               if (!data->data_pages[i])
-                       goto fail_data_pages;
-       }
-
-       data->nr_pages = nr_pages;
-       atomic_set(&data->lock, -1);
-
-       if (counter->attr.watermark) {
-               data->watermark = min_t(long, PAGE_SIZE * nr_pages,
-                                     counter->attr.wakeup_watermark);
-       }
-       if (!data->watermark)
-               data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
-
-       rcu_assign_pointer(counter->data, data);
-
-       return 0;
-
-fail_data_pages:
-       for (i--; i >= 0; i--)
-               free_page((unsigned long)data->data_pages[i]);
-
-       free_page((unsigned long)data->user_page);
-
-fail_user_page:
-       kfree(data);
-
-fail:
-       return -ENOMEM;
-}
-
-static void perf_mmap_free_page(unsigned long addr)
-{
-       struct page *page = virt_to_page((void *)addr);
-
-       page->mapping = NULL;
-       __free_page(page);
-}
-
-static void __perf_mmap_data_free(struct rcu_head *rcu_head)
-{
-       struct perf_mmap_data *data;
-       int i;
-
-       data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-
-       perf_mmap_free_page((unsigned long)data->user_page);
-       for (i = 0; i < data->nr_pages; i++)
-               perf_mmap_free_page((unsigned long)data->data_pages[i]);
-
-       kfree(data);
-}
-
-static void perf_mmap_data_free(struct perf_counter *counter)
-{
-       struct perf_mmap_data *data = counter->data;
-
-       WARN_ON(atomic_read(&counter->mmap_count));
-
-       rcu_assign_pointer(counter->data, NULL);
-       call_rcu(&data->rcu_head, __perf_mmap_data_free);
-}
-
-static void perf_mmap_open(struct vm_area_struct *vma)
-{
-       struct perf_counter *counter = vma->vm_file->private_data;
-
-       atomic_inc(&counter->mmap_count);
-}
-
-static void perf_mmap_close(struct vm_area_struct *vma)
-{
-       struct perf_counter *counter = vma->vm_file->private_data;
-
-       WARN_ON_ONCE(counter->ctx->parent_ctx);
-       if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
-               struct user_struct *user = current_user();
-
-               atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
-               vma->vm_mm->locked_vm -= counter->data->nr_locked;
-               perf_mmap_data_free(counter);
-               mutex_unlock(&counter->mmap_mutex);
-       }
-}
-
-static struct vm_operations_struct perf_mmap_vmops = {
-       .open           = perf_mmap_open,
-       .close          = perf_mmap_close,
-       .fault          = perf_mmap_fault,
-       .page_mkwrite   = perf_mmap_fault,
-};
-
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       struct perf_counter *counter = file->private_data;
-       unsigned long user_locked, user_lock_limit;
-       struct user_struct *user = current_user();
-       unsigned long locked, lock_limit;
-       unsigned long vma_size;
-       unsigned long nr_pages;
-       long user_extra, extra;
-       int ret = 0;
-
-       if (!(vma->vm_flags & VM_SHARED))
-               return -EINVAL;
-
-       vma_size = vma->vm_end - vma->vm_start;
-       nr_pages = (vma_size / PAGE_SIZE) - 1;
-
-       /*
-        * If we have data pages ensure they're a power-of-two number, so we
-        * can do bitmasks instead of modulo.
-        */
-       if (nr_pages != 0 && !is_power_of_2(nr_pages))
-               return -EINVAL;
-
-       if (vma_size != PAGE_SIZE * (1 + nr_pages))
-               return -EINVAL;
-
-       if (vma->vm_pgoff != 0)
-               return -EINVAL;
-
-       WARN_ON_ONCE(counter->ctx->parent_ctx);
-       mutex_lock(&counter->mmap_mutex);
-       if (counter->output) {
-               ret = -EINVAL;
-               goto unlock;
-       }
-
-       if (atomic_inc_not_zero(&counter->mmap_count)) {
-               if (nr_pages != counter->data->nr_pages)
-                       ret = -EINVAL;
-               goto unlock;
-       }
-
-       user_extra = nr_pages + 1;
-       user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-
-       /*
-        * Increase the limit linearly with more CPUs:
-        */
-       user_lock_limit *= num_online_cpus();
-
-       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
-       extra = 0;
-       if (user_locked > user_lock_limit)
-               extra = user_locked - user_lock_limit;
-
-       lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-       lock_limit >>= PAGE_SHIFT;
-       locked = vma->vm_mm->locked_vm + extra;
-
-       if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
-               !capable(CAP_IPC_LOCK)) {
-               ret = -EPERM;
-               goto unlock;
-       }
-
-       WARN_ON(counter->data);
-       ret = perf_mmap_data_alloc(counter, nr_pages);
-       if (ret)
-               goto unlock;
-
-       atomic_set(&counter->mmap_count, 1);
-       atomic_long_add(user_extra, &user->locked_vm);
-       vma->vm_mm->locked_vm += extra;
-       counter->data->nr_locked = extra;
-       if (vma->vm_flags & VM_WRITE)
-               counter->data->writable = 1;
-
-unlock:
-       mutex_unlock(&counter->mmap_mutex);
-
-       vma->vm_flags |= VM_RESERVED;
-       vma->vm_ops = &perf_mmap_vmops;
-
-       return ret;
-}
-
-static int perf_fasync(int fd, struct file *filp, int on)
-{
-       struct inode *inode = filp->f_path.dentry->d_inode;
-       struct perf_counter *counter = filp->private_data;
-       int retval;
-
-       mutex_lock(&inode->i_mutex);
-       retval = fasync_helper(fd, filp, on, &counter->fasync);
-       mutex_unlock(&inode->i_mutex);
-
-       if (retval < 0)
-               return retval;
-
-       return 0;
-}
-
-static const struct file_operations perf_fops = {
-       .release                = perf_release,
-       .read                   = perf_read,
-       .poll                   = perf_poll,
-       .unlocked_ioctl         = perf_ioctl,
-       .compat_ioctl           = perf_ioctl,
-       .mmap                   = perf_mmap,
-       .fasync                 = perf_fasync,
-};
-
-/*
- * Perf counter wakeup
- *
- * If there's data, ensure we set the poll() state and publish everything
- * to user-space before waking everybody up.
- */
-
-void perf_counter_wakeup(struct perf_counter *counter)
-{
-       wake_up_all(&counter->waitq);
-
-       if (counter->pending_kill) {
-               kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
-               counter->pending_kill = 0;
-       }
-}
-
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_counter(struct perf_pending_entry *entry)
-{
-       struct perf_counter *counter = container_of(entry,
-                       struct perf_counter, pending);
-
-       if (counter->pending_disable) {
-               counter->pending_disable = 0;
-               __perf_counter_disable(counter);
-       }
-
-       if (counter->pending_wakeup) {
-               counter->pending_wakeup = 0;
-               perf_counter_wakeup(counter);
-       }
-}
-
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-       PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-                              void (*func)(struct perf_pending_entry *))
-{
-       struct perf_pending_entry **head;
-
-       if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-               return;
-
-       entry->func = func;
-
-       head = &get_cpu_var(perf_pending_head);
-
-       do {
-               entry->next = *head;
-       } while (cmpxchg(head, entry->next, entry) != entry->next);
-
-       set_perf_counter_pending();
-
-       put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-       struct perf_pending_entry *list;
-       int nr = 0;
-
-       list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-       while (list != PENDING_TAIL) {
-               void (*func)(struct perf_pending_entry *);
-               struct perf_pending_entry *entry = list;
-
-               list = list->next;
-
-               func = entry->func;
-               entry->next = NULL;
-               /*
-                * Ensure we observe the unqueue before we issue the wakeup,
-                * so that we won't be waiting forever.
-                * -- see perf_not_pending().
-                */
-               smp_wmb();
-
-               func(entry);
-               nr++;
-       }
-
-       return nr;
-}
-
-static inline int perf_not_pending(struct perf_counter *counter)
-{
-       /*
-        * If we flush on whatever cpu we run, there is a chance we don't
-        * need to wait.
-        */
-       get_cpu();
-       __perf_pending_run();
-       put_cpu();
-
-       /*
-        * Ensure we see the proper queue state before going to sleep
-        * so that we do not miss the wakeup. -- see perf_pending_handle()
-        */
-       smp_rmb();
-       return counter->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_counter *counter)
-{
-       wait_event(counter->waitq, perf_not_pending(counter));
-}
-
-void perf_counter_do_pending(void)
-{
-       __perf_pending_run();
-}
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       return NULL;
-}
-
-/*
- * Output
- */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
-                             unsigned long offset, unsigned long head)
-{
-       unsigned long mask;
-
-       if (!data->writable)
-               return true;
-
-       mask = (data->nr_pages << PAGE_SHIFT) - 1;
-
-       offset = (offset - tail) & mask;
-       head   = (head   - tail) & mask;
-
-       if ((int)(head - offset) < 0)
-               return false;
-
-       return true;
-}
-
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
-       atomic_set(&handle->data->poll, POLL_IN);
-
-       if (handle->nmi) {
-               handle->counter->pending_wakeup = 1;
-               perf_pending_queue(&handle->counter->pending,
-                                  perf_pending_counter);
-       } else
-               perf_counter_wakeup(handle->counter);
-}
-
-/*
- * Curious locking construct.
- *
- * We need to ensure a later event doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_lock(struct perf_output_handle *handle)
-{
-       struct perf_mmap_data *data = handle->data;
-       int cpu;
-
-       handle->locked = 0;
-
-       local_irq_save(handle->flags);
-       cpu = smp_processor_id();
-
-       if (in_nmi() && atomic_read(&data->lock) == cpu)
-               return;
-
-       while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-               cpu_relax();
-
-       handle->locked = 1;
-}
-
-static void perf_output_unlock(struct perf_output_handle *handle)
-{
-       struct perf_mmap_data *data = handle->data;
-       unsigned long head;
-       int cpu;
-
-       data->done_head = data->head;
-
-       if (!handle->locked)
-               goto out;
-
-again:
-       /*
-        * The xchg implies a full barrier that ensures all writes are done
-        * before we publish the new head, matched by a rmb() in userspace when
-        * reading this position.
-        */
-       while ((head = atomic_long_xchg(&data->done_head, 0)))
-               data->user_page->data_head = head;
-
-       /*
-        * NMI can happen here, which means we can miss a done_head update.
-        */
-
-       cpu = atomic_xchg(&data->lock, -1);
-       WARN_ON_ONCE(cpu != smp_processor_id());
-
-       /*
-        * Therefore we have to validate we did not indeed do so.
-        */
-       if (unlikely(atomic_long_read(&data->done_head))) {
-               /*
-                * Since we had it locked, we can lock it again.
-                */
-               while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-                       cpu_relax();
-
-               goto again;
-       }
-
-       if (atomic_xchg(&data->wakeup, 0))
-               perf_output_wakeup(handle);
-out:
-       local_irq_restore(handle->flags);
-}
-
-void perf_output_copy(struct perf_output_handle *handle,
-                     const void *buf, unsigned int len)
-{
-       unsigned int pages_mask;
-       unsigned int offset;
-       unsigned int size;
-       void **pages;
-
-       offset          = handle->offset;
-       pages_mask      = handle->data->nr_pages - 1;
-       pages           = handle->data->data_pages;
-
-       do {
-               unsigned int page_offset;
-               int nr;
-
-               nr          = (offset >> PAGE_SHIFT) & pages_mask;
-               page_offset = offset & (PAGE_SIZE - 1);
-               size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-
-               memcpy(pages[nr] + page_offset, buf, size);
-
-               len         -= size;
-               buf         += size;
-               offset      += size;
-       } while (len);
-
-       handle->offset = offset;
-
-       /*
-        * Check we didn't copy past our reservation window, taking the
-        * possible unsigned int wrap into account.
-        */
-       WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-
-int perf_output_begin(struct perf_output_handle *handle,
-                     struct perf_counter *counter, unsigned int size,
-                     int nmi, int sample)
-{
-       struct perf_counter *output_counter;
-       struct perf_mmap_data *data;
-       unsigned long tail, offset, head;
-       int have_lost;
-       struct {
-               struct perf_event_header header;
-               u64                      id;
-               u64                      lost;
-       } lost_event;
-
-       rcu_read_lock();
-       /*
-        * For inherited counters we send all the output towards the parent.
-        */
-       if (counter->parent)
-               counter = counter->parent;
-
-       output_counter = rcu_dereference(counter->output);
-       if (output_counter)
-               counter = output_counter;
-
-       data = rcu_dereference(counter->data);
-       if (!data)
-               goto out;
-
-       handle->data    = data;
-       handle->counter = counter;
-       handle->nmi     = nmi;
-       handle->sample  = sample;
-
-       if (!data->nr_pages)
-               goto fail;
-
-       have_lost = atomic_read(&data->lost);
-       if (have_lost)
-               size += sizeof(lost_event);
-
-       perf_output_lock(handle);
-
-       do {
-               /*
-                * Userspace could choose to issue a mb() before updating the
-                * tail pointer. So that all reads will be completed before the
-                * write is issued.
-                */
-               tail = ACCESS_ONCE(data->user_page->data_tail);
-               smp_rmb();
-               offset = head = atomic_long_read(&data->head);
-               head += size;
-               if (unlikely(!perf_output_space(data, tail, offset, head)))
-                       goto fail;
-       } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
-
-       handle->offset  = offset;
-       handle->head    = head;
-
-       if (head - tail > data->watermark)
-               atomic_set(&data->wakeup, 1);
-
-       if (have_lost) {
-               lost_event.header.type = PERF_EVENT_LOST;
-               lost_event.header.misc = 0;
-               lost_event.header.size = sizeof(lost_event);
-               lost_event.id          = counter->id;
-               lost_event.lost        = atomic_xchg(&data->lost, 0);
-
-               perf_output_put(handle, lost_event);
-       }
-
-       return 0;
-
-fail:
-       atomic_inc(&data->lost);
-       perf_output_unlock(handle);
-out:
-       rcu_read_unlock();
-
-       return -ENOSPC;
-}
-
-void perf_output_end(struct perf_output_handle *handle)
-{
-       struct perf_counter *counter = handle->counter;
-       struct perf_mmap_data *data = handle->data;
-
-       int wakeup_events = counter->attr.wakeup_events;
-
-       if (handle->sample && wakeup_events) {
-               int events = atomic_inc_return(&data->events);
-               if (events >= wakeup_events) {
-                       atomic_sub(wakeup_events, &data->events);
-                       atomic_set(&data->wakeup, 1);
-               }
-       }
-
-       perf_output_unlock(handle);
-       rcu_read_unlock();
-}
-
-static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
-{
-       /*
-        * only top level counters have the pid namespace they were created in
-        */
-       if (counter->parent)
-               counter = counter->parent;
-
-       return task_tgid_nr_ns(p, counter->ns);
-}
-
-static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
-{
-       /*
-        * only top level counters have the pid namespace they were created in
-        */
-       if (counter->parent)
-               counter = counter->parent;
-
-       return task_pid_nr_ns(p, counter->ns);
-}
-
-static void perf_output_read_one(struct perf_output_handle *handle,
-                                struct perf_counter *counter)
-{
-       u64 read_format = counter->attr.read_format;
-       u64 values[4];
-       int n = 0;
-
-       values[n++] = atomic64_read(&counter->count);
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               values[n++] = counter->total_time_enabled +
-                       atomic64_read(&counter->child_total_time_enabled);
-       }
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               values[n++] = counter->total_time_running +
-                       atomic64_read(&counter->child_total_time_running);
-       }
-       if (read_format & PERF_FORMAT_ID)
-               values[n++] = primary_counter_id(counter);
-
-       perf_output_copy(handle, values, n * sizeof(u64));
-}
-
-/*
- * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
- */
-static void perf_output_read_group(struct perf_output_handle *handle,
-                           struct perf_counter *counter)
-{
-       struct perf_counter *leader = counter->group_leader, *sub;
-       u64 read_format = counter->attr.read_format;
-       u64 values[5];
-       int n = 0;
-
-       values[n++] = 1 + leader->nr_siblings;
-
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               values[n++] = leader->total_time_enabled;
-
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               values[n++] = leader->total_time_running;
-
-       if (leader != counter)
-               leader->pmu->read(leader);
-
-       values[n++] = atomic64_read(&leader->count);
-       if (read_format & PERF_FORMAT_ID)
-               values[n++] = primary_counter_id(leader);
-
-       perf_output_copy(handle, values, n * sizeof(u64));
-
-       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-               n = 0;
-
-               if (sub != counter)
-                       sub->pmu->read(sub);
-
-               values[n++] = atomic64_read(&sub->count);
-               if (read_format & PERF_FORMAT_ID)
-                       values[n++] = primary_counter_id(sub);
-
-               perf_output_copy(handle, values, n * sizeof(u64));
-       }
-}
-
-static void perf_output_read(struct perf_output_handle *handle,
-                            struct perf_counter *counter)
-{
-       if (counter->attr.read_format & PERF_FORMAT_GROUP)
-               perf_output_read_group(handle, counter);
-       else
-               perf_output_read_one(handle, counter);
-}
-
-void perf_output_sample(struct perf_output_handle *handle,
-                       struct perf_event_header *header,
-                       struct perf_sample_data *data,
-                       struct perf_counter *counter)
-{
-       u64 sample_type = data->type;
-
-       perf_output_put(handle, *header);
-
-       if (sample_type & PERF_SAMPLE_IP)
-               perf_output_put(handle, data->ip);
-
-       if (sample_type & PERF_SAMPLE_TID)
-               perf_output_put(handle, data->tid_entry);
-
-       if (sample_type & PERF_SAMPLE_TIME)
-               perf_output_put(handle, data->time);
-
-       if (sample_type & PERF_SAMPLE_ADDR)
-               perf_output_put(handle, data->addr);
-
-       if (sample_type & PERF_SAMPLE_ID)
-               perf_output_put(handle, data->id);
-
-       if (sample_type & PERF_SAMPLE_STREAM_ID)
-               perf_output_put(handle, data->stream_id);
-
-       if (sample_type & PERF_SAMPLE_CPU)
-               perf_output_put(handle, data->cpu_entry);
-
-       if (sample_type & PERF_SAMPLE_PERIOD)
-               perf_output_put(handle, data->period);
-
-       if (sample_type & PERF_SAMPLE_READ)
-               perf_output_read(handle, counter);
-
-       if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-               if (data->callchain) {
-                       int size = 1;
-
-                       if (data->callchain)
-                               size += data->callchain->nr;
-
-                       size *= sizeof(u64);
-
-                       perf_output_copy(handle, data->callchain, size);
-               } else {
-                       u64 nr = 0;
-                       perf_output_put(handle, nr);
-               }
-       }
-
-       if (sample_type & PERF_SAMPLE_RAW) {
-               if (data->raw) {
-                       perf_output_put(handle, data->raw->size);
-                       perf_output_copy(handle, data->raw->data,
-                                        data->raw->size);
-               } else {
-                       struct {
-                               u32     size;
-                               u32     data;
-                       } raw = {
-                               .size = sizeof(u32),
-                               .data = 0,
-                       };
-                       perf_output_put(handle, raw);
-               }
-       }
-}
-
-void perf_prepare_sample(struct perf_event_header *header,
-                        struct perf_sample_data *data,
-                        struct perf_counter *counter,
-                        struct pt_regs *regs)
-{
-       u64 sample_type = counter->attr.sample_type;
-
-       data->type = sample_type;
-
-       header->type = PERF_EVENT_SAMPLE;
-       header->size = sizeof(*header);
-
-       header->misc = 0;
-       header->misc |= perf_misc_flags(regs);
-
-       if (sample_type & PERF_SAMPLE_IP) {
-               data->ip = perf_instruction_pointer(regs);
-
-               header->size += sizeof(data->ip);
-       }
-
-       if (sample_type & PERF_SAMPLE_TID) {
-               /* namespace issues */
-               data->tid_entry.pid = perf_counter_pid(counter, current);
-               data->tid_entry.tid = perf_counter_tid(counter, current);
-
-               header->size += sizeof(data->tid_entry);
-       }
-
-       if (sample_type & PERF_SAMPLE_TIME) {
-               data->time = perf_clock();
-
-               header->size += sizeof(data->time);
-       }
-
-       if (sample_type & PERF_SAMPLE_ADDR)
-               header->size += sizeof(data->addr);
-
-       if (sample_type & PERF_SAMPLE_ID) {
-               data->id = primary_counter_id(counter);
-
-               header->size += sizeof(data->id);
-       }
-
-       if (sample_type & PERF_SAMPLE_STREAM_ID) {
-               data->stream_id = counter->id;
-
-               header->size += sizeof(data->stream_id);
-       }
-
-       if (sample_type & PERF_SAMPLE_CPU) {
-               data->cpu_entry.cpu             = raw_smp_processor_id();
-               data->cpu_entry.reserved        = 0;
-
-               header->size += sizeof(data->cpu_entry);
-       }
-
-       if (sample_type & PERF_SAMPLE_PERIOD)
-               header->size += sizeof(data->period);
-
-       if (sample_type & PERF_SAMPLE_READ)
-               header->size += perf_counter_read_size(counter);
-
-       if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-               int size = 1;
-
-               data->callchain = perf_callchain(regs);
-
-               if (data->callchain)
-                       size += data->callchain->nr;
-
-               header->size += size * sizeof(u64);
-       }
-
-       if (sample_type & PERF_SAMPLE_RAW) {
-               int size = sizeof(u32);
-
-               if (data->raw)
-                       size += data->raw->size;
-               else
-                       size += sizeof(u32);
-
-               WARN_ON_ONCE(size & (sizeof(u64)-1));
-               header->size += size;
-       }
-}
-
-static void perf_counter_output(struct perf_counter *counter, int nmi,
-                               struct perf_sample_data *data,
-                               struct pt_regs *regs)
-{
-       struct perf_output_handle handle;
-       struct perf_event_header header;
-
-       perf_prepare_sample(&header, data, counter, regs);
-
-       if (perf_output_begin(&handle, counter, header.size, nmi, 1))
-               return;
-
-       perf_output_sample(&handle, &header, data, counter);
-
-       perf_output_end(&handle);
-}
-
-/*
- * read event
- */
-
-struct perf_read_event {
-       struct perf_event_header        header;
-
-       u32                             pid;
-       u32                             tid;
-};
-
-static void
-perf_counter_read_event(struct perf_counter *counter,
-                       struct task_struct *task)
-{
-       struct perf_output_handle handle;
-       struct perf_read_event read_event = {
-               .header = {
-                       .type = PERF_EVENT_READ,
-                       .misc = 0,
-                       .size = sizeof(read_event) + perf_counter_read_size(counter),
-               },
-               .pid = perf_counter_pid(counter, task),
-               .tid = perf_counter_tid(counter, task),
-       };
-       int ret;
-
-       ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0);
-       if (ret)
-               return;
-
-       perf_output_put(&handle, read_event);
-       perf_output_read(&handle, counter);
-
-       perf_output_end(&handle);
-}
-
-/*
- * task tracking -- fork/exit
- *
- * enabled by: attr.comm | attr.mmap | attr.task
- */
-
-struct perf_task_event {
-       struct task_struct              *task;
-       struct perf_counter_context     *task_ctx;
-
-       struct {
-               struct perf_event_header        header;
-
-               u32                             pid;
-               u32                             ppid;
-               u32                             tid;
-               u32                             ptid;
-               u64                             time;
-       } event;
-};
-
-static void perf_counter_task_output(struct perf_counter *counter,
-                                    struct perf_task_event *task_event)
-{
-       struct perf_output_handle handle;
-       int size;
-       struct task_struct *task = task_event->task;
-       int ret;
-
-       size  = task_event->event.header.size;
-       ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-       if (ret)
-               return;
-
-       task_event->event.pid = perf_counter_pid(counter, task);
-       task_event->event.ppid = perf_counter_pid(counter, current);
-
-       task_event->event.tid = perf_counter_tid(counter, task);
-       task_event->event.ptid = perf_counter_tid(counter, current);
-
-       task_event->event.time = perf_clock();
-
-       perf_output_put(&handle, task_event->event);
-
-       perf_output_end(&handle);
-}
-
-static int perf_counter_task_match(struct perf_counter *counter)
-{
-       if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
-               return 1;
-
-       return 0;
-}
-
-static void perf_counter_task_ctx(struct perf_counter_context *ctx,
-                                 struct perf_task_event *task_event)
-{
-       struct perf_counter *counter;
-
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_task_match(counter))
-                       perf_counter_task_output(counter, task_event);
-       }
-       rcu_read_unlock();
-}
-
-static void perf_counter_task_event(struct perf_task_event *task_event)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx = task_event->task_ctx;
-
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_task_ctx(&cpuctx->ctx, task_event);
-       put_cpu_var(perf_cpu_context);
-
-       rcu_read_lock();
-       if (!ctx)
-               ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
-       if (ctx)
-               perf_counter_task_ctx(ctx, task_event);
-       rcu_read_unlock();
-}
-
-static void perf_counter_task(struct task_struct *task,
-                             struct perf_counter_context *task_ctx,
-                             int new)
-{
-       struct perf_task_event task_event;
-
-       if (!atomic_read(&nr_comm_counters) &&
-           !atomic_read(&nr_mmap_counters) &&
-           !atomic_read(&nr_task_counters))
-               return;
-
-       task_event = (struct perf_task_event){
-               .task     = task,
-               .task_ctx = task_ctx,
-               .event    = {
-                       .header = {
-                               .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
-                               .misc = 0,
-                               .size = sizeof(task_event.event),
-                       },
-                       /* .pid  */
-                       /* .ppid */
-                       /* .tid  */
-                       /* .ptid */
-               },
-       };
-
-       perf_counter_task_event(&task_event);
-}
-
-void perf_counter_fork(struct task_struct *task)
-{
-       perf_counter_task(task, NULL, 1);
-}
-
-/*
- * comm tracking
- */
-
-struct perf_comm_event {
-       struct task_struct      *task;
-       char                    *comm;
-       int                     comm_size;
-
-       struct {
-               struct perf_event_header        header;
-
-               u32                             pid;
-               u32                             tid;
-       } event;
-};
-
-static void perf_counter_comm_output(struct perf_counter *counter,
-                                    struct perf_comm_event *comm_event)
-{
-       struct perf_output_handle handle;
-       int size = comm_event->event.header.size;
-       int ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-       if (ret)
-               return;
-
-       comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
-       comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
-
-       perf_output_put(&handle, comm_event->event);
-       perf_output_copy(&handle, comm_event->comm,
-                                  comm_event->comm_size);
-       perf_output_end(&handle);
-}
-
-static int perf_counter_comm_match(struct perf_counter *counter)
-{
-       if (counter->attr.comm)
-               return 1;
-
-       return 0;
-}
-
-static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
-                                 struct perf_comm_event *comm_event)
-{
-       struct perf_counter *counter;
-
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_comm_match(counter))
-                       perf_counter_comm_output(counter, comm_event);
-       }
-       rcu_read_unlock();
-}
-
-static void perf_counter_comm_event(struct perf_comm_event *comm_event)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx;
-       unsigned int size;
-       char comm[TASK_COMM_LEN];
-
-       memset(comm, 0, sizeof(comm));
-       strncpy(comm, comm_event->task->comm, sizeof(comm));
-       size = ALIGN(strlen(comm)+1, sizeof(u64));
-
-       comm_event->comm = comm;
-       comm_event->comm_size = size;
-
-       comm_event->event.header.size = sizeof(comm_event->event) + size;
-
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
-       put_cpu_var(perf_cpu_context);
-
-       rcu_read_lock();
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_counter_ctxp);
-       if (ctx)
-               perf_counter_comm_ctx(ctx, comm_event);
-       rcu_read_unlock();
-}
-
-void perf_counter_comm(struct task_struct *task)
-{
-       struct perf_comm_event comm_event;
-
-       if (task->perf_counter_ctxp)
-               perf_counter_enable_on_exec(task);
-
-       if (!atomic_read(&nr_comm_counters))
-               return;
-
-       comm_event = (struct perf_comm_event){
-               .task   = task,
-               /* .comm      */
-               /* .comm_size */
-               .event  = {
-                       .header = {
-                               .type = PERF_EVENT_COMM,
-                               .misc = 0,
-                               /* .size */
-                       },
-                       /* .pid */
-                       /* .tid */
-               },
-       };
-
-       perf_counter_comm_event(&comm_event);
-}
-
-/*
- * mmap tracking
- */
-
-struct perf_mmap_event {
-       struct vm_area_struct   *vma;
-
-       const char              *file_name;
-       int                     file_size;
-
-       struct {
-               struct perf_event_header        header;
-
-               u32                             pid;
-               u32                             tid;
-               u64                             start;
-               u64                             len;
-               u64                             pgoff;
-       } event;
-};
-
-static void perf_counter_mmap_output(struct perf_counter *counter,
-                                    struct perf_mmap_event *mmap_event)
-{
-       struct perf_output_handle handle;
-       int size = mmap_event->event.header.size;
-       int ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-       if (ret)
-               return;
-
-       mmap_event->event.pid = perf_counter_pid(counter, current);
-       mmap_event->event.tid = perf_counter_tid(counter, current);
-
-       perf_output_put(&handle, mmap_event->event);
-       perf_output_copy(&handle, mmap_event->file_name,
-                                  mmap_event->file_size);
-       perf_output_end(&handle);
-}
-
-static int perf_counter_mmap_match(struct perf_counter *counter,
-                                  struct perf_mmap_event *mmap_event)
-{
-       if (counter->attr.mmap)
-               return 1;
-
-       return 0;
-}
-
-static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
-                                 struct perf_mmap_event *mmap_event)
-{
-       struct perf_counter *counter;
-
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_mmap_match(counter, mmap_event))
-                       perf_counter_mmap_output(counter, mmap_event);
-       }
-       rcu_read_unlock();
-}
-
-static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx;
-       struct vm_area_struct *vma = mmap_event->vma;
-       struct file *file = vma->vm_file;
-       unsigned int size;
-       char tmp[16];
-       char *buf = NULL;
-       const char *name;
-
-       memset(tmp, 0, sizeof(tmp));
-
-       if (file) {
-               /*
-                * d_path works from the end of the buffer backwards, so we
-                * need to add enough zero bytes after the string to handle
-                * the 64bit alignment we do later.
-                */
-               buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
-               if (!buf) {
-                       name = strncpy(tmp, "//enomem", sizeof(tmp));
-                       goto got_name;
-               }
-               name = d_path(&file->f_path, buf, PATH_MAX);
-               if (IS_ERR(name)) {
-                       name = strncpy(tmp, "//toolong", sizeof(tmp));
-                       goto got_name;
-               }
-       } else {
-               if (arch_vma_name(mmap_event->vma)) {
-                       name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-                                      sizeof(tmp));
-                       goto got_name;
-               }
-
-               if (!vma->vm_mm) {
-                       name = strncpy(tmp, "[vdso]", sizeof(tmp));
-                       goto got_name;
-               }
-
-               name = strncpy(tmp, "//anon", sizeof(tmp));
-               goto got_name;
-       }
-
-got_name:
-       size = ALIGN(strlen(name)+1, sizeof(u64));
-
-       mmap_event->file_name = name;
-       mmap_event->file_size = size;
-
-       mmap_event->event.header.size = sizeof(mmap_event->event) + size;
-
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
-       put_cpu_var(perf_cpu_context);
-
-       rcu_read_lock();
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_counter_ctxp);
-       if (ctx)
-               perf_counter_mmap_ctx(ctx, mmap_event);
-       rcu_read_unlock();
-
-       kfree(buf);
-}
-
-void __perf_counter_mmap(struct vm_area_struct *vma)
-{
-       struct perf_mmap_event mmap_event;
-
-       if (!atomic_read(&nr_mmap_counters))
-               return;
-
-       mmap_event = (struct perf_mmap_event){
-               .vma    = vma,
-               /* .file_name */
-               /* .file_size */
-               .event  = {
-                       .header = {
-                               .type = PERF_EVENT_MMAP,
-                               .misc = 0,
-                               /* .size */
-                       },
-                       /* .pid */
-                       /* .tid */
-                       .start  = vma->vm_start,
-                       .len    = vma->vm_end - vma->vm_start,
-                       .pgoff  = vma->vm_pgoff,
-               },
-       };
-
-       perf_counter_mmap_event(&mmap_event);
-}
-
-/*
- * IRQ throttle logging
- */
-
-static void perf_log_throttle(struct perf_counter *counter, int enable)
-{
-       struct perf_output_handle handle;
-       int ret;
-
-       struct {
-               struct perf_event_header        header;
-               u64                             time;
-               u64                             id;
-               u64                             stream_id;
-       } throttle_event = {
-               .header = {
-                       .type = PERF_EVENT_THROTTLE,
-                       .misc = 0,
-                       .size = sizeof(throttle_event),
-               },
-               .time           = perf_clock(),
-               .id             = primary_counter_id(counter),
-               .stream_id      = counter->id,
-       };
-
-       if (enable)
-               throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
-
-       ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-       if (ret)
-               return;
-
-       perf_output_put(&handle, throttle_event);
-       perf_output_end(&handle);
-}
-
-/*
- * Generic counter overflow handling, sampling.
- */
-
-static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-                                  int throttle, struct perf_sample_data *data,
-                                  struct pt_regs *regs)
-{
-       int events = atomic_read(&counter->event_limit);
-       struct hw_perf_counter *hwc = &counter->hw;
-       int ret = 0;
-
-       throttle = (throttle && counter->pmu->unthrottle != NULL);
-
-       if (!throttle) {
-               hwc->interrupts++;
-       } else {
-               if (hwc->interrupts != MAX_INTERRUPTS) {
-                       hwc->interrupts++;
-                       if (HZ * hwc->interrupts >
-                                       (u64)sysctl_perf_counter_sample_rate) {
-                               hwc->interrupts = MAX_INTERRUPTS;
-                               perf_log_throttle(counter, 0);
-                               ret = 1;
-                       }
-               } else {
-                       /*
-                        * Keep re-disabling counters even though on the previous
-                        * pass we disabled it - just in case we raced with a
-                        * sched-in and the counter got enabled again:
-                        */
-                       ret = 1;
-               }
-       }
-
-       if (counter->attr.freq) {
-               u64 now = perf_clock();
-               s64 delta = now - hwc->freq_stamp;
-
-               hwc->freq_stamp = now;
-
-               if (delta > 0 && delta < TICK_NSEC)
-                       perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
-       }
-
-       /*
-        * XXX event_limit might not quite work as expected on inherited
-        * counters
-        */
-
-       counter->pending_kill = POLL_IN;
-       if (events && atomic_dec_and_test(&counter->event_limit)) {
-               ret = 1;
-               counter->pending_kill = POLL_HUP;
-               if (nmi) {
-                       counter->pending_disable = 1;
-                       perf_pending_queue(&counter->pending,
-                                          perf_pending_counter);
-               } else
-                       perf_counter_disable(counter);
-       }
-
-       perf_counter_output(counter, nmi, data, regs);
-       return ret;
-}
-
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
-                         struct perf_sample_data *data,
-                         struct pt_regs *regs)
-{
-       return __perf_counter_overflow(counter, nmi, 1, data, regs);
-}
-
-/*
- * Generic software counter infrastructure
- */
-
-/*
- * We directly increment counter->count and keep a second value in
- * counter->hw.period_left to count intervals. This period counter
- * is kept in the range [-sample_period, 0] so that we can use the
- * sign as trigger.
- */
-
-static u64 perf_swcounter_set_period(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       u64 period = hwc->last_period;
-       u64 nr, offset;
-       s64 old, val;
-
-       hwc->last_period = hwc->sample_period;
-
-again:
-       old = val = atomic64_read(&hwc->period_left);
-       if (val < 0)
-               return 0;
-
-       nr = div64_u64(period + val, period);
-       offset = nr * period;
-       val -= offset;
-       if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
-               goto again;
-
-       return nr;
-}
-
-static void perf_swcounter_overflow(struct perf_counter *counter,
-                                   int nmi, struct perf_sample_data *data,
-                                   struct pt_regs *regs)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       int throttle = 0;
-       u64 overflow;
-
-       data->period = counter->hw.last_period;
-       overflow = perf_swcounter_set_period(counter);
-
-       if (hwc->interrupts == MAX_INTERRUPTS)
-               return;
-
-       for (; overflow; overflow--) {
-               if (__perf_counter_overflow(counter, nmi, throttle,
-                                           data, regs)) {
-                       /*
-                        * We inhibit the overflow from happening when
-                        * hwc->interrupts == MAX_INTERRUPTS.
-                        */
-                       break;
-               }
-               throttle = 1;
-       }
-}
-
-static void perf_swcounter_unthrottle(struct perf_counter *counter)
-{
-       /*
-        * Nothing to do, we already reset hwc->interrupts.
-        */
-}
-
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                              int nmi, struct perf_sample_data *data,
-                              struct pt_regs *regs)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-
-       atomic64_add(nr, &counter->count);
-
-       if (!hwc->sample_period)
-               return;
-
-       if (!regs)
-               return;
-
-       if (!atomic64_add_negative(nr, &hwc->period_left))
-               perf_swcounter_overflow(counter, nmi, data, regs);
-}
-
-static int perf_swcounter_is_counting(struct perf_counter *counter)
-{
-       /*
-        * The counter is active, we're good!
-        */
-       if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-               return 1;
-
-       /*
-        * The counter is off/error, not counting.
-        */
-       if (counter->state != PERF_COUNTER_STATE_INACTIVE)
-               return 0;
-
-       /*
-        * The counter is inactive, if the context is active
-        * we're part of a group that didn't make it on the 'pmu',
-        * not counting.
-        */
-       if (counter->ctx->is_active)
-               return 0;
-
-       /*
-        * We're inactive and the context is too, this means the
-        * task is scheduled out, we're counting events that happen
-        * to us, like migration events.
-        */
-       return 1;
-}
-
-static int perf_swcounter_match(struct perf_counter *counter,
-                               enum perf_type_id type,
-                               u32 event_id, struct pt_regs *regs)
-{
-       if (!perf_swcounter_is_counting(counter))
-               return 0;
-
-       if (counter->attr.type != type)
-               return 0;
-       if (counter->attr.config != event_id)
-               return 0;
-
-       if (regs) {
-               if (counter->attr.exclude_user && user_mode(regs))
-                       return 0;
-
-               if (counter->attr.exclude_kernel && !user_mode(regs))
-                       return 0;
-       }
-
-       return 1;
-}
-
-static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-                                    enum perf_type_id type,
-                                    u32 event_id, u64 nr, int nmi,
-                                    struct perf_sample_data *data,
-                                    struct pt_regs *regs)
-{
-       struct perf_counter *counter;
-
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_swcounter_match(counter, type, event_id, regs))
-                       perf_swcounter_add(counter, nr, nmi, data, regs);
-       }
-       rcu_read_unlock();
-}
-
-static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
-{
-       if (in_nmi())
-               return &cpuctx->recursion[3];
-
-       if (in_irq())
-               return &cpuctx->recursion[2];
-
-       if (in_softirq())
-               return &cpuctx->recursion[1];
-
-       return &cpuctx->recursion[0];
-}
-
-static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
-                                   u64 nr, int nmi,
-                                   struct perf_sample_data *data,
-                                   struct pt_regs *regs)
-{
-       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-       int *recursion = perf_swcounter_recursion_context(cpuctx);
-       struct perf_counter_context *ctx;
-
-       if (*recursion)
-               goto out;
-
-       (*recursion)++;
-       barrier();
-
-       perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-                                nr, nmi, data, regs);
-       rcu_read_lock();
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_counter_ctxp);
-       if (ctx)
-               perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
-       rcu_read_unlock();
-
-       barrier();
-       (*recursion)--;
-
-out:
-       put_cpu_var(perf_cpu_context);
-}
-
-void __perf_swcounter_event(u32 event, u64 nr, int nmi,
-                           struct pt_regs *regs, u64 addr)
-{
-       struct perf_sample_data data = {
-               .addr = addr,
-       };
-
-       do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
-                               &data, regs);
-}
-
-static void perf_swcounter_read(struct perf_counter *counter)
-{
-}
-
-static int perf_swcounter_enable(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-
-       if (hwc->sample_period) {
-               hwc->last_period = hwc->sample_period;
-               perf_swcounter_set_period(counter);
-       }
-       return 0;
-}
-
-static void perf_swcounter_disable(struct perf_counter *counter)
-{
-}
-
-static const struct pmu perf_ops_generic = {
-       .enable         = perf_swcounter_enable,
-       .disable        = perf_swcounter_disable,
-       .read           = perf_swcounter_read,
-       .unthrottle     = perf_swcounter_unthrottle,
-};
-
-/*
- * hrtimer based swcounter callback
- */
-
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
-{
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct pt_regs *regs;
-       struct perf_counter *counter;
-       u64 period;
-
-       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
-       counter->pmu->read(counter);
-
-       data.addr = 0;
-       regs = get_irq_regs();
-       /*
-        * In case we exclude kernel IPs or are somehow not in interrupt
-        * context, provide the next best thing, the user IP.
-        */
-       if ((counter->attr.exclude_kernel || !regs) &&
-                       !counter->attr.exclude_user)
-               regs = task_pt_regs(current);
-
-       if (regs) {
-               if (perf_counter_overflow(counter, 0, &data, regs))
-                       ret = HRTIMER_NORESTART;
-       }
-
-       period = max_t(u64, 10000, counter->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-       return ret;
-}
-
-/*
- * Software counter: cpu wall time clock
- */
-
-static void cpu_clock_perf_counter_update(struct perf_counter *counter)
-{
-       int cpu = raw_smp_processor_id();
-       s64 prev;
-       u64 now;
-
-       now = cpu_clock(cpu);
-       prev = atomic64_read(&counter->hw.prev_count);
-       atomic64_set(&counter->hw.prev_count, now);
-       atomic64_add(now - prev, &counter->count);
-}
-
-static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       int cpu = raw_smp_processor_id();
-
-       atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swcounter_hrtimer;
-       if (hwc->sample_period) {
-               u64 period = max_t(u64, 10000, hwc->sample_period);
-               __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL, 0);
-       }
-
-       return 0;
-}
-
-static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
-{
-       if (counter->hw.sample_period)
-               hrtimer_cancel(&counter->hw.hrtimer);
-       cpu_clock_perf_counter_update(counter);
-}
-
-static void cpu_clock_perf_counter_read(struct perf_counter *counter)
-{
-       cpu_clock_perf_counter_update(counter);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-       .enable         = cpu_clock_perf_counter_enable,
-       .disable        = cpu_clock_perf_counter_disable,
-       .read           = cpu_clock_perf_counter_read,
-};
-
-/*
- * Software counter: task time clock
- */
-
-static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
-{
-       u64 prev;
-       s64 delta;
-
-       prev = atomic64_xchg(&counter->hw.prev_count, now);
-       delta = now - prev;
-       atomic64_add(delta, &counter->count);
-}
-
-static int task_clock_perf_counter_enable(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       u64 now;
-
-       now = counter->ctx->time;
-
-       atomic64_set(&hwc->prev_count, now);
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swcounter_hrtimer;
-       if (hwc->sample_period) {
-               u64 period = max_t(u64, 10000, hwc->sample_period);
-               __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL, 0);
-       }
-
-       return 0;
-}
-
-static void task_clock_perf_counter_disable(struct perf_counter *counter)
-{
-       if (counter->hw.sample_period)
-               hrtimer_cancel(&counter->hw.hrtimer);
-       task_clock_perf_counter_update(counter, counter->ctx->time);
-
-}
-
-static void task_clock_perf_counter_read(struct perf_counter *counter)
-{
-       u64 time;
-
-       if (!in_nmi()) {
-               update_context_time(counter->ctx);
-               time = counter->ctx->time;
-       } else {
-               u64 now = perf_clock();
-               u64 delta = now - counter->ctx->timestamp;
-               time = counter->ctx->time + delta;
-       }
-
-       task_clock_perf_counter_update(counter, time);
-}
-
-static const struct pmu perf_ops_task_clock = {
-       .enable         = task_clock_perf_counter_enable,
-       .disable        = task_clock_perf_counter_disable,
-       .read           = task_clock_perf_counter_read,
-};
-
-#ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
-                         int entry_size)
-{
-       struct perf_raw_record raw = {
-               .size = entry_size,
-               .data = record,
-       };
-
-       struct perf_sample_data data = {
-               .addr = addr,
-               .raw = &raw,
-       };
-
-       struct pt_regs *regs = get_irq_regs();
-
-       if (!regs)
-               regs = task_pt_regs(current);
-
-       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-                               &data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tpcounter_event);
-
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
-
-static void tp_perf_counter_destroy(struct perf_counter *counter)
-{
-       ftrace_profile_disable(counter->attr.config);
-}
-
-static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
-{
-       /*
-        * Raw tracepoint data is a severe data leak, only allow root to
-        * have these.
-        */
-       if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
-                       perf_paranoid_tracepoint_raw() &&
-                       !capable(CAP_SYS_ADMIN))
-               return ERR_PTR(-EPERM);
-
-       if (ftrace_profile_enable(counter->attr.config))
-               return NULL;
-
-       counter->destroy = tp_perf_counter_destroy;
-
-       return &perf_ops_generic;
-}
-#else
-static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
-{
-       return NULL;
-}
-#endif
-
-atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
-
-static void sw_perf_counter_destroy(struct perf_counter *counter)
-{
-       u64 event_id = counter->attr.config;
-
-       WARN_ON(counter->parent);
-
-       atomic_dec(&perf_swcounter_enabled[event_id]);
-}
-
-static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
-{
-       const struct pmu *pmu = NULL;
-       u64 event_id = counter->attr.config;
-
-       /*
-        * Software counters (currently) can't in general distinguish
-        * between user, kernel and hypervisor events.
-        * However, context switches and cpu migrations are considered
-        * to be kernel events, and page faults are never hypervisor
-        * events.
-        */
-       switch (event_id) {
-       case PERF_COUNT_SW_CPU_CLOCK:
-               pmu = &perf_ops_cpu_clock;
-
-               break;
-       case PERF_COUNT_SW_TASK_CLOCK:
-               /*
-                * If the user instantiates this as a per-cpu counter,
-                * use the cpu_clock counter instead.
-                */
-               if (counter->ctx->task)
-                       pmu = &perf_ops_task_clock;
-               else
-                       pmu = &perf_ops_cpu_clock;
-
-               break;
-       case PERF_COUNT_SW_PAGE_FAULTS:
-       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-       case PERF_COUNT_SW_CONTEXT_SWITCHES:
-       case PERF_COUNT_SW_CPU_MIGRATIONS:
-               if (!counter->parent) {
-                       atomic_inc(&perf_swcounter_enabled[event_id]);
-                       counter->destroy = sw_perf_counter_destroy;
-               }
-               pmu = &perf_ops_generic;
-               break;
-       }
-
-       return pmu;
-}
-
-/*
- * Allocate and initialize a counter structure
- */
-static struct perf_counter *
-perf_counter_alloc(struct perf_counter_attr *attr,
-                  int cpu,
-                  struct perf_counter_context *ctx,
-                  struct perf_counter *group_leader,
-                  struct perf_counter *parent_counter,
-                  gfp_t gfpflags)
-{
-       const struct pmu *pmu;
-       struct perf_counter *counter;
-       struct hw_perf_counter *hwc;
-       long err;
-
-       counter = kzalloc(sizeof(*counter), gfpflags);
-       if (!counter)
-               return ERR_PTR(-ENOMEM);
-
-       /*
-        * Single counters are their own group leaders, with an
-        * empty sibling list:
-        */
-       if (!group_leader)
-               group_leader = counter;
-
-       mutex_init(&counter->child_mutex);
-       INIT_LIST_HEAD(&counter->child_list);
-
-       INIT_LIST_HEAD(&counter->group_entry);
-       INIT_LIST_HEAD(&counter->event_entry);
-       INIT_LIST_HEAD(&counter->sibling_list);
-       init_waitqueue_head(&counter->waitq);
-
-       mutex_init(&counter->mmap_mutex);
-
-       counter->cpu            = cpu;
-       counter->attr           = *attr;
-       counter->group_leader   = group_leader;
-       counter->pmu            = NULL;
-       counter->ctx            = ctx;
-       counter->oncpu          = -1;
-
-       counter->parent         = parent_counter;
-
-       counter->ns             = get_pid_ns(current->nsproxy->pid_ns);
-       counter->id             = atomic64_inc_return(&perf_counter_id);
-
-       counter->state          = PERF_COUNTER_STATE_INACTIVE;
-
-       if (attr->disabled)
-               counter->state = PERF_COUNTER_STATE_OFF;
-
-       pmu = NULL;
-
-       hwc = &counter->hw;
-       hwc->sample_period = attr->sample_period;
-       if (attr->freq && attr->sample_freq)
-               hwc->sample_period = 1;
-       hwc->last_period = hwc->sample_period;
-
-       atomic64_set(&hwc->period_left, hwc->sample_period);
-
-       /*
-        * we currently do not support PERF_FORMAT_GROUP on inherited counters
-        */
-       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
-               goto done;
-
-       switch (attr->type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               pmu = hw_perf_counter_init(counter);
-               break;
-
-       case PERF_TYPE_SOFTWARE:
-               pmu = sw_perf_counter_init(counter);
-               break;
-
-       case PERF_TYPE_TRACEPOINT:
-               pmu = tp_perf_counter_init(counter);
-               break;
-
-       default:
-               break;
-       }
-done:
-       err = 0;
-       if (!pmu)
-               err = -EINVAL;
-       else if (IS_ERR(pmu))
-               err = PTR_ERR(pmu);
-
-       if (err) {
-               if (counter->ns)
-                       put_pid_ns(counter->ns);
-               kfree(counter);
-               return ERR_PTR(err);
-       }
-
-       counter->pmu = pmu;
-
-       if (!counter->parent) {
-               atomic_inc(&nr_counters);
-               if (counter->attr.mmap)
-                       atomic_inc(&nr_mmap_counters);
-               if (counter->attr.comm)
-                       atomic_inc(&nr_comm_counters);
-               if (counter->attr.task)
-                       atomic_inc(&nr_task_counters);
-       }
-
-       return counter;
-}
-
-static int perf_copy_attr(struct perf_counter_attr __user *uattr,
-                         struct perf_counter_attr *attr)
-{
-       u32 size;
-       int ret;
-
-       if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
-               return -EFAULT;
-
-       /*
-        * zero the full structure, so that a short copy will be nice.
-        */
-       memset(attr, 0, sizeof(*attr));
-
-       ret = get_user(size, &uattr->size);
-       if (ret)
-               return ret;
-
-       if (size > PAGE_SIZE)   /* silly large */
-               goto err_size;
-
-       if (!size)              /* abi compat */
-               size = PERF_ATTR_SIZE_VER0;
-
-       if (size < PERF_ATTR_SIZE_VER0)
-               goto err_size;
-
-       /*
-        * If we're handed a bigger struct than we know of,
-        * ensure all the unknown bits are 0 - i.e. new
-        * user-space does not rely on any kernel feature
-        * extensions we dont know about yet.
-        */
-       if (size > sizeof(*attr)) {
-               unsigned char __user *addr;
-               unsigned char __user *end;
-               unsigned char val;
-
-               addr = (void __user *)uattr + sizeof(*attr);
-               end  = (void __user *)uattr + size;
-
-               for (; addr < end; addr++) {
-                       ret = get_user(val, addr);
-                       if (ret)
-                               return ret;
-                       if (val)
-                               goto err_size;
-               }
-               size = sizeof(*attr);
-       }
-
-       ret = copy_from_user(attr, uattr, size);
-       if (ret)
-               return -EFAULT;
-
-       /*
-        * If the type exists, the corresponding creation will verify
-        * the attr->config.
-        */
-       if (attr->type >= PERF_TYPE_MAX)
-               return -EINVAL;
-
-       if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
-               return -EINVAL;
-
-       if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
-               return -EINVAL;
-
-       if (attr->read_format & ~(PERF_FORMAT_MAX-1))
-               return -EINVAL;
-
-out:
-       return ret;
-
-err_size:
-       put_user(sizeof(*attr), &uattr->size);
-       ret = -E2BIG;
-       goto out;
-}
-
-int perf_counter_set_output(struct perf_counter *counter, int output_fd)
-{
-       struct perf_counter *output_counter = NULL;
-       struct file *output_file = NULL;
-       struct perf_counter *old_output;
-       int fput_needed = 0;
-       int ret = -EINVAL;
-
-       if (!output_fd)
-               goto set;
-
-       output_file = fget_light(output_fd, &fput_needed);
-       if (!output_file)
-               return -EBADF;
-
-       if (output_file->f_op != &perf_fops)
-               goto out;
-
-       output_counter = output_file->private_data;
-
-       /* Don't chain output fds */
-       if (output_counter->output)
-               goto out;
-
-       /* Don't set an output fd when we already have an output channel */
-       if (counter->data)
-               goto out;
-
-       atomic_long_inc(&output_file->f_count);
-
-set:
-       mutex_lock(&counter->mmap_mutex);
-       old_output = counter->output;
-       rcu_assign_pointer(counter->output, output_counter);
-       mutex_unlock(&counter->mmap_mutex);
-
-       if (old_output) {
-               /*
-                * we need to make sure no existing perf_output_*()
-                * is still referencing this counter.
-                */
-               synchronize_rcu();
-               fput(old_output->filp);
-       }
-
-       ret = 0;
-out:
-       fput_light(output_file, fput_needed);
-       return ret;
-}
-
-/**
- * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
- *
- * @attr_uptr: event type attributes for monitoring/sampling
- * @pid:               target pid
- * @cpu:               target cpu
- * @group_fd:          group leader counter fd
- */
-SYSCALL_DEFINE5(perf_counter_open,
-               struct perf_counter_attr __user *, attr_uptr,
-               pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
-{
-       struct perf_counter *counter, *group_leader;
-       struct perf_counter_attr attr;
-       struct perf_counter_context *ctx;
-       struct file *counter_file = NULL;
-       struct file *group_file = NULL;
-       int fput_needed = 0;
-       int fput_needed2 = 0;
-       int err;
-
-       /* for future expandability... */
-       if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
-               return -EINVAL;
-
-       err = perf_copy_attr(attr_uptr, &attr);
-       if (err)
-               return err;
-
-       if (!attr.exclude_kernel) {
-               if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                       return -EACCES;
-       }
-
-       if (attr.freq) {
-               if (attr.sample_freq > sysctl_perf_counter_sample_rate)
-                       return -EINVAL;
-       }
-
-       /*
-        * Get the target context (task or percpu):
-        */
-       ctx = find_get_context(pid, cpu);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       /*
-        * Look up the group leader (we will attach this counter to it):
-        */
-       group_leader = NULL;
-       if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
-               err = -EINVAL;
-               group_file = fget_light(group_fd, &fput_needed);
-               if (!group_file)
-                       goto err_put_context;
-               if (group_file->f_op != &perf_fops)
-                       goto err_put_context;
-
-               group_leader = group_file->private_data;
-               /*
-                * Do not allow a recursive hierarchy (this new sibling
-                * becoming part of another group-sibling):
-                */
-               if (group_leader->group_leader != group_leader)
-                       goto err_put_context;
-               /*
-                * Do not allow to attach to a group in a different
-                * task or CPU context:
-                */
-               if (group_leader->ctx != ctx)
-                       goto err_put_context;
-               /*
-                * Only a group leader can be exclusive or pinned
-                */
-               if (attr.exclusive || attr.pinned)
-                       goto err_put_context;
-       }
-
-       counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
-                                    NULL, GFP_KERNEL);
-       err = PTR_ERR(counter);
-       if (IS_ERR(counter))
-               goto err_put_context;
-
-       err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
-       if (err < 0)
-               goto err_free_put_context;
-
-       counter_file = fget_light(err, &fput_needed2);
-       if (!counter_file)
-               goto err_free_put_context;
-
-       if (flags & PERF_FLAG_FD_OUTPUT) {
-               err = perf_counter_set_output(counter, group_fd);
-               if (err)
-                       goto err_fput_free_put_context;
-       }
-
-       counter->filp = counter_file;
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
-       perf_install_in_context(ctx, counter, cpu);
-       ++ctx->generation;
-       mutex_unlock(&ctx->mutex);
-
-       counter->owner = current;
-       get_task_struct(current);
-       mutex_lock(&current->perf_counter_mutex);
-       list_add_tail(&counter->owner_entry, &current->perf_counter_list);
-       mutex_unlock(&current->perf_counter_mutex);
-
-err_fput_free_put_context:
-       fput_light(counter_file, fput_needed2);
-
-err_free_put_context:
-       if (err < 0)
-               kfree(counter);
-
-err_put_context:
-       if (err < 0)
-               put_ctx(ctx);
-
-       fput_light(group_file, fput_needed);
-
-       return err;
-}
-
-/*
- * inherit a counter from parent task to child task:
- */
-static struct perf_counter *
-inherit_counter(struct perf_counter *parent_counter,
-             struct task_struct *parent,
-             struct perf_counter_context *parent_ctx,
-             struct task_struct *child,
-             struct perf_counter *group_leader,
-             struct perf_counter_context *child_ctx)
-{
-       struct perf_counter *child_counter;
-
-       /*
-        * Instead of creating recursive hierarchies of counters,
-        * we link inherited counters back to the original parent,
-        * which has a filp for sure, which we use as the reference
-        * count:
-        */
-       if (parent_counter->parent)
-               parent_counter = parent_counter->parent;
-
-       child_counter = perf_counter_alloc(&parent_counter->attr,
-                                          parent_counter->cpu, child_ctx,
-                                          group_leader, parent_counter,
-                                          GFP_KERNEL);
-       if (IS_ERR(child_counter))
-               return child_counter;
-       get_ctx(child_ctx);
-
-       /*
-        * Make the child state follow the state of the parent counter,
-        * not its attr.disabled bit.  We hold the parent's mutex,
-        * so we won't race with perf_counter_{en, dis}able_family.
-        */
-       if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
-               child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-       else
-               child_counter->state = PERF_COUNTER_STATE_OFF;
-
-       if (parent_counter->attr.freq)
-               child_counter->hw.sample_period = parent_counter->hw.sample_period;
-
-       /*
-        * Link it up in the child's context:
-        */
-       add_counter_to_ctx(child_counter, child_ctx);
-
-       /*
-        * Get a reference to the parent filp - we will fput it
-        * when the child counter exits. This is safe to do because
-        * we are in the parent and we know that the filp still
-        * exists and has a nonzero count:
-        */
-       atomic_long_inc(&parent_counter->filp->f_count);
-
-       /*
-        * Link this into the parent counter's child list
-        */
-       WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
-       mutex_lock(&parent_counter->child_mutex);
-       list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-       mutex_unlock(&parent_counter->child_mutex);
-
-       return child_counter;
-}
-
-static int inherit_group(struct perf_counter *parent_counter,
-             struct task_struct *parent,
-             struct perf_counter_context *parent_ctx,
-             struct task_struct *child,
-             struct perf_counter_context *child_ctx)
-{
-       struct perf_counter *leader;
-       struct perf_counter *sub;
-       struct perf_counter *child_ctr;
-
-       leader = inherit_counter(parent_counter, parent, parent_ctx,
-                                child, NULL, child_ctx);
-       if (IS_ERR(leader))
-               return PTR_ERR(leader);
-       list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) {
-               child_ctr = inherit_counter(sub, parent, parent_ctx,
-                                           child, leader, child_ctx);
-               if (IS_ERR(child_ctr))
-                       return PTR_ERR(child_ctr);
-       }
-       return 0;
-}
-
-static void sync_child_counter(struct perf_counter *child_counter,
-                              struct task_struct *child)
-{
-       struct perf_counter *parent_counter = child_counter->parent;
-       u64 child_val;
-
-       if (child_counter->attr.inherit_stat)
-               perf_counter_read_event(child_counter, child);
-
-       child_val = atomic64_read(&child_counter->count);
-
-       /*
-        * Add back the child's count to the parent's count:
-        */
-       atomic64_add(child_val, &parent_counter->count);
-       atomic64_add(child_counter->total_time_enabled,
-                    &parent_counter->child_total_time_enabled);
-       atomic64_add(child_counter->total_time_running,
-                    &parent_counter->child_total_time_running);
-
-       /*
-        * Remove this counter from the parent's list
-        */
-       WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
-       mutex_lock(&parent_counter->child_mutex);
-       list_del_init(&child_counter->child_list);
-       mutex_unlock(&parent_counter->child_mutex);
-
-       /*
-        * Release the parent counter, if this was the last
-        * reference to it.
-        */
-       fput(parent_counter->filp);
-}
-
-static void
-__perf_counter_exit_task(struct perf_counter *child_counter,
-                        struct perf_counter_context *child_ctx,
-                        struct task_struct *child)
-{
-       struct perf_counter *parent_counter;
-
-       update_counter_times(child_counter);
-       perf_counter_remove_from_context(child_counter);
-
-       parent_counter = child_counter->parent;
-       /*
-        * It can happen that parent exits first, and has counters
-        * that are still around due to the child reference. These
-        * counters need to be zapped - but otherwise linger.
-        */
-       if (parent_counter) {
-               sync_child_counter(child_counter, child);
-               free_counter(child_counter);
-       }
-}
-
-/*
- * When a child task exits, feed back counter values to parent counters.
- */
-void perf_counter_exit_task(struct task_struct *child)
-{
-       struct perf_counter *child_counter, *tmp;
-       struct perf_counter_context *child_ctx;
-       unsigned long flags;
-
-       if (likely(!child->perf_counter_ctxp)) {
-               perf_counter_task(child, NULL, 0);
-               return;
-       }
-
-       local_irq_save(flags);
-       /*
-        * We can't reschedule here because interrupts are disabled,
-        * and either child is current or it is a task that can't be
-        * scheduled, so we are now safe from rescheduling changing
-        * our context.
-        */
-       child_ctx = child->perf_counter_ctxp;
-       __perf_counter_task_sched_out(child_ctx);
-
-       /*
-        * Take the context lock here so that if find_get_context is
-        * reading child->perf_counter_ctxp, we wait until it has
-        * incremented the context's refcount before we do put_ctx below.
-        */
-       spin_lock(&child_ctx->lock);
-       child->perf_counter_ctxp = NULL;
-       /*
-        * If this context is a clone; unclone it so it can't get
-        * swapped to another process while we're removing all
-        * the counters from it.
-        */
-       unclone_ctx(child_ctx);
-       spin_unlock_irqrestore(&child_ctx->lock, flags);
-
-       /*
-        * Report the task dead after unscheduling the counters so that we
-        * won't get any samples after PERF_EVENT_EXIT. We can however still
-        * get a few PERF_EVENT_READ events.
-        */
-       perf_counter_task(child, child_ctx, 0);
-
-       /*
-        * We can recurse on the same lock type through:
-        *
-        *   __perf_counter_exit_task()
-        *     sync_child_counter()
-        *       fput(parent_counter->filp)
-        *         perf_release()
-        *           mutex_lock(&ctx->mutex)
-        *
-        * But since its the parent context it won't be the same instance.
-        */
-       mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
-
-again:
-       list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list,
-                                group_entry)
-               __perf_counter_exit_task(child_counter, child_ctx, child);
-
-       /*
-        * If the last counter was a group counter, it will have appended all
-        * its siblings to the list, but we obtained 'tmp' before that which
-        * will still point to the list head terminating the iteration.
-        */
-       if (!list_empty(&child_ctx->group_list))
-               goto again;
-
-       mutex_unlock(&child_ctx->mutex);
-
-       put_ctx(child_ctx);
-}
-
-/*
- * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
- */
-void perf_counter_free_task(struct task_struct *task)
-{
-       struct perf_counter_context *ctx = task->perf_counter_ctxp;
-       struct perf_counter *counter, *tmp;
-
-       if (!ctx)
-               return;
-
-       mutex_lock(&ctx->mutex);
-again:
-       list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) {
-               struct perf_counter *parent = counter->parent;
-
-               if (WARN_ON_ONCE(!parent))
-                       continue;
-
-               mutex_lock(&parent->child_mutex);
-               list_del_init(&counter->child_list);
-               mutex_unlock(&parent->child_mutex);
-
-               fput(parent->filp);
-
-               list_del_counter(counter, ctx);
-               free_counter(counter);
-       }
-
-       if (!list_empty(&ctx->group_list))
-               goto again;
-
-       mutex_unlock(&ctx->mutex);
-
-       put_ctx(ctx);
-}
-
-/*
- * Initialize the perf_counter context in task_struct
- */
-int perf_counter_init_task(struct task_struct *child)
-{
-       struct perf_counter_context *child_ctx, *parent_ctx;
-       struct perf_counter_context *cloned_ctx;
-       struct perf_counter *counter;
-       struct task_struct *parent = current;
-       int inherited_all = 1;
-       int ret = 0;
-
-       child->perf_counter_ctxp = NULL;
-
-       mutex_init(&child->perf_counter_mutex);
-       INIT_LIST_HEAD(&child->perf_counter_list);
-
-       if (likely(!parent->perf_counter_ctxp))
-               return 0;
-
-       /*
-        * This is executed from the parent task context, so inherit
-        * counters that have been marked for cloning.
-        * First allocate and initialize a context for the child.
-        */
-
-       child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-       if (!child_ctx)
-               return -ENOMEM;
-
-       __perf_counter_init_context(child_ctx, child);
-       child->perf_counter_ctxp = child_ctx;
-       get_task_struct(child);
-
-       /*
-        * If the parent's context is a clone, pin it so it won't get
-        * swapped under us.
-        */
-       parent_ctx = perf_pin_task_context(parent);
-
-       /*
-        * No need to check if parent_ctx != NULL here; since we saw
-        * it non-NULL earlier, the only reason for it to become NULL
-        * is if we exit, and since we're currently in the middle of
-        * a fork we can't be exiting at the same time.
-        */
-
-       /*
-        * Lock the parent list. No need to lock the child - not PID
-        * hashed yet and not running, so nobody can access it.
-        */
-       mutex_lock(&parent_ctx->mutex);
-
-       /*
-        * We dont have to disable NMIs - we are only looking at
-        * the list, not manipulating it:
-        */
-       list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
-               if (counter != counter->group_leader)
-                       continue;
-
-               if (!counter->attr.inherit) {
-                       inherited_all = 0;
-                       continue;
-               }
-
-               ret = inherit_group(counter, parent, parent_ctx,
-                                            child, child_ctx);
-               if (ret) {
-                       inherited_all = 0;
-                       break;
-               }
-       }
-
-       if (inherited_all) {
-               /*
-                * Mark the child context as a clone of the parent
-                * context, or of whatever the parent is a clone of.
-                * Note that if the parent is a clone, it could get
-                * uncloned at any point, but that doesn't matter
-                * because the list of counters and the generation
-                * count can't have changed since we took the mutex.
-                */
-               cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
-               if (cloned_ctx) {
-                       child_ctx->parent_ctx = cloned_ctx;
-                       child_ctx->parent_gen = parent_ctx->parent_gen;
-               } else {
-                       child_ctx->parent_ctx = parent_ctx;
-                       child_ctx->parent_gen = parent_ctx->generation;
-               }
-               get_ctx(child_ctx->parent_ctx);
-       }
-
-       mutex_unlock(&parent_ctx->mutex);
-
-       perf_unpin_context(parent_ctx);
-
-       return ret;
-}
-
-static void __cpuinit perf_counter_init_cpu(int cpu)
-{
-       struct perf_cpu_context *cpuctx;
-
-       cpuctx = &per_cpu(perf_cpu_context, cpu);
-       __perf_counter_init_context(&cpuctx->ctx, NULL);
-
-       spin_lock(&perf_resource_lock);
-       cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
-       spin_unlock(&perf_resource_lock);
-
-       hw_perf_counter_setup(cpu);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void __perf_counter_exit_cpu(void *info)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_counter_context *ctx = &cpuctx->ctx;
-       struct perf_counter *counter, *tmp;
-
-       list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry)
-               __perf_counter_remove_from_context(counter);
-}
-static void perf_counter_exit_cpu(int cpu)
-{
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_counter_context *ctx = &cpuctx->ctx;
-
-       mutex_lock(&ctx->mutex);
-       smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
-       mutex_unlock(&ctx->mutex);
-}
-#else
-static inline void perf_counter_exit_cpu(int cpu) { }
-#endif
-
-static int __cpuinit
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action) {
-
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               perf_counter_init_cpu(cpu);
-               break;
-
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               hw_perf_counter_setup_online(cpu);
-               break;
-
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               perf_counter_exit_cpu(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-       .notifier_call          = perf_cpu_notify,
-       .priority               = 20,
-};
-
-void __init perf_counter_init(void)
-{
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-                       (void *)(long)smp_processor_id());
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-                       (void *)(long)smp_processor_id());
-       register_cpu_notifier(&perf_cpu_nb);
-}
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
-{
-       return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-                       const char *buf,
-                       size_t count)
-{
-       struct perf_cpu_context *cpuctx;
-       unsigned long val;
-       int err, cpu, mpt;
-
-       err = strict_strtoul(buf, 10, &val);
-       if (err)
-               return err;
-       if (val > perf_max_counters)
-               return -EINVAL;
-
-       spin_lock(&perf_resource_lock);
-       perf_reserved_percpu = val;
-       for_each_online_cpu(cpu) {
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               spin_lock_irq(&cpuctx->ctx.lock);
-               mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
-                         perf_max_counters - perf_reserved_percpu);
-               cpuctx->max_pertask = mpt;
-               spin_unlock_irq(&cpuctx->ctx.lock);
-       }
-       spin_unlock(&perf_resource_lock);
-
-       return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
-{
-       return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
-{
-       unsigned long val;
-       int err;
-
-       err = strict_strtoul(buf, 10, &val);
-       if (err)
-               return err;
-       if (val > 1)
-               return -EINVAL;
-
-       spin_lock(&perf_resource_lock);
-       perf_overcommit = val;
-       spin_unlock(&perf_resource_lock);
-
-       return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-                               reserve_percpu,
-                               0644,
-                               perf_show_reserve_percpu,
-                               perf_set_reserve_percpu
-                       );
-
-static SYSDEV_CLASS_ATTR(
-                               overcommit,
-                               0644,
-                               perf_show_overcommit,
-                               perf_set_overcommit
-                       );
-
-static struct attribute *perfclass_attrs[] = {
-       &attr_reserve_percpu.attr,
-       &attr_overcommit.attr,
-       NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-       .attrs                  = perfclass_attrs,
-       .name                   = "perf_counters",
-};
-
-static int __init perf_counter_sysfs_init(void)
-{
-       return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-                                 &perfclass_attr_group);
-}
-device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c

new file mode 100644 (file)

index 0000000..6e8b99a
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
+/*
+ * Performance event core code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ *  For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU events:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_events __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_events __read_mostly;
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+
+/*
+ * perf event paranoia level:
+ *  -1 - not paranoid at all
+ *   0 - disallow raw tracepoint access for unpriv
+ *   1 - disallow cpu events for unpriv
+ *   2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+       return sysctl_perf_event_paranoid > -1;
+}
+
+static inline bool perf_paranoid_cpu(void)
+{
+       return sysctl_perf_event_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+       return sysctl_perf_event_paranoid > 1;
+}
+
+int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf event sample rate
+ */
+int sysctl_perf_event_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_event_id;
+
+/*
+ * Lock for (sysadmin-configurable) event reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+       return NULL;
+}
+
+void __weak hw_perf_disable(void)              { barrier(); }
+void __weak hw_perf_enable(void)               { barrier(); }
+
+void __weak hw_perf_event_setup(int cpu)       { barrier(); }
+void __weak hw_perf_event_setup_online(int cpu)        { barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_event *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_event_context *ctx, int cpu)
+{
+       return 0;
+}
+
+void __weak perf_event_print_debug(void)       { }
+
+static DEFINE_PER_CPU(int, perf_disable_count);
+
+void __perf_disable(void)
+{
+       __get_cpu_var(perf_disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+       return !--__get_cpu_var(perf_disable_count);
+}
+
+void perf_disable(void)
+{
+       __perf_disable();
+       hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+       if (__perf_enable())
+               hw_perf_enable();
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+       WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+       struct perf_event_context *ctx;
+
+       ctx = container_of(head, struct perf_event_context, rcu_head);
+       kfree(ctx);
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+       if (atomic_dec_and_test(&ctx->refcount)) {
+               if (ctx->parent_ctx)
+                       put_ctx(ctx->parent_ctx);
+               if (ctx->task)
+                       put_task_struct(ctx->task);
+               call_rcu(&ctx->rcu_head, free_ctx);
+       }
+}
+
+static void unclone_ctx(struct perf_event_context *ctx)
+{
+       if (ctx->parent_ctx) {
+               put_ctx(ctx->parent_ctx);
+               ctx->parent_ctx = NULL;
+       }
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+       u64 id = event->id;
+
+       if (event->parent)
+               id = event->parent->id;
+
+       return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+       struct perf_event_context *ctx;
+
+       rcu_read_lock();
+ retry:
+       ctx = rcu_dereference(task->perf_event_ctxp);
+       if (ctx) {
+               /*
+                * If this context is a clone of another, it might
+                * get swapped for another underneath us by
+                * perf_event_task_sched_out, though the
+                * rcu_read_lock() protects us from any context
+                * getting freed.  Lock the context and check if it
+                * got swapped before we could get the lock, and retry
+                * if so.  If we locked the right context, then it
+                * can't get swapped on us any more.
+                */
+               spin_lock_irqsave(&ctx->lock, *flags);
+               if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+                       spin_unlock_irqrestore(&ctx->lock, *flags);
+                       goto retry;
+               }
+
+               if (!atomic_inc_not_zero(&ctx->refcount)) {
+                       spin_unlock_irqrestore(&ctx->lock, *flags);
+                       ctx = NULL;
+               }
+       }
+       rcu_read_unlock();
+       return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+{
+       struct perf_event_context *ctx;
+       unsigned long flags;
+
+       ctx = perf_lock_task_context(task, &flags);
+       if (ctx) {
+               ++ctx->pin_count;
+               spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+       return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ctx->lock, flags);
+       --ctx->pin_count;
+       spin_unlock_irqrestore(&ctx->lock, flags);
+       put_ctx(ctx);
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+       struct perf_event *group_leader = event->group_leader;
+
+       /*
+        * Depending on whether it is a standalone or sibling event,
+        * add it straight to the context's event list, or to the group
+        * leader's sibling list:
+        */
+       if (group_leader == event)
+               list_add_tail(&event->group_entry, &ctx->group_list);
+       else {
+               list_add_tail(&event->group_entry, &group_leader->sibling_list);
+               group_leader->nr_siblings++;
+       }
+
+       list_add_rcu(&event->event_entry, &ctx->event_list);
+       ctx->nr_events++;
+       if (event->attr.inherit_stat)
+               ctx->nr_stat++;
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+       struct perf_event *sibling, *tmp;
+
+       if (list_empty(&event->group_entry))
+               return;
+       ctx->nr_events--;
+       if (event->attr.inherit_stat)
+               ctx->nr_stat--;
+
+       list_del_init(&event->group_entry);
+       list_del_rcu(&event->event_entry);
+
+       if (event->group_leader != event)
+               event->group_leader->nr_siblings--;
+
+       /*
+        * If this was a group event with sibling events then
+        * upgrade the siblings to singleton events by adding them
+        * to the context list directly:
+        */
+       list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+
+               list_move_tail(&sibling->group_entry, &ctx->group_list);
+               sibling->group_leader = sibling;
+       }
+}
+
+static void
+event_sched_out(struct perf_event *event,
+                 struct perf_cpu_context *cpuctx,
+                 struct perf_event_context *ctx)
+{
+       if (event->state != PERF_EVENT_STATE_ACTIVE)
+               return;
+
+       event->state = PERF_EVENT_STATE_INACTIVE;
+       if (event->pending_disable) {
+               event->pending_disable = 0;
+               event->state = PERF_EVENT_STATE_OFF;
+       }
+       event->tstamp_stopped = ctx->time;
+       event->pmu->disable(event);
+       event->oncpu = -1;
+
+       if (!is_software_event(event))
+               cpuctx->active_oncpu--;
+       ctx->nr_active--;
+       if (event->attr.exclusive || !cpuctx->active_oncpu)
+               cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+               struct perf_cpu_context *cpuctx,
+               struct perf_event_context *ctx)
+{
+       struct perf_event *event;
+
+       if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+               return;
+
+       event_sched_out(group_event, cpuctx, ctx);
+
+       /*
+        * Schedule out siblings (if any):
+        */
+       list_for_each_entry(event, &group_event->sibling_list, group_entry)
+               event_sched_out(event, cpuctx, ctx);
+
+       if (group_event->attr.exclusive)
+               cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_event_remove_from_context(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_event *event = info;
+       struct perf_event_context *ctx = event->ctx;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       spin_lock(&ctx->lock);
+       /*
+        * Protect the list operation against NMI by disabling the
+        * events on a global level.
+        */
+       perf_disable();
+
+       event_sched_out(event, cpuctx, ctx);
+
+       list_del_event(event, ctx);
+
+       if (!ctx->task) {
+               /*
+                * Allow more per task events with respect to the
+                * reservation:
+                */
+               cpuctx->max_pertask =
+                       min(perf_max_events - ctx->nr_events,
+                           perf_max_events - perf_reserved_percpu);
+       }
+
+       perf_enable();
+       spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_event_remove_from_context(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Per cpu events are removed via an smp call and
+                * the removal is always sucessful.
+                */
+               smp_call_function_single(event->cpu,
+                                        __perf_event_remove_from_context,
+                                        event, 1);
+               return;
+       }
+
+retry:
+       task_oncpu_function_call(task, __perf_event_remove_from_context,
+                                event);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the context is active we need to retry the smp call.
+        */
+       if (ctx->nr_active && !list_empty(&event->group_entry)) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * The lock prevents that this context is scheduled in so we
+        * can remove the event safely, if the call above did not
+        * succeed.
+        */
+       if (!list_empty(&event->group_entry)) {
+               list_del_event(event, ctx);
+       }
+       spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+       return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+       u64 now = perf_clock();
+
+       ctx->time += now - ctx->timestamp;
+       ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       u64 run_end;
+
+       if (event->state < PERF_EVENT_STATE_INACTIVE ||
+           event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+               return;
+
+       event->total_time_enabled = ctx->time - event->tstamp_enabled;
+
+       if (event->state == PERF_EVENT_STATE_INACTIVE)
+               run_end = event->tstamp_stopped;
+       else
+               run_end = ctx->time;
+
+       event->total_time_running = run_end - event->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+       struct perf_event *event;
+
+       update_event_times(leader);
+       list_for_each_entry(event, &leader->sibling_list, group_entry)
+               update_event_times(event);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+static void __perf_event_disable(void *info)
+{
+       struct perf_event *event = info;
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_event_context *ctx = event->ctx;
+
+       /*
+        * If this is a per-task event, need to check whether this
+        * event's task is the current task on this cpu.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       spin_lock(&ctx->lock);
+
+       /*
+        * If the event is on, turn it off.
+        * If it is in error state, leave it in error state.
+        */
+       if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+               update_context_time(ctx);
+               update_group_times(event);
+               if (event == event->group_leader)
+                       group_sched_out(event, cpuctx, ctx);
+               else
+                       event_sched_out(event, cpuctx, ctx);
+               event->state = PERF_EVENT_STATE_OFF;
+       }
+
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+static void perf_event_disable(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Disable the event on the cpu that it's on
+                */
+               smp_call_function_single(event->cpu, __perf_event_disable,
+                                        event, 1);
+               return;
+       }
+
+ retry:
+       task_oncpu_function_call(task, __perf_event_disable, event);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the event is still active, we need to retry the cross-call.
+        */
+       if (event->state == PERF_EVENT_STATE_ACTIVE) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * Since we have the lock this context can't be scheduled
+        * in, so we can change the state safely.
+        */
+       if (event->state == PERF_EVENT_STATE_INACTIVE) {
+               update_group_times(event);
+               event->state = PERF_EVENT_STATE_OFF;
+       }
+
+       spin_unlock_irq(&ctx->lock);
+}
+
+static int
+event_sched_in(struct perf_event *event,
+                struct perf_cpu_context *cpuctx,
+                struct perf_event_context *ctx,
+                int cpu)
+{
+       if (event->state <= PERF_EVENT_STATE_OFF)
+               return 0;
+
+       event->state = PERF_EVENT_STATE_ACTIVE;
+       event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
+       /*
+        * The new state must be visible before we turn it on in the hardware:
+        */
+       smp_wmb();
+
+       if (event->pmu->enable(event)) {
+               event->state = PERF_EVENT_STATE_INACTIVE;
+               event->oncpu = -1;
+               return -EAGAIN;
+       }
+
+       event->tstamp_running += ctx->time - event->tstamp_stopped;
+
+       if (!is_software_event(event))
+               cpuctx->active_oncpu++;
+       ctx->nr_active++;
+
+       if (event->attr.exclusive)
+               cpuctx->exclusive = 1;
+
+       return 0;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+              struct perf_cpu_context *cpuctx,
+              struct perf_event_context *ctx,
+              int cpu)
+{
+       struct perf_event *event, *partial_group;
+       int ret;
+
+       if (group_event->state == PERF_EVENT_STATE_OFF)
+               return 0;
+
+       ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+       if (ret)
+               return ret < 0 ? ret : 0;
+
+       if (event_sched_in(group_event, cpuctx, ctx, cpu))
+               return -EAGAIN;
+
+       /*
+        * Schedule in siblings as one group (if any):
+        */
+       list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+               if (event_sched_in(event, cpuctx, ctx, cpu)) {
+                       partial_group = event;
+                       goto group_error;
+               }
+       }
+
+       return 0;
+
+group_error:
+       /*
+        * Groups can be scheduled in as one unit only, so undo any
+        * partial group before returning:
+        */
+       list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+               if (event == partial_group)
+                       break;
+               event_sched_out(event, cpuctx, ctx);
+       }
+       event_sched_out(group_event, cpuctx, ctx);
+
+       return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software events,
+ * 0 if the group contains any hardware events.
+ */
+static int is_software_only_group(struct perf_event *leader)
+{
+       struct perf_event *event;
+
+       if (!is_software_event(leader))
+               return 0;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry)
+               if (!is_software_event(event))
+                       return 0;
+
+       return 1;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+                          struct perf_cpu_context *cpuctx,
+                          int can_add_hw)
+{
+       /*
+        * Groups consisting entirely of software events can always go on.
+        */
+       if (is_software_only_group(event))
+               return 1;
+       /*
+        * If an exclusive group is already on, no other hardware
+        * events can go on.
+        */
+       if (cpuctx->exclusive)
+               return 0;
+       /*
+        * If this group is exclusive and there are already
+        * events on the CPU, it can't go on.
+        */
+       if (event->attr.exclusive && cpuctx->active_oncpu)
+               return 0;
+       /*
+        * Otherwise, try to add it if all previous groups were able
+        * to go on.
+        */
+       return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+                              struct perf_event_context *ctx)
+{
+       list_add_event(event, ctx);
+       event->tstamp_enabled = ctx->time;
+       event->tstamp_running = ctx->time;
+       event->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_event *event = info;
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_event *leader = event->group_leader;
+       int cpu = smp_processor_id();
+       int err;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        * Or possibly this is the right context but it isn't
+        * on this cpu because it had no events.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx) {
+               if (cpuctx->task_ctx || ctx->task != current)
+                       return;
+               cpuctx->task_ctx = ctx;
+       }
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
+       update_context_time(ctx);
+
+       /*
+        * Protect the list operation against NMI by disabling the
+        * events on a global level. NOP for non NMI based events.
+        */
+       perf_disable();
+
+       add_event_to_ctx(event, ctx);
+
+       /*
+        * Don't put the event on if it is disabled or if
+        * it is in a group and the group isn't on.
+        */
+       if (event->state != PERF_EVENT_STATE_INACTIVE ||
+           (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
+               goto unlock;
+
+       /*
+        * An exclusive event can't go on if there are already active
+        * hardware events, and no hardware event can go on if there
+        * is already an exclusive event on.
+        */
+       if (!group_can_go_on(event, cpuctx, 1))
+               err = -EEXIST;
+       else
+               err = event_sched_in(event, cpuctx, ctx, cpu);
+
+       if (err) {
+               /*
+                * This event couldn't go on.  If it is in a group
+                * then we have to pull the whole group off.
+                * If the event group is pinned then put it in error state.
+                */
+               if (leader != event)
+                       group_sched_out(leader, cpuctx, ctx);
+               if (leader->attr.pinned) {
+                       update_group_times(leader);
+                       leader->state = PERF_EVENT_STATE_ERROR;
+               }
+       }
+
+       if (!err && !ctx->task && cpuctx->max_pertask)
+               cpuctx->max_pertask--;
+
+ unlock:
+       perf_enable();
+
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+                       struct perf_event *event,
+                       int cpu)
+{
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Per cpu events are installed via an smp call and
+                * the install is always sucessful.
+                */
+               smp_call_function_single(cpu, __perf_install_in_context,
+                                        event, 1);
+               return;
+       }
+
+retry:
+       task_oncpu_function_call(task, __perf_install_in_context,
+                                event);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * we need to retry the smp call.
+        */
+       if (ctx->is_active && list_empty(&event->group_entry)) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * The lock prevents that this context is scheduled in so we
+        * can add the event safely, if it the call above did not
+        * succeed.
+        */
+       if (list_empty(&event->group_entry))
+               add_event_to_ctx(event, ctx);
+       spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event,
+                                       struct perf_event_context *ctx)
+{
+       struct perf_event *sub;
+
+       event->state = PERF_EVENT_STATE_INACTIVE;
+       event->tstamp_enabled = ctx->time - event->total_time_enabled;
+       list_for_each_entry(sub, &event->sibling_list, group_entry)
+               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+                       sub->tstamp_enabled =
+                               ctx->time - sub->total_time_enabled;
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static void __perf_event_enable(void *info)
+{
+       struct perf_event *event = info;
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_event *leader = event->group_leader;
+       int err;
+
+       /*
+        * If this is a per-task event, need to check whether this
+        * event's task is the current task on this cpu.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx) {
+               if (cpuctx->task_ctx || ctx->task != current)
+                       return;
+               cpuctx->task_ctx = ctx;
+       }
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
+       update_context_time(ctx);
+
+       if (event->state >= PERF_EVENT_STATE_INACTIVE)
+               goto unlock;
+       __perf_event_mark_enabled(event, ctx);
+
+       /*
+        * If the event is in a group and isn't the group leader,
+        * then don't put it on unless the group is on.
+        */
+       if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+               goto unlock;
+
+       if (!group_can_go_on(event, cpuctx, 1)) {
+               err = -EEXIST;
+       } else {
+               perf_disable();
+               if (event == leader)
+                       err = group_sched_in(event, cpuctx, ctx,
+                                            smp_processor_id());
+               else
+                       err = event_sched_in(event, cpuctx, ctx,
+                                              smp_processor_id());
+               perf_enable();
+       }
+
+       if (err) {
+               /*
+                * If this event can't go on and it's part of a
+                * group, then the whole group has to come off.
+                */
+               if (leader != event)
+                       group_sched_out(leader, cpuctx, ctx);
+               if (leader->attr.pinned) {
+                       update_group_times(leader);
+                       leader->state = PERF_EVENT_STATE_ERROR;
+               }
+       }
+
+ unlock:
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+static void perf_event_enable(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Enable the event on the cpu that it's on
+                */
+               smp_call_function_single(event->cpu, __perf_event_enable,
+                                        event, 1);
+               return;
+       }
+
+       spin_lock_irq(&ctx->lock);
+       if (event->state >= PERF_EVENT_STATE_INACTIVE)
+               goto out;
+
+       /*
+        * If the event is in error state, clear that first.
+        * That way, if we see the event in error state below, we
+        * know that it has gone back into error state, as distinct
+        * from the task having been scheduled away before the
+        * cross-call arrived.
+        */
+       if (event->state == PERF_EVENT_STATE_ERROR)
+               event->state = PERF_EVENT_STATE_OFF;
+
+ retry:
+       spin_unlock_irq(&ctx->lock);
+       task_oncpu_function_call(task, __perf_event_enable, event);
+
+       spin_lock_irq(&ctx->lock);
+
+       /*
+        * If the context is active and the event is still off,
+        * we need to retry the cross-call.
+        */
+       if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+               goto retry;
+
+       /*
+        * Since we have the lock this context can't be scheduled
+        * in, so we can change the state safely.
+        */
+       if (event->state == PERF_EVENT_STATE_OFF)
+               __perf_event_mark_enabled(event, ctx);
+
+ out:
+       spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_event_refresh(struct perf_event *event, int refresh)
+{
+       /*
+        * not supported on inherited events
+        */
+       if (event->attr.inherit)
+               return -EINVAL;
+
+       atomic_add(refresh, &event->event_limit);
+       perf_event_enable(event);
+
+       return 0;
+}
+
+void __perf_event_sched_out(struct perf_event_context *ctx,
+                             struct perf_cpu_context *cpuctx)
+{
+       struct perf_event *event;
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 0;
+       if (likely(!ctx->nr_events))
+               goto out;
+       update_context_time(ctx);
+
+       perf_disable();
+       if (ctx->nr_active) {
+               list_for_each_entry(event, &ctx->group_list, group_entry) {
+                       if (event != event->group_leader)
+                               event_sched_out(event, cpuctx, ctx);
+                       else
+                               group_sched_out(event, cpuctx, ctx);
+               }
+       }
+       perf_enable();
+ out:
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled events.
+ * If the number of enabled events is the same, then the set
+ * of enabled events should be the same, because these are both
+ * inherited contexts, therefore we can't access individual events
+ * in them directly with an fd; we can only enable/disable all
+ * events via prctl, or enable/disable all events in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+                        struct perf_event_context *ctx2)
+{
+       return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+               && ctx1->parent_gen == ctx2->parent_gen
+               && !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_event_read(void *event);
+
+static void __perf_event_sync_stat(struct perf_event *event,
+                                    struct perf_event *next_event)
+{
+       u64 value;
+
+       if (!event->attr.inherit_stat)
+               return;
+
+       /*
+        * Update the event value, we cannot use perf_event_read()
+        * because we're in the middle of a context switch and have IRQs
+        * disabled, which upsets smp_call_function_single(), however
+        * we know the event must be on the current CPU, therefore we
+        * don't need to use it.
+        */
+       switch (event->state) {
+       case PERF_EVENT_STATE_ACTIVE:
+               __perf_event_read(event);
+               break;
+
+       case PERF_EVENT_STATE_INACTIVE:
+               update_event_times(event);
+               break;
+
+       default:
+               break;
+       }
+
+       /*
+        * In order to keep per-task stats reliable we need to flip the event
+        * values when we flip the contexts.
+        */
+       value = atomic64_read(&next_event->count);
+       value = atomic64_xchg(&event->count, value);
+       atomic64_set(&next_event->count, value);
+
+       swap(event->total_time_enabled, next_event->total_time_enabled);
+       swap(event->total_time_running, next_event->total_time_running);
+
+       /*
+        * Since we swizzled the values, update the user visible data too.
+        */
+       perf_event_update_userpage(event);
+       perf_event_update_userpage(next_event);
+}
+
+#define list_next_entry(pos, member) \
+       list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+                                  struct perf_event_context *next_ctx)
+{
+       struct perf_event *event, *next_event;
+
+       if (!ctx->nr_stat)
+               return;
+
+       event = list_first_entry(&ctx->event_list,
+                                  struct perf_event, event_entry);
+
+       next_event = list_first_entry(&next_ctx->event_list,
+                                       struct perf_event, event_entry);
+
+       while (&event->event_entry != &ctx->event_list &&
+              &next_event->event_entry != &next_ctx->event_list) {
+
+               __perf_event_sync_stat(event, next_event);
+
+               event = list_next_entry(event, event_entry);
+               next_event = list_next_entry(next_event, event_entry);
+       }
+}
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+                                struct task_struct *next, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_event_context *next_ctx;
+       struct perf_event_context *parent;
+       struct pt_regs *regs;
+       int do_switch = 1;
+
+       regs = task_pt_regs(task);
+       perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+       if (likely(!ctx || !cpuctx->task_ctx))
+               return;
+
+       update_context_time(ctx);
+
+       rcu_read_lock();
+       parent = rcu_dereference(ctx->parent_ctx);
+       next_ctx = next->perf_event_ctxp;
+       if (parent && next_ctx &&
+           rcu_dereference(next_ctx->parent_ctx) == parent) {
+               /*
+                * Looks like the two contexts are clones, so we might be
+                * able to optimize the context switch.  We lock both
+                * contexts and check that they are clones under the
+                * lock (including re-checking that neither has been
+                * uncloned in the meantime).  It doesn't matter which
+                * order we take the locks because no other cpu could
+                * be trying to lock both of these tasks.
+                */
+               spin_lock(&ctx->lock);
+               spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+               if (context_equiv(ctx, next_ctx)) {
+                       /*
+                        * XXX do we need a memory barrier of sorts
+                        * wrt to rcu_dereference() of perf_event_ctxp
+                        */
+                       task->perf_event_ctxp = next_ctx;
+                       next->perf_event_ctxp = ctx;
+                       ctx->task = next;
+                       next_ctx->task = task;
+                       do_switch = 0;
+
+                       perf_event_sync_stat(ctx, next_ctx);
+               }
+               spin_unlock(&next_ctx->lock);
+               spin_unlock(&ctx->lock);
+       }
+       rcu_read_unlock();
+
+       if (do_switch) {
+               __perf_event_sched_out(ctx, cpuctx);
+               cpuctx->task_ctx = NULL;
+       }
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+       if (!cpuctx->task_ctx)
+               return;
+
+       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+               return;
+
+       __perf_event_sched_out(ctx, cpuctx);
+       cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+       __perf_event_sched_out(&cpuctx->ctx, cpuctx);
+}
+
+static void
+__perf_event_sched_in(struct perf_event_context *ctx,
+                       struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct perf_event *event;
+       int can_add_hw = 1;
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
+       if (likely(!ctx->nr_events))
+               goto out;
+
+       ctx->timestamp = perf_clock();
+
+       perf_disable();
+
+       /*
+        * First go through the list and put on any pinned groups
+        * in order to give them the best chance of going on.
+        */
+       list_for_each_entry(event, &ctx->group_list, group_entry) {
+               if (event->state <= PERF_EVENT_STATE_OFF ||
+                   !event->attr.pinned)
+                       continue;
+               if (event->cpu != -1 && event->cpu != cpu)
+                       continue;
+
+               if (event != event->group_leader)
+                       event_sched_in(event, cpuctx, ctx, cpu);
+               else {
+                       if (group_can_go_on(event, cpuctx, 1))
+                               group_sched_in(event, cpuctx, ctx, cpu);
+               }
+
+               /*
+                * If this pinned group hasn't been scheduled,
+                * put it in error state.
+                */
+               if (event->state == PERF_EVENT_STATE_INACTIVE) {
+                       update_group_times(event);
+                       event->state = PERF_EVENT_STATE_ERROR;
+               }
+       }
+
+       list_for_each_entry(event, &ctx->group_list, group_entry) {
+               /*
+                * Ignore events in OFF or ERROR state, and
+                * ignore pinned events since we did them already.
+                */
+               if (event->state <= PERF_EVENT_STATE_OFF ||
+                   event->attr.pinned)
+                       continue;
+
+               /*
+                * Listen to the 'cpu' scheduling filter constraint
+                * of events:
+                */
+               if (event->cpu != -1 && event->cpu != cpu)
+                       continue;
+
+               if (event != event->group_leader) {
+                       if (event_sched_in(event, cpuctx, ctx, cpu))
+                               can_add_hw = 0;
+               } else {
+                       if (group_can_go_on(event, cpuctx, can_add_hw)) {
+                               if (group_sched_in(event, cpuctx, ctx, cpu))
+                                       can_add_hw = 0;
+                       }
+               }
+       }
+       perf_enable();
+ out:
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void perf_event_task_sched_in(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_event_context *ctx = task->perf_event_ctxp;
+
+       if (likely(!ctx))
+               return;
+       if (cpuctx->task_ctx == ctx)
+               return;
+       __perf_event_sched_in(ctx, cpuctx, cpu);
+       cpuctx->task_ctx = ctx;
+}
+
+static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct perf_event_context *ctx = &cpuctx->ctx;
+
+       __perf_event_sched_in(ctx, cpuctx, cpu);
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
+static void perf_adjust_period(struct perf_event *event, u64 events)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 period, sample_period;
+       s64 delta;
+
+       events *= hwc->sample_period;
+       period = div64_u64(events, event->attr.sample_freq);
+
+       delta = (s64)(period - hwc->sample_period);
+       delta = (delta + 7) / 8; /* low pass filter */
+
+       sample_period = hwc->sample_period + delta;
+
+       if (!sample_period)
+               sample_period = 1;
+
+       hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+{
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       u64 interrupts, freq;
+
+       spin_lock(&ctx->lock);
+       list_for_each_entry(event, &ctx->group_list, group_entry) {
+               if (event->state != PERF_EVENT_STATE_ACTIVE)
+                       continue;
+
+               hwc = &event->hw;
+
+               interrupts = hwc->interrupts;
+               hwc->interrupts = 0;
+
+               /*
+                * unthrottle events on the tick
+                */
+               if (interrupts == MAX_INTERRUPTS) {
+                       perf_log_throttle(event, 1);
+                       event->pmu->unthrottle(event);
+                       interrupts = 2*sysctl_perf_event_sample_rate/HZ;
+               }
+
+               if (!event->attr.freq || !event->attr.sample_freq)
+                       continue;
+
+               /*
+                * if the specified freq < HZ then we need to skip ticks
+                */
+               if (event->attr.sample_freq < HZ) {
+                       freq = event->attr.sample_freq;
+
+                       hwc->freq_count += freq;
+                       hwc->freq_interrupts += interrupts;
+
+                       if (hwc->freq_count < HZ)
+                               continue;
+
+                       interrupts = hwc->freq_interrupts;
+                       hwc->freq_interrupts = 0;
+                       hwc->freq_count -= HZ;
+               } else
+                       freq = HZ;
+
+               perf_adjust_period(event, freq * interrupts);
+
+               /*
+                * In order to avoid being stalled by an (accidental) huge
+                * sample period, force reset the sample period if we didn't
+                * get any events in this freq period.
+                */
+               if (!interrupts) {
+                       perf_disable();
+                       event->pmu->disable(event);
+                       atomic64_set(&hwc->period_left, 0);
+                       event->pmu->enable(event);
+                       perf_enable();
+               }
+       }
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's events:
+ */
+static void rotate_ctx(struct perf_event_context *ctx)
+{
+       struct perf_event *event;
+
+       if (!ctx->nr_events)
+               return;
+
+       spin_lock(&ctx->lock);
+       /*
+        * Rotate the first entry last (works just fine for group events too):
+        */
+       perf_disable();
+       list_for_each_entry(event, &ctx->group_list, group_entry) {
+               list_move_tail(&event->group_entry, &ctx->group_list);
+               break;
+       }
+       perf_enable();
+
+       spin_unlock(&ctx->lock);
+}
+
+void perf_event_task_tick(struct task_struct *curr, int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx;
+
+       if (!atomic_read(&nr_events))
+               return;
+
+       cpuctx = &per_cpu(perf_cpu_context, cpu);
+       ctx = curr->perf_event_ctxp;
+
+       perf_ctx_adjust_freq(&cpuctx->ctx);
+       if (ctx)
+               perf_ctx_adjust_freq(ctx);
+
+       perf_event_cpu_sched_out(cpuctx);
+       if (ctx)
+               __perf_event_task_sched_out(ctx);
+
+       rotate_ctx(&cpuctx->ctx);
+       if (ctx)
+               rotate_ctx(ctx);
+
+       perf_event_cpu_sched_in(cpuctx, cpu);
+       if (ctx)
+               perf_event_task_sched_in(curr, cpu);
+}
+
+/*
+ * Enable all of a task's events that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_event_enable_on_exec(struct task_struct *task)
+{
+       struct perf_event_context *ctx;
+       struct perf_event *event;
+       unsigned long flags;
+       int enabled = 0;
+
+       local_irq_save(flags);
+       ctx = task->perf_event_ctxp;
+       if (!ctx || !ctx->nr_events)
+               goto out;
+
+       __perf_event_task_sched_out(ctx);
+
+       spin_lock(&ctx->lock);
+
+       list_for_each_entry(event, &ctx->group_list, group_entry) {
+               if (!event->attr.enable_on_exec)
+                       continue;
+               event->attr.enable_on_exec = 0;
+               if (event->state >= PERF_EVENT_STATE_INACTIVE)
+                       continue;
+               __perf_event_mark_enabled(event, ctx);
+               enabled = 1;
+       }
+
+       /*
+        * Unclone this context if we enabled any event.
+        */
+       if (enabled)
+               unclone_ctx(ctx);
+
+       spin_unlock(&ctx->lock);
+
+       perf_event_task_sched_in(task, smp_processor_id());
+ out:
+       local_irq_restore(flags);
+}
+
+/*
+ * Cross CPU call to read the hardware event
+ */
+static void __perf_event_read(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_event *event = info;
+       struct perf_event_context *ctx = event->ctx;
+       unsigned long flags;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu.  If not it has been
+        * scheduled out before the smp call arrived.  In that case
+        * event->count would have been updated to a recent sample
+        * when the event was scheduled out.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       local_irq_save(flags);
+       if (ctx->is_active)
+               update_context_time(ctx);
+       event->pmu->read(event);
+       update_event_times(event);
+       local_irq_restore(flags);
+}
+
+static u64 perf_event_read(struct perf_event *event)
+{
+       /*
+        * If event is enabled and currently active on a CPU, update the
+        * value in the event structure:
+        */
+       if (event->state == PERF_EVENT_STATE_ACTIVE) {
+               smp_call_function_single(event->oncpu,
+                                        __perf_event_read, event, 1);
+       } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+               update_event_times(event);
+       }
+
+       return atomic64_read(&event->count);
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void
+__perf_event_init_context(struct perf_event_context *ctx,
+                           struct task_struct *task)
+{
+       memset(ctx, 0, sizeof(*ctx));
+       spin_lock_init(&ctx->lock);
+       mutex_init(&ctx->mutex);
+       INIT_LIST_HEAD(&ctx->group_list);
+       INIT_LIST_HEAD(&ctx->event_list);
+       atomic_set(&ctx->refcount, 1);
+       ctx->task = task;
+}
+
+static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+{
+       struct perf_event_context *ctx;
+       struct perf_cpu_context *cpuctx;
+       struct task_struct *task;
+       unsigned long flags;
+       int err;
+
+       /*
+        * If cpu is not a wildcard then this is a percpu event:
+        */
+       if (cpu != -1) {
+               /* Must be root to operate on a CPU event: */
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return ERR_PTR(-EACCES);
+
+               if (cpu < 0 || cpu > num_possible_cpus())
+                       return ERR_PTR(-EINVAL);
+
+               /*
+                * We could be clever and allow to attach a event to an
+                * offline CPU and activate it when the CPU comes up, but
+                * that's for later.
+                */
+               if (!cpu_isset(cpu, cpu_online_map))
+                       return ERR_PTR(-ENODEV);
+
+               cpuctx = &per_cpu(perf_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+               get_ctx(ctx);
+
+               return ctx;
+       }
+
+       rcu_read_lock();
+       if (!pid)
+               task = current;
+       else
+               task = find_task_by_vpid(pid);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+
+       if (!task)
+               return ERR_PTR(-ESRCH);
+
+       /*
+        * Can't attach events to a dying task.
+        */
+       err = -ESRCH;
+       if (task->flags & PF_EXITING)
+               goto errout;
+
+       /* Reuse ptrace permission checks for now. */
+       err = -EACCES;
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto errout;
+
+ retry:
+       ctx = perf_lock_task_context(task, &flags);
+       if (ctx) {
+               unclone_ctx(ctx);
+               spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+
+       if (!ctx) {
+               ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+               err = -ENOMEM;
+               if (!ctx)
+                       goto errout;
+               __perf_event_init_context(ctx, task);
+               get_ctx(ctx);
+               if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+                       /*
+                        * We raced with some other task; use
+                        * the context they set.
+                        */
+                       kfree(ctx);
+                       goto retry;
+               }
+               get_task_struct(task);
+       }
+
+       put_task_struct(task);
+       return ctx;
+
+ errout:
+       put_task_struct(task);
+       return ERR_PTR(err);
+}
+
+static void free_event_rcu(struct rcu_head *head)
+{
+       struct perf_event *event;
+
+       event = container_of(head, struct perf_event, rcu_head);
+       if (event->ns)
+               put_pid_ns(event->ns);
+       kfree(event);
+}
+
+static void perf_pending_sync(struct perf_event *event);
+
+static void free_event(struct perf_event *event)
+{
+       perf_pending_sync(event);
+
+       if (!event->parent) {
+               atomic_dec(&nr_events);
+               if (event->attr.mmap)
+                       atomic_dec(&nr_mmap_events);
+               if (event->attr.comm)
+                       atomic_dec(&nr_comm_events);
+               if (event->attr.task)
+                       atomic_dec(&nr_task_events);
+       }
+
+       if (event->output) {
+               fput(event->output->filp);
+               event->output = NULL;
+       }
+
+       if (event->destroy)
+               event->destroy(event);
+
+       put_ctx(event->ctx);
+       call_rcu(&event->rcu_head, free_event_rcu);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+       struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx = event->ctx;
+
+       file->private_data = NULL;
+
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_event_remove_from_context(event);
+       mutex_unlock(&ctx->mutex);
+
+       mutex_lock(&event->owner->perf_event_mutex);
+       list_del_init(&event->owner_entry);
+       mutex_unlock(&event->owner->perf_event_mutex);
+       put_task_struct(event->owner);
+
+       free_event(event);
+
+       return 0;
+}
+
+static int perf_event_read_size(struct perf_event *event)
+{
+       int entry = sizeof(u64); /* value */
+       int size = 0;
+       int nr = 1;
+
+       if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               size += sizeof(u64);
+
+       if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               size += sizeof(u64);
+
+       if (event->attr.read_format & PERF_FORMAT_ID)
+               entry += sizeof(u64);
+
+       if (event->attr.read_format & PERF_FORMAT_GROUP) {
+               nr += event->group_leader->nr_siblings;
+               size += sizeof(u64);
+       }
+
+       size += entry * nr;
+
+       return size;
+}
+
+static u64 perf_event_read_value(struct perf_event *event)
+{
+       struct perf_event *child;
+       u64 total = 0;
+
+       total += perf_event_read(event);
+       list_for_each_entry(child, &event->child_list, child_list)
+               total += perf_event_read(child);
+
+       return total;
+}
+
+static int perf_event_read_entry(struct perf_event *event,
+                                  u64 read_format, char __user *buf)
+{
+       int n = 0, count = 0;
+       u64 values[2];
+
+       values[n++] = perf_event_read_value(event);
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_event_id(event);
+
+       count = n * sizeof(u64);
+
+       if (copy_to_user(buf, values, count))
+               return -EFAULT;
+
+       return count;
+}
+
+static int perf_event_read_group(struct perf_event *event,
+                                  u64 read_format, char __user *buf)
+{
+       struct perf_event *leader = event->group_leader, *sub;
+       int n = 0, size = 0, err = -EFAULT;
+       u64 values[3];
+
+       values[n++] = 1 + leader->nr_siblings;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = leader->total_time_enabled +
+                       atomic64_read(&leader->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = leader->total_time_running +
+                       atomic64_read(&leader->child_total_time_running);
+       }
+
+       size = n * sizeof(u64);
+
+       if (copy_to_user(buf, values, size))
+               return -EFAULT;
+
+       err = perf_event_read_entry(leader, read_format, buf + size);
+       if (err < 0)
+               return err;
+
+       size += err;
+
+       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+               err = perf_event_read_entry(sub, read_format,
+                               buf + size);
+               if (err < 0)
+                       return err;
+
+               size += err;
+       }
+
+       return size;
+}
+
+static int perf_event_read_one(struct perf_event *event,
+                                u64 read_format, char __user *buf)
+{
+       u64 values[4];
+       int n = 0;
+
+       values[n++] = perf_event_read_value(event);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = event->total_time_enabled +
+                       atomic64_read(&event->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = event->total_time_running +
+                       atomic64_read(&event->child_total_time_running);
+       }
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_event_id(event);
+
+       if (copy_to_user(buf, values, n * sizeof(u64)))
+               return -EFAULT;
+
+       return n * sizeof(u64);
+}
+
+/*
+ * Read the performance event - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+{
+       u64 read_format = event->attr.read_format;
+       int ret;
+
+       /*
+        * Return end-of-file for a read on a event that is in
+        * error state (i.e. because it was pinned but it couldn't be
+        * scheduled on to the CPU at some point).
+        */
+       if (event->state == PERF_EVENT_STATE_ERROR)
+               return 0;
+
+       if (count < perf_event_read_size(event))
+               return -ENOSPC;
+
+       WARN_ON_ONCE(event->ctx->parent_ctx);
+       mutex_lock(&event->child_mutex);
+       if (read_format & PERF_FORMAT_GROUP)
+               ret = perf_event_read_group(event, read_format, buf);
+       else
+               ret = perf_event_read_one(event, read_format, buf);
+       mutex_unlock(&event->child_mutex);
+
+       return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+       struct perf_event *event = file->private_data;
+
+       return perf_read_hw(event, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+       struct perf_event *event = file->private_data;
+       struct perf_mmap_data *data;
+       unsigned int events = POLL_HUP;
+
+       rcu_read_lock();
+       data = rcu_dereference(event->data);
+       if (data)
+               events = atomic_xchg(&data->poll, 0);
+       rcu_read_unlock();
+
+       poll_wait(file, &event->waitq, wait);
+
+       return events;
+}
+
+static void perf_event_reset(struct perf_event *event)
+{
+       (void)perf_event_read(event);
+       atomic64_set(&event->count, 0);
+       perf_event_update_userpage(event);
+}
+
+/*
+ * Holding the top-level event's child_mutex means that any
+ * descendant process that has inherited this event will block
+ * in sync_child_event if it goes to exit, thus satisfying the
+ * task existence requirements of perf_event_enable/disable.
+ */
+static void perf_event_for_each_child(struct perf_event *event,
+                                       void (*func)(struct perf_event *))
+{
+       struct perf_event *child;
+
+       WARN_ON_ONCE(event->ctx->parent_ctx);
+       mutex_lock(&event->child_mutex);
+       func(event);
+       list_for_each_entry(child, &event->child_list, child_list)
+               func(child);
+       mutex_unlock(&event->child_mutex);
+}
+
+static void perf_event_for_each(struct perf_event *event,
+                                 void (*func)(struct perf_event *))
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_event *sibling;
+
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       event = event->group_leader;
+
+       perf_event_for_each_child(event, func);
+       func(event);
+       list_for_each_entry(sibling, &event->sibling_list, group_entry)
+               perf_event_for_each_child(event, func);
+       mutex_unlock(&ctx->mutex);
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+       struct perf_event_context *ctx = event->ctx;
+       unsigned long size;
+       int ret = 0;
+       u64 value;
+
+       if (!event->attr.sample_period)
+               return -EINVAL;
+
+       size = copy_from_user(&value, arg, sizeof(value));
+       if (size != sizeof(value))
+               return -EFAULT;
+
+       if (!value)
+               return -EINVAL;
+
+       spin_lock_irq(&ctx->lock);
+       if (event->attr.freq) {
+               if (value > sysctl_perf_event_sample_rate) {
+                       ret = -EINVAL;
+                       goto unlock;
+               }
+
+               event->attr.sample_freq = value;
+       } else {
+               event->attr.sample_period = value;
+               event->hw.sample_period = value;
+       }
+unlock:
+       spin_unlock_irq(&ctx->lock);
+
+       return ret;
+}
+
+int perf_event_set_output(struct perf_event *event, int output_fd);
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct perf_event *event = file->private_data;
+       void (*func)(struct perf_event *);
+       u32 flags = arg;
+
+       switch (cmd) {
+       case PERF_EVENT_IOC_ENABLE:
+               func = perf_event_enable;
+               break;
+       case PERF_EVENT_IOC_DISABLE:
+               func = perf_event_disable;
+               break;
+       case PERF_EVENT_IOC_RESET:
+               func = perf_event_reset;
+               break;
+
+       case PERF_EVENT_IOC_REFRESH:
+               return perf_event_refresh(event, arg);
+
+       case PERF_EVENT_IOC_PERIOD:
+               return perf_event_period(event, (u64 __user *)arg);
+
+       case PERF_EVENT_IOC_SET_OUTPUT:
+               return perf_event_set_output(event, arg);
+
+       default:
+               return -ENOTTY;
+       }
+
+       if (flags & PERF_IOC_FLAG_GROUP)
+               perf_event_for_each(event, func);
+       else
+               perf_event_for_each_child(event, func);
+
+       return 0;
+}
+
+int perf_event_task_enable(void)
+{
+       struct perf_event *event;
+
+       mutex_lock(&current->perf_event_mutex);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry)
+               perf_event_for_each_child(event, perf_event_enable);
+       mutex_unlock(&current->perf_event_mutex);
+
+       return 0;
+}
+
+int perf_event_task_disable(void)
+{
+       struct perf_event *event;
+
+       mutex_lock(&current->perf_event_mutex);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry)
+               perf_event_for_each_child(event, perf_event_disable);
+       mutex_unlock(&current->perf_event_mutex);
+
+       return 0;
+}
+
+#ifndef PERF_EVENT_INDEX_OFFSET
+# define PERF_EVENT_INDEX_OFFSET 0
+#endif
+
+static int perf_event_index(struct perf_event *event)
+{
+       if (event->state != PERF_EVENT_STATE_ACTIVE)
+               return 0;
+
+       return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_event_update_userpage(struct perf_event *event)
+{
+       struct perf_event_mmap_page *userpg;
+       struct perf_mmap_data *data;
+
+       rcu_read_lock();
+       data = rcu_dereference(event->data);
+       if (!data)
+               goto unlock;
+
+       userpg = data->user_page;
+
+       /*
+        * Disable preemption so as to not let the corresponding user-space
+        * spin too long if we get preempted.
+        */
+       preempt_disable();
+       ++userpg->lock;
+       barrier();
+       userpg->index = perf_event_index(event);
+       userpg->offset = atomic64_read(&event->count);
+       if (event->state == PERF_EVENT_STATE_ACTIVE)
+               userpg->offset -= atomic64_read(&event->hw.prev_count);
+
+       userpg->time_enabled = event->total_time_enabled +
+                       atomic64_read(&event->child_total_time_enabled);
+
+       userpg->time_running = event->total_time_running +
+                       atomic64_read(&event->child_total_time_running);
+
+       barrier();
+       ++userpg->lock;
+       preempt_enable();
+unlock:
+       rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct perf_event *event = vma->vm_file->private_data;
+       struct perf_mmap_data *data;
+       int ret = VM_FAULT_SIGBUS;
+
+       if (vmf->flags & FAULT_FLAG_MKWRITE) {
+               if (vmf->pgoff == 0)
+                       ret = 0;
+               return ret;
+       }
+
+       rcu_read_lock();
+       data = rcu_dereference(event->data);
+       if (!data)
+               goto unlock;
+
+       if (vmf->pgoff == 0) {
+               vmf->page = virt_to_page(data->user_page);
+       } else {
+               int nr = vmf->pgoff - 1;
+
+               if ((unsigned)nr > data->nr_pages)
+                       goto unlock;
+
+               if (vmf->flags & FAULT_FLAG_WRITE)
+                       goto unlock;
+
+               vmf->page = virt_to_page(data->data_pages[nr]);
+       }
+
+       get_page(vmf->page);
+       vmf->page->mapping = vma->vm_file->f_mapping;
+       vmf->page->index   = vmf->pgoff;
+
+       ret = 0;
+unlock:
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+{
+       struct perf_mmap_data *data;
+       unsigned long size;
+       int i;
+
+       WARN_ON(atomic_read(&event->mmap_count));
+
+       size = sizeof(struct perf_mmap_data);
+       size += nr_pages * sizeof(void *);
+
+       data = kzalloc(size, GFP_KERNEL);
+       if (!data)
+               goto fail;
+
+       data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!data->user_page)
+               goto fail_user_page;
+
+       for (i = 0; i < nr_pages; i++) {
+               data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+               if (!data->data_pages[i])
+                       goto fail_data_pages;
+       }
+
+       data->nr_pages = nr_pages;
+       atomic_set(&data->lock, -1);
+
+       if (event->attr.watermark) {
+               data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+                                     event->attr.wakeup_watermark);
+       }
+       if (!data->watermark)
+               data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
+       rcu_assign_pointer(event->data, data);
+
+       return 0;
+
+fail_data_pages:
+       for (i--; i >= 0; i--)
+               free_page((unsigned long)data->data_pages[i]);
+
+       free_page((unsigned long)data->user_page);
+
+fail_user_page:
+       kfree(data);
+
+fail:
+       return -ENOMEM;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+       struct page *page = virt_to_page((void *)addr);
+
+       page->mapping = NULL;
+       __free_page(page);
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+       struct perf_mmap_data *data;
+       int i;
+
+       data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
+       perf_mmap_free_page((unsigned long)data->user_page);
+       for (i = 0; i < data->nr_pages; i++)
+               perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
+       kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_event *event)
+{
+       struct perf_mmap_data *data = event->data;
+
+       WARN_ON(atomic_read(&event->mmap_count));
+
+       rcu_assign_pointer(event->data, NULL);
+       call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+       struct perf_event *event = vma->vm_file->private_data;
+
+       atomic_inc(&event->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+       struct perf_event *event = vma->vm_file->private_data;
+
+       WARN_ON_ONCE(event->ctx->parent_ctx);
+       if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
+               struct user_struct *user = current_user();
+
+               atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
+               vma->vm_mm->locked_vm -= event->data->nr_locked;
+               perf_mmap_data_free(event);
+               mutex_unlock(&event->mmap_mutex);
+       }
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+       .open           = perf_mmap_open,
+       .close          = perf_mmap_close,
+       .fault          = perf_mmap_fault,
+       .page_mkwrite   = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct perf_event *event = file->private_data;
+       unsigned long user_locked, user_lock_limit;
+       struct user_struct *user = current_user();
+       unsigned long locked, lock_limit;
+       unsigned long vma_size;
+       unsigned long nr_pages;
+       long user_extra, extra;
+       int ret = 0;
+
+       if (!(vma->vm_flags & VM_SHARED))
+               return -EINVAL;
+
+       vma_size = vma->vm_end - vma->vm_start;
+       nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+       /*
+        * If we have data pages ensure they're a power-of-two number, so we
+        * can do bitmasks instead of modulo.
+        */
+       if (nr_pages != 0 && !is_power_of_2(nr_pages))
+               return -EINVAL;
+
+       if (vma_size != PAGE_SIZE * (1 + nr_pages))
+               return -EINVAL;
+
+       if (vma->vm_pgoff != 0)
+               return -EINVAL;
+
+       WARN_ON_ONCE(event->ctx->parent_ctx);
+       mutex_lock(&event->mmap_mutex);
+       if (event->output) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+
+       if (atomic_inc_not_zero(&event->mmap_count)) {
+               if (nr_pages != event->data->nr_pages)
+                       ret = -EINVAL;
+               goto unlock;
+       }
+
+       user_extra = nr_pages + 1;
+       user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+       /*
+        * Increase the limit linearly with more CPUs:
+        */
+       user_lock_limit *= num_online_cpus();
+
+       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+       extra = 0;
+       if (user_locked > user_lock_limit)
+               extra = user_locked - user_lock_limit;
+
+       lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+       lock_limit >>= PAGE_SHIFT;
+       locked = vma->vm_mm->locked_vm + extra;
+
+       if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+               !capable(CAP_IPC_LOCK)) {
+               ret = -EPERM;
+               goto unlock;
+       }
+
+       WARN_ON(event->data);
+       ret = perf_mmap_data_alloc(event, nr_pages);
+       if (ret)
+               goto unlock;
+
+       atomic_set(&event->mmap_count, 1);
+       atomic_long_add(user_extra, &user->locked_vm);
+       vma->vm_mm->locked_vm += extra;
+       event->data->nr_locked = extra;
+       if (vma->vm_flags & VM_WRITE)
+               event->data->writable = 1;
+
+unlock:
+       mutex_unlock(&event->mmap_mutex);
+
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &perf_mmap_vmops;
+
+       return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+       struct inode *inode = filp->f_path.dentry->d_inode;
+       struct perf_event *event = filp->private_data;
+       int retval;
+
+       mutex_lock(&inode->i_mutex);
+       retval = fasync_helper(fd, filp, on, &event->fasync);
+       mutex_unlock(&inode->i_mutex);
+
+       if (retval < 0)
+               return retval;
+
+       return 0;
+}
+
+static const struct file_operations perf_fops = {
+       .release                = perf_release,
+       .read                   = perf_read,
+       .poll                   = perf_poll,
+       .unlocked_ioctl         = perf_ioctl,
+       .compat_ioctl           = perf_ioctl,
+       .mmap                   = perf_mmap,
+       .fasync                 = perf_fasync,
+};
+
+/*
+ * Perf event wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_event_wakeup(struct perf_event *event)
+{
+       wake_up_all(&event->waitq);
+
+       if (event->pending_kill) {
+               kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+               event->pending_kill = 0;
+       }
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+static void perf_pending_event(struct perf_pending_entry *entry)
+{
+       struct perf_event *event = container_of(entry,
+                       struct perf_event, pending);
+
+       if (event->pending_disable) {
+               event->pending_disable = 0;
+               __perf_event_disable(event);
+       }
+
+       if (event->pending_wakeup) {
+               event->pending_wakeup = 0;
+               perf_event_wakeup(event);
+       }
+}
+
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+       PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_pending_entry *entry,
+                              void (*func)(struct perf_pending_entry *))
+{
+       struct perf_pending_entry **head;
+
+       if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+               return;
+
+       entry->func = func;
+
+       head = &get_cpu_var(perf_pending_head);
+
+       do {
+               entry->next = *head;
+       } while (cmpxchg(head, entry->next, entry) != entry->next);
+
+       set_perf_event_pending();
+
+       put_cpu_var(perf_pending_head);
+}
+
+static int __perf_pending_run(void)
+{
+       struct perf_pending_entry *list;
+       int nr = 0;
+
+       list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+       while (list != PENDING_TAIL) {
+               void (*func)(struct perf_pending_entry *);
+               struct perf_pending_entry *entry = list;
+
+               list = list->next;
+
+               func = entry->func;
+               entry->next = NULL;
+               /*
+                * Ensure we observe the unqueue before we issue the wakeup,
+                * so that we won't be waiting forever.
+                * -- see perf_not_pending().
+                */
+               smp_wmb();
+
+               func(entry);
+               nr++;
+       }
+
+       return nr;
+}
+
+static inline int perf_not_pending(struct perf_event *event)
+{
+       /*
+        * If we flush on whatever cpu we run, there is a chance we don't
+        * need to wait.
+        */
+       get_cpu();
+       __perf_pending_run();
+       put_cpu();
+
+       /*
+        * Ensure we see the proper queue state before going to sleep
+        * so that we do not miss the wakeup. -- see perf_pending_handle()
+        */
+       smp_rmb();
+       return event->pending.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_event *event)
+{
+       wait_event(event->waitq, perf_not_pending(event));
+}
+
+void perf_event_do_pending(void)
+{
+       __perf_pending_run();
+}
+
+/*
+ * Callchain support -- arch specific
+ */
+
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       return NULL;
+}
+
+/*
+ * Output
+ */
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+                             unsigned long offset, unsigned long head)
+{
+       unsigned long mask;
+
+       if (!data->writable)
+               return true;
+
+       mask = (data->nr_pages << PAGE_SHIFT) - 1;
+
+       offset = (offset - tail) & mask;
+       head   = (head   - tail) & mask;
+
+       if ((int)(head - offset) < 0)
+               return false;
+
+       return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+       atomic_set(&handle->data->poll, POLL_IN);
+
+       if (handle->nmi) {
+               handle->event->pending_wakeup = 1;
+               perf_pending_queue(&handle->event->pending,
+                                  perf_pending_event);
+       } else
+               perf_event_wakeup(handle->event);
+}
+
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event_id isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event_id completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+       struct perf_mmap_data *data = handle->data;
+       int cpu;
+
+       handle->locked = 0;
+
+       local_irq_save(handle->flags);
+       cpu = smp_processor_id();
+
+       if (in_nmi() && atomic_read(&data->lock) == cpu)
+               return;
+
+       while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+               cpu_relax();
+
+       handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+       struct perf_mmap_data *data = handle->data;
+       unsigned long head;
+       int cpu;
+
+       data->done_head = data->head;
+
+       if (!handle->locked)
+               goto out;
+
+again:
+       /*
+        * The xchg implies a full barrier that ensures all writes are done
+        * before we publish the new head, matched by a rmb() in userspace when
+        * reading this position.
+        */
+       while ((head = atomic_long_xchg(&data->done_head, 0)))
+               data->user_page->data_head = head;
+
+       /*
+        * NMI can happen here, which means we can miss a done_head update.
+        */
+
+       cpu = atomic_xchg(&data->lock, -1);
+       WARN_ON_ONCE(cpu != smp_processor_id());
+
+       /*
+        * Therefore we have to validate we did not indeed do so.
+        */
+       if (unlikely(atomic_long_read(&data->done_head))) {
+               /*
+                * Since we had it locked, we can lock it again.
+                */
+               while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+                       cpu_relax();
+
+               goto again;
+       }
+
+       if (atomic_xchg(&data->wakeup, 0))
+               perf_output_wakeup(handle);
+out:
+       local_irq_restore(handle->flags);
+}
+
+void perf_output_copy(struct perf_output_handle *handle,
+                     const void *buf, unsigned int len)
+{
+       unsigned int pages_mask;
+       unsigned int offset;
+       unsigned int size;
+       void **pages;
+
+       offset          = handle->offset;
+       pages_mask      = handle->data->nr_pages - 1;
+       pages           = handle->data->data_pages;
+
+       do {
+               unsigned int page_offset;
+               int nr;
+
+               nr          = (offset >> PAGE_SHIFT) & pages_mask;
+               page_offset = offset & (PAGE_SIZE - 1);
+               size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+               memcpy(pages[nr] + page_offset, buf, size);
+
+               len         -= size;
+               buf         += size;
+               offset      += size;
+       } while (len);
+
+       handle->offset = offset;
+
+       /*
+        * Check we didn't copy past our reservation window, taking the
+        * possible unsigned int wrap into account.
+        */
+       WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+                     struct perf_event *event, unsigned int size,
+                     int nmi, int sample)
+{
+       struct perf_event *output_event;
+       struct perf_mmap_data *data;
+       unsigned long tail, offset, head;
+       int have_lost;
+       struct {
+               struct perf_event_header header;
+               u64                      id;
+               u64                      lost;
+       } lost_event;
+
+       rcu_read_lock();
+       /*
+        * For inherited events we send all the output towards the parent.
+        */
+       if (event->parent)
+               event = event->parent;
+
+       output_event = rcu_dereference(event->output);
+       if (output_event)
+               event = output_event;
+
+       data = rcu_dereference(event->data);
+       if (!data)
+               goto out;
+
+       handle->data    = data;
+       handle->event   = event;
+       handle->nmi     = nmi;
+       handle->sample  = sample;
+
+       if (!data->nr_pages)
+               goto fail;
+
+       have_lost = atomic_read(&data->lost);
+       if (have_lost)
+               size += sizeof(lost_event);
+
+       perf_output_lock(handle);
+
+       do {
+               /*
+                * Userspace could choose to issue a mb() before updating the
+                * tail pointer. So that all reads will be completed before the
+                * write is issued.
+                */
+               tail = ACCESS_ONCE(data->user_page->data_tail);
+               smp_rmb();
+               offset = head = atomic_long_read(&data->head);
+               head += size;
+               if (unlikely(!perf_output_space(data, tail, offset, head)))
+                       goto fail;
+       } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+       handle->offset  = offset;
+       handle->head    = head;
+
+       if (head - tail > data->watermark)
+               atomic_set(&data->wakeup, 1);
+
+       if (have_lost) {
+               lost_event.header.type = PERF_RECORD_LOST;
+               lost_event.header.misc = 0;
+               lost_event.header.size = sizeof(lost_event);
+               lost_event.id          = event->id;
+               lost_event.lost        = atomic_xchg(&data->lost, 0);
+
+               perf_output_put(handle, lost_event);
+       }
+
+       return 0;
+
+fail:
+       atomic_inc(&data->lost);
+       perf_output_unlock(handle);
+out:
+       rcu_read_unlock();
+
+       return -ENOSPC;
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+       struct perf_event *event = handle->event;
+       struct perf_mmap_data *data = handle->data;
+
+       int wakeup_events = event->attr.wakeup_events;
+
+       if (handle->sample && wakeup_events) {
+               int events = atomic_inc_return(&data->events);
+               if (events >= wakeup_events) {
+                       atomic_sub(wakeup_events, &data->events);
+                       atomic_set(&data->wakeup, 1);
+               }
+       }
+
+       perf_output_unlock(handle);
+       rcu_read_unlock();
+}
+
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+       /*
+        * only top level events have the pid namespace they were created in
+        */
+       if (event->parent)
+               event = event->parent;
+
+       return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+       /*
+        * only top level events have the pid namespace they were created in
+        */
+       if (event->parent)
+               event = event->parent;
+
+       return task_pid_nr_ns(p, event->ns);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+                                struct perf_event *event)
+{
+       u64 read_format = event->attr.read_format;
+       u64 values[4];
+       int n = 0;
+
+       values[n++] = atomic64_read(&event->count);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = event->total_time_enabled +
+                       atomic64_read(&event->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = event->total_time_running +
+                       atomic64_read(&event->child_total_time_running);
+       }
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_event_id(event);
+
+       perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+                           struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader, *sub;
+       u64 read_format = event->attr.read_format;
+       u64 values[5];
+       int n = 0;
+
+       values[n++] = 1 + leader->nr_siblings;
+
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               values[n++] = leader->total_time_enabled;
+
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               values[n++] = leader->total_time_running;
+
+       if (leader != event)
+               leader->pmu->read(leader);
+
+       values[n++] = atomic64_read(&leader->count);
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_event_id(leader);
+
+       perf_output_copy(handle, values, n * sizeof(u64));
+
+       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+               n = 0;
+
+               if (sub != event)
+                       sub->pmu->read(sub);
+
+               values[n++] = atomic64_read(&sub->count);
+               if (read_format & PERF_FORMAT_ID)
+                       values[n++] = primary_event_id(sub);
+
+               perf_output_copy(handle, values, n * sizeof(u64));
+       }
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+                            struct perf_event *event)
+{
+       if (event->attr.read_format & PERF_FORMAT_GROUP)
+               perf_output_read_group(handle, event);
+       else
+               perf_output_read_one(handle, event);
+}
+
+void perf_output_sample(struct perf_output_handle *handle,
+                       struct perf_event_header *header,
+                       struct perf_sample_data *data,
+                       struct perf_event *event)
+{
+       u64 sample_type = data->type;
+
+       perf_output_put(handle, *header);
+
+       if (sample_type & PERF_SAMPLE_IP)
+               perf_output_put(handle, data->ip);
+
+       if (sample_type & PERF_SAMPLE_TID)
+               perf_output_put(handle, data->tid_entry);
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               perf_output_put(handle, data->time);
+
+       if (sample_type & PERF_SAMPLE_ADDR)
+               perf_output_put(handle, data->addr);
+
+       if (sample_type & PERF_SAMPLE_ID)
+               perf_output_put(handle, data->id);
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               perf_output_put(handle, data->stream_id);
+
+       if (sample_type & PERF_SAMPLE_CPU)
+               perf_output_put(handle, data->cpu_entry);
+
+       if (sample_type & PERF_SAMPLE_PERIOD)
+               perf_output_put(handle, data->period);
+
+       if (sample_type & PERF_SAMPLE_READ)
+               perf_output_read(handle, event);
+
+       if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+               if (data->callchain) {
+                       int size = 1;
+
+                       if (data->callchain)
+                               size += data->callchain->nr;
+
+                       size *= sizeof(u64);
+
+                       perf_output_copy(handle, data->callchain, size);
+               } else {
+                       u64 nr = 0;
+                       perf_output_put(handle, nr);
+               }
+       }
+
+       if (sample_type & PERF_SAMPLE_RAW) {
+               if (data->raw) {
+                       perf_output_put(handle, data->raw->size);
+                       perf_output_copy(handle, data->raw->data,
+                                        data->raw->size);
+               } else {
+                       struct {
+                               u32     size;
+                               u32     data;
+                       } raw = {
+                               .size = sizeof(u32),
+                               .data = 0,
+                       };
+                       perf_output_put(handle, raw);
+               }
+       }
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+                        struct perf_sample_data *data,
+                        struct perf_event *event,
+                        struct pt_regs *regs)
+{
+       u64 sample_type = event->attr.sample_type;
+
+       data->type = sample_type;
+
+       header->type = PERF_RECORD_SAMPLE;
+       header->size = sizeof(*header);
+
+       header->misc = 0;
+       header->misc |= perf_misc_flags(regs);
+
+       if (sample_type & PERF_SAMPLE_IP) {
+               data->ip = perf_instruction_pointer(regs);
+
+               header->size += sizeof(data->ip);
+       }
+
+       if (sample_type & PERF_SAMPLE_TID) {
+               /* namespace issues */
+               data->tid_entry.pid = perf_event_pid(event, current);
+               data->tid_entry.tid = perf_event_tid(event, current);
+
+               header->size += sizeof(data->tid_entry);
+       }
+
+       if (sample_type & PERF_SAMPLE_TIME) {
+               data->time = perf_clock();
+
+               header->size += sizeof(data->time);
+       }
+
+       if (sample_type & PERF_SAMPLE_ADDR)
+               header->size += sizeof(data->addr);
+
+       if (sample_type & PERF_SAMPLE_ID) {
+               data->id = primary_event_id(event);
+
+               header->size += sizeof(data->id);
+       }
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID) {
+               data->stream_id = event->id;
+
+               header->size += sizeof(data->stream_id);
+       }
+
+       if (sample_type & PERF_SAMPLE_CPU) {
+               data->cpu_entry.cpu             = raw_smp_processor_id();
+               data->cpu_entry.reserved        = 0;
+
+               header->size += sizeof(data->cpu_entry);
+       }
+
+       if (sample_type & PERF_SAMPLE_PERIOD)
+               header->size += sizeof(data->period);
+
+       if (sample_type & PERF_SAMPLE_READ)
+               header->size += perf_event_read_size(event);
+
+       if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+               int size = 1;
+
+               data->callchain = perf_callchain(regs);
+
+               if (data->callchain)
+                       size += data->callchain->nr;
+
+               header->size += size * sizeof(u64);
+       }
+
+       if (sample_type & PERF_SAMPLE_RAW) {
+               int size = sizeof(u32);
+
+               if (data->raw)
+                       size += data->raw->size;
+               else
+                       size += sizeof(u32);
+
+               WARN_ON_ONCE(size & (sizeof(u64)-1));
+               header->size += size;
+       }
+}
+
+static void perf_event_output(struct perf_event *event, int nmi,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+
+       perf_prepare_sample(&header, data, event, regs);
+
+       if (perf_output_begin(&handle, event, header.size, nmi, 1))
+               return;
+
+       perf_output_sample(&handle, &header, data, event);
+
+       perf_output_end(&handle);
+}
+
+/*
+ * read event_id
+ */
+
+struct perf_read_event {
+       struct perf_event_header        header;
+
+       u32                             pid;
+       u32                             tid;
+};
+
+static void
+perf_event_read_event(struct perf_event *event,
+                       struct task_struct *task)
+{
+       struct perf_output_handle handle;
+       struct perf_read_event read_event = {
+               .header = {
+                       .type = PERF_RECORD_READ,
+                       .misc = 0,
+                       .size = sizeof(read_event) + perf_event_read_size(event),
+               },
+               .pid = perf_event_pid(event, task),
+               .tid = perf_event_tid(event, task),
+       };
+       int ret;
+
+       ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, read_event);
+       perf_output_read(&handle, event);
+
+       perf_output_end(&handle);
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
+ */
+
+struct perf_task_event {
+       struct task_struct              *task;
+       struct perf_event_context       *task_ctx;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             ppid;
+               u32                             tid;
+               u32                             ptid;
+               u64                             time;
+       } event_id;
+};
+
+static void perf_event_task_output(struct perf_event *event,
+                                    struct perf_task_event *task_event)
+{
+       struct perf_output_handle handle;
+       int size;
+       struct task_struct *task = task_event->task;
+       int ret;
+
+       size  = task_event->event_id.header.size;
+       ret = perf_output_begin(&handle, event, size, 0, 0);
+
+       if (ret)
+               return;
+
+       task_event->event_id.pid = perf_event_pid(event, task);
+       task_event->event_id.ppid = perf_event_pid(event, current);
+
+       task_event->event_id.tid = perf_event_tid(event, task);
+       task_event->event_id.ptid = perf_event_tid(event, current);
+
+       task_event->event_id.time = perf_clock();
+
+       perf_output_put(&handle, task_event->event_id);
+
+       perf_output_end(&handle);
+}
+
+static int perf_event_task_match(struct perf_event *event)
+{
+       if (event->attr.comm || event->attr.mmap || event->attr.task)
+               return 1;
+
+       return 0;
+}
+
+static void perf_event_task_ctx(struct perf_event_context *ctx,
+                                 struct perf_task_event *task_event)
+{
+       struct perf_event *event;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+               if (perf_event_task_match(event))
+                       perf_event_task_output(event, task_event);
+       }
+       rcu_read_unlock();
+}
+
+static void perf_event_task_event(struct perf_task_event *task_event)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx = task_event->task_ctx;
+
+       cpuctx = &get_cpu_var(perf_cpu_context);
+       perf_event_task_ctx(&cpuctx->ctx, task_event);
+       put_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+       if (!ctx)
+               ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+       if (ctx)
+               perf_event_task_ctx(ctx, task_event);
+       rcu_read_unlock();
+}
+
+static void perf_event_task(struct task_struct *task,
+                             struct perf_event_context *task_ctx,
+                             int new)
+{
+       struct perf_task_event task_event;
+
+       if (!atomic_read(&nr_comm_events) &&
+           !atomic_read(&nr_mmap_events) &&
+           !atomic_read(&nr_task_events))
+               return;
+
+       task_event = (struct perf_task_event){
+               .task     = task,
+               .task_ctx = task_ctx,
+               .event_id    = {
+                       .header = {
+                               .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
+                               .misc = 0,
+                               .size = sizeof(task_event.event_id),
+                       },
+                       /* .pid  */
+                       /* .ppid */
+                       /* .tid  */
+                       /* .ptid */
+               },
+       };
+
+       perf_event_task_event(&task_event);
+}
+
+void perf_event_fork(struct task_struct *task)
+{
+       perf_event_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+       struct task_struct      *task;
+       char                    *comm;
+       int                     comm_size;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             tid;
+       } event_id;
+};
+
+static void perf_event_comm_output(struct perf_event *event,
+                                    struct perf_comm_event *comm_event)
+{
+       struct perf_output_handle handle;
+       int size = comm_event->event_id.header.size;
+       int ret = perf_output_begin(&handle, event, size, 0, 0);
+
+       if (ret)
+               return;
+
+       comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
+       comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
+
+       perf_output_put(&handle, comm_event->event_id);
+       perf_output_copy(&handle, comm_event->comm,
+                                  comm_event->comm_size);
+       perf_output_end(&handle);
+}
+
+static int perf_event_comm_match(struct perf_event *event)
+{
+       if (event->attr.comm)
+               return 1;
+
+       return 0;
+}
+
+static void perf_event_comm_ctx(struct perf_event_context *ctx,
+                                 struct perf_comm_event *comm_event)
+{
+       struct perf_event *event;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+               if (perf_event_comm_match(event))
+                       perf_event_comm_output(event, comm_event);
+       }
+       rcu_read_unlock();
+}
+
+static void perf_event_comm_event(struct perf_comm_event *comm_event)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx;
+       unsigned int size;
+       char comm[TASK_COMM_LEN];
+
+       memset(comm, 0, sizeof(comm));
+       strncpy(comm, comm_event->task->comm, sizeof(comm));
+       size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+       comm_event->comm = comm;
+       comm_event->comm_size = size;
+
+       comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+
+       cpuctx = &get_cpu_var(perf_cpu_context);
+       perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+       put_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_event_comm_ctx(ctx, comm_event);
+       rcu_read_unlock();
+}
+
+void perf_event_comm(struct task_struct *task)
+{
+       struct perf_comm_event comm_event;
+
+       if (task->perf_event_ctxp)
+               perf_event_enable_on_exec(task);
+
+       if (!atomic_read(&nr_comm_events))
+               return;
+
+       comm_event = (struct perf_comm_event){
+               .task   = task,
+               /* .comm      */
+               /* .comm_size */
+               .event_id  = {
+                       .header = {
+                               .type = PERF_RECORD_COMM,
+                               .misc = 0,
+                               /* .size */
+                       },
+                       /* .pid */
+                       /* .tid */
+               },
+       };
+
+       perf_event_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+       struct vm_area_struct   *vma;
+
+       const char              *file_name;
+       int                     file_size;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             tid;
+               u64                             start;
+               u64                             len;
+               u64                             pgoff;
+       } event_id;
+};
+
+static void perf_event_mmap_output(struct perf_event *event,
+                                    struct perf_mmap_event *mmap_event)
+{
+       struct perf_output_handle handle;
+       int size = mmap_event->event_id.header.size;
+       int ret = perf_output_begin(&handle, event, size, 0, 0);
+
+       if (ret)
+               return;
+
+       mmap_event->event_id.pid = perf_event_pid(event, current);
+       mmap_event->event_id.tid = perf_event_tid(event, current);
+
+       perf_output_put(&handle, mmap_event->event_id);
+       perf_output_copy(&handle, mmap_event->file_name,
+                                  mmap_event->file_size);
+       perf_output_end(&handle);
+}
+
+static int perf_event_mmap_match(struct perf_event *event,
+                                  struct perf_mmap_event *mmap_event)
+{
+       if (event->attr.mmap)
+               return 1;
+
+       return 0;
+}
+
+static void perf_event_mmap_ctx(struct perf_event_context *ctx,
+                                 struct perf_mmap_event *mmap_event)
+{
+       struct perf_event *event;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+               if (perf_event_mmap_match(event, mmap_event))
+                       perf_event_mmap_output(event, mmap_event);
+       }
+       rcu_read_unlock();
+}
+
+static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx;
+       struct vm_area_struct *vma = mmap_event->vma;
+       struct file *file = vma->vm_file;
+       unsigned int size;
+       char tmp[16];
+       char *buf = NULL;
+       const char *name;
+
+       memset(tmp, 0, sizeof(tmp));
+
+       if (file) {
+               /*
+                * d_path works from the end of the buffer backwards, so we
+                * need to add enough zero bytes after the string to handle
+                * the 64bit alignment we do later.
+                */
+               buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+               if (!buf) {
+                       name = strncpy(tmp, "//enomem", sizeof(tmp));
+                       goto got_name;
+               }
+               name = d_path(&file->f_path, buf, PATH_MAX);
+               if (IS_ERR(name)) {
+                       name = strncpy(tmp, "//toolong", sizeof(tmp));
+                       goto got_name;
+               }
+       } else {
+               if (arch_vma_name(mmap_event->vma)) {
+                       name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                                      sizeof(tmp));
+                       goto got_name;
+               }
+
+               if (!vma->vm_mm) {
+                       name = strncpy(tmp, "[vdso]", sizeof(tmp));
+                       goto got_name;
+               }
+
+               name = strncpy(tmp, "//anon", sizeof(tmp));
+               goto got_name;
+       }
+
+got_name:
+       size = ALIGN(strlen(name)+1, sizeof(u64));
+
+       mmap_event->file_name = name;
+       mmap_event->file_size = size;
+
+       mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+
+       cpuctx = &get_cpu_var(perf_cpu_context);
+       perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+       put_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_event_mmap_ctx(ctx, mmap_event);
+       rcu_read_unlock();
+
+       kfree(buf);
+}
+
+void __perf_event_mmap(struct vm_area_struct *vma)
+{
+       struct perf_mmap_event mmap_event;
+
+       if (!atomic_read(&nr_mmap_events))
+               return;
+
+       mmap_event = (struct perf_mmap_event){
+               .vma    = vma,
+               /* .file_name */
+               /* .file_size */
+               .event_id  = {
+                       .header = {
+                               .type = PERF_RECORD_MMAP,
+                               .misc = 0,
+                               /* .size */
+                       },
+                       /* .pid */
+                       /* .tid */
+                       .start  = vma->vm_start,
+                       .len    = vma->vm_end - vma->vm_start,
+                       .pgoff  = vma->vm_pgoff,
+               },
+       };
+
+       perf_event_mmap_event(&mmap_event);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_event *event, int enable)
+{
+       struct perf_output_handle handle;
+       int ret;
+
+       struct {
+               struct perf_event_header        header;
+               u64                             time;
+               u64                             id;
+               u64                             stream_id;
+       } throttle_event = {
+               .header = {
+                       .type = PERF_RECORD_THROTTLE,
+                       .misc = 0,
+                       .size = sizeof(throttle_event),
+               },
+               .time           = perf_clock(),
+               .id             = primary_event_id(event),
+               .stream_id      = event->id,
+       };
+
+       if (enable)
+               throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
+
+       ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, throttle_event);
+       perf_output_end(&handle);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event, int nmi,
+                                  int throttle, struct perf_sample_data *data,
+                                  struct pt_regs *regs)
+{
+       int events = atomic_read(&event->event_limit);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = 0;
+
+       throttle = (throttle && event->pmu->unthrottle != NULL);
+
+       if (!throttle) {
+               hwc->interrupts++;
+       } else {
+               if (hwc->interrupts != MAX_INTERRUPTS) {
+                       hwc->interrupts++;
+                       if (HZ * hwc->interrupts >
+                                       (u64)sysctl_perf_event_sample_rate) {
+                               hwc->interrupts = MAX_INTERRUPTS;
+                               perf_log_throttle(event, 0);
+                               ret = 1;
+                       }
+               } else {
+                       /*
+                        * Keep re-disabling events even though on the previous
+                        * pass we disabled it - just in case we raced with a
+                        * sched-in and the event got enabled again:
+                        */
+                       ret = 1;
+               }
+       }
+
+       if (event->attr.freq) {
+               u64 now = perf_clock();
+               s64 delta = now - hwc->freq_stamp;
+
+               hwc->freq_stamp = now;
+
+               if (delta > 0 && delta < TICK_NSEC)
+                       perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+       }
+
+       /*
+        * XXX event_limit might not quite work as expected on inherited
+        * events
+        */
+
+       event->pending_kill = POLL_IN;
+       if (events && atomic_dec_and_test(&event->event_limit)) {
+               ret = 1;
+               event->pending_kill = POLL_HUP;
+               if (nmi) {
+                       event->pending_disable = 1;
+                       perf_pending_queue(&event->pending,
+                                          perf_pending_event);
+               } else
+                       perf_event_disable(event);
+       }
+
+       perf_event_output(event, nmi, data, regs);
+       return ret;
+}
+
+int perf_event_overflow(struct perf_event *event, int nmi,
+                         struct perf_sample_data *data,
+                         struct pt_regs *regs)
+{
+       return __perf_event_overflow(event, nmi, 1, data, regs);
+}
+
+/*
+ * Generic software event infrastructure
+ */
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swevent_set_period(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 period = hwc->last_period;
+       u64 nr, offset;
+       s64 old, val;
+
+       hwc->last_period = hwc->sample_period;
+
+again:
+       old = val = atomic64_read(&hwc->period_left);
+       if (val < 0)
+               return 0;
+
+       nr = div64_u64(period + val, period);
+       offset = nr * period;
+       val -= offset;
+       if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+               goto again;
+
+       return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event,
+                                   int nmi, struct perf_sample_data *data,
+                                   struct pt_regs *regs)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int throttle = 0;
+       u64 overflow;
+
+       data->period = event->hw.last_period;
+       overflow = perf_swevent_set_period(event);
+
+       if (hwc->interrupts == MAX_INTERRUPTS)
+               return;
+
+       for (; overflow; overflow--) {
+               if (__perf_event_overflow(event, nmi, throttle,
+                                           data, regs)) {
+                       /*
+                        * We inhibit the overflow from happening when
+                        * hwc->interrupts == MAX_INTERRUPTS.
+                        */
+                       break;
+               }
+               throttle = 1;
+       }
+}
+
+static void perf_swevent_unthrottle(struct perf_event *event)
+{
+       /*
+        * Nothing to do, we already reset hwc->interrupts.
+        */
+}
+
+static void perf_swevent_add(struct perf_event *event, u64 nr,
+                              int nmi, struct perf_sample_data *data,
+                              struct pt_regs *regs)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       atomic64_add(nr, &event->count);
+
+       if (!hwc->sample_period)
+               return;
+
+       if (!regs)
+               return;
+
+       if (!atomic64_add_negative(nr, &hwc->period_left))
+               perf_swevent_overflow(event, nmi, data, regs);
+}
+
+static int perf_swevent_is_counting(struct perf_event *event)
+{
+       /*
+        * The event is active, we're good!
+        */
+       if (event->state == PERF_EVENT_STATE_ACTIVE)
+               return 1;
+
+       /*
+        * The event is off/error, not counting.
+        */
+       if (event->state != PERF_EVENT_STATE_INACTIVE)
+               return 0;
+
+       /*
+        * The event is inactive, if the context is active
+        * we're part of a group that didn't make it on the 'pmu',
+        * not counting.
+        */
+       if (event->ctx->is_active)
+               return 0;
+
+       /*
+        * We're inactive and the context is too, this means the
+        * task is scheduled out, we're counting events that happen
+        * to us, like migration events.
+        */
+       return 1;
+}
+
+static int perf_swevent_match(struct perf_event *event,
+                               enum perf_type_id type,
+                               u32 event_id, struct pt_regs *regs)
+{
+       if (!perf_swevent_is_counting(event))
+               return 0;
+
+       if (event->attr.type != type)
+               return 0;
+       if (event->attr.config != event_id)
+               return 0;
+
+       if (regs) {
+               if (event->attr.exclude_user && user_mode(regs))
+                       return 0;
+
+               if (event->attr.exclude_kernel && !user_mode(regs))
+                       return 0;
+       }
+
+       return 1;
+}
+
+static void perf_swevent_ctx_event(struct perf_event_context *ctx,
+                                    enum perf_type_id type,
+                                    u32 event_id, u64 nr, int nmi,
+                                    struct perf_sample_data *data,
+                                    struct pt_regs *regs)
+{
+       struct perf_event *event;
+
+       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+               if (perf_swevent_match(event, type, event_id, regs))
+                       perf_swevent_add(event, nr, nmi, data, regs);
+       }
+       rcu_read_unlock();
+}
+
+static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+{
+       if (in_nmi())
+               return &cpuctx->recursion[3];
+
+       if (in_irq())
+               return &cpuctx->recursion[2];
+
+       if (in_softirq())
+               return &cpuctx->recursion[1];
+
+       return &cpuctx->recursion[0];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                   u64 nr, int nmi,
+                                   struct perf_sample_data *data,
+                                   struct pt_regs *regs)
+{
+       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+       int *recursion = perf_swevent_recursion_context(cpuctx);
+       struct perf_event_context *ctx;
+
+       if (*recursion)
+               goto out;
+
+       (*recursion)++;
+       barrier();
+
+       perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
+                                nr, nmi, data, regs);
+       rcu_read_lock();
+       /*
+        * doesn't really matter which of the child contexts the
+        * events ends up in.
+        */
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
+       rcu_read_unlock();
+
+       barrier();
+       (*recursion)--;
+
+out:
+       put_cpu_var(perf_cpu_context);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, int nmi,
+                           struct pt_regs *regs, u64 addr)
+{
+       struct perf_sample_data data = {
+               .addr = addr,
+       };
+
+       do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
+                               &data, regs);
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_enable(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->sample_period) {
+               hwc->last_period = hwc->sample_period;
+               perf_swevent_set_period(event);
+       }
+       return 0;
+}
+
+static void perf_swevent_disable(struct perf_event *event)
+{
+}
+
+static const struct pmu perf_ops_generic = {
+       .enable         = perf_swevent_enable,
+       .disable        = perf_swevent_disable,
+       .read           = perf_swevent_read,
+       .unthrottle     = perf_swevent_unthrottle,
+};
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct pt_regs *regs;
+       struct perf_event *event;
+       u64 period;
+
+       event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
+       event->pmu->read(event);
+
+       data.addr = 0;
+       regs = get_irq_regs();
+       /*
+        * In case we exclude kernel IPs or are somehow not in interrupt
+        * context, provide the next best thing, the user IP.
+        */
+       if ((event->attr.exclude_kernel || !regs) &&
+                       !event->attr.exclude_user)
+               regs = task_pt_regs(current);
+
+       if (regs) {
+               if (perf_event_overflow(event, 0, &data, regs))
+                       ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, event->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_perf_event_update(struct perf_event *event)
+{
+       int cpu = raw_smp_processor_id();
+       s64 prev;
+       u64 now;
+
+       now = cpu_clock(cpu);
+       prev = atomic64_read(&event->hw.prev_count);
+       atomic64_set(&event->hw.prev_count, now);
+       atomic64_add(now - prev, &event->count);
+}
+
+static int cpu_clock_perf_event_enable(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int cpu = raw_smp_processor_id();
+
+       atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+       if (hwc->sample_period) {
+               u64 period = max_t(u64, 10000, hwc->sample_period);
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL, 0);
+       }
+
+       return 0;
+}
+
+static void cpu_clock_perf_event_disable(struct perf_event *event)
+{
+       if (event->hw.sample_period)
+               hrtimer_cancel(&event->hw.hrtimer);
+       cpu_clock_perf_event_update(event);
+}
+
+static void cpu_clock_perf_event_read(struct perf_event *event)
+{
+       cpu_clock_perf_event_update(event);
+}
+
+static const struct pmu perf_ops_cpu_clock = {
+       .enable         = cpu_clock_perf_event_enable,
+       .disable        = cpu_clock_perf_event_disable,
+       .read           = cpu_clock_perf_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_perf_event_update(struct perf_event *event, u64 now)
+{
+       u64 prev;
+       s64 delta;
+
+       prev = atomic64_xchg(&event->hw.prev_count, now);
+       delta = now - prev;
+       atomic64_add(delta, &event->count);
+}
+
+static int task_clock_perf_event_enable(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 now;
+
+       now = event->ctx->time;
+
+       atomic64_set(&hwc->prev_count, now);
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+       if (hwc->sample_period) {
+               u64 period = max_t(u64, 10000, hwc->sample_period);
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL, 0);
+       }
+
+       return 0;
+}
+
+static void task_clock_perf_event_disable(struct perf_event *event)
+{
+       if (event->hw.sample_period)
+               hrtimer_cancel(&event->hw.hrtimer);
+       task_clock_perf_event_update(event, event->ctx->time);
+
+}
+
+static void task_clock_perf_event_read(struct perf_event *event)
+{
+       u64 time;
+
+       if (!in_nmi()) {
+               update_context_time(event->ctx);
+               time = event->ctx->time;
+       } else {
+               u64 now = perf_clock();
+               u64 delta = now - event->ctx->timestamp;
+               time = event->ctx->time + delta;
+       }
+
+       task_clock_perf_event_update(event, time);
+}
+
+static const struct pmu perf_ops_task_clock = {
+       .enable         = task_clock_perf_event_enable,
+       .disable        = task_clock_perf_event_disable,
+       .read           = task_clock_perf_event_read,
+};
+
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+                         int entry_size)
+{
+       struct perf_raw_record raw = {
+               .size = entry_size,
+               .data = record,
+       };
+
+       struct perf_sample_data data = {
+               .addr = addr,
+               .raw = &raw,
+       };
+
+       struct pt_regs *regs = get_irq_regs();
+
+       if (!regs)
+               regs = task_pt_regs(current);
+
+       do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+                               &data, regs);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+       ftrace_profile_disable(event->attr.config);
+}
+
+static const struct pmu *tp_perf_event_init(struct perf_event *event)
+{
+       /*
+        * Raw tracepoint data is a severe data leak, only allow root to
+        * have these.
+        */
+       if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+                       perf_paranoid_tracepoint_raw() &&
+                       !capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       if (ftrace_profile_enable(event->attr.config))
+               return NULL;
+
+       event->destroy = tp_perf_event_destroy;
+
+       return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_event_init(struct perf_event *event)
+{
+       return NULL;
+}
+#endif
+
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
+
+       WARN_ON(event->parent);
+
+       atomic_dec(&perf_swevent_enabled[event_id]);
+}
+
+static const struct pmu *sw_perf_event_init(struct perf_event *event)
+{
+       const struct pmu *pmu = NULL;
+       u64 event_id = event->attr.config;
+
+       /*
+        * Software events (currently) can't in general distinguish
+        * between user, kernel and hypervisor events.
+        * However, context switches and cpu migrations are considered
+        * to be kernel events, and page faults are never hypervisor
+        * events.
+        */
+       switch (event_id) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+               pmu = &perf_ops_cpu_clock;
+
+               break;
+       case PERF_COUNT_SW_TASK_CLOCK:
+               /*
+                * If the user instantiates this as a per-cpu event,
+                * use the cpu_clock event instead.
+                */
+               if (event->ctx->task)
+                       pmu = &perf_ops_task_clock;
+               else
+                       pmu = &perf_ops_cpu_clock;
+
+               break;
+       case PERF_COUNT_SW_PAGE_FAULTS:
+       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+       case PERF_COUNT_SW_CONTEXT_SWITCHES:
+       case PERF_COUNT_SW_CPU_MIGRATIONS:
+               if (!event->parent) {
+                       atomic_inc(&perf_swevent_enabled[event_id]);
+                       event->destroy = sw_perf_event_destroy;
+               }
+               pmu = &perf_ops_generic;
+               break;
+       }
+
+       return pmu;
+}
+
+/*
+ * Allocate and initialize a event structure
+ */
+static struct perf_event *
+perf_event_alloc(struct perf_event_attr *attr,
+                  int cpu,
+                  struct perf_event_context *ctx,
+                  struct perf_event *group_leader,
+                  struct perf_event *parent_event,
+                  gfp_t gfpflags)
+{
+       const struct pmu *pmu;
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       long err;
+
+       event = kzalloc(sizeof(*event), gfpflags);
+       if (!event)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Single events are their own group leaders, with an
+        * empty sibling list:
+        */
+       if (!group_leader)
+               group_leader = event;
+
+       mutex_init(&event->child_mutex);
+       INIT_LIST_HEAD(&event->child_list);
+
+       INIT_LIST_HEAD(&event->group_entry);
+       INIT_LIST_HEAD(&event->event_entry);
+       INIT_LIST_HEAD(&event->sibling_list);
+       init_waitqueue_head(&event->waitq);
+
+       mutex_init(&event->mmap_mutex);
+
+       event->cpu              = cpu;
+       event->attr             = *attr;
+       event->group_leader     = group_leader;
+       event->pmu              = NULL;
+       event->ctx              = ctx;
+       event->oncpu            = -1;
+
+       event->parent           = parent_event;
+
+       event->ns               = get_pid_ns(current->nsproxy->pid_ns);
+       event->id               = atomic64_inc_return(&perf_event_id);
+
+       event->state            = PERF_EVENT_STATE_INACTIVE;
+
+       if (attr->disabled)
+               event->state = PERF_EVENT_STATE_OFF;
+
+       pmu = NULL;
+
+       hwc = &event->hw;
+       hwc->sample_period = attr->sample_period;
+       if (attr->freq && attr->sample_freq)
+               hwc->sample_period = 1;
+       hwc->last_period = hwc->sample_period;
+
+       atomic64_set(&hwc->period_left, hwc->sample_period);
+
+       /*
+        * we currently do not support PERF_FORMAT_GROUP on inherited events
+        */
+       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+               goto done;
+
+       switch (attr->type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               pmu = hw_perf_event_init(event);
+               break;
+
+       case PERF_TYPE_SOFTWARE:
+               pmu = sw_perf_event_init(event);
+               break;
+
+       case PERF_TYPE_TRACEPOINT:
+               pmu = tp_perf_event_init(event);
+               break;
+
+       default:
+               break;
+       }
+done:
+       err = 0;
+       if (!pmu)
+               err = -EINVAL;
+       else if (IS_ERR(pmu))
+               err = PTR_ERR(pmu);
+
+       if (err) {
+               if (event->ns)
+                       put_pid_ns(event->ns);
+               kfree(event);
+               return ERR_PTR(err);
+       }
+
+       event->pmu = pmu;
+
+       if (!event->parent) {
+               atomic_inc(&nr_events);
+               if (event->attr.mmap)
+                       atomic_inc(&nr_mmap_events);
+               if (event->attr.comm)
+                       atomic_inc(&nr_comm_events);
+               if (event->attr.task)
+                       atomic_inc(&nr_task_events);
+       }
+
+       return event;
+}
+
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+                         struct perf_event_attr *attr)
+{
+       u32 size;
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+               return -EFAULT;
+
+       /*
+        * zero the full structure, so that a short copy will be nice.
+        */
+       memset(attr, 0, sizeof(*attr));
+
+       ret = get_user(size, &uattr->size);
+       if (ret)
+               return ret;
+
+       if (size > PAGE_SIZE)   /* silly large */
+               goto err_size;
+
+       if (!size)              /* abi compat */
+               size = PERF_ATTR_SIZE_VER0;
+
+       if (size < PERF_ATTR_SIZE_VER0)
+               goto err_size;
+
+       /*
+        * If we're handed a bigger struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. new
+        * user-space does not rely on any kernel feature
+        * extensions we dont know about yet.
+        */
+       if (size > sizeof(*attr)) {
+               unsigned char __user *addr;
+               unsigned char __user *end;
+               unsigned char val;
+
+               addr = (void __user *)uattr + sizeof(*attr);
+               end  = (void __user *)uattr + size;
+
+               for (; addr < end; addr++) {
+                       ret = get_user(val, addr);
+                       if (ret)
+                               return ret;
+                       if (val)
+                               goto err_size;
+               }
+               size = sizeof(*attr);
+       }
+
+       ret = copy_from_user(attr, uattr, size);
+       if (ret)
+               return -EFAULT;
+
+       /*
+        * If the type exists, the corresponding creation will verify
+        * the attr->config.
+        */
+       if (attr->type >= PERF_TYPE_MAX)
+               return -EINVAL;
+
+       if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+               return -EINVAL;
+
+       if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+               return -EINVAL;
+
+       if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+               return -EINVAL;
+
+out:
+       return ret;
+
+err_size:
+       put_user(sizeof(*attr), &uattr->size);
+       ret = -E2BIG;
+       goto out;
+}
+
+int perf_event_set_output(struct perf_event *event, int output_fd)
+{
+       struct perf_event *output_event = NULL;
+       struct file *output_file = NULL;
+       struct perf_event *old_output;
+       int fput_needed = 0;
+       int ret = -EINVAL;
+
+       if (!output_fd)
+               goto set;
+
+       output_file = fget_light(output_fd, &fput_needed);
+       if (!output_file)
+               return -EBADF;
+
+       if (output_file->f_op != &perf_fops)
+               goto out;
+
+       output_event = output_file->private_data;
+
+       /* Don't chain output fds */
+       if (output_event->output)
+               goto out;
+
+       /* Don't set an output fd when we already have an output channel */
+       if (event->data)
+               goto out;
+
+       atomic_long_inc(&output_file->f_count);
+
+set:
+       mutex_lock(&event->mmap_mutex);
+       old_output = event->output;
+       rcu_assign_pointer(event->output, output_event);
+       mutex_unlock(&event->mmap_mutex);
+
+       if (old_output) {
+               /*
+                * we need to make sure no existing perf_output_*()
+                * is still referencing this event.
+                */
+               synchronize_rcu();
+               fput(old_output->filp);
+       }
+
+       ret = 0;
+out:
+       fput_light(output_file, fput_needed);
+       return ret;
+}
+
+/**
+ * sys_perf_event_open - open a performance event, associate it to a task/cpu
+ *
+ * @attr_uptr: event_id type attributes for monitoring/sampling
+ * @pid:               target pid
+ * @cpu:               target cpu
+ * @group_fd:          group leader event fd
+ */
+SYSCALL_DEFINE5(perf_event_open,
+               struct perf_event_attr __user *, attr_uptr,
+               pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+       struct perf_event *event, *group_leader;
+       struct perf_event_attr attr;
+       struct perf_event_context *ctx;
+       struct file *event_file = NULL;
+       struct file *group_file = NULL;
+       int fput_needed = 0;
+       int fput_needed2 = 0;
+       int err;
+
+       /* for future expandability... */
+       if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+               return -EINVAL;
+
+       err = perf_copy_attr(attr_uptr, &attr);
+       if (err)
+               return err;
+
+       if (!attr.exclude_kernel) {
+               if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
+
+       if (attr.freq) {
+               if (attr.sample_freq > sysctl_perf_event_sample_rate)
+                       return -EINVAL;
+       }
+
+       /*
+        * Get the target context (task or percpu):
+        */
+       ctx = find_get_context(pid, cpu);
+       if (IS_ERR(ctx))
+               return PTR_ERR(ctx);
+
+       /*
+        * Look up the group leader (we will attach this event to it):
+        */
+       group_leader = NULL;
+       if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+               err = -EINVAL;
+               group_file = fget_light(group_fd, &fput_needed);
+               if (!group_file)
+                       goto err_put_context;
+               if (group_file->f_op != &perf_fops)
+                       goto err_put_context;
+
+               group_leader = group_file->private_data;
+               /*
+                * Do not allow a recursive hierarchy (this new sibling
+                * becoming part of another group-sibling):
+                */
+               if (group_leader->group_leader != group_leader)
+                       goto err_put_context;
+               /*
+                * Do not allow to attach to a group in a different
+                * task or CPU context:
+                */
+               if (group_leader->ctx != ctx)
+                       goto err_put_context;
+               /*
+                * Only a group leader can be exclusive or pinned
+                */
+               if (attr.exclusive || attr.pinned)
+                       goto err_put_context;
+       }
+
+       event = perf_event_alloc(&attr, cpu, ctx, group_leader,
+                                    NULL, GFP_KERNEL);
+       err = PTR_ERR(event);
+       if (IS_ERR(event))
+               goto err_put_context;
+
+       err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
+       if (err < 0)
+               goto err_free_put_context;
+
+       event_file = fget_light(err, &fput_needed2);
+       if (!event_file)
+               goto err_free_put_context;
+
+       if (flags & PERF_FLAG_FD_OUTPUT) {
+               err = perf_event_set_output(event, group_fd);
+               if (err)
+                       goto err_fput_free_put_context;
+       }
+
+       event->filp = event_file;
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_install_in_context(ctx, event, cpu);
+       ++ctx->generation;
+       mutex_unlock(&ctx->mutex);
+
+       event->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_event_mutex);
+       list_add_tail(&event->owner_entry, &current->perf_event_list);
+       mutex_unlock(&current->perf_event_mutex);
+
+err_fput_free_put_context:
+       fput_light(event_file, fput_needed2);
+
+err_free_put_context:
+       if (err < 0)
+               kfree(event);
+
+err_put_context:
+       if (err < 0)
+               put_ctx(ctx);
+
+       fput_light(group_file, fput_needed);
+
+       return err;
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+             struct task_struct *parent,
+             struct perf_event_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_event *group_leader,
+             struct perf_event_context *child_ctx)
+{
+       struct perf_event *child_event;
+
+       /*
+        * Instead of creating recursive hierarchies of events,
+        * we link inherited events back to the original parent,
+        * which has a filp for sure, which we use as the reference
+        * count:
+        */
+       if (parent_event->parent)
+               parent_event = parent_event->parent;
+
+       child_event = perf_event_alloc(&parent_event->attr,
+                                          parent_event->cpu, child_ctx,
+                                          group_leader, parent_event,
+                                          GFP_KERNEL);
+       if (IS_ERR(child_event))
+               return child_event;
+       get_ctx(child_ctx);
+
+       /*
+        * Make the child state follow the state of the parent event,
+        * not its attr.disabled bit.  We hold the parent's mutex,
+        * so we won't race with perf_event_{en, dis}able_family.
+        */
+       if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+               child_event->state = PERF_EVENT_STATE_INACTIVE;
+       else
+               child_event->state = PERF_EVENT_STATE_OFF;
+
+       if (parent_event->attr.freq)
+               child_event->hw.sample_period = parent_event->hw.sample_period;
+
+       /*
+        * Link it up in the child's context:
+        */
+       add_event_to_ctx(child_event, child_ctx);
+
+       /*
+        * Get a reference to the parent filp - we will fput it
+        * when the child event exits. This is safe to do because
+        * we are in the parent and we know that the filp still
+        * exists and has a nonzero count:
+        */
+       atomic_long_inc(&parent_event->filp->f_count);
+
+       /*
+        * Link this into the parent event's child list
+        */
+       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+       mutex_lock(&parent_event->child_mutex);
+       list_add_tail(&child_event->child_list, &parent_event->child_list);
+       mutex_unlock(&parent_event->child_mutex);
+
+       return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+             struct task_struct *parent,
+             struct perf_event_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_event_context *child_ctx)
+{
+       struct perf_event *leader;
+       struct perf_event *sub;
+       struct perf_event *child_ctr;
+
+       leader = inherit_event(parent_event, parent, parent_ctx,
+                                child, NULL, child_ctx);
+       if (IS_ERR(leader))
+               return PTR_ERR(leader);
+       list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+               child_ctr = inherit_event(sub, parent, parent_ctx,
+                                           child, leader, child_ctx);
+               if (IS_ERR(child_ctr))
+                       return PTR_ERR(child_ctr);
+       }
+       return 0;
+}
+
+static void sync_child_event(struct perf_event *child_event,
+                              struct task_struct *child)
+{
+       struct perf_event *parent_event = child_event->parent;
+       u64 child_val;
+
+       if (child_event->attr.inherit_stat)
+               perf_event_read_event(child_event, child);
+
+       child_val = atomic64_read(&child_event->count);
+
+       /*
+        * Add back the child's count to the parent's count:
+        */
+       atomic64_add(child_val, &parent_event->count);
+       atomic64_add(child_event->total_time_enabled,
+                    &parent_event->child_total_time_enabled);
+       atomic64_add(child_event->total_time_running,
+                    &parent_event->child_total_time_running);
+
+       /*
+        * Remove this event from the parent's list
+        */
+       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+       mutex_lock(&parent_event->child_mutex);
+       list_del_init(&child_event->child_list);
+       mutex_unlock(&parent_event->child_mutex);
+
+       /*
+        * Release the parent event, if this was the last
+        * reference to it.
+        */
+       fput(parent_event->filp);
+}
+
+static void
+__perf_event_exit_task(struct perf_event *child_event,
+                        struct perf_event_context *child_ctx,
+                        struct task_struct *child)
+{
+       struct perf_event *parent_event;
+
+       update_event_times(child_event);
+       perf_event_remove_from_context(child_event);
+
+       parent_event = child_event->parent;
+       /*
+        * It can happen that parent exits first, and has events
+        * that are still around due to the child reference. These
+        * events need to be zapped - but otherwise linger.
+        */
+       if (parent_event) {
+               sync_child_event(child_event, child);
+               free_event(child_event);
+       }
+}
+
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+       struct perf_event *child_event, *tmp;
+       struct perf_event_context *child_ctx;
+       unsigned long flags;
+
+       if (likely(!child->perf_event_ctxp)) {
+               perf_event_task(child, NULL, 0);
+               return;
+       }
+
+       local_irq_save(flags);
+       /*
+        * We can't reschedule here because interrupts are disabled,
+        * and either child is current or it is a task that can't be
+        * scheduled, so we are now safe from rescheduling changing
+        * our context.
+        */
+       child_ctx = child->perf_event_ctxp;
+       __perf_event_task_sched_out(child_ctx);
+
+       /*
+        * Take the context lock here so that if find_get_context is
+        * reading child->perf_event_ctxp, we wait until it has
+        * incremented the context's refcount before we do put_ctx below.
+        */
+       spin_lock(&child_ctx->lock);
+       child->perf_event_ctxp = NULL;
+       /*
+        * If this context is a clone; unclone it so it can't get
+        * swapped to another process while we're removing all
+        * the events from it.
+        */
+       unclone_ctx(child_ctx);
+       spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+       /*
+        * Report the task dead after unscheduling the events so that we
+        * won't get any samples after PERF_RECORD_EXIT. We can however still
+        * get a few PERF_RECORD_READ events.
+        */
+       perf_event_task(child, child_ctx, 0);
+
+       /*
+        * We can recurse on the same lock type through:
+        *
+        *   __perf_event_exit_task()
+        *     sync_child_event()
+        *       fput(parent_event->filp)
+        *         perf_release()
+        *           mutex_lock(&ctx->mutex)
+        *
+        * But since its the parent context it won't be the same instance.
+        */
+       mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+
+again:
+       list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+                                group_entry)
+               __perf_event_exit_task(child_event, child_ctx, child);
+
+       /*
+        * If the last event was a group event, it will have appended all
+        * its siblings to the list, but we obtained 'tmp' before that which
+        * will still point to the list head terminating the iteration.
+        */
+       if (!list_empty(&child_ctx->group_list))
+               goto again;
+
+       mutex_unlock(&child_ctx->mutex);
+
+       put_ctx(child_ctx);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_event_free_task(struct task_struct *task)
+{
+       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_event *event, *tmp;
+
+       if (!ctx)
+               return;
+
+       mutex_lock(&ctx->mutex);
+again:
+       list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
+               struct perf_event *parent = event->parent;
+
+               if (WARN_ON_ONCE(!parent))
+                       continue;
+
+               mutex_lock(&parent->child_mutex);
+               list_del_init(&event->child_list);
+               mutex_unlock(&parent->child_mutex);
+
+               fput(parent->filp);
+
+               list_del_event(event, ctx);
+               free_event(event);
+       }
+
+       if (!list_empty(&ctx->group_list))
+               goto again;
+
+       mutex_unlock(&ctx->mutex);
+
+       put_ctx(ctx);
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+       struct perf_event_context *child_ctx, *parent_ctx;
+       struct perf_event_context *cloned_ctx;
+       struct perf_event *event;
+       struct task_struct *parent = current;
+       int inherited_all = 1;
+       int ret = 0;
+
+       child->perf_event_ctxp = NULL;
+
+       mutex_init(&child->perf_event_mutex);
+       INIT_LIST_HEAD(&child->perf_event_list);
+
+       if (likely(!parent->perf_event_ctxp))
+               return 0;
+
+       /*
+        * This is executed from the parent task context, so inherit
+        * events that have been marked for cloning.
+        * First allocate and initialize a context for the child.
+        */
+
+       child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+       if (!child_ctx)
+               return -ENOMEM;
+
+       __perf_event_init_context(child_ctx, child);
+       child->perf_event_ctxp = child_ctx;
+       get_task_struct(child);
+
+       /*
+        * If the parent's context is a clone, pin it so it won't get
+        * swapped under us.
+        */
+       parent_ctx = perf_pin_task_context(parent);
+
+       /*
+        * No need to check if parent_ctx != NULL here; since we saw
+        * it non-NULL earlier, the only reason for it to become NULL
+        * is if we exit, and since we're currently in the middle of
+        * a fork we can't be exiting at the same time.
+        */
+
+       /*
+        * Lock the parent list. No need to lock the child - not PID
+        * hashed yet and not running, so nobody can access it.
+        */
+       mutex_lock(&parent_ctx->mutex);
+
+       /*
+        * We dont have to disable NMIs - we are only looking at
+        * the list, not manipulating it:
+        */
+       list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
+               if (event != event->group_leader)
+                       continue;
+
+               if (!event->attr.inherit) {
+                       inherited_all = 0;
+                       continue;
+               }
+
+               ret = inherit_group(event, parent, parent_ctx,
+                                            child, child_ctx);
+               if (ret) {
+                       inherited_all = 0;
+                       break;
+               }
+       }
+
+       if (inherited_all) {
+               /*
+                * Mark the child context as a clone of the parent
+                * context, or of whatever the parent is a clone of.
+                * Note that if the parent is a clone, it could get
+                * uncloned at any point, but that doesn't matter
+                * because the list of events and the generation
+                * count can't have changed since we took the mutex.
+                */
+               cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+               if (cloned_ctx) {
+                       child_ctx->parent_ctx = cloned_ctx;
+                       child_ctx->parent_gen = parent_ctx->parent_gen;
+               } else {
+                       child_ctx->parent_ctx = parent_ctx;
+                       child_ctx->parent_gen = parent_ctx->generation;
+               }
+               get_ctx(child_ctx->parent_ctx);
+       }
+
+       mutex_unlock(&parent_ctx->mutex);
+
+       perf_unpin_context(parent_ctx);
+
+       return ret;
+}
+
+static void __cpuinit perf_event_init_cpu(int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+
+       cpuctx = &per_cpu(perf_cpu_context, cpu);
+       __perf_event_init_context(&cpuctx->ctx, NULL);
+
+       spin_lock(&perf_resource_lock);
+       cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
+       spin_unlock(&perf_resource_lock);
+
+       hw_perf_event_setup(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_event_exit_cpu(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_event_context *ctx = &cpuctx->ctx;
+       struct perf_event *event, *tmp;
+
+       list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+               __perf_event_remove_from_context(event);
+}
+static void perf_event_exit_cpu(int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_event_context *ctx = &cpuctx->ctx;
+
+       mutex_lock(&ctx->mutex);
+       smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
+       mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_event_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action) {
+
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               perf_event_init_cpu(cpu);
+               break;
+
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               hw_perf_event_setup_online(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               perf_event_exit_cpu(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+       .notifier_call          = perf_cpu_notify,
+       .priority               = 20,
+};
+
+void __init perf_event_init(void)
+{
+       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+                       (void *)(long)smp_processor_id());
+       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+                       (void *)(long)smp_processor_id());
+       register_cpu_notifier(&perf_cpu_nb);
+}
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+       return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+                       const char *buf,
+                       size_t count)
+{
+       struct perf_cpu_context *cpuctx;
+       unsigned long val;
+       int err, cpu, mpt;
+
+       err = strict_strtoul(buf, 10, &val);
+       if (err)
+               return err;
+       if (val > perf_max_events)
+               return -EINVAL;
+
+       spin_lock(&perf_resource_lock);
+       perf_reserved_percpu = val;
+       for_each_online_cpu(cpu) {
+               cpuctx = &per_cpu(perf_cpu_context, cpu);
+               spin_lock_irq(&cpuctx->ctx.lock);
+               mpt = min(perf_max_events - cpuctx->ctx.nr_events,
+                         perf_max_events - perf_reserved_percpu);
+               cpuctx->max_pertask = mpt;
+               spin_unlock_irq(&cpuctx->ctx.lock);
+       }
+       spin_unlock(&perf_resource_lock);
+
+       return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+       return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+       unsigned long val;
+       int err;
+
+       err = strict_strtoul(buf, 10, &val);
+       if (err)
+               return err;
+       if (val > 1)
+               return -EINVAL;
+
+       spin_lock(&perf_resource_lock);
+       perf_overcommit = val;
+       spin_unlock(&perf_resource_lock);
+
+       return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+                               reserve_percpu,
+                               0644,
+                               perf_show_reserve_percpu,
+                               perf_set_reserve_percpu
+                       );
+
+static SYSDEV_CLASS_ATTR(
+                               overcommit,
+                               0644,
+                               perf_show_overcommit,
+                               perf_set_overcommit
+                       );
+
+static struct attribute *perfclass_attrs[] = {
+       &attr_reserve_percpu.attr,
+       &attr_overcommit.attr,
+       NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+       .attrs                  = perfclass_attrs,
+       .name                   = "perf_events",
+};
+
+static int __init perf_event_sysfs_init(void)
+{
+       return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+                                 &perfclass_attr_group);
+}
+device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c

index faf4d46..291c8d2 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
  #include <linux/completion.h>
  #include <linux/kernel_stat.h>
  #include <linux/debug_locks.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/security.h>
  #include <linux/notifier.h>
  #include <linux/profile.h>
@@ -2059,7 +2059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
  #endif
-               perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
                                      1, 1, NULL, 0);
         }
         p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2724,7 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
-       perf_counter_task_sched_in(current, cpu_of(rq));
+       perf_event_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
  
         fire_sched_in_preempt_notifiers(current);
@@ -5199,7 +5199,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         spin_unlock(&rq->lock);
  
-       perf_counter_task_tick(curr, cpu);
+       perf_event_task_tick(curr, cpu);
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@ -5415,7 +5415,7 @@ need_resched_nonpreemptible:
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
-               perf_counter_task_sched_out(prev, next, cpu);
+               perf_event_task_sched_out(prev, next, cpu);
  
                 rq->nr_switches++;
                 rq->curr = next;
@@ -7692,7 +7692,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  /*
   * Register at high priority so that task migration (migrate_all_tasks)
   * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_counter subsystem, though.
+ * the notifier in the perf_event subsystem, though.
   */
  static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
@@ -9549,7 +9549,7 @@ void __init sched_init(void)
         alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  #endif /* SMP */
  
-       perf_counter_init();
+       perf_event_init();
  
         scheduler_running = 1;
  }
diff --git a/kernel/sys.c b/kernel/sys.c

index b3f1097..ea5c3bc 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
  #include <linux/prctl.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/resource.h>
  #include <linux/kernel.h>
  #include <linux/kexec.h>
@@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                 case PR_SET_TSC:
                         error = SET_TSC_CTL(arg2);
                         break;
-               case PR_TASK_PERF_COUNTERS_DISABLE:
-                       error = perf_counter_task_disable();
+               case PR_TASK_PERF_EVENTS_DISABLE:
+                       error = perf_event_task_disable();
                         break;
-               case PR_TASK_PERF_COUNTERS_ENABLE:
-                       error = perf_counter_task_enable();
+               case PR_TASK_PERF_EVENTS_ENABLE:
+                       error = perf_event_task_enable();
                         break;
                 case PR_GET_TIMERSLACK:
                         error = current->timer_slack_ns;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c

index 68320f6..515bc23 100644 (file)
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -177,4 +177,4 @@ cond_syscall(sys_eventfd);
  cond_syscall(sys_eventfd2);
  
  /* performance counters: */
-cond_syscall(sys_perf_counter_open);
+cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 1a631ba..6ba49c7 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,7 +50,7 @@
  #include <linux/reboot.h>
  #include <linux/ftrace.h>
  #include <linux/slow-work.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/uaccess.h>
  #include <asm/processor.h>
@@ -964,28 +964,28 @@ static struct ctl_table kern_table[] = {
                 .child          = slow_work_sysctls,
         },
  #endif
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
         {
                 .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "perf_counter_paranoid",
-               .data           = &sysctl_perf_counter_paranoid,
-               .maxlen         = sizeof(sysctl_perf_counter_paranoid),
+               .procname       = "perf_event_paranoid",
+               .data           = &sysctl_perf_event_paranoid,
+               .maxlen         = sizeof(sysctl_perf_event_paranoid),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
         {
                 .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "perf_counter_mlock_kb",
-               .data           = &sysctl_perf_counter_mlock,
-               .maxlen         = sizeof(sysctl_perf_counter_mlock),
+               .procname       = "perf_event_mlock_kb",
+               .data           = &sysctl_perf_event_mlock,
+               .maxlen         = sizeof(sysctl_perf_event_mlock),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
         {
                 .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "perf_counter_max_sample_rate",
-               .data           = &sysctl_perf_counter_sample_rate,
-               .maxlen         = sizeof(sysctl_perf_counter_sample_rate),
+               .procname       = "perf_event_max_sample_rate",
+               .data           = &sysctl_perf_event_sample_rate,
+               .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
diff --git a/kernel/timer.c b/kernel/timer.c

index bbb5107..811e5c3 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
  #include <linux/delay.h>
  #include <linux/tick.h>
  #include <linux/kallsyms.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <linux/sched.h>
  
  #include <asm/uaccess.h>
@@ -1187,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h)
  {
         struct tvec_base *base = __get_cpu_var(tvec_bases);
  
-       perf_counter_do_pending();
+       perf_event_do_pending();
  
         hrtimer_run_pending();
  
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c

index 8712ce3..233f348 100644 (file)
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
  #include <trace/events/syscalls.h>
  #include <linux/kernel.h>
  #include <linux/ftrace.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <asm/syscall.h>
  
  #include "trace_output.h"
@@ -414,7 +414,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
                 rec->nr = syscall_nr;
                 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                                        (unsigned long *)&rec->args);
-               perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+               perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
         } while(0);
  }
  
@@ -476,7 +476,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
         rec.nr = syscall_nr;
         rec.ret = syscall_get_return_value(current, regs);
  
-       perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+       perf_tp_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
  }
  
  int reg_prof_syscall_exit(char *name)
diff --git a/mm/mmap.c b/mm/mmap.c

index 26892e3..376492e 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
  #include <linux/mempolicy.h>
  #include <linux/rmap.h>
  #include <linux/mmu_notifier.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
@@ -1220,7 +1220,7 @@ munmap_back:
         if (correct_wcount)
                 atomic_inc(&inode->i_writecount);
  out:
-       perf_counter_mmap(vma);
+       perf_event_mmap(vma);
  
         mm->total_vm += len >> PAGE_SHIFT;
         vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2308,7 +2308,7 @@ int install_special_mapping(struct mm_struct *mm,
  
         mm->total_vm += len >> PAGE_SHIFT;
  
-       perf_counter_mmap(vma);
+       perf_event_mmap(vma);
  
         return 0;
  }
diff --git a/mm/mprotect.c b/mm/mprotect.c

index d80311b..8bc969d 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
  #include <linux/swapops.h>
  #include <linux/mmu_notifier.h>
  #include <linux/migrate.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  #include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
                 if (error)
                         goto out;
-               perf_counter_mmap(vma);
+               perf_event_mmap(vma);
                 nstart = tmp;
  
                 if (nstart < prev->vm_end)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index 0aba8b6..b5f1953 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -318,7 +318,7 @@ export PERL_PATH
  
  LIB_FILE=libperf.a
  
-LIB_H += ../../include/linux/perf_counter.h
+LIB_H += ../../include/linux/perf_event.h
  LIB_H += ../../include/linux/rbtree.h
  LIB_H += ../../include/linux/list.h
  LIB_H += util/include/linux/list.h
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index 043d85b..1ec7416 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -505,7 +505,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 return -1;
         }
  
-       if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+       if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
                 show = SHOW_KERNEL;
                 level = 'k';
  
@@ -513,7 +513,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
  
                 dump_printf(" ...... dso: %s\n", dso->name);
  
-       } else if (event->header.misc & PERF_EVENT_MISC_USER) {
+       } else if (event->header.misc & PERF_RECORD_MISC_USER) {
  
                 show = SHOW_USER;
                 level = '.';
@@ -565,7 +565,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
  
         thread = threads__findnew(event->mmap.pid, &threads, &last_match);
  
-       dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+       dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->mmap.pid,
@@ -575,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
                 event->mmap.filename);
  
         if (thread == NULL || map == NULL) {
-               dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
                 return 0;
         }
  
@@ -591,14 +591,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
         struct thread *thread;
  
         thread = threads__findnew(event->comm.pid, &threads, &last_match);
-       dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+       dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->comm.comm, event->comm.pid);
  
         if (thread == NULL ||
             thread__set_comm(thread, event->comm.comm)) {
-               dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
                 return -1;
         }
         total_comm++;
@@ -614,7 +614,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
  
         thread = threads__findnew(event->fork.pid, &threads, &last_match);
         parent = threads__findnew(event->fork.ppid, &threads, &last_match);
-       dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+       dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->fork.pid, event->fork.ppid);
@@ -627,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
                 return 0;
  
         if (!thread || !parent || thread__fork(thread, parent)) {
-               dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
                 return -1;
         }
         total_fork++;
@@ -639,23 +639,23 @@ static int
  process_event(event_t *event, unsigned long offset, unsigned long head)
  {
         switch (event->header.type) {
-       case PERF_EVENT_SAMPLE:
+       case PERF_RECORD_SAMPLE:
                 return process_sample_event(event, offset, head);
  
-       case PERF_EVENT_MMAP:
+       case PERF_RECORD_MMAP:
                 return process_mmap_event(event, offset, head);
  
-       case PERF_EVENT_COMM:
+       case PERF_RECORD_COMM:
                 return process_comm_event(event, offset, head);
  
-       case PERF_EVENT_FORK:
+       case PERF_RECORD_FORK:
                 return process_fork_event(event, offset, head);
         /*
          * We dont process them right now but they are fine:
          */
  
-       case PERF_EVENT_THROTTLE:
-       case PERF_EVENT_UNTHROTTLE:
+       case PERF_RECORD_THROTTLE:
+       case PERF_RECORD_UNTHROTTLE:
                 return 0;
  
         default:
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index 2459e5a..a5a050a 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -77,7 +77,7 @@ static struct mmap_data               mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
  
  static unsigned long mmap_read_head(struct mmap_data *md)
  {
-       struct perf_counter_mmap_page *pc = md->base;
+       struct perf_event_mmap_page *pc = md->base;
         long head;
  
         head = pc->data_head;
@@ -88,7 +88,7 @@ static unsigned long mmap_read_head(struct mmap_data *md)
  
  static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
  {
-       struct perf_counter_mmap_page *pc = md->base;
+       struct perf_event_mmap_page *pc = md->base;
  
         /*
          * ensure all reads are done before we write the tail out.
@@ -233,7 +233,7 @@ static pid_t pid_synthesize_comm_event(pid_t pid, int full)
                 }
         }
  
-       comm_ev.header.type = PERF_EVENT_COMM;
+       comm_ev.header.type = PERF_RECORD_COMM;
         size = ALIGN(size, sizeof(u64));
         comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
  
@@ -288,7 +288,7 @@ static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
         while (1) {
                 char bf[BUFSIZ], *pbf = bf;
                 struct mmap_event mmap_ev = {
-                       .header = { .type = PERF_EVENT_MMAP },
+                       .header = { .type = PERF_RECORD_MMAP },
                 };
                 int n;
                 size_t size;
@@ -355,7 +355,7 @@ static void synthesize_all(void)
  
  static int group_fd;
  
-static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
+static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
  {
         struct perf_header_attr *h_attr;
  
@@ -371,7 +371,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int
  
  static void create_counter(int counter, int cpu, pid_t pid)
  {
-       struct perf_counter_attr *attr = attrs + counter;
+       struct perf_event_attr *attr = attrs + counter;
         struct perf_header_attr *h_attr;
         int track = !counter; /* only the first counter needs these */
         struct {
@@ -417,7 +417,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
         attr->disabled          = 1;
  
  try_again:
-       fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
+       fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
  
         if (fd[nr_cpu][counter] < 0) {
                 int err = errno;
@@ -444,7 +444,7 @@ try_again:
                 printf("\n");
                 error("perfcounter syscall returned with %d (%s)\n",
                         fd[nr_cpu][counter], strerror(err));
-               die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+               die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
                 exit(-1);
         }
  
@@ -478,7 +478,7 @@ try_again:
         if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
                 int ret;
  
-               ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
+               ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
                 assert(ret != -1);
         } else {
                 event_array[nr_poll].fd = fd[nr_cpu][counter];
@@ -496,7 +496,7 @@ try_again:
                 }
         }
  
-       ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
+       ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
  }
  
  static void open_counters(int cpu, pid_t pid)
@@ -642,7 +642,7 @@ static int __cmd_record(int argc, const char **argv)
                 if (done) {
                         for (i = 0; i < nr_cpu; i++) {
                                 for (counter = 0; counter < nr_counters; counter++)
-                                       ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE);
+                                       ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE);
                         }
                 }
         }
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index cdf9a8d..19669c2 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1121,7 +1121,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 more_data += sizeof(u64);
         }
  
-       dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+       dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->header.misc,
@@ -1158,9 +1158,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
         if (comm_list && !strlist__has_entry(comm_list, thread->comm))
                 return 0;
  
-       cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+       cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
  
-       if (cpumode == PERF_EVENT_MISC_KERNEL) {
+       if (cpumode == PERF_RECORD_MISC_KERNEL) {
                 show = SHOW_KERNEL;
                 level = 'k';
  
@@ -1168,7 +1168,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
  
                 dump_printf(" ...... dso: %s\n", dso->name);
  
-       } else if (cpumode == PERF_EVENT_MISC_USER) {
+       } else if (cpumode == PERF_RECORD_MISC_USER) {
  
                 show = SHOW_USER;
                 level = '.';
@@ -1210,7 +1210,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
  
         thread = threads__findnew(event->mmap.pid, &threads, &last_match);
  
-       dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
+       dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->mmap.pid,
@@ -1221,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
                 event->mmap.filename);
  
         if (thread == NULL || map == NULL) {
-               dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
                 return 0;
         }
  
@@ -1238,14 +1238,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
  
         thread = threads__findnew(event->comm.pid, &threads, &last_match);
  
-       dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+       dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->comm.comm, event->comm.pid);
  
         if (thread == NULL ||
             thread__set_comm_adjust(thread, event->comm.comm)) {
-               dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
                 return -1;
         }
         total_comm++;
@@ -1262,10 +1262,10 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
         thread = threads__findnew(event->fork.pid, &threads, &last_match);
         parent = threads__findnew(event->fork.ppid, &threads, &last_match);
  
-       dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n",
+       dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
-               event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT",
+               event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT",
                 event->fork.pid, event->fork.tid,
                 event->fork.ppid, event->fork.ptid);
  
@@ -1276,11 +1276,11 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
         if (thread == parent)
                 return 0;
  
-       if (event->header.type == PERF_EVENT_EXIT)
+       if (event->header.type == PERF_RECORD_EXIT)
                 return 0;
  
         if (!thread || !parent || thread__fork(thread, parent)) {
-               dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
                 return -1;
         }
         total_fork++;
@@ -1291,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
  static int
  process_lost_event(event_t *event, unsigned long offset, unsigned long head)
  {
-       dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n",
+       dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->lost.id,
@@ -1305,7 +1305,7 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head)
  static int
  process_read_event(event_t *event, unsigned long offset, unsigned long head)
  {
-       struct perf_counter_attr *attr;
+       struct perf_event_attr *attr;
  
         attr = perf_header__find_attr(event->read.id, header);
  
@@ -1319,7 +1319,7 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head)
                                            event->read.value);
         }
  
-       dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n",
+       dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n",
                         (void *)(offset + head),
                         (void *)(long)(event->header.size),
                         event->read.pid,
@@ -1337,31 +1337,31 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
         trace_event(event);
  
         switch (event->header.type) {
-       case PERF_EVENT_SAMPLE:
+       case PERF_RECORD_SAMPLE:
                 return process_sample_event(event, offset, head);
  
-       case PERF_EVENT_MMAP:
+       case PERF_RECORD_MMAP:
                 return process_mmap_event(event, offset, head);
  
-       case PERF_EVENT_COMM:
+       case PERF_RECORD_COMM:
                 return process_comm_event(event, offset, head);
  
-       case PERF_EVENT_FORK:
-       case PERF_EVENT_EXIT:
+       case PERF_RECORD_FORK:
+       case PERF_RECORD_EXIT:
                 return process_task_event(event, offset, head);
  
-       case PERF_EVENT_LOST:
+       case PERF_RECORD_LOST:
                 return process_lost_event(event, offset, head);
  
-       case PERF_EVENT_READ:
+       case PERF_RECORD_READ:
                 return process_read_event(event, offset, head);
  
         /*
          * We dont process them right now but they are fine:
          */
  
-       case PERF_EVENT_THROTTLE:
-       case PERF_EVENT_UNTHROTTLE:
+       case PERF_RECORD_THROTTLE:
+       case PERF_RECORD_UNTHROTTLE:
                 return 0;
  
         default:
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c

index 275d79c..ea9c15c 100644 (file)
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1573,7 +1573,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 more_data += sizeof(u64);
         }
  
-       dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+       dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->header.misc,
@@ -1589,9 +1589,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 return -1;
         }
  
-       cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+       cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
  
-       if (cpumode == PERF_EVENT_MISC_KERNEL) {
+       if (cpumode == PERF_RECORD_MISC_KERNEL) {
                 show = SHOW_KERNEL;
                 level = 'k';
  
@@ -1599,7 +1599,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
  
                 dump_printf(" ...... dso: %s\n", dso->name);
  
-       } else if (cpumode == PERF_EVENT_MISC_USER) {
+       } else if (cpumode == PERF_RECORD_MISC_USER) {
  
                 show = SHOW_USER;
                 level = '.';
@@ -1626,23 +1626,23 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
  
         nr_events++;
         switch (event->header.type) {
-       case PERF_EVENT_MMAP:
+       case PERF_RECORD_MMAP:
                 return 0;
-       case PERF_EVENT_LOST:
+       case PERF_RECORD_LOST:
                 nr_lost_chunks++;
                 nr_lost_events += event->lost.lost;
                 return 0;
  
-       case PERF_EVENT_COMM:
+       case PERF_RECORD_COMM:
                 return process_comm_event(event, offset, head);
  
-       case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+       case PERF_RECORD_EXIT ... PERF_RECORD_READ:
                 return 0;
  
-       case PERF_EVENT_SAMPLE:
+       case PERF_RECORD_SAMPLE:
                 return process_sample_event(event, offset, head);
  
-       case PERF_EVENT_MAX:
+       case PERF_RECORD_MAX:
         default:
                 return -1;
         }
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index 61b8282..16af2d8 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -48,7 +48,7 @@
  #include <sys/prctl.h>
  #include <math.h>
  
-static struct perf_counter_attr default_attrs[] = {
+static struct perf_event_attr default_attrs[] = {
  
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK     },
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
@@ -130,11 +130,11 @@ struct stats                      runtime_cycles_stats;
          attrs[counter].config == PERF_COUNT_##c)
  
  #define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"
+"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
  
  static void create_perf_stat_counter(int counter, int pid)
  {
-       struct perf_counter_attr *attr = attrs + counter;
+       struct perf_event_attr *attr = attrs + counter;
  
         if (scale)
                 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -144,7 +144,7 @@ static void create_perf_stat_counter(int counter, int pid)
                 unsigned int cpu;
  
                 for (cpu = 0; cpu < nr_cpus; cpu++) {
-                       fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
+                       fd[cpu][counter] = sys_perf_event_open(attr, -1, cpu, -1, 0);
                         if (fd[cpu][counter] < 0 && verbose)
                                 fprintf(stderr, ERR_PERF_OPEN, counter,
                                         fd[cpu][counter], strerror(errno));
@@ -154,7 +154,7 @@ static void create_perf_stat_counter(int counter, int pid)
                 attr->disabled       = 1;
                 attr->enable_on_exec = 1;
  
-               fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
+               fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0);
                 if (fd[0][counter] < 0 && verbose)
                         fprintf(stderr, ERR_PERF_OPEN, counter,
                                 fd[0][counter], strerror(errno));
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c

index 6004063..4405681 100644 (file)
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -937,21 +937,21 @@ process_event(event_t *event)
  
         switch (event->header.type) {
  
-       case PERF_EVENT_COMM:
+       case PERF_RECORD_COMM:
                 return process_comm_event(event);
-       case PERF_EVENT_FORK:
+       case PERF_RECORD_FORK:
                 return process_fork_event(event);
-       case PERF_EVENT_EXIT:
+       case PERF_RECORD_EXIT:
                 return process_exit_event(event);
-       case PERF_EVENT_SAMPLE:
+       case PERF_RECORD_SAMPLE:
                 return queue_sample_event(event);
  
         /*
          * We dont process them right now but they are fine:
          */
-       case PERF_EVENT_MMAP:
-       case PERF_EVENT_THROTTLE:
-       case PERF_EVENT_UNTHROTTLE:
+       case PERF_RECORD_MMAP:
+       case PERF_RECORD_THROTTLE:
+       case PERF_RECORD_UNTHROTTLE:
                 return 0;
  
         default:
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index 4002ccb..1ca8889 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -901,7 +901,7 @@ struct mmap_data {
  
  static unsigned int mmap_read_head(struct mmap_data *md)
  {
-       struct perf_counter_mmap_page *pc = md->base;
+       struct perf_event_mmap_page *pc = md->base;
         int head;
  
         head = pc->data_head;
@@ -977,9 +977,9 @@ static void mmap_read_counter(struct mmap_data *md)
  
                 old += size;
  
-               if (event->header.type == PERF_EVENT_SAMPLE) {
+               if (event->header.type == PERF_RECORD_SAMPLE) {
                         int user =
-       (event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER;
+       (event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER;
                         process_event(event->ip.ip, md->counter, user);
                 }
         }
@@ -1005,7 +1005,7 @@ int group_fd;
  
  static void start_counter(int i, int counter)
  {
-       struct perf_counter_attr *attr;
+       struct perf_event_attr *attr;
         int cpu;
  
         cpu = profile_cpu;
@@ -1019,7 +1019,7 @@ static void start_counter(int i, int counter)
         attr->inherit           = (cpu < 0) && inherit;
  
  try_again:
-       fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+       fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0);
  
         if (fd[i][counter] < 0) {
                 int err = errno;
@@ -1044,7 +1044,7 @@ try_again:
                 printf("\n");
                 error("perfcounter syscall returned with %d (%s)\n",
                         fd[i][counter], strerror(err));
-               die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+               die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
                 exit(-1);
         }
         assert(fd[i][counter] >= 0);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index 914ab36..e9d256e 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -35,14 +35,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
  
         thread = threads__findnew(event->comm.pid, &threads, &last_match);
  
-       dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+       dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->comm.comm, event->comm.pid);
  
         if (thread == NULL ||
             thread__set_comm(thread, event->comm.comm)) {
-               dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+               dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
                 return -1;
         }
         total_comm++;
@@ -82,7 +82,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 more_data += sizeof(u64);
         }
  
-       dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+       dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
                 (void *)(offset + head),
                 (void *)(long)(event->header.size),
                 event->header.misc,
@@ -98,9 +98,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 return -1;
         }
  
-       cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+       cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
  
-       if (cpumode == PERF_EVENT_MISC_KERNEL) {
+       if (cpumode == PERF_RECORD_MISC_KERNEL) {
                 show = SHOW_KERNEL;
                 level = 'k';
  
@@ -108,7 +108,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
  
                 dump_printf(" ...... dso: %s\n", dso->name);
  
-       } else if (cpumode == PERF_EVENT_MISC_USER) {
+       } else if (cpumode == PERF_RECORD_MISC_USER) {
  
                 show = SHOW_USER;
                 level = '.';
@@ -146,19 +146,19 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
         trace_event(event);
  
         switch (event->header.type) {
-       case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+       case PERF_RECORD_MMAP ... PERF_RECORD_LOST:
                 return 0;
  
-       case PERF_EVENT_COMM:
+       case PERF_RECORD_COMM:
                 return process_comm_event(event, offset, head);
  
-       case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+       case PERF_RECORD_EXIT ... PERF_RECORD_READ:
                 return 0;
  
-       case PERF_EVENT_SAMPLE:
+       case PERF_RECORD_SAMPLE:
                 return process_sample_event(event, offset, head);
  
-       case PERF_EVENT_MAX:
+       case PERF_RECORD_MAX:
         default:
                 return -1;
         }
diff --git a/tools/perf/design.txt b/tools/perf/design.txt

index f71e0d2..f1946d1 100644 (file)
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -18,10 +18,10 @@ underlying hardware counters.
  Performance counters are accessed via special file descriptors.
  There's one file descriptor per virtual counter used.
  
-The special file descriptor is opened via the perf_counter_open()
+The special file descriptor is opened via the perf_event_open()
  system call:
  
-   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+   int sys_perf_event_open(struct perf_event_hw_event *hw_event_uptr,
                              pid_t pid, int cpu, int group_fd,
                              unsigned long flags);
  
@@ -32,9 +32,9 @@ can be used to set the blocking mode, etc.
  Multiple counters can be kept open at a time, and the counters
  can be poll()ed.
  
-When creating a new counter fd, 'perf_counter_hw_event' is:
+When creating a new counter fd, 'perf_event_hw_event' is:
  
-struct perf_counter_hw_event {
+struct perf_event_hw_event {
          /*
           * The MSB of the config word signifies if the rest contains cpu
           * specific (raw) counter configuration data, if unset, the next
@@ -93,7 +93,7 @@ specified by 'event_id':
  
  /*
   * Generalized performance counter event types, used by the hw_event.event_id
- * parameter of the sys_perf_counter_open() syscall:
+ * parameter of the sys_perf_event_open() syscall:
   */
  enum hw_event_ids {
         /*
@@ -159,7 +159,7 @@ in size.
   * reads on the counter should return the indicated quantities,
   * in increasing order of bit value, after the counter value.
   */
-enum perf_counter_read_format {
+enum perf_event_read_format {
          PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
          PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
  };
@@ -178,7 +178,7 @@ interrupt:
   * Bits that can be set in hw_event.record_type to request information
   * in the overflow packets.
   */
-enum perf_counter_record_format {
+enum perf_event_record_format {
          PERF_RECORD_IP          = 1U << 0,
          PERF_RECORD_TID         = 1U << 1,
          PERF_RECORD_TIME        = 1U << 2,
@@ -228,7 +228,7 @@ these events are recorded in the ring-buffer (see below).
  The 'comm' bit allows tracking of process comm data on process creation.
  This too is recorded in the ring-buffer (see below).
  
-The 'pid' parameter to the perf_counter_open() system call allows the
+The 'pid' parameter to the perf_event_open() system call allows the
  counter to be specific to a task:
  
   pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -258,7 +258,7 @@ The 'flags' parameter is currently unused and must be zero.
  
  The 'group_fd' parameter allows counter "groups" to be set up.  A
  counter group has one counter which is the group "leader".  The leader
-is created first, with group_fd = -1 in the perf_counter_open call
+is created first, with group_fd = -1 in the perf_event_open call
  that creates it.  The rest of the group members are created
  subsequently, with group_fd giving the fd of the group leader.
  (A single counter on its own is created with group_fd = -1 and is
@@ -277,13 +277,13 @@ tracking are logged into a ring-buffer. This ring-buffer is created and
  accessed through mmap().
  
  The mmap size should be 1+2^n pages, where the first page is a meta-data page
-(struct perf_counter_mmap_page) that contains various bits of information such
+(struct perf_event_mmap_page) that contains various bits of information such
  as where the ring-buffer head is.
  
  /*
   * Structure of the page that can be mapped via mmap
   */
-struct perf_counter_mmap_page {
+struct perf_event_mmap_page {
          __u32   version;                /* version number of this structure */
          __u32   compat_version;         /* lowest version this is compat with */
  
@@ -317,7 +317,7 @@ struct perf_counter_mmap_page {
           * Control data for the mmap() data buffer.
           *
           * User-space reading this value should issue an rmb(), on SMP capable
-         * platforms, after reading this value -- see perf_counter_wakeup().
+         * platforms, after reading this value -- see perf_event_wakeup().
           */
          __u32   data_head;              /* head in the data section */
  };
@@ -327,9 +327,9 @@ NOTE: the hw-counter userspace bits are arch specific and are currently only
  
  The following 2^n pages are the ring-buffer which contains events of the form:
  
-#define PERF_EVENT_MISC_KERNEL          (1 << 0)
-#define PERF_EVENT_MISC_USER            (1 << 1)
-#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+#define PERF_RECORD_MISC_KERNEL          (1 << 0)
+#define PERF_RECORD_MISC_USER            (1 << 1)
+#define PERF_RECORD_MISC_OVERFLOW        (1 << 2)
  
  struct perf_event_header {
          __u32   type;
@@ -353,8 +353,8 @@ enum perf_event_type {
           *      char                            filename[];
           * };
           */
-        PERF_EVENT_MMAP                 = 1,
-        PERF_EVENT_MUNMAP               = 2,
+        PERF_RECORD_MMAP                 = 1,
+        PERF_RECORD_MUNMAP               = 2,
  
          /*
           * struct {
@@ -364,10 +364,10 @@ enum perf_event_type {
           *      char                            comm[];
           * };
           */
-        PERF_EVENT_COMM                 = 3,
+        PERF_RECORD_COMM                 = 3,
  
          /*
-         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field
           * will be PERF_RECORD_*
           *
           * struct {
@@ -397,7 +397,7 @@ Notification of new events is possible through poll()/select()/epoll() and
  fcntl() managing signals.
  
  Normally a notification is generated for every page filled, however one can
-additionally set perf_counter_hw_event.wakeup_events to generate one every
+additionally set perf_event_hw_event.wakeup_events to generate one every
  so many counter overflow events.
  
  Future work will include a splice() interface to the ring-buffer.
@@ -409,11 +409,11 @@ events but does continue to exist and maintain its count value.
  
  An individual counter or counter group can be enabled with
  
-       ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+       ioctl(fd, PERF_EVENT_IOC_ENABLE);
  
  or disabled with
  
-       ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+       ioctl(fd, PERF_EVENT_IOC_DISABLE);
  
  Enabling or disabling the leader of a group enables or disables the
  whole group; that is, while the group leader is disabled, none of the
@@ -424,16 +424,16 @@ other counter.
  
  Additionally, non-inherited overflow counters can use
  
-       ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+       ioctl(fd, PERF_EVENT_IOC_REFRESH, nr);
  
  to enable a counter for 'nr' events, after which it gets disabled again.
  
  A process can enable or disable all the counter groups that are
  attached to it, using prctl:
  
-       prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+       prctl(PR_TASK_PERF_EVENTS_ENABLE);
  
-       prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+       prctl(PR_TASK_PERF_EVENTS_DISABLE);
  
  This applies to all counters on the current process, whether created
  by this process or by another, and doesn't affect any counters that
@@ -447,11 +447,11 @@ Arch requirements
  If your architecture does not have hardware performance metrics, you can
  still use the generic software counters based on hrtimers for sampling.
  
-So to start with, in order to add HAVE_PERF_COUNTERS to your Kconfig, you
+So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you
  will need at least this:
-       - asm/perf_counter.h - a basic stub will suffice at first
+       - asm/perf_event.h - a basic stub will suffice at first
         - support for atomic64 types (and associated helper functions)
-       - set_perf_counter_pending() implemented
+       - set_perf_event_pending() implemented
  
  If your architecture does have hardware capabilities, you can override the
-weak stub hw_perf_counter_init() to register hardware counters.
+weak stub hw_perf_event_init() to register hardware counters.
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

index 2abeb20..8cc4623 100644 (file)
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -52,15 +52,15 @@
  #include <sys/types.h>
  #include <sys/syscall.h>
  
-#include "../../include/linux/perf_counter.h"
+#include "../../include/linux/perf_event.h"
  #include "util/types.h"
  
  /*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
   * counters in the current task.
   */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
+#define PR_TASK_PERF_EVENTS_DISABLE   31
+#define PR_TASK_PERF_EVENTS_ENABLE    32
  
  #ifndef NSEC_PER_SEC
  # define NSEC_PER_SEC                  1000000000ULL
@@ -90,12 +90,12 @@ static inline unsigned long long rdclock(void)
         _min1 < _min2 ? _min1 : _min2; })
  
  static inline int
-sys_perf_counter_open(struct perf_counter_attr *attr,
+sys_perf_event_open(struct perf_event_attr *attr,
                       pid_t pid, int cpu, int group_fd,
                       unsigned long flags)
  {
         attr->size = sizeof(*attr);
-       return syscall(__NR_perf_counter_open, attr, pid, cpu,
+       return syscall(__NR_perf_event_open, attr, pid, cpu,
                        group_fd, flags);
  }
  
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h

index 018d414..2c9c26d 100644 (file)
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_EVENT_H
-#define __PERF_EVENT_H
+#ifndef __PERF_RECORD_H
+#define __PERF_RECORD_H
  #include "../perf.h"
  #include "util.h"
  #include <linux/list.h>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index bb4fca3..e306857 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -9,7 +9,7 @@
  /*
   * Create new perf.data header attribute:
   */
-struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr)
+struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr)
  {
         struct perf_header_attr *self = malloc(sizeof(*self));
  
@@ -134,7 +134,7 @@ struct perf_file_section {
  };
  
  struct perf_file_attr {
-       struct perf_counter_attr        attr;
+       struct perf_event_attr  attr;
         struct perf_file_section        ids;
  };
  
@@ -320,7 +320,7 @@ u64 perf_header__sample_type(struct perf_header *header)
         return type;
  }
  
-struct perf_counter_attr *
+struct perf_event_attr *
  perf_header__find_attr(u64 id, struct perf_header *header)
  {
         int i;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h

index 7b0e84a..a0761bc 100644 (file)
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -1,12 +1,12 @@
  #ifndef _PERF_HEADER_H
  #define _PERF_HEADER_H
  
-#include "../../../include/linux/perf_counter.h"
+#include "../../../include/linux/perf_event.h"
  #include <sys/types.h>
  #include "types.h"
  
  struct perf_header_attr {
-       struct perf_counter_attr attr;
+       struct perf_event_attr attr;
         int ids, size;
         u64 *id;
         off_t id_offset;
@@ -34,11 +34,11 @@ char *perf_header__find_event(u64 id);
  
  
  struct perf_header_attr *
-perf_header_attr__new(struct perf_counter_attr *attr);
+perf_header_attr__new(struct perf_event_attr *attr);
  void perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
  
  u64 perf_header__sample_type(struct perf_header *header);
-struct perf_counter_attr *
+struct perf_event_attr *
  perf_header__find_attr(u64 id, struct perf_header *header);
  
  
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index 89172fd..13ab4b8 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -10,7 +10,7 @@
  
  int                                    nr_counters;
  
-struct perf_counter_attr               attrs[MAX_COUNTERS];
+struct perf_event_attr         attrs[MAX_COUNTERS];
  
  struct event_symbol {
         u8              type;
@@ -48,13 +48,13 @@ static struct event_symbol event_symbols[] = {
    { CSW(CPU_MIGRATIONS),       "cpu-migrations",       "migrations"    },
  };
  
-#define __PERF_COUNTER_FIELD(config, name) \
-       ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+#define __PERF_EVENT_FIELD(config, name) \
+       ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT)
  
-#define PERF_COUNTER_RAW(config)       __PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)    __PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)      __PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)                __PERF_COUNTER_FIELD(config, EVENT)
+#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW)
+#define PERF_EVENT_CONFIG(config)      __PERF_EVENT_FIELD(config, CONFIG)
+#define PERF_EVENT_TYPE(config)        __PERF_EVENT_FIELD(config, TYPE)
+#define PERF_EVENT_ID(config)          __PERF_EVENT_FIELD(config, EVENT)
  
  static const char *hw_event_names[] = {
         "cycles",
@@ -352,7 +352,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int
  }
  
  static enum event_result
-parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
+parse_generic_hw_event(const char **str, struct perf_event_attr *attr)
  {
         const char *s = *str;
         int cache_type = -1, cache_op = -1, cache_result = -1;
@@ -417,7 +417,7 @@ parse_single_tracepoint_event(char *sys_name,
                               const char *evt_name,
                               unsigned int evt_length,
                               char *flags,
-                             struct perf_counter_attr *attr,
+                             struct perf_event_attr *attr,
                               const char **strp)
  {
         char evt_path[MAXPATHLEN];
@@ -505,7 +505,7 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags)
  
  
  static enum event_result parse_tracepoint_event(const char **strp,
-                                   struct perf_counter_attr *attr)
+                                   struct perf_event_attr *attr)
  {
         const char *evt_name;
         char *flags;
@@ -563,7 +563,7 @@ static int check_events(const char *str, unsigned int i)
  }
  
  static enum event_result
-parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
+parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
  {
         const char *str = *strp;
         unsigned int i;
@@ -582,7 +582,7 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
  }
  
  static enum event_result
-parse_raw_event(const char **strp, struct perf_counter_attr *attr)
+parse_raw_event(const char **strp, struct perf_event_attr *attr)
  {
         const char *str = *strp;
         u64 config;
@@ -601,7 +601,7 @@ parse_raw_event(const char **strp, struct perf_counter_attr *attr)
  }
  
  static enum event_result
-parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
+parse_numeric_event(const char **strp, struct perf_event_attr *attr)
  {
         const char *str = *strp;
         char *endp;
@@ -623,7 +623,7 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
  }
  
  static enum event_result
-parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
+parse_event_modifier(const char **strp, struct perf_event_attr *attr)
  {
         const char *str = *strp;
         int eu = 1, ek = 1, eh = 1;
@@ -656,7 +656,7 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
   * Symbolic names are (almost) exactly matched.
   */
  static enum event_result
-parse_event_symbols(const char **str, struct perf_counter_attr *attr)
+parse_event_symbols(const char **str, struct perf_event_attr *attr)
  {
         enum event_result ret;
  
@@ -711,7 +711,7 @@ static void store_event_type(const char *orgname)
  
  int parse_events(const struct option *opt __used, const char *str, int unset __used)
  {
-       struct perf_counter_attr attr;
+       struct perf_event_attr attr;
         enum event_result ret;
  
         if (strchr(str, ':'))
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index 60704c1..30c6081 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -16,7 +16,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
  
  extern int                     nr_counters;
  
-extern struct perf_counter_attr attrs[MAX_COUNTERS];
+extern struct perf_event_attr attrs[MAX_COUNTERS];
  
  extern const char *event_name(int ctr);
  extern const char *__event_name(int type, u64 config);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c

index 1fd824c..af4b057 100644 (file)
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -480,12 +480,12 @@ out:
  }
  
  static struct tracepoint_path *
-get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
+get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
  {
         struct tracepoint_path path, *ppath = &path;
         int i;
  
-       for (i = 0; i < nb_counters; i++) {
+       for (i = 0; i < nb_events; i++) {
                 if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
                         continue;
                 ppath->next = tracepoint_id_to_path(pattrs[i].config);
@@ -496,7 +496,7 @@ get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
  
         return path.next;
  }
-void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
+void read_tracing_data(struct perf_event_attr *pattrs, int nb_events)
  {
         char buf[BUFSIZ];
         struct tracepoint_path *tps;
@@ -530,7 +530,7 @@ void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
         page_size = getpagesize();
         write_or_die(&page_size, 4);
  
-       tps = get_tracepoints_path(pattrs, nb_counters);
+       tps = get_tracepoints_path(pattrs, nb_events);
  
         read_header_files();
         read_ftrace_files(tps);
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h

index d35ebf1..693f815 100644 (file)
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -240,6 +240,6 @@ unsigned long long
  raw_field_value(struct event *event, const char *name, void *data);
  void *raw_field_ptr(struct event *event, const char *name, void *data);
  
-void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
+void read_tracing_data(struct perf_event_attr *pattrs, int nb_events);
  
  #endif /* _TRACE_EVENTS_H */
author	Ingo Molnar <mingo@elte.hu>
	Mon, 21 Sep 2009 10:02:48 +0000 (12:02 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 21 Sep 2009 12:28:04 +0000 (14:28 +0200)
arch/arm/include/asm/unistd.h		patch \| blob \| history
arch/arm/kernel/calls.S		patch \| blob \| history
arch/blackfin/include/asm/unistd.h		patch \| blob \| history
arch/blackfin/mach-common/entry.S		patch \| blob \| history
arch/frv/Kconfig		patch \| blob \| history
arch/frv/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/frv/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/frv/include/asm/unistd.h		patch \| blob \| history
arch/frv/kernel/entry.S		patch \| blob \| history
arch/frv/lib/Makefile		patch \| blob \| history
arch/frv/lib/perf_counter.c	[deleted file]	patch \| blob \| history
arch/frv/lib/perf_event.c	[new file with mode: 0644]	patch \| blob
arch/m68k/include/asm/unistd.h		patch \| blob \| history
arch/m68k/kernel/entry.S		patch \| blob \| history
arch/m68knommu/kernel/syscalltable.S		patch \| blob \| history
arch/microblaze/include/asm/unistd.h		patch \| blob \| history
arch/microblaze/kernel/syscall_table.S		patch \| blob \| history
arch/mips/include/asm/unistd.h		patch \| blob \| history
arch/mips/kernel/scall32-o32.S		patch \| blob \| history
arch/mips/kernel/scall64-64.S		patch \| blob \| history
arch/mips/kernel/scall64-n32.S		patch \| blob \| history
arch/mips/kernel/scall64-o32.S		patch \| blob \| history
arch/mn10300/include/asm/unistd.h		patch \| blob \| history
arch/mn10300/kernel/entry.S		patch \| blob \| history
arch/parisc/Kconfig		patch \| blob \| history
arch/parisc/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/parisc/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/parisc/include/asm/unistd.h		patch \| blob \| history
arch/parisc/kernel/syscall_table.S		patch \| blob \| history
arch/powerpc/Kconfig		patch \| blob \| history
arch/powerpc/include/asm/hw_irq.h		patch \| blob \| history
arch/powerpc/include/asm/paca.h		patch \| blob \| history
arch/powerpc/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/powerpc/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/powerpc/include/asm/systbl.h		patch \| blob \| history
arch/powerpc/include/asm/unistd.h		patch \| blob \| history
arch/powerpc/kernel/Makefile		patch \| blob \| history
arch/powerpc/kernel/asm-offsets.c		patch \| blob \| history
arch/powerpc/kernel/entry_64.S		patch \| blob \| history
arch/powerpc/kernel/irq.c		patch \| blob \| history
arch/powerpc/kernel/mpc7450-pmu.c		patch \| blob \| history
arch/powerpc/kernel/perf_callchain.c		patch \| blob \| history
arch/powerpc/kernel/perf_counter.c	[deleted file]	patch \| blob \| history
arch/powerpc/kernel/perf_event.c	[new file with mode: 0644]	patch \| blob
arch/powerpc/kernel/power4-pmu.c		patch \| blob \| history
arch/powerpc/kernel/power5+-pmu.c		patch \| blob \| history
arch/powerpc/kernel/power5-pmu.c		patch \| blob \| history
arch/powerpc/kernel/power6-pmu.c		patch \| blob \| history
arch/powerpc/kernel/power7-pmu.c		patch \| blob \| history
arch/powerpc/kernel/ppc970-pmu.c		patch \| blob \| history
arch/powerpc/kernel/time.c		patch \| blob \| history
arch/powerpc/mm/fault.c		patch \| blob \| history
arch/powerpc/platforms/Kconfig.cputype		patch \| blob \| history
arch/s390/Kconfig		patch \| blob \| history
arch/s390/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/s390/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/s390/include/asm/unistd.h		patch \| blob \| history
arch/s390/kernel/compat_wrapper.S		patch \| blob \| history
arch/s390/kernel/syscalls.S		patch \| blob \| history
arch/s390/mm/fault.c		patch \| blob \| history
arch/sh/Kconfig		patch \| blob \| history
arch/sh/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/sh/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/sh/include/asm/unistd_32.h		patch \| blob \| history
arch/sh/include/asm/unistd_64.h		patch \| blob \| history
arch/sh/kernel/syscalls_32.S		patch \| blob \| history
arch/sh/kernel/syscalls_64.S		patch \| blob \| history
arch/sh/mm/fault_32.c		patch \| blob \| history
arch/sh/mm/tlbflush_64.c		patch \| blob \| history
arch/sparc/Kconfig		patch \| blob \| history
arch/sparc/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/sparc/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/sparc/include/asm/unistd.h		patch \| blob \| history
arch/sparc/kernel/Makefile		patch \| blob \| history
arch/sparc/kernel/nmi.c		patch \| blob \| history
arch/sparc/kernel/pcr.c		patch \| blob \| history
arch/sparc/kernel/perf_counter.c	[deleted file]	patch \| blob \| history
arch/sparc/kernel/perf_event.c	[new file with mode: 0644]	patch \| blob
arch/sparc/kernel/systbls_32.S		patch \| blob \| history
arch/sparc/kernel/systbls_64.S		patch \| blob \| history
arch/x86/Kconfig		patch \| blob \| history
arch/x86/ia32/ia32entry.S		patch \| blob \| history
arch/x86/include/asm/entry_arch.h		patch \| blob \| history
arch/x86/include/asm/perf_counter.h	[deleted file]	patch \| blob \| history
arch/x86/include/asm/perf_event.h	[new file with mode: 0644]	patch \| blob
arch/x86/include/asm/unistd_32.h		patch \| blob \| history
arch/x86/include/asm/unistd_64.h		patch \| blob \| history
arch/x86/kernel/apic/apic.c		patch \| blob \| history
arch/x86/kernel/cpu/Makefile		patch \| blob \| history
arch/x86/kernel/cpu/common.c		patch \| blob \| history
arch/x86/kernel/cpu/perf_counter.c	[deleted file]	patch \| blob \| history
arch/x86/kernel/cpu/perf_event.c	[new file with mode: 0644]	patch \| blob
arch/x86/kernel/cpu/perfctr-watchdog.c		patch \| blob \| history
arch/x86/kernel/entry_64.S		patch \| blob \| history
arch/x86/kernel/irqinit.c		patch \| blob \| history
arch/x86/kernel/syscall_table_32.S		patch \| blob \| history
arch/x86/mm/fault.c		patch \| blob \| history
arch/x86/oprofile/op_model_ppro.c		patch \| blob \| history
arch/x86/oprofile/op_x86_model.h		patch \| blob \| history
drivers/char/sysrq.c		patch \| blob \| history
fs/exec.c		patch \| blob \| history
include/asm-generic/unistd.h		patch \| blob \| history
include/linux/init_task.h		patch \| blob \| history
include/linux/perf_counter.h	[deleted file]	patch \| blob \| history
include/linux/perf_event.h	[new file with mode: 0644]	patch \| blob
include/linux/prctl.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/syscalls.h		patch \| blob \| history
include/trace/ftrace.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/perf_counter.c	[deleted file]	patch \| blob \| history
kernel/perf_event.c	[new file with mode: 0644]	patch \| blob
kernel/sched.c		patch \| blob \| history
kernel/sys.c		patch \| blob \| history
kernel/sys_ni.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
kernel/timer.c		patch \| blob \| history
kernel/trace/trace_syscalls.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
tools/perf/Makefile		patch \| blob \| history
tools/perf/builtin-annotate.c		patch \| blob \| history
tools/perf/builtin-record.c		patch \| blob \| history
tools/perf/builtin-report.c		patch \| blob \| history
tools/perf/builtin-sched.c		patch \| blob \| history
tools/perf/builtin-stat.c		patch \| blob \| history
tools/perf/builtin-timechart.c		patch \| blob \| history
tools/perf/builtin-top.c		patch \| blob \| history
tools/perf/builtin-trace.c		patch \| blob \| history
tools/perf/design.txt		patch \| blob \| history
tools/perf/perf.h		patch \| blob \| history
tools/perf/util/event.h		patch \| blob \| history
tools/perf/util/header.c		patch \| blob \| history
tools/perf/util/header.h		patch \| blob \| history
tools/perf/util/parse-events.c		patch \| blob \| history
tools/perf/util/parse-events.h		patch \| blob \| history
tools/perf/util/trace-event-info.c		patch \| blob \| history
tools/perf/util/trace-event.h		patch \| blob \| history