2 * This file implements the perfmon subsystem which is used
3 * to program the IA-64 Performance Monitoring Unit (PMU).
5 * Originally Written by Ganesh Venkitachalam, IBM Corp.
6 * Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
8 * Modifications by Stephane Eranian, Hewlett-Packard Co.
9 * Modifications by David Mosberger-Tang, Hewlett-Packard Co.
11 * Copyright (C) 1999-2003 Hewlett Packard Co
12 * Stephane Eranian <eranian@hpl.hp.com>
13 * David Mosberger-Tang <davidm@hpl.hp.com>
16 #include <linux/config.h>
17 #include <linux/kernel.h>
18 #include <linux/sched.h>
19 #include <linux/interrupt.h>
20 #include <linux/smp_lock.h>
21 #include <linux/proc_fs.h>
22 #include <linux/init.h>
23 #include <linux/vmalloc.h>
25 #include <linux/sysctl.h>
26 #include <linux/smp.h>
28 #include <asm/bitops.h>
29 #include <asm/errno.h>
31 #include <asm/perfmon.h>
32 #include <asm/processor.h>
33 #include <asm/signal.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/delay.h> /* for ia64_get_itc() */
41 * For PMUs which rely on the debug registers for some features, you must
42 * you must enable the following flag to activate the support for
43 * accessing the registers via the perfmonctl() interface.
45 #if defined(CONFIG_ITANIUM) || defined(CONFIG_MCKINLEY)
46 #define PFM_PMU_USES_DBR 1
50 * perfmon context states
52 #define PFM_CTX_DISABLED 0
53 #define PFM_CTX_ENABLED 1
56 * Reset register flags
58 #define PFM_PMD_LONG_RESET 1
59 #define PFM_PMD_SHORT_RESET 2
62 * Misc macros and definitions
64 #define PMU_FIRST_COUNTER 4
65 #define PMU_MAX_PMCS 256
66 #define PMU_MAX_PMDS 256
69 * type of a PMU register (bitmask).
71 * bit0 : register implemented
74 * bit4-7 : register type
77 #define PFM_REG_IMPL 0x1 /* register implemented */
78 #define PFM_REG_END 0x2 /* end marker */
79 #define PFM_REG_MONITOR (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
80 #define PFM_REG_COUNTING (0x2<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm AND pmc.oi, a PMD used as a counter */
81 #define PFM_REG_CONTROL (0x3<<4|PFM_REG_IMPL) /* PMU control register */
82 #define PFM_REG_CONFIG (0x4<<4|PFM_REG_IMPL) /* refine configuration */
83 #define PFM_REG_BUFFER (0x5<<4|PFM_REG_IMPL) /* PMD used as buffer */
85 #define PMC_IS_LAST(i) (pmu_conf.pmc_desc[i].type & PFM_REG_END)
86 #define PMD_IS_LAST(i) (pmu_conf.pmd_desc[i].type & PFM_REG_END)
88 #define PFM_IS_DISABLED() pmu_conf.disabled
90 #define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_soft_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY)
91 #define PFM_FL_INHERIT_MASK (PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)
93 /* i assume unsigned */
94 #define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf.pmc_desc[i].type & PFM_REG_IMPL))
95 #define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf.pmd_desc[i].type & PFM_REG_IMPL))
97 /* XXX: these three assume that register i is implemented */
98 #define PMD_IS_COUNTING(i) (pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING)
99 #define PMC_IS_COUNTING(i) (pmu_conf.pmc_desc[i].type == PFM_REG_COUNTING)
100 #define PMC_IS_MONITOR(i) (pmu_conf.pmc_desc[i].type == PFM_REG_MONITOR)
101 #define PMC_DFL_VAL(i) pmu_conf.pmc_desc[i].default_value
102 #define PMC_RSVD_MASK(i) pmu_conf.pmc_desc[i].reserved_mask
103 #define PMD_PMD_DEP(i) pmu_conf.pmd_desc[i].dep_pmd[0]
104 #define PMC_PMD_DEP(i) pmu_conf.pmc_desc[i].dep_pmd[0]
106 /* k assume unsigned */
107 #define IBR_IS_IMPL(k) (k<pmu_conf.num_ibrs)
108 #define DBR_IS_IMPL(k) (k<pmu_conf.num_dbrs)
110 #define CTX_IS_ENABLED(c) ((c)->ctx_flags.state == PFM_CTX_ENABLED)
111 #define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0)
112 #define CTX_INHERIT_MODE(c) ((c)->ctx_fl_inherit)
113 #define CTX_HAS_SMPL(c) ((c)->ctx_psb != NULL)
114 /* XXX: does not support more than 64 PMDs */
115 #define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
116 #define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
119 #define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
120 #define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
121 #define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
123 #define LOCK_CTX(ctx) spin_lock(&(ctx)->ctx_lock)
124 #define UNLOCK_CTX(ctx) spin_unlock(&(ctx)->ctx_lock)
126 #define SET_PMU_OWNER(t) do { pmu_owners[smp_processor_id()].owner = (t); } while(0)
127 #define PMU_OWNER() pmu_owners[smp_processor_id()].owner
129 #define LOCK_PFS() spin_lock(&pfm_sessions.pfs_lock)
130 #define UNLOCK_PFS() spin_unlock(&pfm_sessions.pfs_lock)
132 #define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
134 #define PFM_CPUINFO_CLEAR(v) __get_cpu_var(pfm_syst_info) &= ~(v)
135 #define PFM_CPUINFO_SET(v) __get_cpu_var(pfm_syst_info) |= (v)
140 #define DBprintk(a) \
142 if (pfm_sysctl.debug >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
145 #define DBprintk_ovfl(a) \
147 if (pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
153 * Architected PMC structure
156 unsigned long pmc_plm:4; /* privilege level mask */
157 unsigned long pmc_ev:1; /* external visibility */
158 unsigned long pmc_oi:1; /* overflow interrupt */
159 unsigned long pmc_pm:1; /* privileged monitor */
160 unsigned long pmc_ig1:1; /* reserved */
161 unsigned long pmc_es:8; /* event select */
162 unsigned long pmc_ig2:48; /* reserved */
166 * There is one such data structure per perfmon context. It is used to describe the
167 * sampling buffer. It is to be shared among siblings whereas the pfm_context
169 * Therefore we maintain a refcnt which is incremented on fork().
170 * This buffer is private to the kernel only the actual sampling buffer
171 * including its header are exposed to the user. This construct allows us to
172 * export the buffer read-write, if needed, without worrying about security
175 typedef struct _pfm_smpl_buffer_desc {
176 spinlock_t psb_lock; /* protection lock */
177 unsigned long psb_refcnt; /* how many users for the buffer */
178 int psb_flags; /* bitvector of flags (not yet used) */
180 void *psb_addr; /* points to location of first entry */
181 unsigned long psb_entries; /* maximum number of entries */
182 unsigned long psb_size; /* aligned size of buffer */
183 unsigned long psb_index; /* next free entry slot XXX: must use the one in buffer */
184 unsigned long psb_entry_size; /* size of each entry including entry header */
186 perfmon_smpl_hdr_t *psb_hdr; /* points to sampling buffer header */
188 struct _pfm_smpl_buffer_desc *psb_next; /* next psb, used for rvfreeing of psb_hdr */
190 } pfm_smpl_buffer_desc_t;
195 #define PSB_HAS_VMA 0x1 /* a virtual mapping for the buffer exists */
197 #define LOCK_PSB(p) spin_lock(&(p)->psb_lock)
198 #define UNLOCK_PSB(p) spin_unlock(&(p)->psb_lock)
201 * 64-bit software counter structure
204 u64 val; /* virtual 64bit counter value */
205 u64 lval; /* last value */
206 u64 long_reset; /* reset value on sampling overflow */
207 u64 short_reset;/* reset value on overflow */
208 u64 reset_pmds[4]; /* which other pmds to reset when this counter overflows */
209 u64 seed; /* seed for random-number generator */
210 u64 mask; /* mask for random-number generator */
211 unsigned int flags; /* notify/do not notify */
215 * perfmon context. One per process, is cloned on fork() depending on
219 unsigned int state:1; /* 0=disabled, 1=enabled */
220 unsigned int inherit:2; /* inherit mode */
221 unsigned int block:1; /* when 1, task will blocked on user notifications */
222 unsigned int system:1; /* do system wide monitoring */
223 unsigned int frozen:1; /* pmu must be kept frozen on ctxsw in */
224 unsigned int protected:1; /* allow access to creator of context only */
225 unsigned int using_dbreg:1; /* using range restrictions (debug registers) */
226 unsigned int excl_idle:1; /* exclude idle task in system wide session */
227 unsigned int unsecure:1; /* sp = 0 for non self-monitored task */
228 unsigned int trap_reason:2; /* reason for going into pfm_block_ovfl_reset() */
229 unsigned int reserved:20;
230 } pfm_context_flags_t;
232 #define PFM_TRAP_REASON_NONE 0x0 /* default value */
233 #define PFM_TRAP_REASON_BLOCKSIG 0x1 /* we need to block on overflow and signal user */
234 #define PFM_TRAP_REASON_SIG 0x2 /* we simply need to signal user */
235 #define PFM_TRAP_REASON_RESET 0x3 /* we need to reset PMDs */
238 * perfmon context: encapsulates all the state of a monitoring session
239 * XXX: probably need to change layout
241 typedef struct pfm_context {
242 pfm_smpl_buffer_desc_t *ctx_psb; /* sampling buffer, if any */
243 unsigned long ctx_smpl_vaddr; /* user level virtual address of smpl buffer */
246 pfm_context_flags_t ctx_flags; /* block/noblock */
248 struct task_struct *ctx_notify_task; /* who to notify on overflow */
249 struct task_struct *ctx_owner; /* pid of creator (debug) */
251 unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */
252 unsigned long ctx_smpl_regs[4]; /* which registers to record on overflow */
254 struct semaphore ctx_restart_sem; /* use for blocking notification mode */
256 unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */
257 unsigned long ctx_reload_pmds[4]; /* bitmask of PMD to reload on ctxsw */
259 unsigned long ctx_used_pmcs[4]; /* bitmask PMC used by context */
260 unsigned long ctx_reload_pmcs[4]; /* bitmask of PMC to reload on ctxsw */
262 unsigned long ctx_used_ibrs[4]; /* bitmask of used IBR (speedup ctxsw) */
263 unsigned long ctx_used_dbrs[4]; /* bitmask of used DBR (speedup ctxsw) */
265 pfm_counter_t ctx_soft_pmds[IA64_NUM_PMD_REGS]; /* XXX: size should be dynamic */
267 u64 ctx_saved_psr; /* copy of psr used for lazy ctxsw */
268 unsigned long ctx_saved_cpus_allowed; /* copy of the task cpus_allowed (system wide) */
269 unsigned int ctx_cpu; /* CPU used by system wide session */
271 atomic_t ctx_last_cpu; /* CPU id of current or last CPU used */
274 #define ctx_fl_inherit ctx_flags.inherit
275 #define ctx_fl_block ctx_flags.block
276 #define ctx_fl_system ctx_flags.system
277 #define ctx_fl_frozen ctx_flags.frozen
278 #define ctx_fl_protected ctx_flags.protected
279 #define ctx_fl_using_dbreg ctx_flags.using_dbreg
280 #define ctx_fl_excl_idle ctx_flags.excl_idle
281 #define ctx_fl_trap_reason ctx_flags.trap_reason
282 #define ctx_fl_unsecure ctx_flags.unsecure
285 * global information about all sessions
286 * mostly used to synchronize between system wide and per-process
289 spinlock_t pfs_lock; /* lock the structure */
291 unsigned int pfs_task_sessions; /* number of per task sessions */
292 unsigned int pfs_sys_sessions; /* number of per system wide sessions */
293 unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */
294 unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */
295 struct task_struct *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
299 * information about a PMC or PMD.
300 * dep_pmd[]: a bitmask of dependent PMD registers
301 * dep_pmc[]: a bitmask of dependent PMC registers
306 unsigned long default_value; /* power-on default value */
307 unsigned long reserved_mask; /* bitmask of reserved bits */
308 int (*read_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
309 int (*write_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
310 unsigned long dep_pmd[4];
311 unsigned long dep_pmc[4];
314 /* assume cnum is a valid monitor */
315 #define PMC_PM(cnum, val) (((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
316 #define PMC_WR_FUNC(cnum) (pmu_conf.pmc_desc[cnum].write_check)
317 #define PMD_WR_FUNC(cnum) (pmu_conf.pmd_desc[cnum].write_check)
318 #define PMD_RD_FUNC(cnum) (pmu_conf.pmd_desc[cnum].read_check)
321 * This structure is initialized at boot time and contains
322 * a description of the PMU main characteristics.
325 unsigned int disabled; /* indicates if perfmon is working properly */
326 unsigned long ovfl_val; /* overflow value for generic counters */
327 unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */
328 unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */
329 unsigned int num_pmcs; /* number of implemented PMCS */
330 unsigned int num_pmds; /* number of implemented PMDS */
331 unsigned int num_ibrs; /* number of implemented IBRS */
332 unsigned int num_dbrs; /* number of implemented DBRS */
333 unsigned int num_counters; /* number of PMD/PMC counters */
334 pfm_reg_desc_t *pmc_desc; /* detailed PMC register dependencies descriptions */
335 pfm_reg_desc_t *pmd_desc; /* detailed PMD register dependencies descriptions */
339 * structure used to pass argument to/from remote CPU
340 * using IPI to check and possibly save the PMU context on SMP systems.
342 * not used in UP kernels
345 struct task_struct *task; /* which task we are interested in */
346 int retval; /* return value of the call: 0=you can proceed, 1=need to wait for completion */
350 * perfmon command descriptions
353 int (*cmd_func)(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
355 unsigned int cmd_narg;
359 #define PFM_CMD_PID 0x1 /* command requires pid argument */
360 #define PFM_CMD_ARG_READ 0x2 /* command must read argument(s) */
361 #define PFM_CMD_ARG_RW 0x4 /* command must read/write argument(s) */
362 #define PFM_CMD_CTX 0x8 /* command needs a perfmon context */
363 #define PFM_CMD_NOCHK 0x10 /* command does not need to check task's state */
365 #define PFM_CMD_IDX(cmd) (cmd)
367 #define PFM_CMD_IS_VALID(cmd) ((PFM_CMD_IDX(cmd) >= 0) \
368 && (PFM_CMD_IDX(cmd) < (int) PFM_CMD_COUNT) \
369 && pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func != NULL)
371 #define PFM_CMD_USE_PID(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_PID) != 0)
372 #define PFM_CMD_READ_ARG(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_READ) != 0)
373 #define PFM_CMD_RW_ARG(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_RW) != 0)
374 #define PFM_CMD_USE_CTX(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_CTX) != 0)
375 #define PFM_CMD_CHK(cmd) ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_NOCHK) == 0)
377 #define PFM_CMD_ARG_MANY -1 /* cannot be zero */
378 #define PFM_CMD_NARG(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg)
379 #define PFM_CMD_ARG_SIZE(cmd) (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize)
382 int debug; /* turn on/off debugging via syslog */
383 int debug_ovfl; /* turn on/off debug printk in overflow handler */
384 int fastctxsw; /* turn on/off fast (unsecure) ctxsw */
388 unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
389 unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
390 unsigned long pfm_recorded_samples_count;
391 unsigned long pfm_full_smpl_buffer_count; /* how many times the sampling buffer was full */
392 char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
396 * perfmon internal variables
398 static pfm_session_t pfm_sessions; /* global sessions information */
399 static struct proc_dir_entry *perfmon_dir; /* for debug only */
400 static pfm_stats_t pfm_stats[NR_CPUS];
401 static pfm_intr_handler_desc_t *pfm_alternate_intr_handler;
403 DEFINE_PER_CPU(unsigned long, pfm_syst_info);
405 /* sysctl() controls */
406 static pfm_sysctl_t pfm_sysctl;
408 static ctl_table pfm_ctl_table[]={
409 {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
410 {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
411 {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
414 static ctl_table pfm_sysctl_dir[] = {
415 {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
418 static ctl_table pfm_sysctl_root[] = {
419 {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
422 static struct ctl_table_header *pfm_sysctl_header;
424 static void pfm_vm_close(struct vm_area_struct * area);
426 static struct vm_operations_struct pfm_vm_ops={
427 .close = pfm_vm_close
431 * keep track of task owning the PMU per CPU.
434 struct task_struct *owner;
435 char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
436 } pmu_owners[NR_CPUS];
441 * forward declarations
443 static void pfm_reset_pmu(struct task_struct *);
444 static void pfm_lazy_save_regs (struct task_struct *ta);
446 #if defined(CONFIG_ITANIUM)
447 #include "perfmon_itanium.h"
448 #elif defined(CONFIG_MCKINLEY)
449 #include "perfmon_mckinley.h"
451 #include "perfmon_generic.h"
455 pfm_clear_psr_pp(void)
457 __asm__ __volatile__ ("rsm psr.pp;; srlz.i;;"::: "memory");
463 __asm__ __volatile__ ("ssm psr.pp;; srlz.i;;"::: "memory");
467 pfm_clear_psr_up(void)
469 __asm__ __volatile__ ("rum psr.up;; srlz.i;;"::: "memory");
475 __asm__ __volatile__ ("sum psr.up;; srlz.i;;"::: "memory");
478 static inline unsigned long
482 __asm__ __volatile__ ("mov %0=psr;;": "=r"(tmp) :: "memory");
487 pfm_set_psr_l(unsigned long val)
489 __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(val): "memory");
500 pfm_unfreeze_pmu(void)
506 static inline unsigned long
507 pfm_read_soft_counter(pfm_context_t *ctx, int i)
509 return ctx->ctx_soft_pmds[i].val + (ia64_get_pmd(i) & pmu_conf.ovfl_val);
513 pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
515 ctx->ctx_soft_pmds[i].val = val & ~pmu_conf.ovfl_val;
517 * writing to unimplemented part is ignore, so we do not need to
520 ia64_set_pmd(i, val & pmu_conf.ovfl_val);
524 * Generates a unique (per CPU) timestamp
526 static inline unsigned long
530 * XXX: must find something more efficient
532 return ia64_get_itc();
535 /* Here we want the physical address of the memory.
536 * This is used when initializing the contents of the
537 * area and marking the pages as reserved.
539 static inline unsigned long
540 pfm_kvirt_to_pa(unsigned long adr)
542 __u64 pa = ia64_tpa(adr);
543 //DBprintk(("kv2pa(%lx-->%lx)\n", adr, pa));
548 pfm_rvmalloc(unsigned long size)
553 size=PAGE_ALIGN(size);
556 //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
557 memset(mem, 0, size); /* Clear the ram out, no junk to the user */
558 adr=(unsigned long) mem;
560 SetPageReserved(vmalloc_to_page((void *)adr));
569 pfm_rvfree(void *mem, unsigned long size)
574 adr=(unsigned long) mem;
575 while ((long) size > 0) {
576 ClearPageReserved(vmalloc_to_page((void*)adr));
586 * This function gets called from mm/mmap.c:exit_mmap() only when there is a sampling buffer
587 * attached to the context AND the current task has a mapping for it, i.e., it is the original
588 * creator of the context.
590 * This function is used to remember the fact that the vma describing the sampling buffer
591 * has now been removed. It can only be called when no other tasks share the same mm context.
595 pfm_vm_close(struct vm_area_struct *vma)
597 pfm_smpl_buffer_desc_t *psb = (pfm_smpl_buffer_desc_t *)vma->vm_private_data;
600 printk(KERN_DEBUG "perfmon: psb is null in [%d]\n", current->pid);
604 * Add PSB to list of buffers to free on release_thread() when no more users
606 * This call is safe because, once the count is zero is cannot be modified anymore.
607 * This is not because there is no more user of the mm context, that the sampling
608 * buffer is not being used anymore outside of this task. In fact, it can still
609 * be accessed from within the kernel by another task (such as the monitored task).
611 * Therefore, we only move the psb into the list of buffers to free when we know
612 * nobody else is using it.
613 * The linked list if independent of the perfmon context, because in the case of
614 * multi-threaded processes, the last thread may not have been involved with
615 * monitoring however it will be the one removing the vma and it should therefore
616 * also remove the sampling buffer. This buffer cannot be removed until the vma
619 * This function cannot remove the buffer from here, because exit_mmap() must first
620 * complete. Given that there is no other vma related callback in the generic code,
621 * we have created our own with the linked list of sampling buffers to free. The list
622 * is part of the thread structure. In release_thread() we check if the list is
623 * empty. If not we call into perfmon to free the buffer and psb. That is the only
624 * way to ensure a safe deallocation of the sampling buffer which works when
625 * the buffer is shared between distinct processes or with multi-threaded programs.
627 * We need to lock the psb because the refcnt test and flag manipulation must
628 * looked like an atomic operation vis a vis pfm_context_exit()
632 if (psb->psb_refcnt == 0) {
634 psb->psb_next = current->thread.pfm_smpl_buf_list;
635 current->thread.pfm_smpl_buf_list = psb;
637 DBprintk(("[%d] add smpl @%p size %lu to smpl_buf_list psb_flags=0x%x\n",
638 current->pid, psb->psb_hdr, psb->psb_size, psb->psb_flags));
640 DBprintk(("[%d] clearing psb_flags=0x%x smpl @%p size %lu\n",
641 current->pid, psb->psb_flags, psb->psb_hdr, psb->psb_size));
643 * decrement the number vma for the buffer
645 psb->psb_flags &= ~PSB_HAS_VMA;
651 * This function is called from pfm_destroy_context() and also from pfm_inherit()
652 * to explicitly remove the sampling buffer mapping from the user level address space.
655 pfm_remove_smpl_mapping(struct task_struct *task)
657 pfm_context_t *ctx = task->thread.pfm_context;
658 pfm_smpl_buffer_desc_t *psb;
662 * some sanity checks first
664 if (ctx == NULL || task->mm == NULL || ctx->ctx_smpl_vaddr == 0 || ctx->ctx_psb == NULL) {
665 printk(KERN_DEBUG "perfmon: invalid context mm=%p\n", task->mm);
670 down_write(&task->mm->mmap_sem);
672 r = do_munmap(task->mm, ctx->ctx_smpl_vaddr, psb->psb_size);
674 up_write(&task->mm->mmap_sem);
676 printk(KERN_DEBUG "perfmon: pid %d unable to unmap sampling buffer "
677 "@0x%lx size=%ld\n", task->pid, ctx->ctx_smpl_vaddr, psb->psb_size);
680 DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d refcnt=%lu psb_flags=0x%x\n",
681 task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r, psb->psb_refcnt, psb->psb_flags));
686 static pfm_context_t *
687 pfm_context_alloc(void)
691 /* allocate context descriptor */
692 ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
693 if (ctx) memset(ctx, 0, sizeof(pfm_context_t));
699 pfm_context_free(pfm_context_t *ctx)
705 pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
709 DBprintk(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
712 page = pfm_kvirt_to_pa(buf);
714 if (remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
724 * counts the number of PMDS to save per entry.
725 * This code is generic enough to accommodate more than 64 PMDS when they become available
728 pfm_smpl_entry_size(unsigned long *which, unsigned long size)
730 unsigned long i, res = 0;
732 for (i=0; i < size; i++, which++) res += hweight64(*which);
734 DBprintk(("weight=%ld\n", res));
740 * Allocates the sampling buffer and remaps it into caller's address space
743 pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned long entries,
746 struct mm_struct *mm = current->mm;
747 struct vm_area_struct *vma = NULL;
748 unsigned long size, regcount;
750 pfm_smpl_buffer_desc_t *psb;
753 /* note that regcount might be 0, in this case only the header for each
754 * entry will be recorded.
756 regcount = pfm_smpl_entry_size(which_pmds, 1);
758 if ((sizeof(perfmon_smpl_hdr_t)+ entries*sizeof(perfmon_smpl_entry_t)) <= entries) {
759 DBprintk(("requested entries %lu is too big\n", entries));
764 * 1 buffer hdr and for each entry a header + regcount PMDs to save
766 size = PAGE_ALIGN( sizeof(perfmon_smpl_hdr_t)
767 + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64)));
769 DBprintk(("sampling buffer size=%lu bytes\n", size));
772 * check requested size to avoid Denial-of-service attacks
773 * XXX: may have to refine this test
774 * Check against address space limit.
776 * if ((mm->total_vm << PAGE_SHIFT) + len> current->rlim[RLIMIT_AS].rlim_cur)
779 if (size > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;
782 * We do the easy to undo allocations first.
784 * pfm_rvmalloc(), clears the buffer, so there is no leak
786 smpl_buf = pfm_rvmalloc(size);
787 if (smpl_buf == NULL) {
788 DBprintk(("Can't allocate sampling buffer\n"));
792 DBprintk(("smpl_buf @%p\n", smpl_buf));
794 /* allocate sampling buffer descriptor now */
795 psb = kmalloc(sizeof(*psb), GFP_KERNEL);
797 DBprintk(("Can't allocate sampling buffer descriptor\n"));
802 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
804 DBprintk(("Cannot allocate vma\n"));
808 * partially initialize the vma for the sampling buffer
810 * The VM_DONTCOPY flag is very important as it ensures that the mapping
811 * will never be inherited for any child process (via fork()) which is always
815 vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY;
816 vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
817 vma->vm_ops = &pfm_vm_ops; /* necesarry to get the close() callback */
820 vma->vm_private_data = psb; /* information needed by the pfm_vm_close() function */
823 * Now we have everything we need and we can initialize
824 * and connect all the data structures
827 psb->psb_hdr = smpl_buf;
828 psb->psb_addr = ((char *)smpl_buf)+sizeof(perfmon_smpl_hdr_t); /* first entry */
829 psb->psb_size = size; /* aligned size */
831 psb->psb_entries = entries;
833 psb->psb_flags = PSB_HAS_VMA;
835 spin_lock_init(&psb->psb_lock);
838 * XXX: will need to do cacheline alignment to avoid false sharing in SMP mode and
839 * multitask monitoring.
841 psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64);
843 DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p refcnt=%lu psb_flags=0x%x\n",
844 (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr,
845 (void *)psb->psb_addr, psb->psb_refcnt, psb->psb_flags));
847 /* initialize some of the fields of user visible buffer header */
848 psb->psb_hdr->hdr_version = PFM_SMPL_VERSION;
849 psb->psb_hdr->hdr_entry_size = psb->psb_entry_size;
850 psb->psb_hdr->hdr_pmds[0] = which_pmds[0];
853 * Let's do the difficult operations next.
855 * now we atomically find some area in the address space and
856 * remap the buffer in it.
858 down_write(¤t->mm->mmap_sem);
861 /* find some free area in address space, must have mmap sem held */
862 vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
863 if (vma->vm_start == 0UL) {
864 DBprintk(("Cannot find unmapped area for size %ld\n", size));
865 up_write(¤t->mm->mmap_sem);
868 vma->vm_end = vma->vm_start + size;
870 DBprintk(("entries=%ld aligned size=%ld, unmapped @0x%lx\n", entries, size, vma->vm_start));
872 /* can only be applied to current, need to have the mm semaphore held when called */
873 if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
874 DBprintk(("Can't remap buffer\n"));
875 up_write(¤t->mm->mmap_sem);
880 * now insert the vma in the vm list for the process, must be
881 * done with mmap lock held
883 insert_vm_struct(mm, vma);
885 mm->total_vm += size >> PAGE_SHIFT;
887 up_write(¤t->mm->mmap_sem);
889 /* store which PMDS to record */
890 ctx->ctx_smpl_regs[0] = which_pmds[0];
893 /* link to perfmon context */
897 * keep track of user level virtual address
899 ctx->ctx_smpl_vaddr = *(unsigned long *)user_vaddr = vma->vm_start;
904 kmem_cache_free(vm_area_cachep, vma);
908 pfm_rvfree(smpl_buf, size);
913 pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned long cpu_mask)
915 unsigned long m, undo_mask;
919 * validy checks on cpu_mask have been done upstream
925 * cannot mix system wide and per-task sessions
927 if (pfm_sessions.pfs_task_sessions > 0UL) {
928 DBprintk(("system wide not possible, %u conflicting task_sessions\n",
929 pfm_sessions.pfs_task_sessions));
933 m = cpu_mask; undo_mask = 0UL; n = 0;
934 DBprintk(("cpu_mask=0x%lx\n", cpu_mask));
935 for(i=0; m; i++, m>>=1) {
937 if ((m & 0x1) == 0UL) continue;
939 if (pfm_sessions.pfs_sys_session[i]) goto undo;
941 DBprintk(("reserving CPU%d currently on CPU%d\n", i, smp_processor_id()));
943 pfm_sessions.pfs_sys_session[i] = task;
944 undo_mask |= 1UL << i;
947 pfm_sessions.pfs_sys_sessions += n;
949 if (pfm_sessions.pfs_sys_sessions) goto abort;
950 pfm_sessions.pfs_task_sessions++;
952 DBprintk(("task_sessions=%u sys_session[%d]=%d",
953 pfm_sessions.pfs_task_sessions,
954 smp_processor_id(), pfm_sessions.pfs_sys_session[smp_processor_id()] ? 1 : 0));
958 DBprintk(("system wide not possible, conflicting session [%d] on CPU%d\n",
959 pfm_sessions.pfs_sys_session[i]->pid, i));
961 for(i=0; undo_mask; i++, undo_mask >>=1) {
962 pfm_sessions.pfs_sys_session[i] = NULL;
972 pfm_unreserve_session(struct task_struct *task, int is_syswide, unsigned long cpu_mask)
978 ctx = task ? task->thread.pfm_context : NULL;
981 * validy checks on cpu_mask have been done upstream
985 DBprintk(("[%d] sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu_mask=0x%lx\n",
987 pfm_sessions.pfs_sys_sessions,
988 pfm_sessions.pfs_task_sessions,
989 pfm_sessions.pfs_sys_use_dbregs,
996 for(i=0; m; i++, m>>=1) {
997 if ((m & 0x1) == 0UL) continue;
998 pfm_sessions.pfs_sys_session[i] = NULL;
1002 * would not work with perfmon+more than one bit in cpu_mask
1004 if (ctx && ctx->ctx_fl_using_dbreg) {
1005 if (pfm_sessions.pfs_sys_use_dbregs == 0) {
1006 printk(KERN_DEBUG "perfmon: invalid release for [%d] "
1007 "sys_use_dbregs=0\n", task->pid);
1009 pfm_sessions.pfs_sys_use_dbregs--;
1012 pfm_sessions.pfs_sys_sessions -= n;
1014 DBprintk(("CPU%d sys_sessions=%u\n",
1015 smp_processor_id(), pfm_sessions.pfs_sys_sessions));
1017 pfm_sessions.pfs_task_sessions--;
1018 DBprintk(("[%d] task_sessions=%u\n",
1019 task->pid, pfm_sessions.pfs_task_sessions));
1028 * XXX: do something better here
1031 pfm_bad_permissions(struct task_struct *task)
1033 /* stolen from bad_signal() */
1034 return (current->session != task->session)
1035 && (current->euid ^ task->suid) && (current->euid ^ task->uid)
1036 && (current->uid ^ task->suid) && (current->uid ^ task->uid);
1041 pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
1043 unsigned long smpl_pmds = pfx->ctx_smpl_regs[0];
1049 /* cannot send to process 1, 0 means do not notify */
1050 if (pfx->ctx_notify_pid == 1) {
1051 DBprintk(("invalid notify_pid %d\n", pfx->ctx_notify_pid));
1054 ctx_flags = pfx->ctx_flags;
1056 if ((ctx_flags & PFM_FL_INHERIT_MASK) == (PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)) {
1057 DBprintk(("invalid inherit mask 0x%x\n",ctx_flags & PFM_FL_INHERIT_MASK));
1061 if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
1062 DBprintk(("cpu_mask=0x%lx\n", pfx->ctx_cpu_mask));
1064 * cannot block in this mode
1066 if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
1067 DBprintk(("cannot use blocking mode when in system wide monitoring\n"));
1071 * must only have one bit set in the CPU mask
1073 if (hweight64(pfx->ctx_cpu_mask) != 1UL) {
1074 DBprintk(("invalid CPU mask specified\n"));
1078 * and it must be a valid CPU
1080 cpu = ffz(~pfx->ctx_cpu_mask);
1082 if (cpu_online(cpu) == 0) {
1086 DBprintk(("CPU%d is not online\n", cpu));
1091 * check for pre-existing pinning, if conflicting reject
1093 if (task->cpus_allowed != ~0UL && (task->cpus_allowed & (1UL<<cpu)) == 0) {
1094 DBprintk(("[%d] pinned on 0x%lx, mask for CPU%d \n", task->pid,
1095 task->cpus_allowed, cpu));
1101 * must provide a target for the signal in blocking mode even when
1102 * no counter is configured with PFM_FL_REG_OVFL_NOTIFY
1104 if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) {
1105 DBprintk(("must have notify_pid when blocking for [%d]\n", task->pid));
1109 if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == task->pid) {
1110 DBprintk(("cannot notify self when blocking for [%d]\n", task->pid));
1115 /* verify validity of smpl_regs */
1116 if ((smpl_pmds & pmu_conf.impl_pmds[0]) != smpl_pmds) {
1117 DBprintk(("invalid smpl_regs 0x%lx\n", smpl_pmds));
1120 /* probably more to add here */
1126 pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int count,
1127 struct pt_regs *regs)
1129 pfarg_context_t tmp;
1135 /* a context has already been defined */
1136 if (ctx) return -EBUSY;
1141 if (task != current) return -EINVAL;
1143 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1145 ret = pfx_is_sane(task, &tmp);
1146 if (ret < 0) return ret;
1148 ctx_flags = tmp.ctx_flags;
1150 ret = pfm_reserve_session(task, ctx_flags & PFM_FL_SYSTEM_WIDE, tmp.ctx_cpu_mask);
1151 if (ret) goto abort;
1155 ctx = pfm_context_alloc();
1156 if (!ctx) goto error;
1158 /* record the creator (important for inheritance) */
1159 ctx->ctx_owner = current;
1161 notify_pid = tmp.ctx_notify_pid;
1163 spin_lock_init(&ctx->ctx_lock);
1165 if (notify_pid == current->pid) {
1167 ctx->ctx_notify_task = current;
1168 task->thread.pfm_context = ctx;
1170 } else if (notify_pid!=0) {
1171 struct task_struct *notify_task;
1173 read_lock(&tasklist_lock);
1175 notify_task = find_task_by_pid(notify_pid);
1182 * check if we can send this task a signal
1184 if (pfm_bad_permissions(notify_task)) {
1185 read_unlock(&tasklist_lock);
1191 * must be done inside critical section
1193 * if the initialization does not go through it is still
1194 * okay because child will do the scan for nothing which
1197 task->thread.pfm_context = ctx;
1200 * will cause task to check on exit for monitored
1201 * processes that would notify it. see release_thread()
1202 * Note: the scan MUST be done in release thread, once the
1203 * task has been detached from the tasklist otherwise you are
1204 * exposed to race conditions.
1206 atomic_add(1, &ctx->ctx_notify_task->thread.pfm_notifiers_check);
1208 ctx->ctx_notify_task = notify_task;
1210 read_unlock(&tasklist_lock);
1214 * notification process does not exist
1216 if (notify_pid != 0 && ctx->ctx_notify_task == NULL) {
1221 if (tmp.ctx_smpl_entries) {
1222 DBprintk(("sampling entries=%lu\n",tmp.ctx_smpl_entries));
1224 ret = pfm_smpl_buffer_alloc(ctx, tmp.ctx_smpl_regs,
1225 tmp.ctx_smpl_entries, &uaddr);
1226 if (ret<0) goto buffer_error;
1228 tmp.ctx_smpl_vaddr = uaddr;
1230 /* initialization of context's flags */
1231 ctx->ctx_fl_inherit = ctx_flags & PFM_FL_INHERIT_MASK;
1232 ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
1233 ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
1234 ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
1235 ctx->ctx_fl_unsecure = (ctx_flags & PFM_FL_UNSECURE) ? 1: 0;
1236 ctx->ctx_fl_frozen = 0;
1237 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
1240 * setting this flag to 0 here means, that the creator or the task that the
1241 * context is being attached are granted access. Given that a context can only
1242 * be created for the calling process this, in effect only allows the creator
1243 * to access the context. See pfm_protect() for more.
1245 ctx->ctx_fl_protected = 0;
1247 /* for system wide mode only (only 1 bit set) */
1248 ctx->ctx_cpu = ffz(~tmp.ctx_cpu_mask);
1250 atomic_set(&ctx->ctx_last_cpu,-1); /* SMP only, means no CPU */
1252 sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */
1254 if (__copy_to_user(req, &tmp, sizeof(tmp))) {
1259 DBprintk(("context=%p, pid=%d notify_task=%p\n",
1260 (void *)ctx, task->pid, ctx->ctx_notify_task));
1262 DBprintk(("context=%p, pid=%d flags=0x%x inherit=%d block=%d system=%d excl_idle=%d unsecure=%d\n",
1263 (void *)ctx, task->pid, ctx_flags, ctx->ctx_fl_inherit,
1264 ctx->ctx_fl_block, ctx->ctx_fl_system,
1265 ctx->ctx_fl_excl_idle,
1266 ctx->ctx_fl_unsecure));
1269 * when no notification is required, we can make this visible at the last moment
1271 if (notify_pid == 0) task->thread.pfm_context = ctx;
1273 * pin task to CPU and force reschedule on exit to ensure
1274 * that when back to user level the task runs on the designated
1277 if (ctx->ctx_fl_system) {
1278 ctx->ctx_saved_cpus_allowed = task->cpus_allowed;
1279 set_cpus_allowed(task, tmp.ctx_cpu_mask);
1280 DBprintk(("[%d] rescheduled allowed=0x%lx\n", task->pid, task->cpus_allowed));
1286 pfm_context_free(ctx);
1288 pfm_unreserve_session(task, ctx_flags & PFM_FL_SYSTEM_WIDE , tmp.ctx_cpu_mask);
1290 /* make sure we don't leave anything behind */
1291 task->thread.pfm_context = NULL;
1296 static inline unsigned long
1297 pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
1299 unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
1300 unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
1301 extern unsigned long carta_random32 (unsigned long seed);
1303 if (reg->flags & PFM_REGFL_RANDOM) {
1304 new_seed = carta_random32(old_seed);
1305 val -= (old_seed & mask); /* counter values are negative numbers! */
1306 if ((mask >> 32) != 0)
1307 /* construct a full 64-bit random value: */
1308 new_seed |= carta_random32(old_seed >> 32) << 32;
1309 reg->seed = new_seed;
1316 pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
1318 unsigned long mask = ovfl_regs[0];
1319 unsigned long reset_others = 0UL;
1321 int i, is_long_reset = (flag == PFM_PMD_LONG_RESET);
1324 * now restore reset value on sampling overflowed counters
1326 mask >>= PMU_FIRST_COUNTER;
1327 for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
1329 val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
1330 reset_others |= ctx->ctx_soft_pmds[i].reset_pmds[0];
1332 DBprintk_ovfl(("[%d] %s reset soft_pmd[%d]=%lx\n", current->pid,
1333 is_long_reset ? "long" : "short", i, val));
1335 /* upper part is ignored on rval */
1336 pfm_write_soft_counter(ctx, i, val);
1341 * Now take care of resetting the other registers
1343 for(i = 0; reset_others; i++, reset_others >>= 1) {
1345 if ((reset_others & 0x1) == 0) continue;
1347 val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
1349 if (PMD_IS_COUNTING(i)) {
1350 pfm_write_soft_counter(ctx, i, val);
1352 ia64_set_pmd(i, val);
1354 DBprintk_ovfl(("[%d] %s reset_others pmd[%d]=%lx\n", current->pid,
1355 is_long_reset ? "long" : "short", i, val));
1361 pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1363 struct thread_struct *th = &task->thread;
1364 pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
1365 unsigned long value, reset_pmds;
1366 unsigned int cnum, reg_flags, flags;
1370 /* we don't quite support this right now */
1371 if (task != current) return -EINVAL;
1373 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1375 /* XXX: ctx locking may be required here */
1377 for (i = 0; i < count; i++, req++) {
1379 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1382 reg_flags = tmp.reg_flags;
1383 value = tmp.reg_value;
1384 reset_pmds = tmp.reg_reset_pmds[0];
1388 * we reject all non implemented PMC as well
1389 * as attempts to modify PMC[0-3] which are used
1390 * as status registers by the PMU
1392 if (!PMC_IS_IMPL(cnum) || cnum < 4) {
1393 DBprintk(("pmc[%u] is unimplemented or invalid\n", cnum));
1397 * A PMC used to configure monitors must be:
1398 * - system-wide session: privileged monitor
1399 * - per-task : user monitor
1400 * any other configuration is rejected.
1402 if (PMC_IS_MONITOR(cnum) || PMC_IS_COUNTING(cnum)) {
1403 DBprintk(("pmc[%u].pm=%ld\n", cnum, PMC_PM(cnum, value)));
1405 if (ctx->ctx_fl_system ^ PMC_PM(cnum, value)) {
1406 DBprintk(("pmc_pm=%ld fl_system=%d\n", PMC_PM(cnum, value), ctx->ctx_fl_system));
1411 if (PMC_IS_COUNTING(cnum)) {
1412 pfm_monitor_t *p = (pfm_monitor_t *)&value;
1414 * enforce generation of overflow interrupt. Necessary on all
1419 if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
1421 * must have a target for the signal
1423 if (ctx->ctx_notify_task == NULL) {
1424 DBprintk(("cannot set ovfl_notify: no notify_task\n"));
1427 flags |= PFM_REGFL_OVFL_NOTIFY;
1430 if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
1432 /* verify validity of reset_pmds */
1433 if ((reset_pmds & pmu_conf.impl_pmds[0]) != reset_pmds) {
1434 DBprintk(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
1437 } else if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
1438 DBprintk(("cannot set ovfl_notify or random on pmc%u\n", cnum));
1443 * execute write checker, if any
1445 if (PMC_WR_FUNC(cnum)) {
1446 ret = PMC_WR_FUNC(cnum)(task, cnum, &value, regs);
1447 if (ret) goto error;
1452 * no error on this register
1454 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
1457 * update register return value, abort all if problem during copy.
1458 * we only modify the reg_flags field. no check mode is fine because
1459 * access has been verified upfront in sys_perfmonctl().
1461 * If this fails, then the software state is not modified
1463 if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
1466 * Now we commit the changes to the software state
1470 * full flag update each time a register is programmed
1472 ctx->ctx_soft_pmds[cnum].flags = flags;
1474 if (PMC_IS_COUNTING(cnum)) {
1475 ctx->ctx_soft_pmds[cnum].reset_pmds[0] = reset_pmds;
1477 /* mark all PMDS to be accessed as used */
1478 CTX_USED_PMD(ctx, reset_pmds);
1482 * Needed in case the user does not initialize the equivalent
1483 * PMD. Clearing is done in reset_pmu() so there is no possible
1486 CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
1489 * keep copy the pmc, used for register reload
1491 th->pmc[cnum] = value;
1493 ia64_set_pmc(cnum, value);
1495 DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x used_pmds=0x%lx\n",
1496 task->pid, cnum, value,
1497 ctx->ctx_soft_pmds[cnum].flags,
1498 ctx->ctx_used_pmds[0]));
1505 PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
1507 if (__put_user(tmp.reg_flags, &req->reg_flags)) ret = -EFAULT;
1509 DBprintk(("[%d] pmc[%u]=0x%lx error %d\n", task->pid, cnum, value, ret));
1515 pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1517 pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
1518 unsigned long value, hw_value;
1523 /* we don't quite support this right now */
1524 if (task != current) return -EINVAL;
1527 * Cannot do anything before PMU is enabled
1529 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1532 /* XXX: ctx locking may be required here */
1535 for (i = 0; i < count; i++, req++) {
1537 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1540 value = tmp.reg_value;
1542 if (!PMD_IS_IMPL(cnum)) {
1543 DBprintk(("pmd[%u] is unimplemented or invalid\n", cnum));
1548 * execute write checker, if any
1550 if (PMD_WR_FUNC(cnum)) {
1551 unsigned long v = value;
1552 ret = PMD_WR_FUNC(cnum)(task, cnum, &v, regs);
1553 if (ret) goto abort_mission;
1559 * no error on this register
1561 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
1563 if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
1566 * now commit changes to software state
1569 /* update virtualized (64bits) counter */
1570 if (PMD_IS_COUNTING(cnum)) {
1571 ctx->ctx_soft_pmds[cnum].lval = value;
1572 ctx->ctx_soft_pmds[cnum].val = value & ~pmu_conf.ovfl_val;
1574 hw_value = value & pmu_conf.ovfl_val;
1576 ctx->ctx_soft_pmds[cnum].long_reset = tmp.reg_long_reset;
1577 ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset;
1579 ctx->ctx_soft_pmds[cnum].seed = tmp.reg_random_seed;
1580 ctx->ctx_soft_pmds[cnum].mask = tmp.reg_random_mask;
1583 /* keep track of what we use */
1584 CTX_USED_PMD(ctx, pmu_conf.pmd_desc[(cnum)].dep_pmd[0]);
1586 /* mark this register as used as well */
1587 CTX_USED_PMD(ctx, RDEP(cnum));
1589 /* writes to unimplemented part is ignored, so this is safe */
1590 ia64_set_pmd(cnum, hw_value);
1595 DBprintk(("[%d] pmd[%u]: value=0x%lx hw_value=0x%lx soft_pmd=0x%lx short_reset=0x%lx "
1596 "long_reset=0x%lx hw_pmd=%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx\n",
1599 ctx->ctx_soft_pmds[cnum].val,
1600 ctx->ctx_soft_pmds[cnum].short_reset,
1601 ctx->ctx_soft_pmds[cnum].long_reset,
1602 ia64_get_pmd(cnum) & pmu_conf.ovfl_val,
1603 PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
1604 ctx->ctx_used_pmds[0],
1605 ctx->ctx_soft_pmds[cnum].reset_pmds[0]));
1614 * for now, we have only one possibility for error
1616 PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
1619 * we change the return value to EFAULT in case we cannot write register return code.
1620 * The caller first must correct this error, then a resubmission of the request will
1621 * eventually yield the EINVAL.
1623 if (__put_user(tmp.reg_flags, &req->reg_flags)) ret = -EFAULT;
1625 DBprintk(("[%d] pmc[%u]=0x%lx ret %d\n", task->pid, cnum, value, ret));
1631 pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1633 struct thread_struct *th = &task->thread;
1634 unsigned long val, lval;
1635 pfarg_reg_t *req = (pfarg_reg_t *)arg;
1636 unsigned int cnum, reg_flags = 0;
1643 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1646 * XXX: MUST MAKE SURE WE DON"T HAVE ANY PENDING OVERFLOW BEFORE READING
1647 * This is required when the monitoring has been stoppped by user or kernel.
1648 * If it is still going on, then that's fine because we a re not guaranteed
1649 * to return an accurate value in this case.
1652 /* XXX: ctx locking may be required here */
1654 DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid));
1656 for (i = 0; i < count; i++, req++) {
1659 foo = __get_user(cnum, &req->reg_num);
1660 if (foo) return -EFAULT;
1661 foo = __get_user(reg_flags, &req->reg_flags);
1662 if (foo) return -EFAULT;
1664 if (__get_user(cnum, &req->reg_num)) return -EFAULT;
1665 if (__get_user(reg_flags, &req->reg_flags)) return -EFAULT;
1669 if (!PMD_IS_IMPL(cnum)) goto abort_mission;
1671 * we can only read the register that we use. That includes
1672 * the one we explicitly initialize AND the one we want included
1673 * in the sampling buffer (smpl_regs).
1675 * Having this restriction allows optimization in the ctxsw routine
1676 * without compromising security (leaks)
1678 if (!CTX_IS_USED_PMD(ctx, cnum)) goto abort_mission;
1681 * If the task is not the current one, then we check if the
1682 * PMU state is still in the local live register due to lazy ctxsw.
1683 * If true, then we read directly from the registers.
1686 if (atomic_read(&ctx->ctx_last_cpu) == me){
1688 val = ia64_get_pmd(cnum);
1689 DBprintk(("reading pmd[%u]=0x%lx from hw\n", cnum, val));
1691 val = th->pmd[cnum];
1695 if (PMD_IS_COUNTING(cnum)) {
1697 * XXX: need to check for overflow
1699 val &= pmu_conf.ovfl_val;
1700 val += ctx->ctx_soft_pmds[cnum].val;
1702 lval = ctx->ctx_soft_pmds[cnum].lval;
1706 * execute read checker, if any
1708 if (PMD_RD_FUNC(cnum)) {
1709 unsigned long v = val;
1710 ret = PMD_RD_FUNC(cnum)(task, cnum, &v, regs);
1714 PFM_REG_RETFLAG_SET(reg_flags, ret);
1718 DBprintk(("read pmd[%u] ret=%d value=0x%lx pmc=0x%lx\n",
1719 cnum, ret, val, ia64_get_pmc(cnum)));
1722 * update register return value, abort all if problem during copy.
1723 * we only modify the reg_flags field. no check mode is fine because
1724 * access has been verified upfront in sys_perfmonctl().
1726 if (__put_user(cnum, &req->reg_num)) return -EFAULT;
1727 if (__put_user(val, &req->reg_value)) return -EFAULT;
1728 if (__put_user(reg_flags, &req->reg_flags)) return -EFAULT;
1729 if (__put_user(lval, &req->reg_last_reset_value)) return -EFAULT;
1735 PFM_REG_RETFLAG_SET(reg_flags, PFM_REG_RETFL_EINVAL);
1737 * XXX: if this fails, we stick with the original failure, flag not updated!
1739 __put_user(reg_flags, &req->reg_flags);
1744 #ifdef PFM_PMU_USES_DBR
1746 * Only call this function when a process it trying to
1747 * write the debug registers (reading is always allowed)
1750 pfm_use_debug_registers(struct task_struct *task)
1752 pfm_context_t *ctx = task->thread.pfm_context;
1755 DBprintk(("called for [%d]\n", task->pid));
1760 if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
1763 * Even on SMP, we do not need to use an atomic here because
1764 * the only way in is via ptrace() and this is possible only when the
1765 * process is stopped. Even in the case where the ctxsw out is not totally
1766 * completed by the time we come here, there is no way the 'stopped' process
1767 * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
1768 * So this is always safe.
1770 if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
1775 * We cannot allow setting breakpoints when system wide monitoring
1776 * sessions are using the debug registers.
1778 if (pfm_sessions.pfs_sys_use_dbregs> 0)
1781 pfm_sessions.pfs_ptrace_use_dbregs++;
1783 DBprintk(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n",
1784 pfm_sessions.pfs_ptrace_use_dbregs,
1785 pfm_sessions.pfs_sys_use_dbregs,
1794 * This function is called for every task that exits with the
1795 * IA64_THREAD_DBG_VALID set. This indicates a task which was
1796 * able to use the debug registers for debugging purposes via
1797 * ptrace(). Therefore we know it was not using them for
1798 * perfmormance monitoring, so we only decrement the number
1799 * of "ptraced" debug register users to keep the count up to date
1802 pfm_release_debug_registers(struct task_struct *task)
1807 if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
1808 printk(KERN_DEBUG "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n",
1812 pfm_sessions.pfs_ptrace_use_dbregs--;
1819 #else /* PFM_PMU_USES_DBR is true */
1821 * in case, the PMU does not use the debug registers, these two functions are nops.
1822 * The first function is called from arch/ia64/kernel/ptrace.c.
1823 * The second function is called from arch/ia64/kernel/process.c.
1826 pfm_use_debug_registers(struct task_struct *task)
1832 pfm_release_debug_registers(struct task_struct *task)
1836 #endif /* PFM_PMU_USES_DBR */
1839 pfm_restart(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1840 struct pt_regs *regs)
1842 void *sem = &ctx->ctx_restart_sem;
1845 * Cannot do anything before PMU is enabled
1847 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1849 if (task == current) {
1850 DBprintk(("restarting self %d frozen=%d ovfl_regs=0x%lx\n",
1853 ctx->ctx_ovfl_regs[0]));
1856 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
1858 ctx->ctx_ovfl_regs[0] = 0UL;
1861 * We ignore block/don't block because we never block
1862 * for a self-monitoring process.
1864 ctx->ctx_fl_frozen = 0;
1866 if (CTX_HAS_SMPL(ctx)) {
1867 ctx->ctx_psb->psb_hdr->hdr_count = 0;
1868 ctx->ctx_psb->psb_index = 0;
1871 /* simply unfreeze */
1878 /* restart on another task */
1881 * if blocking, then post the semaphore.
1882 * if non-blocking, then we ensure that the task will go into
1883 * pfm_overflow_must_block() before returning to user mode.
1884 * We cannot explicitly reset another task, it MUST always
1885 * be done by the task itself. This works for system wide because
1886 * the tool that is controlling the session is doing "self-monitoring".
1888 * XXX: what if the task never goes back to user?
1891 if (CTX_OVFL_NOBLOCK(ctx) == 0) {
1892 DBprintk(("unblocking %d \n", task->pid));
1895 struct thread_info *info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
1896 task->thread.pfm_ovfl_block_reset = 1;
1897 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
1898 set_bit(TIF_NOTIFY_RESUME, &info->flags);
1902 * in case of non blocking mode, then it's just a matter of
1903 * of reseting the sampling buffer (if any) index. The PMU
1904 * is already active.
1908 * must reset the header count first
1910 if (CTX_HAS_SMPL(ctx)) {
1911 DBprintk(("resetting sampling indexes for %d \n", task->pid));
1912 ctx->ctx_psb->psb_hdr->hdr_count = 0;
1913 ctx->ctx_psb->psb_index = 0;
1920 pfm_stop(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1921 struct pt_regs *regs)
1923 /* we don't quite support this right now */
1924 if (task != current) return -EINVAL;
1927 * Cannot do anything before PMU is enabled
1929 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1931 DBprintk(("[%d] fl_system=%d owner=%p current=%p\n",
1933 ctx->ctx_fl_system, PMU_OWNER(),
1937 /* simply stop monitoring but not the PMU */
1938 if (ctx->ctx_fl_system) {
1940 /* disable dcr pp */
1941 ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
1943 /* stop monitoring */
1948 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
1950 ia64_psr(regs)->pp = 0;
1954 /* stop monitoring */
1960 * clear user level psr.up
1962 ia64_psr(regs)->up = 0;
1969 pfm_disable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1970 struct pt_regs *regs)
1972 /* we don't quite support this right now */
1973 if (task != current) return -EINVAL;
1975 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1979 * stop monitoring, freeze PMU, and save state in context
1980 * this call will clear IA64_THREAD_PM_VALID for per-task sessions.
1982 pfm_flush_regs(task);
1984 if (ctx->ctx_fl_system) {
1985 ia64_psr(regs)->pp = 0;
1987 ia64_psr(regs)->up = 0;
1990 * goes back to default behavior: no user level control
1991 * no need to change live psr.sp because useless at the kernel level
1993 ia64_psr(regs)->sp = 1;
1995 DBprintk(("enabling psr.sp for [%d]\n", current->pid));
1997 ctx->ctx_flags.state = PFM_CTX_DISABLED;
2004 pfm_context_destroy(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2005 struct pt_regs *regs)
2007 /* we don't quite support this right now */
2008 if (task != current) return -EINVAL;
2011 * if context was never enabled, then there is not much
2014 if (!CTX_IS_ENABLED(ctx)) goto skipped_stop;
2017 * Disable context: stop monitoring, flush regs to software state (useless here),
2020 * The IA64_THREAD_PM_VALID is cleared by pfm_flush_regs() called from pfm_disable()
2022 pfm_disable(task, ctx, arg, count, regs);
2024 if (ctx->ctx_fl_system) {
2025 ia64_psr(regs)->pp = 0;
2027 ia64_psr(regs)->up = 0;
2032 * remove sampling buffer mapping, if any
2034 if (ctx->ctx_smpl_vaddr) {
2035 pfm_remove_smpl_mapping(task);
2036 ctx->ctx_smpl_vaddr = 0UL;
2038 /* now free context and related state */
2039 pfm_context_exit(task);
2045 * does nothing at the moment
2048 pfm_context_unprotect(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2049 struct pt_regs *regs)
2055 pfm_protect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2056 struct pt_regs *regs)
2058 DBprintk(("context from [%d] is protected\n", task->pid));
2060 * from now on, only the creator of the context has access to it
2062 ctx->ctx_fl_protected = 1;
2065 * reinforce secure monitoring: cannot toggle psr.up
2067 if (ctx->ctx_fl_unsecure == 0) ia64_psr(regs)->sp = 1;
2073 pfm_debug(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2074 struct pt_regs *regs)
2076 unsigned int mode = *(unsigned int *)arg;
2078 pfm_sysctl.debug = mode == 0 ? 0 : 1;
2080 printk(KERN_INFO "perfmon debugging %s\n", pfm_sysctl.debug ? "on" : "off");
2085 #ifdef PFM_PMU_USES_DBR
2088 unsigned long ibr_mask:56;
2089 unsigned long ibr_plm:4;
2090 unsigned long ibr_ig:3;
2091 unsigned long ibr_x:1;
2095 unsigned long dbr_mask:56;
2096 unsigned long dbr_plm:4;
2097 unsigned long dbr_ig:2;
2098 unsigned long dbr_w:1;
2099 unsigned long dbr_r:1;
2109 pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, struct pt_regs *regs)
2111 struct thread_struct *thread = &task->thread;
2112 pfm_context_t *ctx = task->thread.pfm_context;
2113 pfarg_dbreg_t tmp, *req = (pfarg_dbreg_t *)arg;
2120 * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
2121 * ensuring that no real breakpoint can be installed via this call.
2124 first_time = ctx->ctx_fl_using_dbreg == 0;
2127 * check for debug registers in system wide mode
2131 if (ctx->ctx_fl_system && first_time) {
2132 if (pfm_sessions.pfs_ptrace_use_dbregs)
2135 pfm_sessions.pfs_sys_use_dbregs++;
2139 if (ret != 0) return ret;
2141 if (ctx->ctx_fl_system) {
2142 /* we mark ourselves as owner of the debug registers */
2143 ctx->ctx_fl_using_dbreg = 1;
2144 DBprintk(("system-wide setting fl_using_dbreg for [%d]\n", task->pid));
2145 } else if (first_time) {
2147 if ((thread->flags & IA64_THREAD_DBG_VALID) != 0) {
2148 DBprintk(("debug registers already in use for [%d]\n", task->pid));
2151 /* we mark ourselves as owner of the debug registers */
2152 ctx->ctx_fl_using_dbreg = 1;
2154 DBprintk(("setting fl_using_dbreg for [%d]\n", task->pid));
2156 * Given debug registers cannot be used for both debugging
2157 * and performance monitoring at the same time, we reuse
2158 * the storage area to save and restore the registers on ctxsw.
2160 memset(task->thread.dbr, 0, sizeof(task->thread.dbr));
2161 memset(task->thread.ibr, 0, sizeof(task->thread.ibr));
2165 DBprintk(("[%d] clearing ibrs,dbrs\n", task->pid));
2167 * clear hardware registers to make sure we don't
2168 * pick up stale state.
2170 * for a system wide session, we do not use
2171 * thread.dbr, thread.ibr because this process
2172 * never leaves the current CPU and the state
2173 * is shared by all processes running on it
2175 for (i=0; i < (int) pmu_conf.num_ibrs; i++) {
2176 ia64_set_ibr(i, 0UL);
2179 for (i=0; i < (int) pmu_conf.num_dbrs; i++) {
2180 ia64_set_dbr(i, 0UL);
2188 * Now install the values into the registers
2190 for (i = 0; i < count; i++, req++) {
2192 if (__copy_from_user(&tmp, req, sizeof(tmp))) goto abort_mission;
2194 rnum = tmp.dbreg_num;
2195 dbreg.val = tmp.dbreg_value;
2199 if ((mode == 0 && !IBR_IS_IMPL(rnum)) || ((mode == 1) && !DBR_IS_IMPL(rnum))) {
2200 DBprintk(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
2201 rnum, dbreg.val, mode, i, count));
2207 * make sure we do not install enabled breakpoint
2211 dbreg.ibr.ibr_x = 0;
2213 dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
2217 * clear return flags and copy back to user
2219 * XXX: fix once EAGAIN is implemented
2223 PFM_REG_RETFLAG_SET(tmp.dbreg_flags, 0);
2225 if (__copy_to_user(req, &tmp, sizeof(tmp))) goto abort_mission;
2228 * Debug registers, just like PMC, can only be modified
2229 * by a kernel call. Moreover, perfmon() access to those
2230 * registers are centralized in this routine. The hardware
2231 * does not modify the value of these registers, therefore,
2232 * if we save them as they are written, we can avoid having
2233 * to save them on context switch out. This is made possible
2234 * by the fact that when perfmon uses debug registers, ptrace()
2235 * won't be able to modify them concurrently.
2238 CTX_USED_IBR(ctx, rnum);
2240 ia64_set_ibr(rnum, dbreg.val);
2243 thread->ibr[rnum] = dbreg.val;
2245 DBprintk(("write ibr%u=0x%lx used_ibrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_ibrs[0]));
2247 CTX_USED_DBR(ctx, rnum);
2249 ia64_set_dbr(rnum, dbreg.val);
2252 thread->dbr[rnum] = dbreg.val;
2254 DBprintk(("write dbr%u=0x%lx used_dbrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_dbrs[0]));
2262 * in case it was our first attempt, we undo the global modifications
2266 if (ctx->ctx_fl_system) {
2267 pfm_sessions.pfs_sys_use_dbregs--;
2270 ctx->ctx_fl_using_dbreg = 0;
2273 * install error return flag
2275 if (ret != -EFAULT) {
2277 * XXX: for now we can only come here on EINVAL
2279 PFM_REG_RETFLAG_SET(tmp.dbreg_flags, PFM_REG_RETFL_EINVAL);
2280 if (__put_user(tmp.dbreg_flags, &req->dbreg_flags)) ret = -EFAULT;
2286 pfm_write_ibrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2287 struct pt_regs *regs)
2289 /* we don't quite support this right now */
2290 if (task != current) return -EINVAL;
2292 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2294 return pfm_write_ibr_dbr(0, task, arg, count, regs);
2298 pfm_write_dbrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2299 struct pt_regs *regs)
2301 /* we don't quite support this right now */
2302 if (task != current) return -EINVAL;
2304 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2306 return pfm_write_ibr_dbr(1, task, arg, count, regs);
2309 #endif /* PFM_PMU_USES_DBR */
2312 pfm_get_features(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
2314 pfarg_features_t tmp;
2316 memset(&tmp, 0, sizeof(tmp));
2318 tmp.ft_version = PFM_VERSION;
2319 tmp.ft_smpl_version = PFM_SMPL_VERSION;
2321 if (__copy_to_user(arg, &tmp, sizeof(tmp))) return -EFAULT;
2327 pfm_start(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2328 struct pt_regs *regs)
2330 /* we don't quite support this right now */
2331 if (task != current) return -EINVAL;
2334 * Cannot do anything before PMU is enabled
2336 if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2338 DBprintk(("[%d] fl_system=%d owner=%p current=%p\n",
2340 ctx->ctx_fl_system, PMU_OWNER(),
2343 if (PMU_OWNER() != task) {
2344 printk(KERN_DEBUG "perfmon: pfm_start task [%d] not pmu owner\n", task->pid);
2349 if (ctx->ctx_fl_system) {
2351 PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
2353 /* set user level psr.pp */
2354 ia64_psr(regs)->pp = 1;
2356 /* start monitoring at kernel level */
2360 ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
2365 if ((task->thread.flags & IA64_THREAD_PM_VALID) == 0) {
2367 printk(KERN_DEBUG "perfmon: pfm_start task flag not set for [%d]\n",
2371 /* set user level psr.up */
2372 ia64_psr(regs)->up = 1;
2374 /* start monitoring at kernel level */
2385 pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2386 struct pt_regs *regs)
2390 /* we don't quite support this right now */
2391 if (task != current) return -EINVAL;
2393 me = get_cpu(); /* make sure we're not migrated or preempted */
2395 if (ctx->ctx_fl_system == 0 && PMU_OWNER() && PMU_OWNER() != current)
2396 pfm_lazy_save_regs(PMU_OWNER());
2398 /* reset all registers to stable quiet state */
2399 pfm_reset_pmu(task);
2401 /* make sure nothing starts */
2402 if (ctx->ctx_fl_system) {
2403 ia64_psr(regs)->pp = 0;
2404 ia64_psr(regs)->up = 0; /* just to make sure! */
2406 /* make sure monitoring is stopped */
2410 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
2411 PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
2412 if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
2415 * needed in case the task was a passive task during
2416 * a system wide session and now wants to have its own
2419 ia64_psr(regs)->pp = 0; /* just to make sure! */
2420 ia64_psr(regs)->up = 0;
2422 /* make sure monitoring is stopped */
2426 DBprintk(("clearing psr.sp for [%d]\n", current->pid));
2428 /* allow user level control */
2429 ia64_psr(regs)->sp = 0;
2431 /* PMU state will be saved/restored on ctxsw */
2432 task->thread.flags |= IA64_THREAD_PM_VALID;
2435 SET_PMU_OWNER(task);
2437 ctx->ctx_flags.state = PFM_CTX_ENABLED;
2438 atomic_set(&ctx->ctx_last_cpu, me);
2440 /* simply unfreeze */
2449 pfm_get_pmc_reset(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2450 struct pt_regs *regs)
2452 pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
2454 int i, ret = -EINVAL;
2456 for (i = 0; i < count; i++, req++) {
2458 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
2462 if (!PMC_IS_IMPL(cnum)) goto abort_mission;
2464 tmp.reg_value = PMC_DFL_VAL(cnum);
2466 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
2468 DBprintk(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, tmp.reg_value));
2470 if (__copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
2474 PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
2475 if (__copy_to_user(req, &tmp, sizeof(tmp))) ret = -EFAULT;
2481 * functions MUST be listed in the increasing order of their index (see permfon.h)
2483 static pfm_cmd_desc_t pfm_cmd_tab[]={
2484 /* 0 */{ NULL, 0, 0, 0}, /* not used */
2485 /* 1 */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2486 /* 2 */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2487 /* 3 */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2488 /* 4 */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2489 /* 5 */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2490 /* 6 */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2491 /* 7 */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2492 /* 8 */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_RW, 1, sizeof(pfarg_context_t)},
2493 /* 9 */{ pfm_context_destroy, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2494 /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0},
2495 /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2496 /* 12 */{ pfm_get_features, PFM_CMD_ARG_RW, 0, 0},
2497 /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)},
2498 /* 14 */{ pfm_context_unprotect, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2499 /* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2500 /* 16 */{ NULL, 0, 0, 0}, /* not used */
2501 /* 17 */{ NULL, 0, 0, 0}, /* not used */
2502 /* 18 */{ NULL, 0, 0, 0}, /* not used */
2503 /* 19 */{ NULL, 0, 0, 0}, /* not used */
2504 /* 20 */{ NULL, 0, 0, 0}, /* not used */
2505 /* 21 */{ NULL, 0, 0, 0}, /* not used */
2506 /* 22 */{ NULL, 0, 0, 0}, /* not used */
2507 /* 23 */{ NULL, 0, 0, 0}, /* not used */
2508 /* 24 */{ NULL, 0, 0, 0}, /* not used */
2509 /* 25 */{ NULL, 0, 0, 0}, /* not used */
2510 /* 26 */{ NULL, 0, 0, 0}, /* not used */
2511 /* 27 */{ NULL, 0, 0, 0}, /* not used */
2512 /* 28 */{ NULL, 0, 0, 0}, /* not used */
2513 /* 29 */{ NULL, 0, 0, 0}, /* not used */
2514 /* 30 */{ NULL, 0, 0, 0}, /* not used */
2515 /* 31 */{ NULL, 0, 0, 0}, /* not used */
2516 #ifdef PFM_PMU_USES_DBR
2517 /* 32 */{ pfm_write_ibrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)},
2518 /* 33 */{ pfm_write_dbrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)}
2521 #define PFM_CMD_COUNT ARRAY_SIZE(pfm_cmd_tab)
2524 check_task_state(struct task_struct *task)
2528 /* We must wait until the state has been completely
2529 * saved. There can be situations where the reader arrives before
2530 * after the task is marked as STOPPED but before pfm_save_regs()
2533 if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY;
2534 DBprintk(("before wait_task_inactive [%d] state %ld\n", task->pid, task->state));
2535 wait_task_inactive(task);
2536 DBprintk(("after wait_task_inactive [%d] state %ld\n", task->pid, task->state));
2538 if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) {
2539 DBprintk(("warning [%d] not in stable state %ld\n", task->pid, task->state));
2547 sys_perfmonctl (pid_t pid, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
2548 long arg8, long stack)
2550 struct pt_regs *regs = (struct pt_regs *)&stack;
2551 struct task_struct *task = current;
2557 * reject any call if perfmon was disabled at initialization time
2559 if (PFM_IS_DISABLED()) return -ENOSYS;
2561 DBprintk(("cmd=%d idx=%d valid=%d narg=0x%x\n", cmd, PFM_CMD_IDX(cmd),
2562 PFM_CMD_IS_VALID(cmd), PFM_CMD_NARG(cmd)));
2564 if (PFM_CMD_IS_VALID(cmd) == 0) return -EINVAL;
2566 /* ingore arguments when command has none */
2567 narg = PFM_CMD_NARG(cmd);
2568 if ((narg == PFM_CMD_ARG_MANY && count == 0) || (narg > 0 && narg != count)) return -EINVAL;
2570 sz = PFM_CMD_ARG_SIZE(cmd);
2572 if (PFM_CMD_READ_ARG(cmd) && !access_ok(VERIFY_READ, arg, sz*count)) return -EFAULT;
2574 if (PFM_CMD_RW_ARG(cmd) && !access_ok(VERIFY_WRITE, arg, sz*count)) return -EFAULT;
2576 if (PFM_CMD_USE_PID(cmd)) {
2578 * XXX: may need to fine tune this one
2580 if (pid < 2) return -EPERM;
2582 if (pid != current->pid) {
2586 read_lock(&tasklist_lock);
2588 task = find_task_by_pid(pid);
2590 if (task) get_task_struct(task);
2592 read_unlock(&tasklist_lock);
2594 if (!task) goto abort_call;
2598 if (pfm_bad_permissions(task)) goto abort_call;
2600 if (PFM_CMD_CHK(cmd)) {
2601 ret = check_task_state(task);
2602 if (ret != 0) goto abort_call;
2607 ctx = task->thread.pfm_context;
2609 if (PFM_CMD_USE_CTX(cmd)) {
2612 DBprintk(("no context for task %d\n", task->pid));
2617 * we only grant access to the context if:
2618 * - the caller is the creator of the context (ctx_owner)
2619 * OR - the context is attached to the caller AND The context IS NOT
2622 if (ctx->ctx_owner != current && (ctx->ctx_fl_protected || task != current)) {
2623 DBprintk(("context protected, no access for [%d]\n", task->pid));
2628 ret = (*pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func)(task, ctx, arg, count, regs);
2631 if (task && task != current) put_task_struct(task);
2637 * send SIGPROF to register task, must be invoked when it
2638 * is safe to send a signal, e.g., not holding any runqueue
2642 pfm_notify_user(pfm_context_t *ctx)
2647 if (ctx->ctx_notify_task == NULL) {
2648 DBprintk(("[%d] no notifier\n", current->pid));
2654 si.si_pid = current->pid; /* who is sending */
2655 si.si_signo = SIGPROF;
2656 si.si_code = PROF_OVFL;
2658 si.si_pfm_ovfl[0] = ctx->ctx_ovfl_regs[0];
2661 * when the target of the signal is not ourself, we have to be more
2662 * careful. The notify_task may being cleared by the target task itself
2663 * in release_thread(). We must ensure mutual exclusion here such that
2664 * the signal is delivered (even to a dying task) safely.
2667 if (ctx->ctx_notify_task != current) {
2669 * grab the notification lock for this task
2670 * This guarantees that the sequence: test + send_signal
2671 * is atomic with regards to the ctx_notify_task field.
2673 * We need a spinlock and not just an atomic variable for this.
2676 spin_lock(&ctx->ctx_lock);
2679 * now notify_task cannot be modified until we're done
2680 * if NULL, they it got modified while we were in the handler
2682 if (ctx->ctx_notify_task == NULL) {
2684 spin_unlock(&ctx->ctx_lock);
2687 * If we've lost the notified task, then we will run
2688 * to completion wbut keep the PMU frozen. Results
2689 * will be incorrect anyway. We do not kill task
2690 * to leave it possible to attach perfmon context
2691 * to already running task.
2693 printk("perfmon: pfm_notify_user() lost notify_task\n");
2694 DBprintk_ovfl(("notification task has disappeared !\n"));
2696 /* we cannot afford to block now */
2697 ctx->ctx_fl_block = 0;
2703 * required by send_sig_info() to make sure the target
2704 * task does not disappear on us.
2706 read_lock(&tasklist_lock);
2709 * in this case, we don't stop the task, we let it go on. It will
2710 * necessarily go to the signal handler (if any) when it goes back to
2713 DBprintk_ovfl(("[%d] sending notification to [%d]\n",
2714 current->pid, ctx->ctx_notify_task->pid));
2717 * this call is safe in an interrupt handler, so does read_lock() on tasklist_lock
2719 ret = send_sig_info(SIGPROF, &si, ctx->ctx_notify_task);
2721 printk("perfmon: send_sig_info(process %d, SIGPROF)=%d\n",
2722 ctx->ctx_notify_task->pid, ret);
2726 * now undo the protections in order
2728 if (ctx->ctx_notify_task != current) {
2729 read_unlock(&tasklist_lock);
2730 spin_unlock(&ctx->ctx_lock);
2736 pfm_ovfl_block_reset(void)
2738 struct thread_struct *th = ¤t->thread;
2739 pfm_context_t *ctx = current->thread.pfm_context;
2740 unsigned int reason;
2744 * clear the flag, to make sure we won't get here
2747 th->pfm_ovfl_block_reset = 0;
2748 clear_thread_flag(TIF_NOTIFY_RESUME);
2751 * do some sanity checks first
2754 printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
2758 * extract reason for being here and clear
2760 reason = ctx->ctx_fl_trap_reason;
2761 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
2763 DBprintk(("[%d] reason=%d\n", current->pid, reason));
2766 * just here for a reset (non-blocking context only)
2768 if (reason == PFM_TRAP_REASON_RESET) goto non_blocking;
2771 * first notify user. This can fail if notify_task has disappeared.
2773 if (reason == PFM_TRAP_REASON_SIG || reason == PFM_TRAP_REASON_BLOCKSIG) {
2774 ret = pfm_notify_user(ctx);
2779 * came here just to signal (non-blocking)
2781 if (reason == PFM_TRAP_REASON_SIG) return;
2783 DBprintk(("[%d] before sleeping\n", current->pid));
2786 * may go through without blocking on SMP systems
2787 * if restart has been received already by the time we call down()
2789 ret = down_interruptible(&ctx->ctx_restart_sem);
2791 DBprintk(("[%d] after sleeping ret=%d\n", current->pid, ret));
2794 * in case of interruption of down() we don't restart anything
2799 /* we reactivate on context switch */
2800 ctx->ctx_fl_frozen = 0;
2802 * the ovfl_sem is cleared by the restart task and this is safe because we always
2803 * use the local reference
2806 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
2808 ctx->ctx_ovfl_regs[0] = 0UL;
2811 * Unlock sampling buffer and reset index atomically
2812 * XXX: not really needed when blocking
2814 if (CTX_HAS_SMPL(ctx)) {
2815 ctx->ctx_psb->psb_hdr->hdr_count = 0;
2816 ctx->ctx_psb->psb_index = 0;
2821 /* state restored, can go back to work (user mode) */
2826 * This function will record an entry in the sampling if it is not full already.
2828 * 0 : buffer is not full (did not BECOME full: still space or was already full)
2829 * 1 : buffer is full (recorded the last entry)
2832 pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ovfl_mask, struct pt_regs *regs)
2834 pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb;
2835 unsigned long *e, m, idx;
2836 perfmon_smpl_entry_t *h;
2840 idx = ia64_fetch_and_add(1, &psb->psb_index);
2841 DBprintk_ovfl(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));
2844 * XXX: there is a small chance that we could run out on index before resetting
2845 * but index is unsigned long, so it will take some time.....
2846 * We use > instead of == because fetch_and_add() is off by one (see below)
2848 * This case can happen in non-blocking mode or with multiple processes.
2849 * For non-blocking, we need to reload and continue.
2851 if (idx > psb->psb_entries) return 0;
2853 /* first entry is really entry 0, not 1 caused by fetch_and_add */
2856 h = (perfmon_smpl_entry_t *)(((char *)psb->psb_addr) + idx*(psb->psb_entry_size));
2859 * initialize entry header
2861 h->pid = current->pid;
2863 h->last_reset_value = ovfl_mask ? ctx->ctx_soft_pmds[ffz(~ovfl_mask)].lval : 0UL;
2864 h->ip = regs ? regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3): 0x0UL;
2865 h->regs = ovfl_mask; /* which registers overflowed */
2867 /* guaranteed to monotonically increase on each cpu */
2868 h->stamp = pfm_get_stamp();
2870 /* position for first pmd */
2871 e = (unsigned long *)(h+1);
2874 * selectively store PMDs in increasing index number
2876 m = ctx->ctx_smpl_regs[0];
2877 for (j=0; m; m >>=1, j++) {
2879 if ((m & 0x1) == 0) continue;
2881 if (PMD_IS_COUNTING(j)) {
2882 *e = pfm_read_soft_counter(ctx, j);
2884 *e = ia64_get_pmd(j); /* slow */
2886 DBprintk_ovfl(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
2889 pfm_stats[h->cpu].pfm_recorded_samples_count++;
2892 * make the new entry visible to user, needs to be atomic
2894 ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count);
2896 DBprintk_ovfl(("index=%ld entries=%ld hdr_count=%ld\n",
2897 idx, psb->psb_entries, psb->psb_hdr->hdr_count));
2899 * sampling buffer full ?
2901 if (idx == (psb->psb_entries-1)) {
2902 DBprintk_ovfl(("sampling buffer full\n"));
2904 * XXX: must reset buffer in blocking mode and lost notified
2906 pfm_stats[h->cpu].pfm_full_smpl_buffer_count++;
2915 * main overflow processing routine.
2916 * it can be called from the interrupt path or explicitly during the context switch code
2918 * mode: 0=coming from PMU interrupt, 1=coming from ctxsw
2921 * new value of pmc[0]. if 0x0 then unfreeze, else keep frozen
2923 static unsigned long
2924 pfm_overflow_handler(int mode, struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
2926 struct thread_struct *t;
2928 unsigned long old_val;
2929 unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL;
2933 * It is never safe to access the task for which the overflow interrupt is destinated
2934 * using the current variable as the interrupt may occur in the middle of a context switch
2935 * where current does not hold the task that is running yet.
2937 * For monitoring, however, we do need to get access to the task which caused the overflow
2938 * to account for overflow on the counters.
2940 * We accomplish this by maintaining a current owner of the PMU per CPU. During context
2941 * switch the ownership is changed in a way such that the reflected owner is always the
2942 * valid one, i.e. the one that caused the interrupt.
2951 * Don't think this could happen given upfront tests
2953 if ((t->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system == 0) {
2954 printk(KERN_DEBUG "perfmon: Spurious overflow interrupt: process %d not "
2955 "using perfmon\n", task->pid);
2956 preempt_enable_no_resched();
2960 * sanity test. Should never happen
2962 if ((pmc0 & 0x1) == 0) {
2963 printk(KERN_DEBUG "perfmon: pid %d pmc0=0x%lx assumption error for freeze bit\n",
2965 preempt_enable_no_resched();
2969 mask = pmc0 >> PMU_FIRST_COUNTER;
2971 DBprintk_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
2972 " mode used_pmds=0x%lx used_pmcs=0x%lx reload_pmcs=0x%lx\n",
2973 pmc0, task->pid, (regs ? regs->cr_iip : 0),
2974 CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
2975 ctx->ctx_used_pmds[0],
2976 ctx->ctx_used_pmcs[0],
2977 ctx->ctx_reload_pmcs[0]));
2980 * First we update the virtual counters
2982 for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
2984 /* skip pmd which did not overflow */
2985 if ((mask & 0x1) == 0) continue;
2987 DBprintk_ovfl(("pmd[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n",
2988 i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val));
2991 * Note that the pmd is not necessarily 0 at this point as qualified events
2992 * may have happened before the PMU was frozen. The residual count is not
2993 * taken into consideration here but will be with any read of the pmd via
2996 old_val = ctx->ctx_soft_pmds[i].val;
2997 ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.ovfl_val;
3000 * check for overflow condition
3002 if (old_val > ctx->ctx_soft_pmds[i].val) {
3004 ovfl_pmds |= 1UL << i;
3006 if (PMC_OVFL_NOTIFY(ctx, i)) {
3007 ovfl_notify |= 1UL << i;
3010 DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
3011 i, ctx->ctx_soft_pmds[i].val, old_val,
3012 ia64_get_pmd(i) & pmu_conf.ovfl_val, ovfl_pmds, ovfl_notify));
3016 * check for sampling buffer
3018 * if present, record sample only when a 64-bit counter has overflowed.
3019 * We propagate notification ONLY when buffer becomes full.
3021 if(CTX_HAS_SMPL(ctx) && ovfl_pmds) {
3022 ret = pfm_record_sample(task, ctx, ovfl_pmds, regs);
3025 * Sampling buffer became full
3026 * If no notication was requested, then we reset buffer index
3027 * and reset registers (done below) and resume.
3028 * If notification requested, then defer reset until pfm_restart()
3030 if (ovfl_notify == 0UL) {
3031 ctx->ctx_psb->psb_hdr->hdr_count = 0UL;
3032 ctx->ctx_psb->psb_index = 0UL;
3036 * sample recorded in buffer, no need to notify user
3043 * No overflow requiring a user level notification
3045 if (ovfl_notify == 0UL) {
3047 pfm_reset_regs(ctx, &ovfl_pmds, PFM_PMD_SHORT_RESET);
3048 preempt_enable_no_resched();
3053 * keep track of what to reset when unblocking
3055 ctx->ctx_ovfl_regs[0] = ovfl_pmds;
3057 DBprintk_ovfl(("block=%d notify [%d] current [%d]\n",
3059 ctx->ctx_notify_task ? ctx->ctx_notify_task->pid: -1,
3063 * ctx_notify_task could already be NULL, checked in pfm_notify_user()
3065 if (CTX_OVFL_NOBLOCK(ctx) == 0 && ctx->ctx_notify_task != task) {
3066 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCKSIG;
3068 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_SIG;
3071 * we cannot block in system wide mode and we do not go
3072 * through the PMU ctxsw code. Therefore we can generate
3073 * the notification here. In system wide mode, the current
3074 * task maybe different from the task controlling the session
3075 * on this CPU, therefore owner can be different from current.
3077 * In per-process mode, this function gets called from
3078 * the interrupt handler or pfm_load_regs(). The mode argument
3079 * tells where we are coming from. When coming from the interrupt
3080 * handler, it is safe to notify (send signal) right here because
3081 * we do not hold any runqueue locks needed by send_sig_info().
3083 * However when coming from ctxsw, we cannot send the signal here.
3084 * It must be deferred until we are sure we do not hold any runqueue
3085 * related locks. The current task maybe different from the owner
3086 * only in UP mode. The deferral is implemented using the
3087 * TIF_NOTIFY_RESUME mechanism. In this case, the pending work
3088 * is checked when the task is about to leave the kernel (see
3089 * entry.S). As of this version of perfmon, a kernel only
3090 * task cannot be monitored in per-process mode. Therefore,
3091 * when this function gets called from pfm_load_regs(), we know
3092 * we have a user level task which will eventually either exit
3093 * or leave the kernel, and thereby go through the checkpoint
3096 if (ctx->ctx_fl_system || mode == 0) {
3097 pfm_notify_user(ctx);
3098 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
3100 struct thread_info *info;
3103 * given that TIF_NOTIFY_RESUME is not specific to
3104 * perfmon, we need to have a second level check to
3105 * verify the source of the notification.
3107 task->thread.pfm_ovfl_block_reset = 1;
3109 * when coming from ctxsw, current still points to the
3110 * previous task, therefore we must work with task and not current.
3112 info = ((struct thread_info *) ((char *) task + IA64_TASK_SIZE));
3113 set_bit(TIF_NOTIFY_RESUME, &info->flags);
3117 * keep the PMU frozen until either pfm_restart() or
3118 * task completes (non-blocking or notify_task gone).
3120 ctx->ctx_fl_frozen = 1;
3122 DBprintk_ovfl(("current [%d] owner [%d] mode=%d return pmc0=0x%x must_block=%ld reason=%d\n",
3124 PMU_OWNER() ? PMU_OWNER()->pid : -1,
3126 ctx->ctx_fl_frozen ? 0x1 : 0x0,
3127 t->pfm_ovfl_block_reset,
3128 ctx->ctx_fl_trap_reason));
3130 preempt_enable_no_resched();
3135 pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
3138 struct task_struct *task;
3141 pfm_stats[get_cpu()].pfm_ovfl_intr_count++;
3144 * if an alternate handler is registered, just bypass the default one
3146 if (pfm_alternate_intr_handler) {
3147 (*pfm_alternate_intr_handler->handler)(irq, arg, regs);
3153 * srlz.d done before arriving here
3157 pmc0 = ia64_get_pmc(0);
3160 * if we have some pending bits set
3161 * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1
3163 if ((pmc0 & ~0x1UL)!=0UL && (task=PMU_OWNER())!= NULL) {
3165 * we assume that pmc0.fr is always set here
3167 ctx = task->thread.pfm_context;
3171 printk(KERN_DEBUG "perfmon: Spurious overflow interrupt: process %d has "
3172 "no PFM context\n", task->pid);
3178 * assume PMC[0].fr = 1 at this point
3180 pmc0 = pfm_overflow_handler(0, task, ctx, pmc0, regs);
3182 * we can only update pmc0 when the overflow
3183 * is for the current context or we are in system
3184 * wide mode. In UP (per-task) the current
3185 * task may not be the one owning the PMU,
3186 * same thing for system-wide.
3188 if (task == current || ctx->ctx_fl_system) {
3190 * We always clear the overflow status bits and either unfreeze
3191 * or keep the PMU frozen.
3193 ia64_set_pmc(0, pmc0);
3196 task->thread.pmc[0] = pmc0;
3199 pfm_stats[smp_processor_id()].pfm_spurious_ovfl_intr_count++;
3201 put_cpu_no_resched();
3205 /* for debug only */
3207 pfm_proc_info(char *page)
3212 p += sprintf(p, "fastctxsw : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
3213 p += sprintf(p, "ovfl_mask : 0x%lx\n", pmu_conf.ovfl_val);
3215 for(i=0; i < NR_CPUS; i++) {
3216 if (cpu_online(i) == 0) continue;
3217 p += sprintf(p, "CPU%-2d overflow intrs : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_count);
3218 p += sprintf(p, "CPU%-2d spurious intrs : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count);
3219 p += sprintf(p, "CPU%-2d recorded samples : %lu\n", i, pfm_stats[i].pfm_recorded_samples_count);
3220 p += sprintf(p, "CPU%-2d smpl buffer full : %lu\n", i, pfm_stats[i].pfm_full_smpl_buffer_count);
3221 p += sprintf(p, "CPU%-2d syst_wide : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0);
3222 p += sprintf(p, "CPU%-2d dcr_pp : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0);
3223 p += sprintf(p, "CPU%-2d exclude idle : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0);
3224 p += sprintf(p, "CPU%-2d owner : %d\n", i, pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1);
3229 p += sprintf(p, "proc_sessions : %u\n"
3230 "sys_sessions : %u\n"
3231 "sys_use_dbregs : %u\n"
3232 "ptrace_use_dbregs : %u\n",
3233 pfm_sessions.pfs_task_sessions,
3234 pfm_sessions.pfs_sys_sessions,
3235 pfm_sessions.pfs_sys_use_dbregs,
3236 pfm_sessions.pfs_ptrace_use_dbregs);
3243 /* /proc interface, for debug only */
3245 perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
3247 int len = pfm_proc_info(page);
3249 if (len <= off+count) *eof = 1;
3251 *start = page + off;
3254 if (len>count) len = count;
3261 * we come here as soon as PFM_CPUINFO_SYST_WIDE is set. This happens
3262 * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
3263 * is active or inactive based on mode. We must rely on the value in
3264 * cpu_data(i)->pfm_syst_info
3267 pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
3269 struct pt_regs *regs;
3271 unsigned long dcr_pp;
3274 dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
3277 * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
3278 * on every CPU, so we can rely on the pid to identify the idle task.
3280 if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
3281 regs = (struct pt_regs *)((unsigned long) task + IA64_STK_OFFSET);
3283 ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
3288 * if monitoring has started
3291 dcr = ia64_get_dcr();
3293 * context switching in?
3296 /* mask monitoring for the idle task */
3297 ia64_set_dcr(dcr & ~IA64_DCR_PP);
3304 * context switching out
3305 * restore monitoring for next task
3307 * Due to inlining this odd if-then-else construction generates
3310 ia64_set_dcr(dcr |IA64_DCR_PP);
3318 pfm_save_regs (struct task_struct *task)
3327 ctx = task->thread.pfm_context;
3331 * save current PSR: needed because we modify it
3333 psr = pfm_get_psr();
3337 * This is the last instruction which can generate an overflow
3339 * We do not need to set psr.sp because, it is irrelevant in kernel.
3340 * It will be restored from ipsr when going back to user level
3345 ctx->ctx_saved_psr = psr;
3349 * We do not use a lazy scheme in SMP because
3350 * of the new scheduler which masks interrupts
3351 * during low-level context switch. So we save
3352 * all the PMD register we use and restore on
3355 * release ownership of this PMU.
3356 * must be done before we save the registers.
3358 SET_PMU_OWNER(NULL);
3365 mask = ctx->ctx_used_pmds[0];
3366 for (i=0; mask; i++, mask>>=1) {
3367 if (mask & 0x1) task->thread.pmd[i] =ia64_get_pmd(i);
3373 task->thread.pmc[0] = ia64_get_pmc(0);
3376 * force a full reload
3378 atomic_set(&ctx->ctx_last_cpu, -1);
3384 pfm_lazy_save_regs (struct task_struct *task)
3387 struct thread_struct *t;
3392 DBprintk(("on [%d] by [%d]\n", task->pid, current->pid));
3395 ctx = task->thread.pfm_context;
3398 * do not own the PMU
3400 SET_PMU_OWNER(NULL);
3405 * XXX needs further optimization.
3406 * Also must take holes into account
3408 mask = ctx->ctx_used_pmds[0];
3409 for (i=0; mask; i++, mask>>=1) {
3410 if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
3414 t->pmc[0] = ia64_get_pmc(0);
3416 /* not owned by this CPU */
3417 atomic_set(&ctx->ctx_last_cpu, -1);
3422 pfm_load_regs (struct task_struct *task)
3424 struct thread_struct *t;
3426 struct task_struct *owner;
3433 owner = PMU_OWNER();
3434 ctx = task->thread.pfm_context;
3439 printk("perfmon: pfm_load_regs: null ctx for [%d]\n", task->pid);
3444 * we restore ALL the debug registers to avoid picking up
3447 * This must be done even when the task is still the owner
3448 * as the registers may have been modified via ptrace()
3449 * (not perfmon) by the previous task.
3451 * XXX: dealing with this in a lazy fashion requires modifications
3452 * to the way the the debug registers are managed. This is will done
3453 * in the next version of perfmon.
3455 if (ctx->ctx_fl_using_dbreg) {
3456 for (i=0; i < (int) pmu_conf.num_ibrs; i++) {
3457 ia64_set_ibr(i, t->ibr[i]);
3460 for (i=0; i < (int) pmu_conf.num_dbrs; i++) {
3461 ia64_set_dbr(i, t->dbr[i]);
3467 * if we were the last user, then nothing to do except restore psr
3468 * this path cannot be used in SMP
3470 if (owner == task) {
3471 if ((unsigned int) atomic_read(&ctx->ctx_last_cpu) != smp_processor_id())
3472 DBprintk(("invalid last_cpu=%d for [%d]\n",
3473 atomic_read(&ctx->ctx_last_cpu), task->pid));
3475 psr = ctx->ctx_saved_psr;
3482 * someone else is still using the PMU, first push it out and
3483 * then we'll be able to install our stuff !
3485 * not possible in SMP
3487 if (owner) pfm_lazy_save_regs(owner);
3490 * To avoid leaking information to the user level when psr.sp=0,
3491 * we must reload ALL implemented pmds (even the ones we don't use).
3492 * In the kernel we only allow PFM_READ_PMDS on registers which
3493 * we initialized or requested (sampling) so there is no risk there.
3495 * As an optimization, we will only reload the PMD that we use when
3496 * the context is in protected mode, i.e. psr.sp=1 because then there
3497 * is no leak possible.
3499 mask = pfm_sysctl.fastctxsw || ctx->ctx_fl_protected ? ctx->ctx_used_pmds[0] : ctx->ctx_reload_pmds[0];
3500 for (i=0; mask; i++, mask>>=1) {
3501 if (mask & 0x1) ia64_set_pmd(i, t->pmd[i] & pmu_conf.ovfl_val);
3505 * PMC0 is never set in the mask because it is always restored
3508 * ALL PMCs are systematically reloaded, unused registers
3509 * get their default (PAL reset) values to avoid picking up
3510 * stale configuration.
3512 mask = ctx->ctx_reload_pmcs[0];
3513 for (i=0; mask; i++, mask>>=1) {
3514 if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
3518 * manually invoke core interrupt handler
3519 * if the task had a pending overflow when it was ctxsw out.
3520 * Side effect on ctx_fl_frozen is possible.
3522 if (t->pmc[0] & ~0x1) {
3523 t->pmc[0] = pfm_overflow_handler(1, task, ctx, t->pmc[0], NULL);
3527 * unfreeze PMU if possible
3529 if (ctx->ctx_fl_frozen == 0) pfm_unfreeze_pmu();
3531 atomic_set(&ctx->ctx_last_cpu, smp_processor_id());
3533 SET_PMU_OWNER(task);
3536 * restore the psr we changed in pfm_save_regs()
3538 psr = ctx->ctx_saved_psr;
3544 * XXX: make this routine able to work with non current context
3547 pfm_reset_pmu(struct task_struct *task)
3549 struct thread_struct *t = &task->thread;
3550 pfm_context_t *ctx = t->pfm_context;
3553 if (task != current) {
3554 printk("perfmon: invalid task in pfm_reset_pmu()\n");
3559 /* Let's make sure the PMU is frozen */
3563 * install reset values for PMC. We skip PMC0 (done above)
3564 * XX: good up to 64 PMCS
3566 for (i=1; (pmu_conf.pmc_desc[i].type & PFM_REG_END) == 0; i++) {
3567 if ((pmu_conf.pmc_desc[i].type & PFM_REG_IMPL) == 0) continue;
3568 ia64_set_pmc(i, PMC_DFL_VAL(i));
3570 * When restoring context, we must restore ALL pmcs, even the ones
3571 * that the task does not use to avoid leaks and possibly corruption
3572 * of the sesion because of configuration conflicts. So here, we
3573 * initialize the entire set used in the context switch restore routine.
3575 t->pmc[i] = PMC_DFL_VAL(i);
3576 DBprintk(("pmc[%d]=0x%lx\n", i, t->pmc[i]));
3580 * clear reset values for PMD.
3581 * XXX: good up to 64 PMDS.
3583 for (i=0; (pmu_conf.pmd_desc[i].type & PFM_REG_END) == 0; i++) {
3584 if ((pmu_conf.pmd_desc[i].type & PFM_REG_IMPL) == 0) continue;
3585 ia64_set_pmd(i, 0UL);
3590 * On context switched restore, we must restore ALL pmc and ALL pmd even
3591 * when they are not actively used by the task. In UP, the incoming process
3592 * may otherwise pick up left over PMC, PMD state from the previous process.
3593 * As opposed to PMD, stale PMC can cause harm to the incoming
3594 * process because they may change what is being measured.
3595 * Therefore, we must systematically reinstall the entire
3596 * PMC state. In SMP, the same thing is possible on the
3597 * same CPU but also on between 2 CPUs.
3599 * The problem with PMD is information leaking especially
3600 * to user level when psr.sp=0
3602 * There is unfortunately no easy way to avoid this problem
3603 * on either UP or SMP. This definitively slows down the
3604 * pfm_load_regs() function.
3608 * We must include all the PMC in this mask to make sure we don't
3609 * see any side effect of a stale state, such as opcode matching
3610 * or range restrictions, for instance.
3612 * We never directly restore PMC0 so we do not include it in the mask.
3614 ctx->ctx_reload_pmcs[0] = pmu_conf.impl_pmcs[0] & ~0x1;
3616 * We must include all the PMD in this mask to avoid picking
3617 * up stale value and leak information, especially directly
3618 * at the user level when psr.sp=0
3620 ctx->ctx_reload_pmds[0] = pmu_conf.impl_pmds[0];
3623 * Keep track of the pmds we want to sample
3624 * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
3625 * but we do need the BTB for sure. This is because of a hardware
3626 * buffer of 1 only for non-BTB pmds.
3628 * We ignore the unimplemented pmds specified by the user
3630 ctx->ctx_used_pmds[0] = ctx->ctx_smpl_regs[0];
3631 ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */
3634 * useful in case of re-enable after disable
3636 ctx->ctx_used_ibrs[0] = 0UL;
3637 ctx->ctx_used_dbrs[0] = 0UL;
3644 * This function is called when a thread exits (from exit_thread()).
3645 * This is a simplified pfm_save_regs() that simply flushes the current
3646 * register state into the save area taking into account any pending
3647 * overflow. This time no notification is sent because the task is dying
3648 * anyway. The inline processing of overflows avoids loosing some counts.
3649 * The PMU is frozen on exit from this call and is to never be reenabled
3650 * again for this task.
3654 pfm_flush_regs (struct task_struct *task)
3658 unsigned long mask2, val;
3661 ctx = task->thread.pfm_context;
3663 if (ctx == NULL) return;
3666 * that's it if context already disabled
3668 if (ctx->ctx_flags.state == PFM_CTX_DISABLED) return;
3673 * This is the only way to stop monitoring without destroying overflow
3674 * information in PMC[0].
3675 * This is the last instruction which can cause overflow when monitoring
3677 * By now, we could still have an overflow interrupt in-flight.
3679 if (ctx->ctx_fl_system) {
3682 /* disable dcr pp */
3683 ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
3685 /* stop monitoring */
3690 PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
3691 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
3692 PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
3695 /* stop monitoring */
3700 /* no more save/restore on ctxsw */
3701 current->thread.flags &= ~IA64_THREAD_PM_VALID;
3705 * Mark the PMU as not owned
3706 * This will cause the interrupt handler to do nothing in case an overflow
3707 * interrupt was in-flight
3708 * This also guarantees that pmc0 will contain the final state
3709 * It virtually gives us full control on overflow processing from that point
3711 * It must be an atomic operation.
3713 SET_PMU_OWNER(NULL);
3716 * read current overflow status:
3718 * we are guaranteed to read the final stable state
3721 pmc0 = ia64_get_pmc(0); /* slow */
3726 * This destroys the overflow information. This is required to make sure
3727 * next process does not start with monitoring on if not requested
3732 * We don't need to restore psr, because we are on our way out
3736 * This loop flushes the PMD into the PFM context.
3737 * It also processes overflow inline.
3739 * IMPORTANT: No notification is sent at this point as the process is dying.
3740 * The implicit notification will come from a SIGCHILD or a return from a
3745 if ((unsigned int) atomic_read(&ctx->ctx_last_cpu) != smp_processor_id())
3746 printk(KERN_DEBUG "perfmon: [%d] last_cpu=%d\n",
3747 task->pid, atomic_read(&ctx->ctx_last_cpu));
3750 * we save all the used pmds
3751 * we take care of overflows for pmds used as counters
3753 mask2 = ctx->ctx_used_pmds[0];
3754 for (i = 0; mask2; i++, mask2>>=1) {
3756 /* skip non used pmds */
3757 if ((mask2 & 0x1) == 0) continue;
3759 val = ia64_get_pmd(i);
3761 if (PMD_IS_COUNTING(i)) {
3762 DBprintk(("[%d] pmd[%d] soft_pmd=0x%lx hw_pmd=0x%lx\n",
3765 ctx->ctx_soft_pmds[i].val,
3766 val & pmu_conf.ovfl_val));
3768 /* collect latest results */
3769 ctx->ctx_soft_pmds[i].val += val & pmu_conf.ovfl_val;
3772 * now everything is in ctx_soft_pmds[] and we need
3773 * to clear the saved context from save_regs() such that
3774 * pfm_read_pmds() gets the correct value
3776 task->thread.pmd[i] = 0;
3779 * take care of overflow inline
3781 if (pmc0 & (1UL << i)) {
3782 ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.ovfl_val;
3783 DBprintk(("[%d] pmd[%d] overflowed soft_pmd=0x%lx\n",
3784 task->pid, i, ctx->ctx_soft_pmds[i].val));
3787 DBprintk(("[%d] pmd[%d] hw_pmd=0x%lx\n", task->pid, i, val));
3789 * not a counter, just save value as is
3791 task->thread.pmd[i] = val;
3795 * indicates that context has been saved
3797 atomic_set(&ctx->ctx_last_cpu, -1);
3803 * task is the newly created task, pt_regs for new child
3806 pfm_inherit(struct task_struct *task, struct pt_regs *regs)
3809 pfm_context_t *nctx;
3810 struct thread_struct *thread;
3815 * the new task was copied from parent and therefore points
3816 * to the parent's context at this point
3818 ctx = task->thread.pfm_context;
3819 thread = &task->thread;
3823 * for secure sessions, make sure child cannot mess up
3824 * the monitoring session.
3826 if (ctx->ctx_fl_unsecure == 0) {
3827 ia64_psr(regs)->sp = 1;
3828 DBprintk(("enabling psr.sp for [%d]\n", task->pid));
3830 DBprintk(("psr.sp=%d [%d]\n", ia64_psr(regs)->sp, task->pid));
3834 * if there was a virtual mapping for the sampling buffer
3835 * the mapping is NOT inherited across fork() (see VM_DONTCOPY),
3836 * so we don't have to explicitly remove it here.
3839 * Part of the clearing of fields is also done in
3840 * copy_thread() because the fiels are outside the
3841 * pfm_context structure and can affect tasks not
3845 /* clear pending notification */
3846 task->thread.pfm_ovfl_block_reset = 0;
3849 * clear cpu pinning restriction for child
3851 if (ctx->ctx_fl_system) {
3852 set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
3854 DBprintk(("setting cpus_allowed for [%d] to 0x%lx from 0x%lx\n",
3856 ctx->ctx_saved_cpus_allowed,
3857 current->cpus_allowed));
3861 * takes care of easiest case first
3863 if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) {
3865 DBprintk(("removing PFM context for [%d]\n", task->pid));
3867 task->thread.pfm_context = NULL;
3870 * we must clear psr.up because the new child does
3871 * not have a context and the PM_VALID flag is cleared
3874 * we do not clear psr.pp because it is always
3875 * controlled by the system wide logic and we should
3876 * never be here when system wide is running anyway
3878 ia64_psr(regs)->up = 0;
3882 /* copy_thread() clears IA64_THREAD_PM_VALID */
3885 nctx = pfm_context_alloc();
3886 if (nctx == NULL) return -ENOMEM;
3892 if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) {
3893 nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE;
3894 DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));
3897 * task is not yet visible in the tasklist, so we do
3898 * not need to lock the newly created context.
3899 * However, we must grab the tasklist_lock to ensure
3900 * that the ctx_owner or ctx_notify_task do not disappear
3901 * while we increment their check counters.
3903 read_lock(&tasklist_lock);
3905 if (nctx->ctx_notify_task)
3906 atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);
3908 if (nctx->ctx_owner)
3909 atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);
3911 read_unlock(&tasklist_lock);
3915 pfm_sessions.pfs_task_sessions++;
3918 /* initialize counters in new context */
3919 m = nctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
3920 for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) {
3921 if ((m & 0x1) && pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING) {
3922 nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].lval & ~pmu_conf.ovfl_val;
3923 thread->pmd[i] = nctx->ctx_soft_pmds[i].lval & pmu_conf.ovfl_val;
3925 thread->pmd[i] = 0UL; /* reset to initial state */
3929 nctx->ctx_fl_frozen = 0;
3930 nctx->ctx_ovfl_regs[0] = 0UL;
3931 nctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
3932 atomic_set(&nctx->ctx_last_cpu, -1);
3935 * here nctx->ctx_psb == ctx->ctx_psb
3937 * increment reference count to sampling
3938 * buffer, if any. Note that this is independent
3939 * from the virtual mapping. The latter is never
3940 * inherited while the former will be if context
3941 * is setup to something different from PFM_FL_INHERIT_NONE
3943 if (nctx->ctx_psb) {
3944 LOCK_PSB(nctx->ctx_psb);
3946 nctx->ctx_psb->psb_refcnt++;
3948 DBprintk(("updated smpl @ %p refcnt=%lu psb_flags=0x%x\n",
3949 ctx->ctx_psb->psb_hdr,
3950 ctx->ctx_psb->psb_refcnt,
3951 ctx->ctx_psb->psb_flags));
3953 UNLOCK_PSB(nctx->ctx_psb);
3956 * remove any pointer to sampling buffer mapping
3958 nctx->ctx_smpl_vaddr = 0;
3961 sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */
3964 * propagate kernel psr in new context (used for first ctxsw in
3966 nctx->ctx_saved_psr = pfm_get_psr();
3969 * propagate kernel psr in new context (used for first ctxsw in
3971 nctx->ctx_saved_psr = pfm_get_psr();
3973 /* link with new task */
3974 thread->pfm_context = nctx;
3976 DBprintk(("nctx=%p for process [%d]\n", (void *)nctx, task->pid));
3979 * the copy_thread routine automatically clears
3980 * IA64_THREAD_PM_VALID, so we need to reenable it, if it was used by the caller
3982 if (current->thread.flags & IA64_THREAD_PM_VALID) {
3983 DBprintk(("setting PM_VALID for [%d]\n", task->pid));
3984 thread->flags |= IA64_THREAD_PM_VALID;
3994 * We cannot touch any of the PMU registers at this point as we may
3995 * not be running on the same CPU the task was last run on. Therefore
3996 * it is assumed that the PMU has been stopped appropriately in
3997 * pfm_flush_regs() called from exit_thread().
3999 * The function is called in the context of the parent via a release_thread()
4000 * and wait4(). The task is not in the tasklist anymore.
4003 pfm_context_exit(struct task_struct *task)
4005 pfm_context_t *ctx = task->thread.pfm_context;
4008 * check sampling buffer
4012 pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb;
4016 DBprintk(("sampling buffer from [%d] @%p size %ld refcnt=%lu psb_flags=0x%x\n",
4018 psb->psb_hdr, psb->psb_size, psb->psb_refcnt, psb->psb_flags));
4021 * in the case where we are the last user, we may be able to free
4026 if (psb->psb_refcnt == 0) {
4029 * The flag is cleared in pfm_vm_close(). which gets
4030 * called from do_exit() via exit_mm().
4031 * By the time we come here, the task has no more mm context.
4033 * We can only free the psb and buffer here after the vm area
4034 * describing the buffer has been removed. This normally happens
4035 * as part of do_exit() but the entire mm context is ONLY removed
4036 * once its reference counts goes to zero. This is typically
4037 * the case except for multi-threaded (several tasks) processes.
4039 * See pfm_vm_close() and pfm_cleanup_smpl_buf() for more details.
4041 if ((psb->psb_flags & PSB_HAS_VMA) == 0) {
4043 DBprintk(("cleaning sampling buffer from [%d] @%p size %ld\n",
4045 psb->psb_hdr, psb->psb_size));
4048 * free the buffer and psb
4050 pfm_rvfree(psb->psb_hdr, psb->psb_size);
4055 /* psb may have been deleted */
4056 if (psb) UNLOCK_PSB(psb);
4059 DBprintk(("cleaning [%d] pfm_context @%p notify_task=%p check=%d mm=%p\n",
4061 ctx->ctx_notify_task,
4062 atomic_read(&task->thread.pfm_notifiers_check), task->mm));
4065 * To avoid getting the notified task or owner task scan the entire process
4066 * list when they exit, we decrement notifiers_check and owners_check respectively.
4068 * Of course, there is race condition between decreasing the value and the
4069 * task exiting. The danger comes from the fact that, in both cases, we have a
4070 * direct pointer to a task structure thereby bypassing the tasklist.
4071 * We must make sure that, if we have task!= NULL, the target task is still
4072 * present and is identical to the initial task specified
4073 * during pfm_context_create(). It may already be detached from the tasklist but
4074 * that's okay. Note that it is okay if we miss the deadline and the task scans
4075 * the list for nothing, it will affect performance but not correctness.
4076 * The correctness is ensured by using the ctx_lock which prevents the
4077 * notify_task from changing the fields in our context.
4078 * Once holdhing this lock, if we see task!= NULL, then it will stay like
4079 * that until we release the lock. If it is NULL already then we came too late.
4083 if (ctx->ctx_notify_task != NULL) {
4084 DBprintk(("[%d], [%d] atomic_sub on [%d] notifiers=%u\n", current->pid,
4086 ctx->ctx_notify_task->pid,
4087 atomic_read(&ctx->ctx_notify_task->thread.pfm_notifiers_check)));
4089 atomic_dec(&ctx->ctx_notify_task->thread.pfm_notifiers_check);
4092 if (ctx->ctx_owner != NULL) {
4093 DBprintk(("[%d], [%d] atomic_sub on [%d] owners=%u\n",
4096 ctx->ctx_owner->pid,
4097 atomic_read(&ctx->ctx_owner->thread.pfm_owners_check)));
4099 atomic_dec(&ctx->ctx_owner->thread.pfm_owners_check);
4105 pfm_unreserve_session(task, ctx->ctx_fl_system, 1UL << ctx->ctx_cpu);
4107 if (ctx->ctx_fl_system) {
4109 * remove any CPU pinning
4111 set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
4114 pfm_context_free(ctx);
4116 * clean pfm state in thread structure,
4118 task->thread.pfm_context = NULL;
4119 task->thread.pfm_ovfl_block_reset = 0;
4121 /* pfm_notifiers is cleaned in pfm_cleanup_notifiers() */
4125 * function invoked from release_thread when pfm_smpl_buf_list is not NULL
4128 pfm_cleanup_smpl_buf(struct task_struct *task)
4130 pfm_smpl_buffer_desc_t *tmp, *psb = task->thread.pfm_smpl_buf_list;
4133 printk(KERN_DEBUG "perfmon: psb is null in [%d]\n", current->pid);
4137 * Walk through the list and free the sampling buffer and psb
4140 DBprintk(("[%d] freeing smpl @%p size %ld\n", current->pid, psb->psb_hdr, psb->psb_size));
4142 pfm_rvfree(psb->psb_hdr, psb->psb_size);
4143 tmp = psb->psb_next;
4149 task->thread.pfm_smpl_buf_list = NULL;
4155 * function invoked from release_thread to make sure that the ctx_owner field does not
4156 * point to an unexisting task.
4159 pfm_cleanup_owners(struct task_struct *task)
4161 struct task_struct *g, *p;
4164 DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid));
4166 read_lock(&tasklist_lock);
4168 do_each_thread(g, p) {
4170 * It is safe to do the 2-step test here, because thread.ctx
4171 * is cleaned up only in release_thread() and at that point
4172 * the task has been detached from the tasklist which is an
4173 * operation which uses the write_lock() on the tasklist_lock
4174 * so it cannot run concurrently to this loop. So we have the
4175 * guarantee that if we find p and it has a perfmon ctx then
4176 * it is going to stay like this for the entire execution of this
4179 ctx = p->thread.pfm_context;
4181 //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));
4183 if (ctx && ctx->ctx_owner == task) {
4184 DBprintk(("trying for owner [%d] in [%d]\n", task->pid, p->pid));
4186 * the spinlock is required to take care of a race condition
4187 * with the send_sig_info() call. We must make sure that
4188 * either the send_sig_info() completes using a valid task,
4189 * or the notify_task is cleared before the send_sig_info()
4190 * can pick up a stale value. Note that by the time this
4191 * function is executed the 'task' is already detached from the
4192 * tasklist. The problem is that the notifiers have a direct
4193 * pointer to it. It is okay to send a signal to a task in this
4194 * stage, it simply will have no effect. But it is better than sending
4195 * to a completely destroyed task or worse to a new task using the same
4196 * task_struct address.
4200 ctx->ctx_owner = NULL;
4204 DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid));
4206 } while_each_thread(g, p);
4208 read_unlock(&tasklist_lock);
4210 atomic_set(&task->thread.pfm_owners_check, 0);
4215 * function called from release_thread to make sure that the ctx_notify_task is not pointing
4216 * to an unexisting task
4219 pfm_cleanup_notifiers(struct task_struct *task)
4221 struct task_struct *g, *p;
4224 DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid));
4226 read_lock(&tasklist_lock);
4228 do_each_thread(g, p) {
4230 * It is safe to do the 2-step test here, because thread.ctx is cleaned up
4231 * only in release_thread() and at that point the task has been detached
4232 * from the tasklist which is an operation which uses the write_lock() on
4233 * the tasklist_lock so it cannot run concurrently to this loop. So we
4234 * have the guarantee that if we find p and it has a perfmon ctx then it
4235 * is going to stay like this for the entire execution of this loop.
4237 ctx = p->thread.pfm_context;
4239 //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));
4241 if (ctx && ctx->ctx_notify_task == task) {
4242 DBprintk(("trying for notifier [%d] in [%d]\n", task->pid, p->pid));
4244 * the spinlock is required to take care of a race condition
4245 * with the send_sig_info() call. We must make sure that
4246 * either the send_sig_info() completes using a valid task,
4247 * or the notify_task is cleared before the send_sig_info()
4248 * can pick up a stale value. Note that by the time this
4249 * function is executed the 'task' is already detached from the
4250 * tasklist. The problem is that the notifiers have a direct
4251 * pointer to it. It is okay to send a signal to a task in this
4252 * stage, it simply will have no effect. But it is better than sending
4253 * to a completely destroyed task or worse to a new task using the same
4254 * task_struct address.
4258 ctx->ctx_notify_task = NULL;
4262 DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid));
4264 } while_each_thread(g, p);
4266 read_unlock(&tasklist_lock);
4268 atomic_set(&task->thread.pfm_notifiers_check, 0);
4271 static struct irqaction perfmon_irqaction = {
4272 .handler = pfm_interrupt_handler,
4273 .flags = SA_INTERRUPT,
4278 pfm_install_alternate_syswide_subsystem(pfm_intr_handler_desc_t *hdl)
4283 /* some sanity checks */
4284 if (hdl == NULL || hdl->handler == NULL) {
4288 /* do the easy test first */
4289 if (pfm_alternate_intr_handler) {
4294 /* reserve our session */
4295 ret = pfm_reserve_session(NULL, 1, cpu_online_map);
4301 if (pfm_alternate_intr_handler) {
4303 printk(KERN_DEBUG "perfmon: install_alternate, intr_handler not NULL "
4308 pfm_alternate_intr_handler = hdl;
4315 pfm_remove_alternate_syswide_subsystem(pfm_intr_handler_desc_t *hdl)
4320 /* cannot remove someone else's handler! */
4321 if (pfm_alternate_intr_handler != hdl)
4325 pfm_alternate_intr_handler = NULL;
4328 * XXX: assume cpu_online_map has not changed since reservation
4330 pfm_unreserve_session(NULL, 1, cpu_online_map);
4338 * perfmon initialization routine, called from the initcall() table
4343 unsigned int n, n_counters, i;
4345 pmu_conf.disabled = 1;
4347 printk(KERN_INFO "perfmon: version %u.%u IRQ %u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN,
4348 IA64_PERFMON_VECTOR);
4351 * compute the number of implemented PMD/PMC from the
4352 * description tables
4355 for (i=0; PMC_IS_LAST(i) == 0; i++) {
4356 if (PMC_IS_IMPL(i) == 0) continue;
4357 pmu_conf.impl_pmcs[i>>6] |= 1UL << (i&63);
4360 pmu_conf.num_pmcs = n;
4362 n = 0; n_counters = 0;
4363 for (i=0; PMD_IS_LAST(i) == 0; i++) {
4364 if (PMD_IS_IMPL(i) == 0) continue;
4365 pmu_conf.impl_pmds[i>>6] |= 1UL << (i&63);
4367 if (PMD_IS_COUNTING(i)) n_counters++;
4369 pmu_conf.num_pmds = n;
4370 pmu_conf.num_counters = n_counters;
4372 printk(KERN_INFO "perfmon: %u PMCs, %u PMDs, %u counters (%lu bits)\n",
4375 pmu_conf.num_counters,
4376 ffz(pmu_conf.ovfl_val));
4379 if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) {
4380 printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
4385 * for now here for debug purposes
4387 perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
4388 if (perfmon_dir == NULL) {
4389 printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
4394 * create /proc/perfmon
4396 pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
4399 * initialize all our spinlocks
4401 spin_lock_init(&pfm_sessions.pfs_lock);
4403 /* we are all set */
4404 pmu_conf.disabled = 0;
4408 __initcall(pfm_init);
4411 pfm_init_percpu(void)
4417 register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
4419 ia64_set_pmv(IA64_PERFMON_VECTOR);
4423 * we first initialize the PMU to a stable state.
4424 * the values may have been changed from their power-up
4425 * values by software executed before the kernel took over.
4427 * At this point, pmu_conf has not yet been initialized
4429 * On McKinley, this code is ineffective until PMC4 is initialized.
4431 for (i=1; PMC_IS_LAST(i) == 0; i++) {
4432 if (PMC_IS_IMPL(i) == 0) continue;
4433 ia64_set_pmc(i, PMC_DFL_VAL(i));
4436 for (i=0; PMD_IS_LAST(i); i++) {
4437 if (PMD_IS_IMPL(i) == 0) continue;
4438 ia64_set_pmd(i, 0UL);
4444 #else /* !CONFIG_PERFMON */
4447 sys_perfmonctl (int pid, int cmd, void *req, int count, long arg5, long arg6,
4448 long arg7, long arg8, long stack)
4453 #endif /* !CONFIG_PERFMON */