arch/ia64/kernel/perfmon.c

   1 /*
   2  * This file implements the perfmon subsystem which is used
   3  * to program the IA-64 Performance Monitoring Unit (PMU).
   4  *
   5  * Originaly Written by Ganesh Venkitachalam, IBM Corp.
   6  * Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
   7  *
   8  * Modifications by Stephane Eranian, Hewlett-Packard Co.
   9  * Modifications by David Mosberger-Tang, Hewlett-Packard Co.
  10  *
  11  * Copyright (C) 1999-2003  Hewlett Packard Co
  12  *               Stephane Eranian <eranian@hpl.hp.com>
  13  *               David Mosberger-Tang <davidm@hpl.hp.com>
  14  */
  15
  16 #include <linux/config.h>
  17 #include <linux/kernel.h>
  18 #include <linux/sched.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/smp_lock.h>
  21 #include <linux/proc_fs.h>
  22 #include <linux/init.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/mm.h>
  25 #include <linux/sysctl.h>
  26 #include <linux/smp.h>
  27
  28 #include <asm/bitops.h>
  29 #include <asm/errno.h>
  30 #include <asm/page.h>
  31 #include <asm/perfmon.h>
  32 #include <asm/processor.h>
  33 #include <asm/signal.h>
  34 #include <asm/system.h>
  35 #include <asm/uaccess.h>
  36 #include <asm/delay.h> /* for ia64_get_itc() */
  37
  38 #ifdef CONFIG_PERFMON
  39
  40 /*
  41  * For PMUs which rely on the debug registers for some features, you must
  42  * you must enable the following flag to activate the support for
  43  * accessing the registers via the perfmonctl() interface.
  44  */
  45 #if defined(CONFIG_ITANIUM) || defined(CONFIG_MCKINLEY)
  46 #define PFM_PMU_USES_DBR        1
  47 #endif
  48
  49 /*
  50  * perfmon context states
  51  */
  52 #define PFM_CTX_DISABLED        0
  53 #define PFM_CTX_ENABLED         1
  54
  55 /*
  56  * Reset register flags
  57  */
  58 #define PFM_PMD_LONG_RESET      1
  59 #define PFM_PMD_SHORT_RESET     2
  60
  61 /*
  62  * Misc macros and definitions
  63  */
  64 #define PMU_FIRST_COUNTER       4
  65 #define PMU_MAX_PMCS            256
  66 #define PMU_MAX_PMDS            256
  67
  68 /*
  69  * type of a PMU register (bitmask).
  70  * bitmask structure:
  71  *      bit0   : register implemented
  72  *      bit1   : end marker
  73  *      bit2-3 : reserved
  74  *      bit4-7 : register type
  75  *      bit8-31: reserved
  76  */
  77 #define PFM_REG_IMPL            0x1 /* register implemented */
  78 #define PFM_REG_END             0x2 /* end marker */
  79 #define PFM_REG_MONITOR         (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
  80 #define PFM_REG_COUNTING        (0x2<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm AND pmc.oi, a PMD used as a counter */
  81 #define PFM_REG_CONTROL         (0x3<<4|PFM_REG_IMPL) /* PMU control register */
  82 #define PFM_REG_CONFIG          (0x4<<4|PFM_REG_IMPL) /* refine configuration */
  83 #define PFM_REG_BUFFER          (0x5<<4|PFM_REG_IMPL) /* PMD used as buffer */
  84
  85 #define PMC_IS_LAST(i)  (pmu_conf.pmc_desc[i].type & PFM_REG_END)
  86 #define PMD_IS_LAST(i)  (pmu_conf.pmd_desc[i].type & PFM_REG_END)
  87
  88 #define PFM_IS_DISABLED() pmu_conf.disabled
  89
  90 #define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_soft_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
  91 #define PFM_FL_INHERIT_MASK     (PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)
  92
  93 /* i assume unsigned */
  94 #define PMC_IS_IMPL(i)    (i< PMU_MAX_PMCS && (pmu_conf.pmc_desc[i].type & PFM_REG_IMPL))
  95 #define PMD_IS_IMPL(i)    (i< PMU_MAX_PMDS && (pmu_conf.pmd_desc[i].type & PFM_REG_IMPL))
  96
  97 /* XXX: these three assume that register i is implemented */
  98 #define PMD_IS_COUNTING(i) (pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING)
  99 #define PMC_IS_COUNTING(i) (pmu_conf.pmc_desc[i].type == PFM_REG_COUNTING)
 100 #define PMC_IS_MONITOR(i)  (pmu_conf.pmc_desc[i].type == PFM_REG_MONITOR)
 101 #define PMC_DFL_VAL(i)     pmu_conf.pmc_desc[i].default_value
 102 #define PMC_RSVD_MASK(i)   pmu_conf.pmc_desc[i].reserved_mask
 103 #define PMD_PMD_DEP(i)     pmu_conf.pmd_desc[i].dep_pmd[0]
 104 #define PMC_PMD_DEP(i)     pmu_conf.pmc_desc[i].dep_pmd[0]
 105
 106 /* k assume unsigned */
 107 #define IBR_IS_IMPL(k)    (k<pmu_conf.num_ibrs)
 108 #define DBR_IS_IMPL(k)    (k<pmu_conf.num_dbrs)
 109
 110 #define CTX_IS_ENABLED(c)       ((c)->ctx_flags.state == PFM_CTX_ENABLED)
 111 #define CTX_OVFL_NOBLOCK(c)     ((c)->ctx_fl_block == 0)
 112 #define CTX_INHERIT_MODE(c)     ((c)->ctx_fl_inherit)
 113 #define CTX_HAS_SMPL(c)         ((c)->ctx_psb != NULL)
 114 /* XXX: does not support more than 64 PMDs */
 115 #define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
 116 #define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
 117
 118
 119 #define CTX_USED_IBR(ctx,n)     (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
 120 #define CTX_USED_DBR(ctx,n)     (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
 121 #define CTX_USES_DBREGS(ctx)    (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
 122
 123 #define LOCK_CTX(ctx)   spin_lock(&(ctx)->ctx_lock)
 124 #define UNLOCK_CTX(ctx) spin_unlock(&(ctx)->ctx_lock)
 125
 126 #define SET_PMU_OWNER(t)    do { pmu_owners[smp_processor_id()].owner = (t); } while(0)
 127 #define PMU_OWNER()         pmu_owners[smp_processor_id()].owner
 128
 129 #define LOCK_PFS()          spin_lock(&pfm_sessions.pfs_lock)
 130 #define UNLOCK_PFS()        spin_unlock(&pfm_sessions.pfs_lock)
 131
 132 #define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
 133
 134 #define PFM_CPUINFO_CLEAR(v)    __get_cpu_var(pfm_syst_info) &= ~(v)
 135 #define PFM_CPUINFO_SET(v)      __get_cpu_var(pfm_syst_info) |= (v)
 136
 137 /*
 138  * debugging
 139  */
 140 #define DBprintk(a) \
 141         do { \
 142                 if (pfm_sysctl.debug >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
 143         } while (0)
 144
 145 #define DBprintk_ovfl(a) \
 146         do { \
 147                 if (pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
 148         } while (0)
 149
 150
 151
 152 /*
 153  * Architected PMC structure
 154  */
 155 typedef struct {
 156         unsigned long pmc_plm:4;        /* privilege level mask */
 157         unsigned long pmc_ev:1;         /* external visibility */
 158         unsigned long pmc_oi:1;         /* overflow interrupt */
 159         unsigned long pmc_pm:1;         /* privileged monitor */
 160         unsigned long pmc_ig1:1;        /* reserved */
 161         unsigned long pmc_es:8;         /* event select */
 162         unsigned long pmc_ig2:48;       /* reserved */
 163 } pfm_monitor_t;
 164
 165 /*
 166  * There is one such data structure per perfmon context. It is used to describe the
 167  * sampling buffer. It is to be shared among siblings whereas the pfm_context
 168  * is not.
 169  * Therefore we maintain a refcnt which is incremented on fork().
 170  * This buffer is private to the kernel only the actual sampling buffer
 171  * including its header are exposed to the user. This construct allows us to
 172  * export the buffer read-write, if needed, without worrying about security
 173  * problems.
 174  */
 175 typedef struct _pfm_smpl_buffer_desc {
 176         spinlock_t              psb_lock;       /* protection lock */
 177         unsigned long           psb_refcnt;     /* how many users for the buffer */
 178         int                     psb_flags;      /* bitvector of flags (not yet used) */
 179
 180         void                    *psb_addr;      /* points to location of first entry */
 181         unsigned long           psb_entries;    /* maximum number of entries */
 182         unsigned long           psb_size;       /* aligned size of buffer */
 183         unsigned long           psb_index;      /* next free entry slot XXX: must use the one in buffer */
 184         unsigned long           psb_entry_size; /* size of each entry including entry header */
 185
 186         perfmon_smpl_hdr_t      *psb_hdr;       /* points to sampling buffer header */
 187
 188         struct _pfm_smpl_buffer_desc *psb_next; /* next psb, used for rvfreeing of psb_hdr */
 189
 190 } pfm_smpl_buffer_desc_t;
 191
 192 /*
 193  * psb_flags
 194  */
 195 #define PSB_HAS_VMA     0x1             /* a virtual mapping for the buffer exists */
 196
 197 #define LOCK_PSB(p)     spin_lock(&(p)->psb_lock)
 198 #define UNLOCK_PSB(p)   spin_unlock(&(p)->psb_lock)
 199
 200 /*
 201  * 64-bit software counter structure
 202  */
 203 typedef struct {
 204         u64 val;        /* virtual 64bit counter value */
 205         u64 lval;       /* last value */
 206         u64 long_reset; /* reset value on sampling overflow */
 207         u64 short_reset;/* reset value on overflow */
 208         u64 reset_pmds[4]; /* which other pmds to reset when this counter overflows */
 209         u64 seed;       /* seed for random-number generator */
 210         u64 mask;       /* mask for random-number generator */
 211         unsigned int flags; /* notify/do not notify */
 212 } pfm_counter_t;
 213
 214 /*
 215  * perfmon context. One per process, is cloned on fork() depending on
 216  * inheritance flags
 217  */
 218 typedef struct {
 219         unsigned int state:1;           /* 0=disabled, 1=enabled */
 220         unsigned int inherit:2;         /* inherit mode */
 221         unsigned int block:1;           /* when 1, task will blocked on user notifications */
 222         unsigned int system:1;          /* do system wide monitoring */
 223         unsigned int frozen:1;          /* pmu must be kept frozen on ctxsw in */
 224         unsigned int protected:1;       /* allow access to creator of context only */
 225         unsigned int using_dbreg:1;     /* using range restrictions (debug registers) */
 226         unsigned int excl_idle:1;       /* exclude idle task in system wide session */
 227         unsigned int trap_reason:2;     /* reason for going into pfm_block_ovfl_reset() */
 228         unsigned int reserved:21;
 229 } pfm_context_flags_t;
 230
 231 #define PFM_TRAP_REASON_NONE            0x0     /* default value */
 232 #define PFM_TRAP_REASON_BLOCKSIG        0x1     /* we need to block on overflow and signal user */
 233 #define PFM_TRAP_REASON_SIG             0x2     /* we simply need to signal user */
 234 #define PFM_TRAP_REASON_RESET           0x3     /* we need to reset PMDs */
 235
 236 /*
 237  * perfmon context: encapsulates all the state of a monitoring session
 238  * XXX: probably need to change layout
 239  */
 240 typedef struct pfm_context {
 241         pfm_smpl_buffer_desc_t  *ctx_psb;               /* sampling buffer, if any */
 242         unsigned long           ctx_smpl_vaddr;         /* user level virtual address of smpl buffer */
 243
 244         spinlock_t              ctx_lock;
 245         pfm_context_flags_t     ctx_flags;              /* block/noblock */
 246
 247         struct task_struct      *ctx_notify_task;       /* who to notify on overflow */
 248         struct task_struct      *ctx_owner;             /* pid of creator (debug) */
 249
 250         unsigned long           ctx_ovfl_regs[4];       /* which registers overflowed (notification) */
 251         unsigned long           ctx_smpl_regs[4];       /* which registers to record on overflow */
 252
 253         struct semaphore        ctx_restart_sem;        /* use for blocking notification mode */
 254
 255         unsigned long           ctx_used_pmds[4];       /* bitmask of PMD used                 */
 256         unsigned long           ctx_reload_pmds[4];     /* bitmask of PMD to reload on ctxsw   */
 257
 258         unsigned long           ctx_used_pmcs[4];       /* bitmask PMC used by context         */
 259         unsigned long           ctx_reload_pmcs[4];     /* bitmask of PMC to reload on ctxsw   */
 260
 261         unsigned long           ctx_used_ibrs[4];       /* bitmask of used IBR (speedup ctxsw) */
 262         unsigned long           ctx_used_dbrs[4];       /* bitmask of used DBR (speedup ctxsw) */
 263
 264         pfm_counter_t           ctx_soft_pmds[IA64_NUM_PMD_REGS]; /* XXX: size should be dynamic */
 265
 266         u64                     ctx_saved_psr;          /* copy of psr used for lazy ctxsw */
 267         unsigned long           ctx_saved_cpus_allowed; /* copy of the task cpus_allowed (system wide) */
 268         unsigned int            ctx_cpu;                /* CPU used by system wide session */
 269
 270         atomic_t                ctx_last_cpu;           /* CPU id of current or last CPU used */
 271 } pfm_context_t;
 272
 273 #define ctx_fl_inherit          ctx_flags.inherit
 274 #define ctx_fl_block            ctx_flags.block
 275 #define ctx_fl_system           ctx_flags.system
 276 #define ctx_fl_frozen           ctx_flags.frozen
 277 #define ctx_fl_protected        ctx_flags.protected
 278 #define ctx_fl_using_dbreg      ctx_flags.using_dbreg
 279 #define ctx_fl_excl_idle        ctx_flags.excl_idle
 280 #define ctx_fl_trap_reason      ctx_flags.trap_reason
 281
 282 /*
 283  * global information about all sessions
 284  * mostly used to synchronize between system wide and per-process
 285  */
 286 typedef struct {
 287         spinlock_t              pfs_lock;                  /* lock the structure */
 288
 289         unsigned int            pfs_task_sessions;         /* number of per task sessions */
 290         unsigned int            pfs_sys_sessions;          /* number of per system wide sessions */
 291         unsigned int            pfs_sys_use_dbregs;        /* incremented when a system wide session uses debug regs */
 292         unsigned int            pfs_ptrace_use_dbregs;     /* incremented when a process uses debug regs */
 293         struct task_struct      *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
 294 } pfm_session_t;
 295
 296 /*
 297  * information about a PMC or PMD.
 298  * dep_pmd[]: a bitmask of dependent PMD registers
 299  * dep_pmc[]: a bitmask of dependent PMC registers
 300  */
 301 typedef struct {
 302         unsigned int            type;
 303         int                     pm_pos;
 304         unsigned long           default_value;  /* power-on default value */
 305         unsigned long           reserved_mask;  /* bitmask of reserved bits */
 306         int                     (*read_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
 307         int                     (*write_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
 308         unsigned long           dep_pmd[4];
 309         unsigned long           dep_pmc[4];
 310 } pfm_reg_desc_t;
 311
 312 /* assume cnum is a valid monitor */
 313 #define PMC_PM(cnum, val)       (((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
 314 #define PMC_WR_FUNC(cnum)       (pmu_conf.pmc_desc[cnum].write_check)
 315 #define PMD_WR_FUNC(cnum)       (pmu_conf.pmd_desc[cnum].write_check)
 316 #define PMD_RD_FUNC(cnum)       (pmu_conf.pmd_desc[cnum].read_check)
 317
 318 /*
 319  * This structure is initialized at boot time and contains
 320  * a description of the PMU main characteristics.
 321  */
 322 typedef struct {
 323         unsigned int  disabled;         /* indicates if perfmon is working properly */
 324         unsigned long ovfl_val;         /* overflow value for generic counters   */
 325         unsigned long impl_pmcs[4];     /* bitmask of implemented PMCS */
 326         unsigned long impl_pmds[4];     /* bitmask of implemented PMDS */
 327         unsigned int  num_pmcs;         /* number of implemented PMCS */
 328         unsigned int  num_pmds;         /* number of implemented PMDS */
 329         unsigned int  num_ibrs;         /* number of implemented IBRS */
 330         unsigned int  num_dbrs;         /* number of implemented DBRS */
 331         unsigned int  num_counters;     /* number of PMD/PMC counters */
 332         pfm_reg_desc_t *pmc_desc;       /* detailed PMC register dependencies descriptions */
 333         pfm_reg_desc_t *pmd_desc;       /* detailed PMD register dependencies descriptions */
 334 } pmu_config_t;
 335
 336 /*
 337  * structure used to pass argument to/from remote CPU
 338  * using IPI to check and possibly save the PMU context on SMP systems.
 339  *
 340  * not used in UP kernels
 341  */
 342 typedef struct {
 343         struct task_struct *task;       /* which task we are interested in */
 344         int retval;                     /* return value of the call: 0=you can proceed, 1=need to wait for completion */
 345 } pfm_smp_ipi_arg_t;
 346
 347 /*
 348  * perfmon command descriptions
 349  */
 350 typedef struct {
 351         int             (*cmd_func)(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
 352         int             cmd_flags;
 353         unsigned int    cmd_narg;
 354         size_t          cmd_argsize;
 355 } pfm_cmd_desc_t;
 356
 357 #define PFM_CMD_PID             0x1     /* command requires pid argument */
 358 #define PFM_CMD_ARG_READ        0x2     /* command must read argument(s) */
 359 #define PFM_CMD_ARG_RW          0x4     /* command must read/write argument(s) */
 360 #define PFM_CMD_CTX             0x8     /* command needs a perfmon context */
 361 #define PFM_CMD_NOCHK           0x10    /* command does not need to check task's state */
 362
 363 #define PFM_CMD_IDX(cmd)        (cmd)
 364
 365 #define PFM_CMD_IS_VALID(cmd)   ((PFM_CMD_IDX(cmd) >= 0) && (PFM_CMD_IDX(cmd) < PFM_CMD_COUNT) \
 366                                   && pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func != NULL)
 367
 368 #define PFM_CMD_USE_PID(cmd)    ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_PID) != 0)
 369 #define PFM_CMD_READ_ARG(cmd)   ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_READ) != 0)
 370 #define PFM_CMD_RW_ARG(cmd)     ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_RW) != 0)
 371 #define PFM_CMD_USE_CTX(cmd)    ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_CTX) != 0)
 372 #define PFM_CMD_CHK(cmd)        ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_NOCHK) == 0)
 373
 374 #define PFM_CMD_ARG_MANY        -1 /* cannot be zero */
 375 #define PFM_CMD_NARG(cmd)       (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg)
 376 #define PFM_CMD_ARG_SIZE(cmd)   (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize)
 377
 378 typedef struct {
 379         int     debug;          /* turn on/off debugging via syslog */
 380         int     debug_ovfl;     /* turn on/off debug printk in overflow handler */
 381         int     fastctxsw;      /* turn on/off fast (unsecure) ctxsw */
 382 } pfm_sysctl_t;
 383
 384 typedef struct {
 385         unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
 386         unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
 387         unsigned long pfm_recorded_samples_count;
 388         unsigned long pfm_full_smpl_buffer_count; /* how many times the sampling buffer was full */
 389         char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
 390 } pfm_stats_t;
 391
 392 /*
 393  * perfmon internal variables
 394  */
 395 static pfm_session_t    pfm_sessions;   /* global sessions information */
 396 static struct proc_dir_entry *perfmon_dir; /* for debug only */
 397 static pfm_stats_t      pfm_stats[NR_CPUS];
 398 static pfm_intr_handler_desc_t  *pfm_alternate_intr_handler;
 399
 400 DEFINE_PER_CPU(unsigned long, pfm_syst_info);
 401
 402 /* sysctl() controls */
 403 static pfm_sysctl_t pfm_sysctl;
 404
 405 static ctl_table pfm_ctl_table[]={
 406         {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
 407         {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
 408         {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
 409         { 0, },
 410 };
 411 static ctl_table pfm_sysctl_dir[] = {
 412         {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
 413         {0,},
 414 };
 415 static ctl_table pfm_sysctl_root[] = {
 416         {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
 417         {0,},
 418 };
 419 static struct ctl_table_header *pfm_sysctl_header;
 420
 421 static void pfm_vm_close(struct vm_area_struct * area);
 422
 423 static struct vm_operations_struct pfm_vm_ops={
 424         .close = pfm_vm_close
 425 };
 426
 427 /*
 428  * keep track of task owning the PMU per CPU.
 429  */
 430 static struct {
 431         struct task_struct *owner;
 432         char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
 433 } pmu_owners[NR_CPUS];
 434
 435
 436
 437 /*
 438  * forward declarations
 439  */
 440 static void pfm_reset_pmu(struct task_struct *);
 441 static void pfm_lazy_save_regs (struct task_struct *ta);
 442
 443 #if   defined(CONFIG_ITANIUM)
 444 #include "perfmon_itanium.h"
 445 #elif defined(CONFIG_MCKINLEY)
 446 #include "perfmon_mckinley.h"
 447 #else
 448 #include "perfmon_generic.h"
 449 #endif
 450
 451 static inline void
 452 pfm_clear_psr_pp(void)
 453 {
 454         __asm__ __volatile__ ("rsm psr.pp;; srlz.i;;"::: "memory");
 455 }
 456
 457 static inline void
 458 pfm_set_psr_pp(void)
 459 {
 460         __asm__ __volatile__ ("ssm psr.pp;; srlz.i;;"::: "memory");
 461 }
 462
 463 static inline void
 464 pfm_clear_psr_up(void)
 465 {
 466         __asm__ __volatile__ ("rum psr.up;; srlz.i;;"::: "memory");
 467 }
 468
 469 static inline void
 470 pfm_set_psr_up(void)
 471 {
 472         __asm__ __volatile__ ("sum psr.up;; srlz.i;;"::: "memory");
 473 }
 474
 475 static inline unsigned long
 476 pfm_get_psr(void)
 477 {
 478         unsigned long tmp;
 479         __asm__ __volatile__ ("mov %0=psr;;": "=r"(tmp) :: "memory");
 480         return tmp;
 481 }
 482
 483 static inline void
 484 pfm_set_psr_l(unsigned long val)
 485 {
 486         __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(val): "memory");
 487 }
 488
 489 static inline void
 490 pfm_freeze_pmu(void)
 491 {
 492         ia64_set_pmc(0,1UL);
 493         ia64_srlz_d();
 494 }
 495
 496 static inline void
 497 pfm_unfreeze_pmu(void)
 498 {
 499         ia64_set_pmc(0,0UL);
 500         ia64_srlz_d();
 501 }
 502
 503 static inline unsigned long
 504 pfm_read_soft_counter(pfm_context_t *ctx, int i)
 505 {
 506         return ctx->ctx_soft_pmds[i].val + (ia64_get_pmd(i) & pmu_conf.ovfl_val);
 507 }
 508
 509 static inline void
 510 pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
 511 {
 512         ctx->ctx_soft_pmds[i].val = val  & ~pmu_conf.ovfl_val;
 513         /*
 514          * writing to unimplemented part is ignore, so we do not need to
 515          * mask off top part
 516          */
 517         ia64_set_pmd(i, val & pmu_conf.ovfl_val);
 518 }
 519
 520 /*
 521  * Generates a unique (per CPU) timestamp
 522  */
 523 static inline unsigned long
 524 pfm_get_stamp(void)
 525 {
 526         /*
 527          * XXX: must find something more efficient
 528          */
 529         return ia64_get_itc();
 530 }
 531
 532 /* Here we want the physical address of the memory.
 533  * This is used when initializing the contents of the
 534  * area and marking the pages as reserved.
 535  */
 536 static inline unsigned long
 537 pfm_kvirt_to_pa(unsigned long adr)
 538 {
 539         __u64 pa = ia64_tpa(adr);
 540         //DBprintk(("kv2pa(%lx-->%lx)\n", adr, pa));
 541         return pa;
 542 }
 543
 544 static void *
 545 pfm_rvmalloc(unsigned long size)
 546 {
 547         void *mem;
 548         unsigned long adr;
 549
 550         size=PAGE_ALIGN(size);
 551         mem=vmalloc(size);
 552         if (mem) {
 553                 //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
 554                 memset(mem, 0, size); /* Clear the ram out, no junk to the user */
 555                 adr=(unsigned long) mem;
 556                 while (size > 0) {
 557                         SetPageReserved(vmalloc_to_page((void *)adr));
 558                         adr+=PAGE_SIZE;
 559                         size-=PAGE_SIZE;
 560                 }
 561         }
 562         return mem;
 563 }
 564
 565 static void
 566 pfm_rvfree(void *mem, unsigned long size)
 567 {
 568         unsigned long adr;
 569
 570         if (mem) {
 571                 adr=(unsigned long) mem;
 572                 while ((long) size > 0) {
 573                         ClearPageReserved(vmalloc_to_page((void*)adr));
 574                         adr+=PAGE_SIZE;
 575                         size-=PAGE_SIZE;
 576                 }
 577                 vfree(mem);
 578         }
 579         return;
 580 }
 581
 582 /*
 583  * This function gets called from mm/mmap.c:exit_mmap() only when there is a sampling buffer
 584  * attached to the context AND the current task has a mapping for it, i.e., it is the original
 585  * creator of the context.
 586  *
 587  * This function is used to remember the fact that the vma describing the sampling buffer
 588  * has now been removed. It can only be called when no other tasks share the same mm context.
 589  *
 590  */
 591 static void
 592 pfm_vm_close(struct vm_area_struct *vma)
 593 {
 594         pfm_smpl_buffer_desc_t *psb = (pfm_smpl_buffer_desc_t *)vma->vm_private_data;
 595
 596         if (psb == NULL) {
 597                 printk(KERN_DEBUG "perfmon: psb is null in [%d]\n", current->pid);
 598                 return;
 599         }
 600         /*
 601          * Add PSB to list of buffers to free on release_thread() when no more users
 602          *
 603          * This call is safe because, once the count is zero is cannot be modified anymore.
 604          * This is not because there is no more user of the mm context, that the sampling
 605          * buffer is not being used anymore outside of this task. In fact, it can still
 606          * be accessed from within the kernel by another task (such as the monitored task).
 607          *
 608          * Therefore, we only move the psb into the list of buffers to free when we know
 609          * nobody else is using it.
 610          * The linked list if independent of the perfmon context, because in the case of
 611          * multi-threaded processes, the last thread may not have been involved with
 612          * monitoring however it will be the one removing the vma and it should therefore
 613          * also remove the sampling buffer. This buffer cannot be removed until the vma
 614          * is removed.
 615          *
 616          * This function cannot remove the buffer from here, because exit_mmap() must first
 617          * complete. Given that there is no other vma related callback in the generic code,
 618          * we have created our own with the linked list of sampling buffers to free. The list
 619          * is part of the thread structure. In release_thread() we check if the list is
 620          * empty. If not we call into perfmon to free the buffer and psb. That is the only
 621          * way to ensure a safe deallocation of the sampling buffer which works when
 622          * the buffer is shared between distinct processes or with multi-threaded programs.
 623          *
 624          * We need to lock the psb because the refcnt test and flag manipulation must
 625          * looked like an atomic operation vis a vis pfm_context_exit()
 626          */
 627         LOCK_PSB(psb);
 628
 629         if (psb->psb_refcnt == 0) {
 630
 631                 psb->psb_next = current->thread.pfm_smpl_buf_list;
 632                 current->thread.pfm_smpl_buf_list = psb;
 633
 634                 DBprintk(("[%d] add smpl @%p size %lu to smpl_buf_list psb_flags=0x%x\n",
 635                         current->pid, psb->psb_hdr, psb->psb_size, psb->psb_flags));
 636         }
 637         DBprintk(("[%d] clearing psb_flags=0x%x smpl @%p size %lu\n",
 638                         current->pid, psb->psb_flags, psb->psb_hdr, psb->psb_size));
 639         /*
 640          * decrement the number vma for the buffer
 641          */
 642         psb->psb_flags &= ~PSB_HAS_VMA;
 643
 644         UNLOCK_PSB(psb);
 645 }
 646
 647 /*
 648  * This function is called from pfm_destroy_context() and also from pfm_inherit()
 649  * to explicitely remove the sampling buffer mapping from the user level address space.
 650  */
 651 static int
 652 pfm_remove_smpl_mapping(struct task_struct *task)
 653 {
 654         pfm_context_t *ctx = task->thread.pfm_context;
 655         pfm_smpl_buffer_desc_t *psb;
 656         int r;
 657
 658         /*
 659          * some sanity checks first
 660          */
 661         if (ctx == NULL || task->mm == NULL || ctx->ctx_smpl_vaddr == 0 || ctx->ctx_psb == NULL) {
 662                 printk(KERN_DEBUG "perfmon: invalid context mm=%p\n", task->mm);
 663                 return -1;
 664         }
 665         psb = ctx->ctx_psb;
 666
 667         down_write(&task->mm->mmap_sem);
 668
 669         r = do_munmap(task->mm, ctx->ctx_smpl_vaddr, psb->psb_size);
 670
 671         up_write(&task->mm->mmap_sem);
 672         if (r !=0) {
 673                 printk(KERN_DEBUG "perfmon: pid %d unable to unmap sampling buffer "
 674                        "@0x%lx size=%ld\n", task->pid, ctx->ctx_smpl_vaddr, psb->psb_size);
 675         }
 676
 677         DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d refcnt=%lu psb_flags=0x%x\n",
 678                 task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r, psb->psb_refcnt, psb->psb_flags));
 679
 680         return 0;
 681 }
 682
 683 static pfm_context_t *
 684 pfm_context_alloc(void)
 685 {
 686         pfm_context_t *ctx;
 687
 688         /* allocate context descriptor */
 689         ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
 690         if (ctx) memset(ctx, 0, sizeof(pfm_context_t));
 691
 692         return ctx;
 693 }
 694
 695 static void
 696 pfm_context_free(pfm_context_t *ctx)
 697 {
 698         if (ctx) kfree(ctx);
 699 }
 700
 701 static int
 702 pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
 703 {
 704         unsigned long page;
 705
 706         DBprintk(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
 707
 708         while (size > 0) {
 709                 page = pfm_kvirt_to_pa(buf);
 710
 711                 if (remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
 712
 713                 addr  += PAGE_SIZE;
 714                 buf   += PAGE_SIZE;
 715                 size  -= PAGE_SIZE;
 716         }
 717         return 0;
 718 }
 719
 720 /*
 721  * counts the number of PMDS to save per entry.
 722  * This code is generic enough to accommodate more than 64 PMDS when they become available
 723  */
 724 static unsigned long
 725 pfm_smpl_entry_size(unsigned long *which, unsigned long size)
 726 {
 727         unsigned long res = 0;
 728         int i;
 729
 730         for (i=0; i < size; i++, which++) res += hweight64(*which);
 731
 732         DBprintk(("weight=%ld\n", res));
 733
 734         return res;
 735 }
 736
 737 /*
 738  * Allocates the sampling buffer and remaps it into caller's address space
 739  */
 740 static int
 741 pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned long entries,
 742                       void **user_vaddr)
 743 {
 744         struct mm_struct *mm = current->mm;
 745         struct vm_area_struct *vma = NULL;
 746         unsigned long size, regcount;
 747         void *smpl_buf;
 748         pfm_smpl_buffer_desc_t *psb;
 749
 750
 751         /* note that regcount might be 0, in this case only the header for each
 752          * entry will be recorded.
 753          */
 754         regcount = pfm_smpl_entry_size(which_pmds, 1);
 755
 756         if ((sizeof(perfmon_smpl_hdr_t)+ entries*sizeof(perfmon_smpl_entry_t)) <= entries) {
 757                 DBprintk(("requested entries %lu is too big\n", entries));
 758                 return -EINVAL;
 759         }
 760
 761         /*
 762          * 1 buffer hdr and for each entry a header + regcount PMDs to save
 763          */
 764         size = PAGE_ALIGN(  sizeof(perfmon_smpl_hdr_t)
 765                           + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64)));
 766
 767         DBprintk(("sampling buffer size=%lu bytes\n", size));
 768
 769         /*
 770          * check requested size to avoid Denial-of-service attacks
 771          * XXX: may have to refine this test
 772          * Check against address space limit.
 773          *
 774          * if ((mm->total_vm << PAGE_SHIFT) + len> current->rlim[RLIMIT_AS].rlim_cur)
 775          *      return -ENOMEM;
 776          */
 777         if (size > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;
 778
 779         /*
 780          * We do the easy to undo allocations first.
 781          *
 782          * pfm_rvmalloc(), clears the buffer, so there is no leak
 783          */
 784         smpl_buf = pfm_rvmalloc(size);
 785         if (smpl_buf == NULL) {
 786                 DBprintk(("Can't allocate sampling buffer\n"));
 787                 return -ENOMEM;
 788         }
 789
 790         DBprintk(("smpl_buf @%p\n", smpl_buf));
 791
 792         /* allocate sampling buffer descriptor now */
 793         psb = kmalloc(sizeof(*psb), GFP_KERNEL);
 794         if (psb == NULL) {
 795                 DBprintk(("Can't allocate sampling buffer descriptor\n"));
 796                 goto error_kmalloc;
 797         }
 798
 799         /* allocate vma */
 800         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 801         if (!vma) {
 802                 DBprintk(("Cannot allocate vma\n"));
 803                 goto error_kmem;
 804         }
 805         /*
 806          * partially initialize the vma for the sampling buffer
 807          *
 808          * The VM_DONTCOPY flag is very important as it ensures that the mapping
 809          * will never be inherited for any child process (via fork()) which is always
 810          * what we want.
 811          */
 812         vma->vm_mm           = mm;
 813         vma->vm_flags        = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY;
 814         vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 815         vma->vm_ops          = &pfm_vm_ops; /* necesarry to get the close() callback */
 816         vma->vm_pgoff        = 0;
 817         vma->vm_file         = NULL;
 818         vma->vm_private_data = psb;     /* information needed by the pfm_vm_close() function */
 819
 820         /*
 821          * Now we have everything we need and we can initialize
 822          * and connect all the data structures
 823          */
 824
 825         psb->psb_hdr     = smpl_buf;
 826         psb->psb_addr    = ((char *)smpl_buf)+sizeof(perfmon_smpl_hdr_t); /* first entry */
 827         psb->psb_size    = size; /* aligned size */
 828         psb->psb_index   = 0;
 829         psb->psb_entries = entries;
 830         psb->psb_refcnt  = 1;
 831         psb->psb_flags   = PSB_HAS_VMA;
 832
 833         spin_lock_init(&psb->psb_lock);
 834
 835         /*
 836          * XXX: will need to do cacheline alignment to avoid false sharing in SMP mode and
 837          * multitask monitoring.
 838          */
 839         psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64);
 840
 841         DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p refcnt=%lu psb_flags=0x%x\n",
 842                   (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr,
 843                   (void *)psb->psb_addr, psb->psb_refcnt, psb->psb_flags));
 844
 845         /* initialize some of the fields of user visible buffer header */
 846         psb->psb_hdr->hdr_version    = PFM_SMPL_VERSION;
 847         psb->psb_hdr->hdr_entry_size = psb->psb_entry_size;
 848         psb->psb_hdr->hdr_pmds[0]    = which_pmds[0];
 849
 850         /*
 851          * Let's do the difficult operations next.
 852          *
 853          * now we atomically find some area in the address space and
 854          * remap the buffer in it.
 855          */
 856         down_write(&current->mm->mmap_sem);
 857
 858
 859         /* find some free area in address space, must have mmap sem held */
 860         vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
 861         if (vma->vm_start == 0UL) {
 862                 DBprintk(("Cannot find unmapped area for size %ld\n", size));
 863                 up_write(&current->mm->mmap_sem);
 864                 goto error;
 865         }
 866         vma->vm_end = vma->vm_start + size;
 867
 868         DBprintk(("entries=%ld aligned size=%ld, unmapped @0x%lx\n", entries, size, vma->vm_start));
 869
 870         /* can only be applied to current, need to have the mm semaphore held when called */
 871         if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
 872                 DBprintk(("Can't remap buffer\n"));
 873                 up_write(&current->mm->mmap_sem);
 874                 goto error;
 875         }
 876
 877         /*
 878          * now insert the vma in the vm list for the process, must be
 879          * done with mmap lock held
 880          */
 881         insert_vm_struct(mm, vma);
 882
 883         mm->total_vm  += size >> PAGE_SHIFT;
 884
 885         up_write(&current->mm->mmap_sem);
 886
 887         /* store which PMDS to record */
 888         ctx->ctx_smpl_regs[0] = which_pmds[0];
 889
 890
 891         /* link to perfmon context */
 892         ctx->ctx_psb        = psb;
 893
 894         /*
 895          * keep track of user level virtual address
 896          */
 897         ctx->ctx_smpl_vaddr = *(unsigned long *)user_vaddr = vma->vm_start;
 898
 899         return 0;
 900
 901 error:
 902         kmem_cache_free(vm_area_cachep, vma);
 903 error_kmem:
 904         kfree(psb);
 905 error_kmalloc:
 906         pfm_rvfree(smpl_buf, size);
 907         return -ENOMEM;
 908 }
 909
 910 static int
 911 pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned long cpu_mask)
 912 {
 913         unsigned long m, undo_mask;
 914         unsigned int n, i;
 915
 916         /*
 917          * validy checks on cpu_mask have been done upstream
 918          */
 919         LOCK_PFS();
 920
 921         if (is_syswide) {
 922                 /*
 923                  * cannot mix system wide and per-task sessions
 924                  */
 925                 if (pfm_sessions.pfs_task_sessions > 0UL) {
 926                         DBprintk(("system wide not possible, %u conflicting task_sessions\n",
 927                                 pfm_sessions.pfs_task_sessions));
 928                         goto abort;
 929                 }
 930
 931                 m = cpu_mask; undo_mask = 0UL; n = 0;
 932                 DBprintk(("cpu_mask=0x%lx\n", cpu_mask));
 933                 for(i=0; m; i++, m>>=1) {
 934
 935                         if ((m & 0x1) == 0UL) continue;
 936
 937                         if (pfm_sessions.pfs_sys_session[i]) goto undo;
 938
 939                         DBprintk(("reserving CPU%d currently on CPU%d\n", i, smp_processor_id()));
 940
 941                         pfm_sessions.pfs_sys_session[i] = task;
 942                         undo_mask |= 1UL << i;
 943                         n++;
 944                 }
 945                 pfm_sessions.pfs_sys_sessions += n;
 946         } else {
 947                 if (pfm_sessions.pfs_sys_sessions) goto abort;
 948                 pfm_sessions.pfs_task_sessions++;
 949         }
 950         DBprintk(("task_sessions=%u sys_session[%d]=%d",
 951                   pfm_sessions.pfs_task_sessions,
 952                   smp_processor_id(), pfm_sessions.pfs_sys_session[smp_processor_id()] ? 1 : 0));
 953         UNLOCK_PFS();
 954         return 0;
 955 undo:
 956         DBprintk(("system wide not possible, conflicting session [%d] on CPU%d\n",
 957                 pfm_sessions.pfs_sys_session[i]->pid, i));
 958
 959         for(i=0; undo_mask; i++, undo_mask >>=1) {
 960                 pfm_sessions.pfs_sys_session[i] = NULL;
 961         }
 962 abort:
 963         UNLOCK_PFS();
 964
 965         return -EBUSY;
 966
 967 }
 968
 969 static int
 970 pfm_unreserve_session(struct task_struct *task, int is_syswide, unsigned long cpu_mask)
 971 {
 972         pfm_context_t *ctx;
 973         unsigned long m;
 974         unsigned int n, i;
 975
 976         ctx = task ? task->thread.pfm_context : NULL;
 977
 978         /*
 979          * validy checks on cpu_mask have been done upstream
 980          */
 981         LOCK_PFS();
 982
 983         DBprintk(("[%d] sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu_mask=0x%lx\n",
 984                 task->pid,
 985                 pfm_sessions.pfs_sys_sessions,
 986                 pfm_sessions.pfs_task_sessions,
 987                 pfm_sessions.pfs_sys_use_dbregs,
 988                 is_syswide,
 989                 cpu_mask));
 990
 991
 992         if (is_syswide) {
 993                 m = cpu_mask; n = 0;
 994                 for(i=0; m; i++, m>>=1) {
 995                         if ((m & 0x1) == 0UL) continue;
 996                         pfm_sessions.pfs_sys_session[i] = NULL;
 997                         n++;
 998                 }
 999                 /*
1000                  * would not work with perfmon+more than one bit in cpu_mask
1001                  */
1002                 if (ctx && ctx->ctx_fl_using_dbreg) {
1003                         if (pfm_sessions.pfs_sys_use_dbregs == 0) {
1004                                 printk(KERN_DEBUG "perfmon: invalid release for [%d] "
1005                                        "sys_use_dbregs=0\n", task->pid);
1006                         } else {
1007                                 pfm_sessions.pfs_sys_use_dbregs--;
1008                         }
1009                 }
1010                 pfm_sessions.pfs_sys_sessions -= n;
1011
1012                 DBprintk(("CPU%d sys_sessions=%u\n",
1013                         smp_processor_id(), pfm_sessions.pfs_sys_sessions));
1014         } else {
1015                 pfm_sessions.pfs_task_sessions--;
1016                 DBprintk(("[%d] task_sessions=%u\n",
1017                         task->pid, pfm_sessions.pfs_task_sessions));
1018         }
1019
1020         UNLOCK_PFS();
1021
1022         return 0;
1023 }
1024
1025 /*
1026  * XXX: do something better here
1027  */
1028 static int
1029 pfm_bad_permissions(struct task_struct *task)
1030 {
1031         /* stolen from bad_signal() */
1032         return (current->session != task->session)
1033             && (current->euid ^ task->suid) && (current->euid ^ task->uid)
1034             && (current->uid ^ task->suid) && (current->uid ^ task->uid);
1035 }
1036
1037
1038 static int
1039 pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
1040 {
1041         unsigned long smpl_pmds = pfx->ctx_smpl_regs[0];
1042         int ctx_flags;
1043         int cpu;
1044
1045         /* valid signal */
1046
1047         /* cannot send to process 1, 0 means do not notify */
1048         if (pfx->ctx_notify_pid == 1) {
1049                 DBprintk(("invalid notify_pid %d\n", pfx->ctx_notify_pid));
1050                 return -EINVAL;
1051         }
1052         ctx_flags = pfx->ctx_flags;
1053
1054         if ((ctx_flags & PFM_FL_INHERIT_MASK) == (PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)) {
1055                 DBprintk(("invalid inherit mask 0x%x\n",ctx_flags & PFM_FL_INHERIT_MASK));
1056                 return -EINVAL;
1057         }
1058
1059         if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
1060                 DBprintk(("cpu_mask=0x%lx\n", pfx->ctx_cpu_mask));
1061                 /*
1062                  * cannot block in this mode
1063                  */
1064                 if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
1065                         DBprintk(("cannot use blocking mode when in system wide monitoring\n"));
1066                         return -EINVAL;
1067                 }
1068                 /*
1069                  * must only have one bit set in the CPU mask
1070                  */
1071                 if (hweight64(pfx->ctx_cpu_mask) != 1UL) {
1072                         DBprintk(("invalid CPU mask specified\n"));
1073                         return -EINVAL;
1074                 }
1075                 /*
1076                  * and it must be a valid CPU
1077                  */
1078                 cpu = ffz(~pfx->ctx_cpu_mask);
1079                 if (cpu_online(cpu) == 0) {
1080                         DBprintk(("CPU%d is not online\n", cpu));
1081                         return -EINVAL;
1082                 }
1083                 /*
1084                  * check for pre-existing pinning, if conflicting reject
1085                  */
1086                 if (task->cpus_allowed != ~0UL && (task->cpus_allowed & (1UL<<cpu)) == 0) {
1087                         DBprintk(("[%d] pinned on 0x%lx, mask for CPU%d \n", task->pid,
1088                                 task->cpus_allowed, cpu));
1089                         return -EINVAL;
1090                 }
1091
1092         } else {
1093                 /*
1094                  * must provide a target for the signal in blocking mode even when
1095                  * no counter is configured with PFM_FL_REG_OVFL_NOTIFY
1096                  */
1097                 if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) {
1098                         DBprintk(("must have notify_pid when blocking for [%d]\n", task->pid));
1099                         return -EINVAL;
1100                 }
1101 #if 0
1102                 if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == task->pid) {
1103                         DBprintk(("cannot notify self when blocking for [%d]\n", task->pid));
1104                         return -EINVAL;
1105                 }
1106 #endif
1107         }
1108         /* verify validity of smpl_regs */
1109         if ((smpl_pmds & pmu_conf.impl_pmds[0]) != smpl_pmds) {
1110                 DBprintk(("invalid smpl_regs 0x%lx\n", smpl_pmds));
1111                 return -EINVAL;
1112         }
1113         /* probably more to add here */
1114
1115         return 0;
1116 }
1117
1118 static int
1119 pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int count,
1120                    struct pt_regs *regs)
1121 {
1122         pfarg_context_t tmp;
1123         void *uaddr = NULL;
1124         int ret;
1125         int ctx_flags;
1126         pid_t notify_pid;
1127
1128         /* a context has already been defined */
1129         if (ctx) return -EBUSY;
1130
1131         /*
1132          * not yet supported
1133          */
1134         if (task != current) return -EINVAL;
1135
1136         if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1137
1138         ret = pfx_is_sane(task, &tmp);
1139         if (ret < 0) return ret;
1140
1141         ctx_flags = tmp.ctx_flags;
1142
1143         ret = pfm_reserve_session(task, ctx_flags & PFM_FL_SYSTEM_WIDE, tmp.ctx_cpu_mask);
1144         if (ret) goto abort;
1145
1146         ret = -ENOMEM;
1147
1148         ctx = pfm_context_alloc();
1149         if (!ctx) goto error;
1150
1151         /* record the creator (important for inheritance) */
1152         ctx->ctx_owner = current;
1153
1154         notify_pid = tmp.ctx_notify_pid;
1155
1156         spin_lock_init(&ctx->ctx_lock);
1157
1158         if (notify_pid == current->pid) {
1159
1160                 ctx->ctx_notify_task = current;
1161                 task->thread.pfm_context = ctx;
1162
1163         } else if (notify_pid!=0) {
1164                 struct task_struct *notify_task;
1165
1166                 read_lock(&tasklist_lock);
1167
1168                 notify_task = find_task_by_pid(notify_pid);
1169
1170                 if (notify_task) {
1171
1172                         ret = -EPERM;
1173
1174                         /*
1175                          * check if we can send this task a signal
1176                          */
1177                         if (pfm_bad_permissions(notify_task)) {
1178                                 read_unlock(&tasklist_lock);
1179                                 goto buffer_error;
1180                         }
1181
1182                         /*
1183                          * make visible
1184                          * must be done inside critical section
1185                          *
1186                          * if the initialization does not go through it is still
1187                          * okay because child will do the scan for nothing which
1188                          * won't hurt.
1189                          */
1190                         task->thread.pfm_context = ctx;
1191
1192                         /*
1193                          * will cause task to check on exit for monitored
1194                          * processes that would notify it. see release_thread()
1195                          * Note: the scan MUST be done in release thread, once the
1196                          * task has been detached from the tasklist otherwise you are
1197                          * exposed to race conditions.
1198                          */
1199                         atomic_add(1, &ctx->ctx_notify_task->thread.pfm_notifiers_check);
1200
1201                         ctx->ctx_notify_task = notify_task;
1202                 }
1203                 read_unlock(&tasklist_lock);
1204         }
1205
1206         /*
1207          * notification process does not exist
1208          */
1209         if (notify_pid != 0 && ctx->ctx_notify_task == NULL) {
1210                 ret = -EINVAL;
1211                 goto buffer_error;
1212         }
1213
1214         if (tmp.ctx_smpl_entries) {
1215                 DBprintk(("sampling entries=%lu\n",tmp.ctx_smpl_entries));
1216
1217                 ret = pfm_smpl_buffer_alloc(ctx, tmp.ctx_smpl_regs,
1218                                                  tmp.ctx_smpl_entries, &uaddr);
1219                 if (ret<0) goto buffer_error;
1220
1221                 tmp.ctx_smpl_vaddr = uaddr;
1222         }
1223         /* initialization of context's flags */
1224         ctx->ctx_fl_inherit   = ctx_flags & PFM_FL_INHERIT_MASK;
1225         ctx->ctx_fl_block     = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
1226         ctx->ctx_fl_system    = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
1227         ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
1228         ctx->ctx_fl_frozen    = 0;
1229         ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
1230
1231         /*
1232          * setting this flag to 0 here means, that the creator or the task that the
1233          * context is being attached are granted access. Given that a context can only
1234          * be created for the calling process this, in effect only allows the creator
1235          * to access the context. See pfm_protect() for more.
1236          */
1237         ctx->ctx_fl_protected = 0;
1238
1239         /* for system wide mode only (only 1 bit set) */
1240         ctx->ctx_cpu = ffz(~tmp.ctx_cpu_mask);
1241
1242         atomic_set(&ctx->ctx_last_cpu,-1); /* SMP only, means no CPU */
1243
1244         sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */
1245
1246         if (__copy_to_user(req, &tmp, sizeof(tmp))) {
1247                 ret = -EFAULT;
1248                 goto buffer_error;
1249         }
1250
1251         DBprintk(("context=%p, pid=%d notify_task=%p\n",
1252                         (void *)ctx, task->pid, ctx->ctx_notify_task));
1253
1254         DBprintk(("context=%p, pid=%d flags=0x%x inherit=%d block=%d system=%d excl_idle=%d\n",
1255                         (void *)ctx, task->pid, ctx_flags, ctx->ctx_fl_inherit,
1256                         ctx->ctx_fl_block, ctx->ctx_fl_system, ctx->ctx_fl_excl_idle));
1257
1258         /*
1259          * when no notification is required, we can make this visible at the last moment
1260          */
1261         if (notify_pid == 0) task->thread.pfm_context = ctx;
1262         /*
1263          * pin task to CPU and force reschedule on exit to ensure
1264          * that when back to user level the task runs on the designated
1265          * CPU.
1266          */
1267         if (ctx->ctx_fl_system) {
1268                 ctx->ctx_saved_cpus_allowed = task->cpus_allowed;
1269                 set_cpus_allowed(task, tmp.ctx_cpu_mask);
1270                 DBprintk(("[%d] rescheduled allowed=0x%lx\n", task->pid, task->cpus_allowed));
1271         }
1272
1273         return 0;
1274
1275 buffer_error:
1276         pfm_context_free(ctx);
1277 error:
1278         pfm_unreserve_session(task, ctx_flags & PFM_FL_SYSTEM_WIDE , tmp.ctx_cpu_mask);
1279 abort:
1280         /* make sure we don't leave anything behind */
1281         task->thread.pfm_context = NULL;
1282
1283         return ret;
1284 }
1285
1286 static inline unsigned long
1287 pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
1288 {
1289         unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
1290         unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
1291         extern unsigned long carta_random32 (unsigned long seed);
1292
1293         if (reg->flags & PFM_REGFL_RANDOM) {
1294                 new_seed = carta_random32(old_seed);
1295                 val -= (old_seed & mask);       /* counter values are negative numbers! */
1296                 if ((mask >> 32) != 0)
1297                         /* construct a full 64-bit random value: */
1298                         new_seed |= carta_random32(old_seed >> 32) << 32;
1299                 reg->seed = new_seed;
1300         }
1301         reg->lval = val;
1302         return val;
1303 }
1304
1305 static void
1306 pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
1307 {
1308         unsigned long mask = ovfl_regs[0];
1309         unsigned long reset_others = 0UL;
1310         unsigned long val;
1311         int i, is_long_reset = (flag == PFM_PMD_LONG_RESET);
1312
1313         /*
1314          * now restore reset value on sampling overflowed counters
1315          */
1316         mask >>= PMU_FIRST_COUNTER;
1317         for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
1318                 if (mask & 0x1) {
1319                         val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
1320                         reset_others |= ctx->ctx_soft_pmds[i].reset_pmds[0];
1321
1322                         DBprintk_ovfl(("[%d] %s reset soft_pmd[%d]=%lx\n", current->pid,
1323                                   is_long_reset ? "long" : "short", i, val));
1324
1325                         /* upper part is ignored on rval */
1326                         pfm_write_soft_counter(ctx, i, val);
1327                 }
1328         }
1329
1330         /*
1331          * Now take care of resetting the other registers
1332          */
1333         for(i = 0; reset_others; i++, reset_others >>= 1) {
1334
1335                 if ((reset_others & 0x1) == 0) continue;
1336
1337                 val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
1338
1339                 if (PMD_IS_COUNTING(i)) {
1340                         pfm_write_soft_counter(ctx, i, val);
1341                 } else {
1342                         ia64_set_pmd(i, val);
1343                 }
1344                 DBprintk_ovfl(("[%d] %s reset_others pmd[%d]=%lx\n", current->pid,
1345                           is_long_reset ? "long" : "short", i, val));
1346         }
1347         ia64_srlz_d();
1348 }
1349
1350 static int
1351 pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1352 {
1353         struct thread_struct *th = &task->thread;
1354         pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
1355         unsigned long value, reset_pmds;
1356         unsigned int cnum, reg_flags, flags;
1357         int i;
1358         int ret = -EINVAL;
1359
1360         /* we don't quite support this right now */
1361         if (task != current) return -EINVAL;
1362
1363         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1364
1365         /* XXX: ctx locking may be required here */
1366
1367         for (i = 0; i < count; i++, req++) {
1368
1369                 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1370
1371                 cnum       = tmp.reg_num;
1372                 reg_flags  = tmp.reg_flags;
1373                 value      = tmp.reg_value;
1374                 reset_pmds = tmp.reg_reset_pmds[0];
1375                 flags      = 0;
1376
1377                 /*
1378                  * we reject all non implemented PMC as well
1379                  * as attempts to modify PMC[0-3] which are used
1380                  * as status registers by the PMU
1381                  */
1382                 if (!PMC_IS_IMPL(cnum) || cnum < 4) {
1383                         DBprintk(("pmc[%u] is unimplemented or invalid\n", cnum));
1384                         goto error;
1385                 }
1386                 /*
1387                  * A PMC used to configure monitors must be:
1388                  *      - system-wide session: privileged monitor
1389                  *      - per-task : user monitor
1390                  * any other configuration is rejected.
1391                  */
1392                 if (PMC_IS_MONITOR(cnum) || PMC_IS_COUNTING(cnum)) {
1393                         DBprintk(("pmc[%u].pm=%ld\n", cnum, PMC_PM(cnum, value)));
1394
1395                         if (ctx->ctx_fl_system ^ PMC_PM(cnum, value)) {
1396                                 DBprintk(("pmc_pm=%ld fl_system=%d\n", PMC_PM(cnum, value), ctx->ctx_fl_system));
1397                                 goto error;
1398                         }
1399                 }
1400
1401                 if (PMC_IS_COUNTING(cnum)) {
1402                         pfm_monitor_t *p = (pfm_monitor_t *)&value;
1403                         /*
1404                          * enforce generation of overflow interrupt. Necessary on all
1405                          * CPUs.
1406                          */
1407                         p->pmc_oi = 1;
1408
1409                         if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
1410                                 /*
1411                                  * must have a target for the signal
1412                                  */
1413                                 if (ctx->ctx_notify_task == NULL) {
1414                                         DBprintk(("cannot set ovfl_notify: no notify_task\n"));
1415                                         goto error;
1416                                 }
1417                                 flags |= PFM_REGFL_OVFL_NOTIFY;
1418                         }
1419
1420                         if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
1421
1422                         /* verify validity of reset_pmds */
1423                         if ((reset_pmds & pmu_conf.impl_pmds[0]) != reset_pmds) {
1424                                 DBprintk(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
1425                                 goto error;
1426                         }
1427                 } else if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
1428                                 DBprintk(("cannot set ovfl_notify or random on pmc%u\n", cnum));
1429                                 goto error;
1430                 }
1431
1432                 /*
1433                  * execute write checker, if any
1434                  */
1435                 if (PMC_WR_FUNC(cnum)) {
1436                         ret = PMC_WR_FUNC(cnum)(task, cnum, &value, regs);
1437                         if (ret) goto error;
1438                         ret = -EINVAL;
1439                 }
1440
1441                 /*
1442                  * no error on this register
1443                  */
1444                 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
1445
1446                 /*
1447                  * update register return value, abort all if problem during copy.
1448                  * we only modify the reg_flags field. no check mode is fine because
1449                  * access has been verified upfront in sys_perfmonctl().
1450                  *
1451                  * If this fails, then the software state is not modified
1452                  */
1453                 if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
1454
1455                 /*
1456                  * Now we commit the changes to the software state
1457                  */
1458
1459                 /*
1460                  * full flag update each time a register is programmed
1461                  */
1462                 ctx->ctx_soft_pmds[cnum].flags = flags;
1463
1464                 if (PMC_IS_COUNTING(cnum)) {
1465                         ctx->ctx_soft_pmds[cnum].reset_pmds[0] = reset_pmds;
1466
1467                         /* mark all PMDS to be accessed as used */
1468                         CTX_USED_PMD(ctx, reset_pmds);
1469                 }
1470
1471                 /*
1472                  * Needed in case the user does not initialize the equivalent
1473                  * PMD. Clearing is done in reset_pmu() so there is no possible
1474                  * leak here.
1475                  */
1476                 CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
1477
1478                 /*
1479                  * keep copy the pmc, used for register reload
1480                  */
1481                 th->pmc[cnum] = value;
1482
1483                 ia64_set_pmc(cnum, value);
1484
1485                 DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x used_pmds=0x%lx\n",
1486                           task->pid, cnum, value,
1487                           ctx->ctx_soft_pmds[cnum].flags,
1488                           ctx->ctx_used_pmds[0]));
1489
1490         }
1491
1492         return 0;
1493
1494 error:
1495         PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
1496
1497         if (__put_user(tmp.reg_flags, &req->reg_flags)) ret = -EFAULT;
1498
1499         DBprintk(("[%d] pmc[%u]=0x%lx error %d\n", task->pid, cnum, value, ret));
1500
1501         return ret;
1502 }
1503
1504 static int
1505 pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1506 {
1507         pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
1508         unsigned long value, hw_value;
1509         unsigned int cnum;
1510         int i;
1511         int ret = -EINVAL;
1512
1513         /* we don't quite support this right now */
1514         if (task != current) return -EINVAL;
1515
1516         /*
1517          * Cannot do anything before PMU is enabled
1518          */
1519         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1520         preempt_disable();
1521
1522         /* XXX: ctx locking may be required here */
1523
1524
1525         for (i = 0; i < count; i++, req++) {
1526
1527                 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1528
1529                 cnum  = tmp.reg_num;
1530                 value = tmp.reg_value;
1531
1532                 if (!PMD_IS_IMPL(cnum)) {
1533                         DBprintk(("pmd[%u] is unimplemented or invalid\n", cnum));
1534                         goto abort_mission;
1535                 }
1536
1537                 /*
1538                  * execute write checker, if any
1539                  */
1540                 if (PMD_WR_FUNC(cnum)) {
1541                         unsigned long v = value;
1542                         ret = PMD_WR_FUNC(cnum)(task, cnum, &v, regs);
1543                         if (ret) goto abort_mission;
1544                         value = v;
1545                         ret = -EINVAL;
1546                 }
1547                 hw_value = value;
1548                 /*
1549                  * no error on this register
1550                  */
1551                 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
1552
1553                 if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
1554
1555                 /*
1556                  * now commit changes to software state
1557                  */
1558
1559                 /* update virtualized (64bits) counter */
1560                 if (PMD_IS_COUNTING(cnum)) {
1561                         ctx->ctx_soft_pmds[cnum].lval = value;
1562                         ctx->ctx_soft_pmds[cnum].val  = value & ~pmu_conf.ovfl_val;
1563
1564                         hw_value = value & pmu_conf.ovfl_val;
1565
1566                         ctx->ctx_soft_pmds[cnum].long_reset  = tmp.reg_long_reset;
1567                         ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset;
1568
1569                         ctx->ctx_soft_pmds[cnum].seed = tmp.reg_random_seed;
1570                         ctx->ctx_soft_pmds[cnum].mask = tmp.reg_random_mask;
1571                 }
1572
1573                 /* keep track of what we use */
1574                 CTX_USED_PMD(ctx, pmu_conf.pmd_desc[(cnum)].dep_pmd[0]);
1575
1576                 /* mark this register as used as well */
1577                 CTX_USED_PMD(ctx, RDEP(cnum));
1578
1579                 /* writes to unimplemented part is ignored, so this is safe */
1580                 ia64_set_pmd(cnum, hw_value);
1581
1582                 /* to go away */
1583                 ia64_srlz_d();
1584
1585                 DBprintk(("[%d] pmd[%u]: value=0x%lx hw_value=0x%lx soft_pmd=0x%lx  short_reset=0x%lx "
1586                           "long_reset=0x%lx hw_pmd=%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx\n",
1587                                 task->pid, cnum,
1588                                 value, hw_value,
1589                                 ctx->ctx_soft_pmds[cnum].val,
1590                                 ctx->ctx_soft_pmds[cnum].short_reset,
1591                                 ctx->ctx_soft_pmds[cnum].long_reset,
1592                                 ia64_get_pmd(cnum) & pmu_conf.ovfl_val,
1593                                 PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
1594                                 ctx->ctx_used_pmds[0],
1595                                 ctx->ctx_soft_pmds[cnum].reset_pmds[0]));
1596         }
1597         preempt_enable();
1598         return 0;
1599
1600 abort_mission:
1601         preempt_enable();
1602
1603         /*
1604          * for now, we have only one possibility for error
1605          */
1606         PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
1607
1608         /*
1609          * we change the return value to EFAULT in case we cannot write register return code.
1610          * The caller first must correct this error, then a resubmission of the request will
1611          * eventually yield the EINVAL.
1612          */
1613         if (__put_user(tmp.reg_flags, &req->reg_flags)) ret = -EFAULT;
1614
1615         DBprintk(("[%d] pmc[%u]=0x%lx ret %d\n", task->pid, cnum, value, ret));
1616
1617         return ret;
1618 }
1619
1620 static int
1621 pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1622 {
1623         struct thread_struct *th = &task->thread;
1624         unsigned long val, lval;
1625         pfarg_reg_t *req = (pfarg_reg_t *)arg;
1626         unsigned int cnum, reg_flags = 0;
1627         int i, ret = 0;
1628
1629 #if __GNUC__ < 3
1630         int foo;
1631 #endif
1632
1633         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1634
1635         /*
1636          * XXX: MUST MAKE SURE WE DON"T HAVE ANY PENDING OVERFLOW BEFORE READING
1637          * This is required when the monitoring has been stoppped by user or kernel.
1638          * If it is still going on, then that's fine because we a re not guaranteed
1639          * to return an accurate value in this case.
1640          */
1641
1642         /* XXX: ctx locking may be required here */
1643
1644         DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid));
1645
1646         for (i = 0; i < count; i++, req++) {
1647                 int me;
1648 #if __GNUC__ < 3
1649                 foo = __get_user(cnum, &req->reg_num);
1650                 if (foo) return -EFAULT;
1651                 foo = __get_user(reg_flags, &req->reg_flags);
1652                 if (foo) return -EFAULT;
1653 #else
1654                 if (__get_user(cnum, &req->reg_num)) return -EFAULT;
1655                 if (__get_user(reg_flags, &req->reg_flags)) return -EFAULT;
1656 #endif
1657                 lval = 0UL;
1658
1659                 if (!PMD_IS_IMPL(cnum)) goto abort_mission;
1660                 /*
1661                  * we can only read the register that we use. That includes
1662                  * the one we explicitely initialize AND the one we want included
1663                  * in the sampling buffer (smpl_regs).
1664                  *
1665                  * Having this restriction allows optimization in the ctxsw routine
1666                  * without compromising security (leaks)
1667                  */
1668                 if (!CTX_IS_USED_PMD(ctx, cnum)) goto abort_mission;
1669
1670                 /*
1671                  * If the task is not the current one, then we check if the
1672                  * PMU state is still in the local live register due to lazy ctxsw.
1673                  * If true, then we read directly from the registers.
1674                  */
1675                 me = get_cpu();
1676                 if (atomic_read(&ctx->ctx_last_cpu) == me){
1677                         ia64_srlz_d();
1678                         val = ia64_get_pmd(cnum);
1679                         DBprintk(("reading pmd[%u]=0x%lx from hw\n", cnum, val));
1680                 } else {
1681                         val = th->pmd[cnum];
1682                 }
1683
1684
1685                 if (PMD_IS_COUNTING(cnum)) {
1686                         /*
1687                          * XXX: need to check for overflow
1688                          */
1689                         val &= pmu_conf.ovfl_val;
1690                         val += ctx->ctx_soft_pmds[cnum].val;
1691
1692                         lval = ctx->ctx_soft_pmds[cnum].lval;
1693                 }
1694
1695                 /*
1696                  * execute read checker, if any
1697                  */
1698                 if (PMD_RD_FUNC(cnum)) {
1699                         unsigned long v = val;
1700                         ret = PMD_RD_FUNC(cnum)(task, cnum, &v, regs);
1701                         val = v;
1702                 }
1703
1704                 PFM_REG_RETFLAG_SET(reg_flags, ret);
1705
1706                 put_cpu();
1707
1708                 DBprintk(("read pmd[%u] ret=%d value=0x%lx pmc=0x%lx\n",
1709                                         cnum, ret, val, ia64_get_pmc(cnum)));
1710
1711                 /*
1712                  * update register return value, abort all if problem during copy.
1713                  * we only modify the reg_flags field. no check mode is fine because
1714                  * access has been verified upfront in sys_perfmonctl().
1715                  */
1716                 if (__put_user(cnum, &req->reg_num)) return -EFAULT;
1717                 if (__put_user(val, &req->reg_value)) return -EFAULT;
1718                 if (__put_user(reg_flags, &req->reg_flags)) return -EFAULT;
1719                 if (__put_user(lval, &req->reg_last_reset_value)) return -EFAULT;
1720         }
1721
1722         return 0;
1723
1724 abort_mission:
1725         PFM_REG_RETFLAG_SET(reg_flags, PFM_REG_RETFL_EINVAL);
1726         /*
1727          * XXX: if this fails, we stick with the original failure, flag not updated!
1728          */
1729         __put_user(reg_flags, &req->reg_flags);
1730
1731         return -EINVAL;
1732 }
1733
1734 #ifdef PFM_PMU_USES_DBR
1735 /*
1736  * Only call this function when a process it trying to
1737  * write the debug registers (reading is always allowed)
1738  */
1739 int
1740 pfm_use_debug_registers(struct task_struct *task)
1741 {
1742         pfm_context_t *ctx = task->thread.pfm_context;
1743         int ret = 0;
1744
1745         DBprintk(("called for [%d]\n", task->pid));
1746
1747         /*
1748          * do it only once
1749          */
1750         if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
1751
1752         /*
1753          * Even on SMP, we do not need to use an atomic here because
1754          * the only way in is via ptrace() and this is possible only when the
1755          * process is stopped. Even in the case where the ctxsw out is not totally
1756          * completed by the time we come here, there is no way the 'stopped' process
1757          * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
1758          * So this is always safe.
1759          */
1760         if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
1761
1762         LOCK_PFS();
1763
1764         /*
1765          * We cannot allow setting breakpoints when system wide monitoring
1766          * sessions are using the debug registers.
1767          */
1768         if (pfm_sessions.pfs_sys_use_dbregs> 0)
1769                 ret = -1;
1770         else
1771                 pfm_sessions.pfs_ptrace_use_dbregs++;
1772
1773         DBprintk(("ptrace_use_dbregs=%u  sys_use_dbregs=%u by [%d] ret = %d\n",
1774                   pfm_sessions.pfs_ptrace_use_dbregs,
1775                   pfm_sessions.pfs_sys_use_dbregs,
1776                   task->pid, ret));
1777
1778         UNLOCK_PFS();
1779
1780         return ret;
1781 }
1782
1783 /*
1784  * This function is called for every task that exits with the
1785  * IA64_THREAD_DBG_VALID set. This indicates a task which was
1786  * able to use the debug registers for debugging purposes via
1787  * ptrace(). Therefore we know it was not using them for
1788  * perfmormance monitoring, so we only decrement the number
1789  * of "ptraced" debug register users to keep the count up to date
1790  */
1791 int
1792 pfm_release_debug_registers(struct task_struct *task)
1793 {
1794         int ret;
1795
1796         LOCK_PFS();
1797         if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
1798                 printk(KERN_DEBUG "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n",
1799                        task->pid);
1800                 ret = -1;
1801         }  else {
1802                 pfm_sessions.pfs_ptrace_use_dbregs--;
1803                 ret = 0;
1804         }
1805         UNLOCK_PFS();
1806
1807         return ret;
1808 }
1809 #else /* PFM_PMU_USES_DBR is true */
1810 /*
1811  * in case, the PMU does not use the debug registers, these two functions are nops.
1812  * The first function is called from arch/ia64/kernel/ptrace.c.
1813  * The second function is called from arch/ia64/kernel/process.c.
1814  */
1815 int
1816 pfm_use_debug_registers(struct task_struct *task)
1817 {
1818         return 0;
1819 }
1820
1821 int
1822 pfm_release_debug_registers(struct task_struct *task)
1823 {
1824         return 0;
1825 }
1826 #endif /* PFM_PMU_USES_DBR */
1827
1828 static int
1829 pfm_restart(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1830          struct pt_regs *regs)
1831 {
1832         void *sem = &ctx->ctx_restart_sem;
1833
1834         /*
1835          * Cannot do anything before PMU is enabled
1836          */
1837         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1838
1839         if (task == current) {
1840                 DBprintk(("restarting self %d frozen=%d ovfl_regs=0x%lx\n",
1841                         task->pid,
1842                         ctx->ctx_fl_frozen,
1843                         ctx->ctx_ovfl_regs[0]));
1844
1845                 preempt_disable();
1846                 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
1847
1848                 ctx->ctx_ovfl_regs[0] = 0UL;
1849
1850                 /*
1851                  * We ignore block/don't block because we never block
1852                  * for a self-monitoring process.
1853                  */
1854                 ctx->ctx_fl_frozen = 0;
1855
1856                 if (CTX_HAS_SMPL(ctx)) {
1857                         ctx->ctx_psb->psb_hdr->hdr_count = 0;
1858                         ctx->ctx_psb->psb_index = 0;
1859                 }
1860
1861                 /* simply unfreeze */
1862                 pfm_unfreeze_pmu();
1863
1864                 preempt_enable();
1865
1866                 return 0;
1867         }
1868         /* restart on another task */
1869
1870         /*
1871          * if blocking, then post the semaphore.
1872          * if non-blocking, then we ensure that the task will go into
1873          * pfm_overflow_must_block() before returning to user mode.
1874          * We cannot explicitely reset another task, it MUST always
1875          * be done by the task itself. This works for system wide because
1876          * the tool that is controlling the session is doing "self-monitoring".
1877          *
1878          * XXX: what if the task never goes back to user?
1879          *
1880          */
1881         if (CTX_OVFL_NOBLOCK(ctx) == 0) {
1882                 DBprintk(("unblocking %d \n", task->pid));
1883                 up(sem);
1884         } else {
1885                 task->thread.pfm_ovfl_block_reset = 1;
1886         }
1887 #if 0
1888         /*
1889          * in case of non blocking mode, then it's just a matter of
1890          * of reseting the sampling buffer (if any) index. The PMU
1891          * is already active.
1892          */
1893
1894         /*
1895          * must reset the header count first
1896          */
1897         if (CTX_HAS_SMPL(ctx)) {
1898                 DBprintk(("resetting sampling indexes for %d \n", task->pid));
1899                 ctx->ctx_psb->psb_hdr->hdr_count = 0;
1900                 ctx->ctx_psb->psb_index = 0;
1901         }
1902 #endif
1903         return 0;
1904 }
1905
1906 static int
1907 pfm_stop(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1908          struct pt_regs *regs)
1909 {
1910         /* we don't quite support this right now */
1911         if (task != current) return -EINVAL;
1912
1913         /*
1914          * Cannot do anything before PMU is enabled
1915          */
1916         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1917
1918         DBprintk(("[%d] fl_system=%d owner=%p current=%p\n",
1919                                 current->pid,
1920                                 ctx->ctx_fl_system, PMU_OWNER(),
1921                                 current));
1922
1923         preempt_disable();
1924         /* simply stop monitoring but not the PMU */
1925         if (ctx->ctx_fl_system) {
1926
1927                 /* disable dcr pp */
1928                 ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
1929
1930                 /* stop monitoring */
1931                 pfm_clear_psr_pp();
1932
1933                 ia64_srlz_i();
1934
1935                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
1936
1937                 ia64_psr(regs)->pp = 0;
1938
1939         } else {
1940
1941                 /* stop monitoring */
1942                 pfm_clear_psr_up();
1943
1944                 ia64_srlz_i();
1945
1946                 /*
1947                  * clear user level psr.up
1948                  */
1949                 ia64_psr(regs)->up = 0;
1950         }
1951         preempt_enable();
1952         return 0;
1953 }
1954
1955 static int
1956 pfm_disable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1957            struct pt_regs *regs)
1958 {
1959         /* we don't quite support this right now */
1960         if (task != current) return -EINVAL;
1961
1962         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1963
1964         preempt_disable();
1965         /*
1966          * stop monitoring, freeze PMU, and save state in context
1967          * this call will clear IA64_THREAD_PM_VALID for per-task sessions.
1968          */
1969         pfm_flush_regs(task);
1970
1971         if (ctx->ctx_fl_system) {
1972                 ia64_psr(regs)->pp = 0;
1973         } else {
1974                 ia64_psr(regs)->up = 0;
1975         }
1976         /*
1977          * goes back to default behavior: no user level control
1978          * no need to change live psr.sp because useless at the kernel level
1979          */
1980         ia64_psr(regs)->sp = 1;
1981
1982         DBprintk(("enabling psr.sp for [%d]\n", current->pid));
1983
1984         ctx->ctx_flags.state = PFM_CTX_DISABLED;
1985         preempt_enable();
1986
1987         return 0;
1988 }
1989
1990 static int
1991 pfm_context_destroy(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1992          struct pt_regs *regs)
1993 {
1994         /* we don't quite support this right now */
1995         if (task != current) return -EINVAL;
1996
1997         /*
1998          * if context was never enabled, then there is not much
1999          * to do
2000          */
2001         if (!CTX_IS_ENABLED(ctx)) goto skipped_stop;
2002
2003         /*
2004          * Disable context: stop monitoring, flush regs to software state (useless here),
2005          * and freeze PMU
2006          *
2007          * The IA64_THREAD_PM_VALID is cleared by pfm_flush_regs() called from pfm_disable()
2008          */
2009         pfm_disable(task, ctx, arg, count, regs);
2010
2011         if (ctx->ctx_fl_system) {
2012                 ia64_psr(regs)->pp = 0;
2013         } else {
2014                 ia64_psr(regs)->up = 0;
2015         }
2016
2017 skipped_stop:
2018         /*
2019          * remove sampling buffer mapping, if any
2020          */
2021         if (ctx->ctx_smpl_vaddr) {
2022                 pfm_remove_smpl_mapping(task);
2023                 ctx->ctx_smpl_vaddr = 0UL;
2024         }
2025         /* now free context and related state */
2026         pfm_context_exit(task);
2027
2028         return 0;
2029 }
2030
2031 /*
2032  * does nothing at the moment
2033  */
2034 static int
2035 pfm_context_unprotect(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2036          struct pt_regs *regs)
2037 {
2038         return 0;
2039 }
2040
2041 static int
2042 pfm_protect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2043          struct pt_regs *regs)
2044 {
2045         DBprintk(("context from [%d] is protected\n", task->pid));
2046         /*
2047          * from now on, only the creator of the context has access to it
2048          */
2049         ctx->ctx_fl_protected = 1;
2050
2051         /*
2052          * reinforce secure monitoring: cannot toggle psr.up
2053          */
2054         ia64_psr(regs)->sp = 1;
2055
2056         return 0;
2057 }
2058
2059 static int
2060 pfm_debug(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2061          struct pt_regs *regs)
2062 {
2063         unsigned int mode = *(unsigned int *)arg;
2064
2065         pfm_sysctl.debug = mode == 0 ? 0 : 1;
2066
2067         printk(KERN_INFO "perfmon debugging %s\n", pfm_sysctl.debug ? "on" : "off");
2068
2069         return 0;
2070 }
2071
2072 #ifdef PFM_PMU_USES_DBR
2073
2074 typedef struct {
2075         unsigned long ibr_mask:56;
2076         unsigned long ibr_plm:4;
2077         unsigned long ibr_ig:3;
2078         unsigned long ibr_x:1;
2079 } ibr_mask_reg_t;
2080
2081 typedef struct {
2082         unsigned long dbr_mask:56;
2083         unsigned long dbr_plm:4;
2084         unsigned long dbr_ig:2;
2085         unsigned long dbr_w:1;
2086         unsigned long dbr_r:1;
2087 } dbr_mask_reg_t;
2088
2089 typedef union {
2090         unsigned long  val;
2091         ibr_mask_reg_t ibr;
2092         dbr_mask_reg_t dbr;
2093 } dbreg_t;
2094
2095 static int
2096 pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, struct pt_regs *regs)
2097 {
2098         struct thread_struct *thread = &task->thread;
2099         pfm_context_t *ctx = task->thread.pfm_context;
2100         pfarg_dbreg_t tmp, *req = (pfarg_dbreg_t *)arg;
2101         dbreg_t dbreg;
2102         unsigned int rnum;
2103         int first_time;
2104         int i, ret = 0;
2105
2106         /*
2107          * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
2108          * ensuring that no real breakpoint can be installed via this call.
2109          */
2110
2111         first_time = ctx->ctx_fl_using_dbreg == 0;
2112
2113         /*
2114          * check for debug registers in system wide mode
2115          *
2116          */
2117         LOCK_PFS();
2118         if (ctx->ctx_fl_system && first_time) {
2119                 if (pfm_sessions.pfs_ptrace_use_dbregs)
2120                         ret = -EBUSY;
2121                 else
2122                         pfm_sessions.pfs_sys_use_dbregs++;
2123         }
2124         UNLOCK_PFS();
2125
2126         if (ret != 0) return ret;
2127
2128         if (ctx->ctx_fl_system) {
2129                 /* we mark ourselves as owner  of the debug registers */
2130                 ctx->ctx_fl_using_dbreg = 1;
2131                 DBprintk(("system-wide setting fl_using_dbreg for [%d]\n", task->pid));
2132         } else if (first_time) {
2133                         ret= -EBUSY;
2134                         if ((thread->flags & IA64_THREAD_DBG_VALID) != 0) {
2135                                 DBprintk(("debug registers already in use for [%d]\n", task->pid));
2136                                 goto abort_mission;
2137                         }
2138                         /* we mark ourselves as owner  of the debug registers */
2139                         ctx->ctx_fl_using_dbreg = 1;
2140
2141                         DBprintk(("setting fl_using_dbreg for [%d]\n", task->pid));
2142                         /*
2143                          * Given debug registers cannot be used for both debugging
2144                          * and performance monitoring at the same time, we reuse
2145                          * the storage area to save and restore the registers on ctxsw.
2146                          */
2147                         memset(task->thread.dbr, 0, sizeof(task->thread.dbr));
2148                         memset(task->thread.ibr, 0, sizeof(task->thread.ibr));
2149         }
2150
2151         if (first_time) {
2152                 DBprintk(("[%d] clearing ibrs,dbrs\n", task->pid));
2153                 /*
2154                  * clear hardware registers to make sure we don't
2155                  * pick up stale state.
2156                  *
2157                  * for a system wide session, we do not use
2158                  * thread.dbr, thread.ibr because this process
2159                  * never leaves the current CPU and the state
2160                  * is shared by all processes running on it
2161                  */
2162                 for (i=0; i < pmu_conf.num_ibrs; i++) {
2163                         ia64_set_ibr(i, 0UL);
2164                 }
2165                 ia64_srlz_i();
2166                 for (i=0; i < pmu_conf.num_dbrs; i++) {
2167                         ia64_set_dbr(i, 0UL);
2168                 }
2169                 ia64_srlz_d();
2170         }
2171
2172         ret = -EFAULT;
2173
2174         /*
2175          * Now install the values into the registers
2176          */
2177         for (i = 0; i < count; i++, req++) {
2178
2179                 if (__copy_from_user(&tmp, req, sizeof(tmp))) goto abort_mission;
2180
2181                 rnum      = tmp.dbreg_num;
2182                 dbreg.val = tmp.dbreg_value;
2183
2184                 ret = -EINVAL;
2185
2186                 if ((mode == 0 && !IBR_IS_IMPL(rnum)) || ((mode == 1) && !DBR_IS_IMPL(rnum))) {
2187                         DBprintk(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
2188                                   rnum, dbreg.val, mode, i, count));
2189
2190                         goto abort_mission;
2191                 }
2192
2193                 /*
2194                  * make sure we do not install enabled breakpoint
2195                  */
2196                 if (rnum & 0x1) {
2197                         if (mode == 0)
2198                                 dbreg.ibr.ibr_x = 0;
2199                         else
2200                                 dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
2201                 }
2202
2203                 /*
2204                  * clear return flags and copy back to user
2205                  *
2206                  * XXX: fix once EAGAIN is implemented
2207                  */
2208                 ret = -EFAULT;
2209
2210                 PFM_REG_RETFLAG_SET(tmp.dbreg_flags, 0);
2211
2212                 if (__copy_to_user(req, &tmp, sizeof(tmp))) goto abort_mission;
2213
2214                 /*
2215                  * Debug registers, just like PMC, can only be modified
2216                  * by a kernel call. Moreover, perfmon() access to those
2217                  * registers are centralized in this routine. The hardware
2218                  * does not modify the value of these registers, therefore,
2219                  * if we save them as they are written, we can avoid having
2220                  * to save them on context switch out. This is made possible
2221                  * by the fact that when perfmon uses debug registers, ptrace()
2222                  * won't be able to modify them concurrently.
2223                  */
2224                 if (mode == 0) {
2225                         CTX_USED_IBR(ctx, rnum);
2226
2227                         ia64_set_ibr(rnum, dbreg.val);
2228                         ia64_srlz_i();
2229
2230                         thread->ibr[rnum] = dbreg.val;
2231
2232                         DBprintk(("write ibr%u=0x%lx used_ibrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_ibrs[0]));
2233                 } else {
2234                         CTX_USED_DBR(ctx, rnum);
2235
2236                         ia64_set_dbr(rnum, dbreg.val);
2237                         ia64_srlz_d();
2238
2239                         thread->dbr[rnum] = dbreg.val;
2240
2241                         DBprintk(("write dbr%u=0x%lx used_dbrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_dbrs[0]));
2242                 }
2243         }
2244
2245         return 0;
2246
2247 abort_mission:
2248         /*
2249          * in case it was our first attempt, we undo the global modifications
2250          */
2251         if (first_time) {
2252                 LOCK_PFS();
2253                 if (ctx->ctx_fl_system) {
2254                         pfm_sessions.pfs_sys_use_dbregs--;
2255                 }
2256                 UNLOCK_PFS();
2257                 ctx->ctx_fl_using_dbreg = 0;
2258         }
2259         /*
2260          * install error return flag
2261          */
2262         if (ret != -EFAULT) {
2263                 /*
2264                  * XXX: for now we can only come here on EINVAL
2265                  */
2266                 PFM_REG_RETFLAG_SET(tmp.dbreg_flags, PFM_REG_RETFL_EINVAL);
2267                 if (__put_user(tmp.dbreg_flags, &req->dbreg_flags)) ret = -EFAULT;
2268         }
2269         return ret;
2270 }
2271
2272 static int
2273 pfm_write_ibrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2274          struct pt_regs *regs)
2275 {
2276         /* we don't quite support this right now */
2277         if (task != current) return -EINVAL;
2278
2279         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2280
2281         return pfm_write_ibr_dbr(0, task, arg, count, regs);
2282 }
2283
2284 static int
2285 pfm_write_dbrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2286          struct pt_regs *regs)
2287 {
2288         /* we don't quite support this right now */
2289         if (task != current) return -EINVAL;
2290
2291         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2292
2293         return pfm_write_ibr_dbr(1, task, arg, count, regs);
2294 }
2295
2296 #endif /* PFM_PMU_USES_DBR */
2297
2298 static int
2299 pfm_get_features(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
2300 {
2301         pfarg_features_t tmp;
2302
2303         memset(&tmp, 0, sizeof(tmp));
2304
2305         tmp.ft_version      = PFM_VERSION;
2306         tmp.ft_smpl_version = PFM_SMPL_VERSION;
2307
2308         if (__copy_to_user(arg, &tmp, sizeof(tmp))) return -EFAULT;
2309
2310         return 0;
2311 }
2312
2313 static int
2314 pfm_start(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2315           struct pt_regs *regs)
2316 {
2317         /* we don't quite support this right now */
2318         if (task != current) return -EINVAL;
2319
2320         /*
2321          * Cannot do anything before PMU is enabled
2322          */
2323         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2324
2325         DBprintk(("[%d] fl_system=%d owner=%p current=%p\n",
2326                                 current->pid,
2327                                 ctx->ctx_fl_system, PMU_OWNER(),
2328                                 current));
2329
2330         if (PMU_OWNER() != task) {
2331                 printk(KERN_DEBUG "perfmon: pfm_start task [%d] not pmu owner\n", task->pid);
2332                 return -EINVAL;
2333         }
2334
2335         preempt_disable();
2336         if (ctx->ctx_fl_system) {
2337
2338                 PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
2339
2340                 /* set user level psr.pp */
2341                 ia64_psr(regs)->pp = 1;
2342
2343                 /* start monitoring at kernel level */
2344                 pfm_set_psr_pp();
2345
2346                 /* enable dcr pp */
2347                 ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
2348
2349                 ia64_srlz_i();
2350
2351         } else {
2352                 if ((task->thread.flags & IA64_THREAD_PM_VALID) == 0) {
2353                         preempt_enable();
2354                         printk(KERN_DEBUG "perfmon: pfm_start task flag not set for [%d]\n",
2355                                task->pid);
2356                         return -EINVAL;
2357                 }
2358                 /* set user level psr.up */
2359                 ia64_psr(regs)->up = 1;
2360
2361                 /* start monitoring at kernel level */
2362                 pfm_set_psr_up();
2363
2364                 ia64_srlz_i();
2365         }
2366
2367         preempt_enable();
2368         return 0;
2369 }
2370
2371 static int
2372 pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2373            struct pt_regs *regs)
2374 {
2375         int me;
2376
2377         /* we don't quite support this right now */
2378         if (task != current) return -EINVAL;
2379
2380         me = get_cpu();  /* make sure we're not migrated or preempted */
2381
2382         if (ctx->ctx_fl_system == 0 && PMU_OWNER()  && PMU_OWNER() != current)
2383                 pfm_lazy_save_regs(PMU_OWNER());
2384
2385         /* reset all registers to stable quiet state */
2386         pfm_reset_pmu(task);
2387
2388         /* make sure nothing starts */
2389         if (ctx->ctx_fl_system) {
2390                 ia64_psr(regs)->pp = 0;
2391                 ia64_psr(regs)->up = 0; /* just to make sure! */
2392
2393                 /* make sure monitoring is stopped */
2394                 pfm_clear_psr_pp();
2395                 ia64_srlz_i();
2396
2397                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
2398                 PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
2399                 if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
2400         } else {
2401                 /*
2402                  * needed in case the task was a passive task during
2403                  * a system wide session and now wants to have its own
2404                  * session
2405                  */
2406                 ia64_psr(regs)->pp = 0; /* just to make sure! */
2407                 ia64_psr(regs)->up = 0;
2408
2409                 /* make sure monitoring is stopped */
2410                 pfm_clear_psr_up();
2411                 ia64_srlz_i();
2412
2413                 DBprintk(("clearing psr.sp for [%d]\n", current->pid));
2414
2415                 /* allow user level control  */
2416                 ia64_psr(regs)->sp = 0;
2417
2418                 /* PMU state will be saved/restored on ctxsw */
2419                 task->thread.flags |= IA64_THREAD_PM_VALID;
2420         }
2421
2422         SET_PMU_OWNER(task);
2423
2424         ctx->ctx_flags.state = PFM_CTX_ENABLED;
2425         atomic_set(&ctx->ctx_last_cpu, me);
2426
2427         /* simply unfreeze */
2428         pfm_unfreeze_pmu();
2429
2430         put_cpu();
2431
2432         return 0;
2433 }
2434
2435 static int
2436 pfm_get_pmc_reset(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2437            struct pt_regs *regs)
2438 {
2439         pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
2440         unsigned int cnum;
2441         int i, ret = -EINVAL;
2442
2443         for (i = 0; i < count; i++, req++) {
2444
2445                 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
2446
2447                 cnum = tmp.reg_num;
2448
2449                 if (!PMC_IS_IMPL(cnum)) goto abort_mission;
2450
2451                 tmp.reg_value = PMC_DFL_VAL(cnum);
2452
2453                 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
2454
2455                 DBprintk(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, tmp.reg_value));
2456
2457                 if (__copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
2458         }
2459         return 0;
2460 abort_mission:
2461         PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
2462         if (__copy_to_user(req, &tmp, sizeof(tmp))) ret = -EFAULT;
2463
2464         return ret;
2465 }
2466
2467 /*
2468  * functions MUST be listed in the increasing order of their index (see permfon.h)
2469  */
2470 static pfm_cmd_desc_t pfm_cmd_tab[]={
2471 /* 0  */{ NULL, 0, 0, 0}, /* not used */
2472 /* 1  */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2473 /* 2  */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2474 /* 3  */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2475 /* 4  */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2476 /* 5  */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2477 /* 6  */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2478 /* 7  */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2479 /* 8  */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_RW, 1, sizeof(pfarg_context_t)},
2480 /* 9  */{ pfm_context_destroy, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2481 /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0},
2482 /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2483 /* 12 */{ pfm_get_features, PFM_CMD_ARG_RW, 0, 0},
2484 /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)},
2485 /* 14 */{ pfm_context_unprotect, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2486 /* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2487 /* 16 */{ NULL, 0, 0, 0}, /* not used */
2488 /* 17 */{ NULL, 0, 0, 0}, /* not used */
2489 /* 18 */{ NULL, 0, 0, 0}, /* not used */
2490 /* 19 */{ NULL, 0, 0, 0}, /* not used */
2491 /* 20 */{ NULL, 0, 0, 0}, /* not used */
2492 /* 21 */{ NULL, 0, 0, 0}, /* not used */
2493 /* 22 */{ NULL, 0, 0, 0}, /* not used */
2494 /* 23 */{ NULL, 0, 0, 0}, /* not used */
2495 /* 24 */{ NULL, 0, 0, 0}, /* not used */
2496 /* 25 */{ NULL, 0, 0, 0}, /* not used */
2497 /* 26 */{ NULL, 0, 0, 0}, /* not used */
2498 /* 27 */{ NULL, 0, 0, 0}, /* not used */
2499 /* 28 */{ NULL, 0, 0, 0}, /* not used */
2500 /* 29 */{ NULL, 0, 0, 0}, /* not used */
2501 /* 30 */{ NULL, 0, 0, 0}, /* not used */
2502 /* 31 */{ NULL, 0, 0, 0}, /* not used */
2503 #ifdef PFM_PMU_USES_DBR
2504 /* 32 */{ pfm_write_ibrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)},
2505 /* 33 */{ pfm_write_dbrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)}
2506 #endif
2507 };
2508 #define PFM_CMD_COUNT   (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
2509
2510 static int
2511 check_task_state(struct task_struct *task)
2512 {
2513         int ret = 0;
2514 #ifdef CONFIG_SMP
2515         /* We must wait until the state has been completely
2516          * saved. There can be situations where the reader arrives before
2517          * after the task is marked as STOPPED but before pfm_save_regs()
2518          * is completed.
2519          */
2520         if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY;
2521         DBprintk(("before wait_task_inactive [%d] state %ld\n", task->pid, task->state));
2522         wait_task_inactive(task);
2523         DBprintk(("after wait_task_inactive [%d] state %ld\n", task->pid, task->state));
2524 #else
2525         if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) {
2526                 DBprintk(("warning [%d] not in stable state %ld\n", task->pid, task->state));
2527                 ret = -EBUSY;
2528         }
2529 #endif
2530         return ret;
2531 }
2532
2533 asmlinkage long
2534 sys_perfmonctl (pid_t pid, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
2535                 long arg8, long stack)
2536 {
2537         struct pt_regs *regs = (struct pt_regs *)&stack;
2538         struct task_struct *task = current;
2539         pfm_context_t *ctx;
2540         size_t sz;
2541         int ret, narg;
2542
2543         /*
2544          * reject any call if perfmon was disabled at initialization time
2545          */
2546         if (PFM_IS_DISABLED()) return -ENOSYS;
2547
2548         DBprintk(("cmd=%d idx=%d valid=%d narg=0x%x\n", cmd, PFM_CMD_IDX(cmd),
2549                   PFM_CMD_IS_VALID(cmd), PFM_CMD_NARG(cmd)));
2550
2551         if (PFM_CMD_IS_VALID(cmd) == 0) return -EINVAL;
2552
2553         /* ingore arguments when command has none */
2554         narg = PFM_CMD_NARG(cmd);
2555         if ((narg == PFM_CMD_ARG_MANY  && count == 0) || (narg > 0 && narg != count)) return -EINVAL;
2556
2557         sz = PFM_CMD_ARG_SIZE(cmd);
2558
2559         if (PFM_CMD_READ_ARG(cmd) && !access_ok(VERIFY_READ, arg, sz*count)) return -EFAULT;
2560
2561         if (PFM_CMD_RW_ARG(cmd) && !access_ok(VERIFY_WRITE, arg, sz*count)) return -EFAULT;
2562
2563         if (PFM_CMD_USE_PID(cmd))  {
2564                 /*
2565                  * XXX: may need to fine tune this one
2566                  */
2567                 if (pid < 2) return -EPERM;
2568
2569                 if (pid != current->pid) {
2570
2571                         ret = -ESRCH;
2572
2573                         read_lock(&tasklist_lock);
2574
2575                         task = find_task_by_pid(pid);
2576
2577                         if (task) get_task_struct(task);
2578
2579                         read_unlock(&tasklist_lock);
2580
2581                         if (!task) goto abort_call;
2582
2583                         ret = -EPERM;
2584
2585                         if (pfm_bad_permissions(task)) goto abort_call;
2586
2587                         if (PFM_CMD_CHK(cmd)) {
2588                                 ret = check_task_state(task);
2589                                 if (ret != 0) goto abort_call;
2590                         }
2591                 }
2592         }
2593
2594         ctx = task->thread.pfm_context;
2595
2596         if (PFM_CMD_USE_CTX(cmd)) {
2597                 ret = -EINVAL;
2598                if (ctx == NULL) {
2599                         DBprintk(("no context for task %d\n", task->pid));
2600                         goto abort_call;
2601                }
2602                ret = -EPERM;
2603                /*
2604                 * we only grant access to the context if:
2605                 *       - the caller is the creator of the context (ctx_owner)
2606                 *  OR   - the context is attached to the caller AND The context IS NOT
2607                 *         in protected mode
2608                 */
2609                if (ctx->ctx_owner != current && (ctx->ctx_fl_protected || task != current)) {
2610                                 DBprintk(("context protected, no access for [%d]\n", task->pid));
2611                                 goto abort_call;
2612                }
2613         }
2614
2615         ret = (*pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func)(task, ctx, arg, count, regs);
2616
2617 abort_call:
2618         if (task && task != current) put_task_struct(task);
2619
2620         return ret;
2621 }
2622
2623 /*
2624  * send SIGPROF to register task, must be invoked when it
2625  * is safe to send a signal, e.g., not holding any runqueue
2626  * related locks.
2627  */
2628 static int
2629 pfm_notify_user(pfm_context_t *ctx)
2630 {
2631         struct siginfo si;
2632         int ret;
2633
2634         if (ctx->ctx_notify_task == NULL) {
2635                 DBprintk(("[%d] no notifier\n", current->pid));
2636                 return -EINVAL;
2637         }
2638
2639         si.si_errno    = 0;
2640         si.si_addr     = NULL;
2641         si.si_pid      = current->pid; /* who is sending */
2642         si.si_signo    = SIGPROF;
2643         si.si_code     = PROF_OVFL;
2644
2645         si.si_pfm_ovfl[0] = ctx->ctx_ovfl_regs[0];
2646
2647         /*
2648          * when the target of the signal is not ourself, we have to be more
2649          * careful. The notify_task may being cleared by the target task itself
2650          * in release_thread(). We must ensure mutual exclusion here such that
2651          * the signal is delivered (even to a dying task) safely.
2652          */
2653
2654         if (ctx->ctx_notify_task != current) {
2655                 /*
2656                  * grab the notification lock for this task
2657                  * This guarantees that the sequence: test + send_signal
2658                  * is atomic with regards to the ctx_notify_task field.
2659                  *
2660                  * We need a spinlock and not just an atomic variable for this.
2661                  *
2662                  */
2663                 spin_lock(&ctx->ctx_lock);
2664
2665                 /*
2666                  * now notify_task cannot be modified until we're done
2667                  * if NULL, they it got modified while we were in the handler
2668                  */
2669                 if (ctx->ctx_notify_task == NULL) {
2670
2671                         spin_unlock(&ctx->ctx_lock);
2672
2673                         /*
2674                          * If we've lost the notified task, then we will run
2675                          * to completion wbut keep the PMU frozen. Results
2676                          * will be incorrect anyway. We do not kill task
2677                          * to leave it possible to attach perfmon context
2678                          * to already running task.
2679                          */
2680                         printk("perfmon: pfm_notify_user() lost notify_task\n");
2681                         DBprintk_ovfl(("notification task has disappeared !\n"));
2682
2683                         /* we cannot afford to block now */
2684                         ctx->ctx_fl_block = 0;
2685
2686                         return  -EINVAL;
2687                 }
2688
2689                 /*
2690                  * required by send_sig_info() to make sure the target
2691                  * task does not disappear on us.
2692                  */
2693                 read_lock(&tasklist_lock);
2694         }
2695         /*
2696          * in this case, we don't stop the task, we let it go on. It will
2697          * necessarily go to the signal handler (if any) when it goes back to
2698          * user mode.
2699          */
2700         DBprintk_ovfl(("[%d] sending notification to [%d]\n",
2701                         current->pid, ctx->ctx_notify_task->pid));
2702
2703         /*
2704          * this call is safe in an interrupt handler, so does read_lock() on tasklist_lock
2705          */
2706         ret = send_sig_info(SIGPROF, &si, ctx->ctx_notify_task);
2707         if (ret) {
2708                 printk("perfmon: send_sig_info(process %d, SIGPROF)=%d\n",
2709                                 ctx->ctx_notify_task->pid, ret);
2710         }
2711
2712         /*
2713          * now undo the protections in order
2714          */
2715         if (ctx->ctx_notify_task != current) {
2716                 read_unlock(&tasklist_lock);
2717                 spin_unlock(&ctx->ctx_lock);
2718         }
2719         return ret;
2720 }
2721
2722 void
2723 pfm_ovfl_block_reset(void)
2724 {
2725         struct thread_struct *th = &current->thread;
2726         pfm_context_t *ctx = current->thread.pfm_context;
2727         unsigned int reason;
2728         int ret;
2729
2730         /*
2731          * clear the flag, to make sure we won't get here
2732          * again
2733          */
2734         th->pfm_ovfl_block_reset = 0;
2735
2736         /*
2737          * do some sanity checks first
2738          */
2739         if (!ctx) {
2740                 printk(KERN_DEBUG "perfmon: [%d] has no PFM context\n", current->pid);
2741                 return;
2742         }
2743         /*
2744          * extract reason for being here and clear
2745          */
2746         reason = ctx->ctx_fl_trap_reason;
2747         ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
2748
2749         DBprintk(("[%d] reason=%d\n", current->pid, reason));
2750
2751         /*
2752          * just here for a reset (non-blocking context only)
2753          */
2754         if (reason == PFM_TRAP_REASON_RESET) goto non_blocking;
2755
2756         /*
2757          * first notify user. This can fail if notify_task has disappeared.
2758          */
2759         if (reason == PFM_TRAP_REASON_SIG || reason == PFM_TRAP_REASON_BLOCKSIG) {
2760                 ret = pfm_notify_user(ctx);
2761                 if (ret) return;
2762         }
2763
2764         /*
2765          * came here just to signal (non-blocking)
2766          */
2767         if (reason == PFM_TRAP_REASON_SIG) return;
2768
2769         DBprintk(("[%d] before sleeping\n", current->pid));
2770
2771         /*
2772          * may go through without blocking on SMP systems
2773          * if restart has been received already by the time we call down()
2774          */
2775         ret = down_interruptible(&ctx->ctx_restart_sem);
2776
2777         DBprintk(("[%d] after sleeping ret=%d\n", current->pid, ret));
2778
2779         /*
2780          * in case of interruption of down() we don't restart anything
2781          */
2782         if (ret >= 0) {
2783
2784 non_blocking:
2785                 /* we reactivate on context switch */
2786                 ctx->ctx_fl_frozen = 0;
2787                 /*
2788                  * the ovfl_sem is cleared by the restart task and this is safe because we always
2789                  * use the local reference
2790                  */
2791
2792                 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
2793
2794                 ctx->ctx_ovfl_regs[0] = 0UL;
2795
2796                 /*
2797                  * Unlock sampling buffer and reset index atomically
2798                  * XXX: not really needed when blocking
2799                  */
2800                 if (CTX_HAS_SMPL(ctx)) {
2801                         ctx->ctx_psb->psb_hdr->hdr_count = 0;
2802                         ctx->ctx_psb->psb_index = 0;
2803                 }
2804
2805                 pfm_unfreeze_pmu();
2806
2807                 /* state restored, can go back to work (user mode) */
2808         }
2809 }
2810
2811 /*
2812  * This function will record an entry in the sampling if it is not full already.
2813  * Return:
2814  *      0 : buffer is not full (did not BECOME full: still space or was already full)
2815  *      1 : buffer is full (recorded the last entry)
2816  */
2817 static int
2818 pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ovfl_mask, struct pt_regs *regs)
2819 {
2820         pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb;
2821         unsigned long *e, m, idx;
2822         perfmon_smpl_entry_t *h;
2823         int j;
2824
2825
2826         idx = ia64_fetch_and_add(1, &psb->psb_index);
2827         DBprintk_ovfl(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));
2828
2829         /*
2830          * XXX: there is a small chance that we could run out on index before resetting
2831          * but index is unsigned long, so it will take some time.....
2832          * We use > instead of == because fetch_and_add() is off by one (see below)
2833          *
2834          * This case can happen in non-blocking mode or with multiple processes.
2835          * For non-blocking, we need to reload and continue.
2836          */
2837         if (idx > psb->psb_entries) return 0;
2838
2839         /* first entry is really entry 0, not 1 caused by fetch_and_add */
2840         idx--;
2841
2842         h = (perfmon_smpl_entry_t *)(((char *)psb->psb_addr) + idx*(psb->psb_entry_size));
2843
2844         /*
2845          * initialize entry header
2846          */
2847         h->pid  = current->pid;
2848         h->cpu  = get_cpu();
2849         h->last_reset_value = ovfl_mask ? ctx->ctx_soft_pmds[ffz(~ovfl_mask)].lval : 0UL;
2850         h->ip   = regs ? regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3): 0x0UL;
2851         h->regs = ovfl_mask;                    /* which registers overflowed */
2852
2853         /* guaranteed to monotonically increase on each cpu */
2854         h->stamp  = pfm_get_stamp();
2855
2856         /* position for first pmd */
2857         e = (unsigned long *)(h+1);
2858
2859         /*
2860          * selectively store PMDs in increasing index number
2861          */
2862         m = ctx->ctx_smpl_regs[0];
2863         for (j=0; m; m >>=1, j++) {
2864
2865                 if ((m & 0x1) == 0) continue;
2866
2867                 if (PMD_IS_COUNTING(j)) {
2868                         *e  =  pfm_read_soft_counter(ctx, j);
2869                 } else {
2870                         *e = ia64_get_pmd(j); /* slow */
2871                 }
2872                 DBprintk_ovfl(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
2873                 e++;
2874         }
2875         pfm_stats[h->cpu].pfm_recorded_samples_count++;
2876
2877         /*
2878          * make the new entry visible to user, needs to be atomic
2879          */
2880         ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count);
2881
2882         DBprintk_ovfl(("index=%ld entries=%ld hdr_count=%ld\n",
2883                                 idx, psb->psb_entries, psb->psb_hdr->hdr_count));
2884         /*
2885          * sampling buffer full ?
2886          */
2887         if (idx == (psb->psb_entries-1)) {
2888                 DBprintk_ovfl(("sampling buffer full\n"));
2889                 /*
2890                  * XXX: must reset buffer in blocking mode and lost notified
2891                  */
2892                 pfm_stats[h->cpu].pfm_full_smpl_buffer_count++;
2893                 put_cpu();
2894                 return 1;
2895         }
2896         put_cpu();
2897         return 0;
2898 }
2899
2900 /*
2901  * main overflow processing routine.
2902  * it can be called from the interrupt path or explicitely during the context switch code
2903  * Return:
2904  *      new value of pmc[0]. if 0x0 then unfreeze, else keep frozen
2905  */
2906 static unsigned long
2907 pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
2908 {
2909         unsigned long mask;
2910         struct thread_struct *t;
2911         unsigned long old_val;
2912         unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL;
2913         int i;
2914         int ret = 1;
2915         /*
2916          * It is never safe to access the task for which the overflow interrupt is destinated
2917          * using the current variable as the interrupt may occur in the middle of a context switch
2918          * where current does not hold the task that is running yet.
2919          *
2920          * For monitoring, however, we do need to get access to the task which caused the overflow
2921          * to account for overflow on the counters.
2922          *
2923          * We accomplish this by maintaining a current owner of the PMU per CPU. During context
2924          * switch the ownership is changed in a way such that the reflected owner is always the
2925          * valid one, i.e. the one that caused the interrupt.
2926          */
2927
2928         preempt_disable();
2929
2930         t   = &task->thread;
2931
2932         /*
2933          * XXX: debug test
2934          * Don't think this could happen given upfront tests
2935          */
2936         if ((t->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system == 0) {
2937                 printk(KERN_DEBUG "perfmon: Spurious overflow interrupt: process %d not "
2938                        "using perfmon\n", task->pid);
2939                 preempt_enable_no_resched();
2940                 return 0x1;
2941         }
2942         /*
2943          * sanity test. Should never happen
2944          */
2945         if ((pmc0 & 0x1) == 0) {
2946                 printk(KERN_DEBUG "perfmon: pid %d pmc0=0x%lx assumption error for freeze bit\n",
2947                        task->pid, pmc0);
2948                 preempt_enable_no_resched();
2949                 return 0x0;
2950         }
2951
2952         mask = pmc0 >> PMU_FIRST_COUNTER;
2953
2954         DBprintk_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
2955                   " mode used_pmds=0x%lx used_pmcs=0x%lx reload_pmcs=0x%lx\n",
2956                         pmc0, task->pid, (regs ? regs->cr_iip : 0),
2957                         CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
2958                         ctx->ctx_used_pmds[0],
2959                         ctx->ctx_used_pmcs[0],
2960                         ctx->ctx_reload_pmcs[0]));
2961
2962         /*
2963          * First we update the virtual counters
2964          */
2965         for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
2966
2967                 /* skip pmd which did not overflow */
2968                 if ((mask & 0x1) == 0) continue;
2969
2970                 DBprintk_ovfl(("pmd[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n",
2971                           i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val));
2972
2973                 /*
2974                  * Note that the pmd is not necessarily 0 at this point as qualified events
2975                  * may have happened before the PMU was frozen. The residual count is not
2976                  * taken into consideration here but will be with any read of the pmd via
2977                  * pfm_read_pmds().
2978                  */
2979                 old_val = ctx->ctx_soft_pmds[i].val;
2980                 ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.ovfl_val;
2981
2982                 /*
2983                  * check for overflow condition
2984                  */
2985                 if (old_val > ctx->ctx_soft_pmds[i].val) {
2986
2987                         ovfl_pmds |= 1UL << i;
2988
2989                         if (PMC_OVFL_NOTIFY(ctx, i)) {
2990                                 ovfl_notify |= 1UL << i;
2991                         }
2992                 }
2993                 DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
2994                           i, ctx->ctx_soft_pmds[i].val, old_val,
2995                           ia64_get_pmd(i) & pmu_conf.ovfl_val, ovfl_pmds, ovfl_notify));
2996         }
2997
2998         /*
2999          * check for sampling buffer
3000          *
3001          * if present, record sample. We propagate notification ONLY when buffer
3002          * becomes full.
3003          */
3004         if(CTX_HAS_SMPL(ctx)) {
3005                 ret = pfm_record_sample(task, ctx, ovfl_pmds, regs);
3006                 if (ret == 1) {
3007                         /*
3008                          * Sampling buffer became full
3009                          * If no notication was requested, then we reset buffer index
3010                          * and reset registers (done below) and resume.
3011                          * If notification requested, then defer reset until pfm_restart()
3012                          */
3013                         if (ovfl_notify == 0UL) {
3014                                 ctx->ctx_psb->psb_hdr->hdr_count = 0UL;
3015                                 ctx->ctx_psb->psb_index          = 0UL;
3016                         }
3017                 } else {
3018                         /*
3019                          * sample recorded in buffer, no need to notify user
3020                          */
3021                         ovfl_notify = 0UL;
3022                 }
3023         }
3024
3025         /*
3026          * No overflow requiring a user level notification
3027          */
3028         if (ovfl_notify == 0UL) {
3029                 if (ovfl_pmds)
3030                         pfm_reset_regs(ctx, &ovfl_pmds, PFM_PMD_SHORT_RESET);
3031                 preempt_enable_no_resched();
3032                 return 0x0UL;
3033         }
3034
3035         /*
3036          * keep track of what to reset when unblocking
3037          */
3038         ctx->ctx_ovfl_regs[0]  = ovfl_pmds;
3039
3040         DBprintk_ovfl(("block=%d notify [%d] current [%d]\n",
3041                 ctx->ctx_fl_block,
3042                 ctx->ctx_notify_task ? ctx->ctx_notify_task->pid: -1,
3043                 current->pid ));
3044
3045         /*
3046          * ctx_notify_task could already be NULL, checked in pfm_notify_user()
3047          */
3048         if (CTX_OVFL_NOBLOCK(ctx) == 0 && ctx->ctx_notify_task != task) {
3049                 t->pfm_ovfl_block_reset = 1; /* will cause blocking */
3050                 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCKSIG;
3051         } else {
3052                 t->pfm_ovfl_block_reset = 1; /* will cause blocking */
3053                 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_SIG;
3054         }
3055
3056         /*
3057          * keep the PMU frozen until either pfm_restart() or
3058          * task completes (non-blocking or notify_task gone).
3059          */
3060         ctx->ctx_fl_frozen = 1;
3061
3062         DBprintk_ovfl(("return pmc0=0x%x must_block=%ld reason=%d\n",
3063                 ctx->ctx_fl_frozen ? 0x1 : 0x0,
3064                 t->pfm_ovfl_block_reset,
3065                 ctx->ctx_fl_trap_reason));
3066
3067         preempt_enable_no_resched();
3068         return 0x1UL;
3069 }
3070
3071 static void
3072 pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
3073 {
3074         u64 pmc0;
3075         struct task_struct *task;
3076         pfm_context_t *ctx;
3077
3078         pfm_stats[get_cpu()].pfm_ovfl_intr_count++;
3079
3080         /*
3081          * if an alternate handler is registered, just bypass the default one
3082          */
3083         if (pfm_alternate_intr_handler) {
3084                 (*pfm_alternate_intr_handler->handler)(irq, arg, regs);
3085                 put_cpu();
3086                 return;
3087         }
3088
3089         /*
3090          * srlz.d done before arriving here
3091          *
3092          * This is slow
3093          */
3094         pmc0 = ia64_get_pmc(0);
3095
3096         /*
3097          * if we have some pending bits set
3098          * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1
3099          */
3100         if ((pmc0 & ~0x1UL)!=0UL && (task=PMU_OWNER())!= NULL) {
3101                 /*
3102                  * we assume that pmc0.fr is always set here
3103                  */
3104                 ctx = task->thread.pfm_context;
3105
3106                 /* sanity check */
3107                 if (!ctx) {
3108                         printk(KERN_DEBUG "perfmon: Spurious overflow interrupt: process %d has "
3109                                "no PFM context\n", task->pid);
3110                         put_cpu();
3111                         return;
3112                 }
3113
3114                 /*
3115                  * assume PMC[0].fr = 1 at this point
3116                  */
3117                 pmc0 = pfm_overflow_handler(task, ctx, pmc0, regs);
3118                 /*
3119                  * we can only update pmc0 when the overflow
3120                  * is for the current context. In UP the current
3121                  * task may not be the one owning the PMU
3122                  */
3123                 if (task == current) {
3124                         /*
3125                          * We always clear the overflow status bits and either unfreeze
3126                          * or keep the PMU frozen.
3127                          */
3128                         ia64_set_pmc(0, pmc0);
3129                         ia64_srlz_d();
3130                 } else {
3131                         task->thread.pmc[0] = pmc0;
3132                 }
3133         } else {
3134                 pfm_stats[smp_processor_id()].pfm_spurious_ovfl_intr_count++;
3135         }
3136         put_cpu_no_resched();
3137 }
3138
3139 /* for debug only */
3140 static int
3141 pfm_proc_info(char *page)
3142 {
3143         char *p = page;
3144         int i;
3145
3146         p += sprintf(p, "fastctxsw              : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
3147         p += sprintf(p, "ovfl_mask              : 0x%lx\n", pmu_conf.ovfl_val);
3148
3149         for(i=0; i < NR_CPUS; i++) {
3150                 if (cpu_online(i) == 0) continue;
3151                 p += sprintf(p, "CPU%-2d overflow intrs   : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_count);
3152                 p += sprintf(p, "CPU%-2d spurious intrs   : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count);
3153                 p += sprintf(p, "CPU%-2d recorded samples : %lu\n", i, pfm_stats[i].pfm_recorded_samples_count);
3154                 p += sprintf(p, "CPU%-2d smpl buffer full : %lu\n", i, pfm_stats[i].pfm_full_smpl_buffer_count);
3155                 p += sprintf(p, "CPU%-2d syst_wide        : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0);
3156                 p += sprintf(p, "CPU%-2d dcr_pp           : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0);
3157                 p += sprintf(p, "CPU%-2d exclude idle     : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0);
3158                 p += sprintf(p, "CPU%-2d owner            : %d\n", i, pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1);
3159         }
3160
3161         LOCK_PFS();
3162
3163         p += sprintf(p, "proc_sessions          : %u\n"
3164                         "sys_sessions           : %u\n"
3165                         "sys_use_dbregs         : %u\n"
3166                         "ptrace_use_dbregs      : %u\n",
3167                         pfm_sessions.pfs_task_sessions,
3168                         pfm_sessions.pfs_sys_sessions,
3169                         pfm_sessions.pfs_sys_use_dbregs,
3170                         pfm_sessions.pfs_ptrace_use_dbregs);
3171
3172         UNLOCK_PFS();
3173
3174         return p - page;
3175 }
3176
3177 /* /proc interface, for debug only */
3178 static int
3179 perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
3180 {
3181         int len = pfm_proc_info(page);
3182
3183         if (len <= off+count) *eof = 1;
3184
3185         *start = page + off;
3186         len   -= off;
3187
3188         if (len>count) len = count;
3189         if (len<0) len = 0;
3190
3191         return len;
3192 }
3193
3194 /*
3195  * we come here as soon as PFM_CPUINFO_SYST_WIDE is set. This happens
3196  * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
3197  * is active or inactive based on mode. We must rely on the value in
3198  * cpu_data(i)->pfm_syst_info
3199  */
3200 void
3201 pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
3202 {
3203         struct pt_regs *regs;
3204         unsigned long dcr;
3205         unsigned long dcr_pp;
3206
3207         preempt_disable();
3208         dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
3209
3210         /*
3211          * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
3212          * on every CPU, so we can rely on the pid to identify the idle task.
3213          */
3214         if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
3215                 regs = (struct pt_regs *)((unsigned long) task + IA64_STK_OFFSET);
3216                 regs--;
3217                 ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
3218                 preempt_enable();
3219                 return;
3220         }
3221         /*
3222          * if monitoring has started
3223          */
3224         if (dcr_pp) {
3225                 dcr = ia64_get_dcr();
3226                 /*
3227                  * context switching in?
3228                  */
3229                 if (is_ctxswin) {
3230                         /* mask monitoring for the idle task */
3231                         ia64_set_dcr(dcr & ~IA64_DCR_PP);
3232                         pfm_clear_psr_pp();
3233                         ia64_srlz_i();
3234                         preempt_enable();
3235                         return;
3236                 }
3237                 /*
3238                  * context switching out
3239                  * restore monitoring for next task
3240                  *
3241                  * Due to inlining this odd if-then-else construction generates
3242                  * better code.
3243                  */
3244                 ia64_set_dcr(dcr |IA64_DCR_PP);
3245                 pfm_set_psr_pp();
3246                 ia64_srlz_i();
3247         }
3248         preempt_enable();
3249 }
3250
3251 void
3252 pfm_save_regs (struct task_struct *task)
3253 {
3254         pfm_context_t *ctx;
3255         unsigned long mask;
3256         u64 psr;
3257         int i;
3258
3259         preempt_disable();
3260
3261         ctx = task->thread.pfm_context;
3262
3263
3264         /*
3265          * save current PSR: needed because we modify it
3266          */
3267         psr = pfm_get_psr();
3268
3269         /*
3270          * stop monitoring:
3271          * This is the last instruction which can generate an overflow
3272          *
3273          * We do not need to set psr.sp because, it is irrelevant in kernel.
3274          * It will be restored from ipsr when going back to user level
3275          */
3276         pfm_clear_psr_up();
3277         ia64_srlz_i();
3278
3279         ctx->ctx_saved_psr = psr;
3280
3281 #ifdef CONFIG_SMP
3282         /*
3283          * We do not use a lazy scheme in SMP because
3284          * of the new scheduler which masks interrupts
3285          * during low-level context switch. So we save
3286          * all the PMD register we use and restore on
3287          * ctxsw in.
3288          *
3289          * release ownership of this PMU.
3290          * must be done before we save the registers.
3291          */
3292         SET_PMU_OWNER(NULL);
3293
3294         /*
3295          * save PMDs
3296          */
3297         ia64_srlz_d();
3298
3299         mask = ctx->ctx_used_pmds[0];
3300         for (i=0; mask; i++, mask>>=1) {
3301                 if (mask & 0x1) task->thread.pmd[i] =ia64_get_pmd(i);
3302         }
3303
3304         /*
3305          * save pmc0
3306          */
3307         task->thread.pmc[0] = ia64_get_pmc(0);
3308
3309         /*
3310          * force a full reload
3311          */
3312         atomic_set(&ctx->ctx_last_cpu, -1);
3313 #endif
3314         preempt_enable();
3315 }
3316
3317 static void
3318 pfm_lazy_save_regs (struct task_struct *task)
3319 {
3320         pfm_context_t *ctx;
3321         struct thread_struct *t;
3322         unsigned long mask;
3323         int i;
3324
3325         preempt_disable();
3326         DBprintk(("on [%d] by [%d]\n", task->pid, current->pid));
3327
3328         t   = &task->thread;
3329         ctx = task->thread.pfm_context;
3330
3331         /*
3332          * do not own the PMU
3333          */
3334         SET_PMU_OWNER(NULL);
3335
3336         ia64_srlz_d();
3337
3338         /*
3339          * XXX needs further optimization.
3340          * Also must take holes into account
3341          */
3342         mask = ctx->ctx_used_pmds[0];
3343         for (i=0; mask; i++, mask>>=1) {
3344                 if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
3345         }
3346
3347         /* save pmc0 */
3348         t->pmc[0] = ia64_get_pmc(0);
3349
3350         /* not owned by this CPU */
3351         atomic_set(&ctx->ctx_last_cpu, -1);
3352         preempt_enable();
3353 }
3354
3355 void
3356 pfm_load_regs (struct task_struct *task)
3357 {
3358         struct thread_struct *t;
3359         pfm_context_t *ctx;
3360         struct task_struct *owner;
3361         unsigned long mask;
3362         u64 psr;
3363         int i;
3364
3365         preempt_disable();
3366
3367         owner = PMU_OWNER();
3368         ctx   = task->thread.pfm_context;
3369         t     = &task->thread;
3370
3371         if (ctx == NULL) {
3372                 preempt_enable();
3373                 printk("perfmon: pfm_load_regs: null ctx for [%d]\n", task->pid);
3374                 return;
3375         }
3376
3377         /*
3378          * we restore ALL the debug registers to avoid picking up
3379          * stale state.
3380          *
3381          * This must be done even when the task is still the owner
3382          * as the registers may have been modified via ptrace()
3383          * (not perfmon) by the previous task.
3384          *
3385          * XXX: dealing with this in a lazy fashion requires modifications
3386          * to the way the the debug registers are managed. This is will done
3387          * in the next version of perfmon.
3388          */
3389         if (ctx->ctx_fl_using_dbreg) {
3390                 for (i=0; i < pmu_conf.num_ibrs; i++) {
3391                         ia64_set_ibr(i, t->ibr[i]);
3392                 }
3393                 ia64_srlz_i();
3394                 for (i=0; i < pmu_conf.num_dbrs; i++) {
3395                         ia64_set_dbr(i, t->dbr[i]);
3396                 }
3397                 ia64_srlz_d();
3398         }
3399
3400         /*
3401          * if we were the last user, then nothing to do except restore psr
3402          * this path cannot be used in SMP
3403          */
3404         if (owner == task) {
3405                 if (atomic_read(&ctx->ctx_last_cpu) != smp_processor_id())
3406                         DBprintk(("invalid last_cpu=%d for [%d]\n",
3407                                 atomic_read(&ctx->ctx_last_cpu), task->pid));
3408
3409                 psr = ctx->ctx_saved_psr;
3410                 pfm_set_psr_l(psr);
3411                 preempt_enable();
3412                 return;
3413         }
3414
3415         /*
3416          * someone else is still using the PMU, first push it out and
3417          * then we'll be able to install our stuff !
3418          *
3419          * not possible in SMP
3420          */
3421         if (owner) pfm_lazy_save_regs(owner);
3422
3423         /*
3424          * To avoid leaking information to the user level when psr.sp=0,
3425          * we must reload ALL implemented pmds (even the ones we don't use).
3426          * In the kernel we only allow PFM_READ_PMDS on registers which
3427          * we initialized or requested (sampling) so there is no risk there.
3428          *
3429          * As an optimization, we will only reload the PMD that we use when
3430          * the context is in protected mode, i.e. psr.sp=1 because then there
3431          * is no leak possible.
3432          */
3433         mask = pfm_sysctl.fastctxsw || ctx->ctx_fl_protected ?  ctx->ctx_used_pmds[0] : ctx->ctx_reload_pmds[0];
3434         for (i=0; mask; i++, mask>>=1) {
3435                 if (mask & 0x1) ia64_set_pmd(i, t->pmd[i] & pmu_conf.ovfl_val);
3436         }
3437
3438         /*
3439          * PMC0 is never set in the mask because it is always restored
3440          * separately.
3441          *
3442          * ALL PMCs are systematically reloaded, unused registers
3443          * get their default (PAL reset) values to avoid picking up
3444          * stale configuration.
3445          */
3446         mask = ctx->ctx_reload_pmcs[0];
3447         for (i=0; mask; i++, mask>>=1) {
3448                 if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
3449         }
3450
3451         /*
3452          * manually invoke core interrupt handler
3453          * if the task had a pending overflow when it was ctxsw out.
3454          * Side effect on ctx_fl_frozen is possible.
3455          */
3456         if (t->pmc[0] & ~0x1) {
3457                 t->pmc[0] = pfm_overflow_handler(task, ctx, t->pmc[0], NULL);
3458         }
3459
3460         /*
3461          * unfreeze PMU if possible
3462          */
3463         if (ctx->ctx_fl_frozen == 0) pfm_unfreeze_pmu();
3464
3465         atomic_set(&ctx->ctx_last_cpu, smp_processor_id());
3466
3467         SET_PMU_OWNER(task);
3468
3469         /*
3470          * restore the psr we changed in pfm_save_regs()
3471          */
3472         psr = ctx->ctx_saved_psr;
3473         preempt_enable();
3474         pfm_set_psr_l(psr);
3475 }
3476
3477 /*
3478  * XXX: make this routine able to work with non current context
3479  */
3480 static void
3481 pfm_reset_pmu(struct task_struct *task)
3482 {
3483         struct thread_struct *t = &task->thread;
3484         pfm_context_t *ctx = t->pfm_context;
3485         int i;
3486
3487         if (task != current) {
3488                 printk("perfmon: invalid task in pfm_reset_pmu()\n");
3489                 return;
3490         }
3491         preempt_disable();
3492
3493         /* Let's make sure the PMU is frozen */
3494         pfm_freeze_pmu();
3495
3496         /*
3497          * install reset values for PMC. We skip PMC0 (done above)
3498          * XX: good up to 64 PMCS
3499          */
3500         for (i=1; (pmu_conf.pmc_desc[i].type & PFM_REG_END) == 0; i++) {
3501                 if ((pmu_conf.pmc_desc[i].type & PFM_REG_IMPL) == 0) continue;
3502                 ia64_set_pmc(i, PMC_DFL_VAL(i));
3503                 /*
3504                  * When restoring context, we must restore ALL pmcs, even the ones
3505                  * that the task does not use to avoid leaks and possibly corruption
3506                  * of the sesion because of configuration conflicts. So here, we
3507                  * initialize the entire set used in the context switch restore routine.
3508                  */
3509                 t->pmc[i] = PMC_DFL_VAL(i);
3510                 DBprintk(("pmc[%d]=0x%lx\n", i, t->pmc[i]));
3511         }
3512
3513         /*
3514          * clear reset values for PMD.
3515          * XXX: good up to 64 PMDS.
3516          */
3517         for (i=0; (pmu_conf.pmd_desc[i].type & PFM_REG_END) == 0; i++) {
3518                 if ((pmu_conf.pmd_desc[i].type & PFM_REG_IMPL) == 0) continue;
3519                 ia64_set_pmd(i, 0UL);
3520                 t->pmd[i] = 0UL;
3521         }
3522
3523         /*
3524          * On context switched restore, we must restore ALL pmc and ALL pmd even
3525          * when they are not actively used by the task. In UP, the incoming process
3526          * may otherwise pick up left over PMC, PMD state from the previous process.
3527          * As opposed to PMD, stale PMC can cause harm to the incoming
3528          * process because they may change what is being measured.
3529          * Therefore, we must systematically reinstall the entire
3530          * PMC state. In SMP, the same thing is possible on the
3531          * same CPU but also on between 2 CPUs.
3532          *
3533          * The problem with PMD is information leaking especially
3534          * to user level when psr.sp=0
3535          *
3536          * There is unfortunately no easy way to avoid this problem
3537          * on either UP or SMP. This definitively slows down the
3538          * pfm_load_regs() function.
3539          */
3540
3541          /*
3542           * We must include all the PMC in this mask to make sure we don't
3543           * see any side effect of a stale state, such as opcode matching
3544           * or range restrictions, for instance.
3545           *
3546           * We never directly restore PMC0 so we do not include it in the mask.
3547           */
3548         ctx->ctx_reload_pmcs[0] = pmu_conf.impl_pmcs[0] & ~0x1;
3549         /*
3550          * We must include all the PMD in this mask to avoid picking
3551          * up stale value and leak information, especially directly
3552          * at the user level when psr.sp=0
3553          */
3554         ctx->ctx_reload_pmds[0] = pmu_conf.impl_pmds[0];
3555
3556         /*
3557          * Keep track of the pmds we want to sample
3558          * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
3559          * but we do need the BTB for sure. This is because of a hardware
3560          * buffer of 1 only for non-BTB pmds.
3561          *
3562          * We ignore the unimplemented pmds specified by the user
3563          */
3564         ctx->ctx_used_pmds[0] = ctx->ctx_smpl_regs[0];
3565         ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */
3566
3567         /*
3568          * useful in case of re-enable after disable
3569          */
3570         ctx->ctx_used_ibrs[0] = 0UL;
3571         ctx->ctx_used_dbrs[0] = 0UL;
3572
3573         ia64_srlz_d();
3574         preempt_enable();
3575 }
3576
3577 /*
3578  * This function is called when a thread exits (from exit_thread()).
3579  * This is a simplified pfm_save_regs() that simply flushes the current
3580  * register state into the save area taking into account any pending
3581  * overflow. This time no notification is sent because the task is dying
3582  * anyway. The inline processing of overflows avoids loosing some counts.
3583  * The PMU is frozen on exit from this call and is to never be reenabled
3584  * again for this task.
3585  *
3586  */
3587 void
3588 pfm_flush_regs (struct task_struct *task)
3589 {
3590         pfm_context_t *ctx;
3591         u64 pmc0;
3592         unsigned long mask2, val;
3593         int i;
3594
3595         ctx = task->thread.pfm_context;
3596
3597         if (ctx == NULL) return;
3598
3599         /*
3600          * that's it if context already disabled
3601          */
3602         if (ctx->ctx_flags.state == PFM_CTX_DISABLED) return;
3603
3604         preempt_disable();
3605         /*
3606          * stop monitoring:
3607          * This is the only way to stop monitoring without destroying overflow
3608          * information in PMC[0].
3609          * This is the last instruction which can cause overflow when monitoring
3610          * in kernel.
3611          * By now, we could still have an overflow interrupt in-flight.
3612          */
3613         if (ctx->ctx_fl_system) {
3614
3615
3616                 /* disable dcr pp */
3617                 ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
3618
3619                 /* stop monitoring */
3620                 pfm_clear_psr_pp();
3621
3622                 ia64_srlz_i();
3623
3624                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
3625                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
3626                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
3627         } else  {
3628
3629                 /* stop monitoring */
3630                 pfm_clear_psr_up();
3631
3632                 ia64_srlz_i();
3633
3634                 /* no more save/restore on ctxsw */
3635                 current->thread.flags &= ~IA64_THREAD_PM_VALID;
3636         }
3637
3638         /*
3639          * Mark the PMU as not owned
3640          * This will cause the interrupt handler to do nothing in case an overflow
3641          * interrupt was in-flight
3642          * This also guarantees that pmc0 will contain the final state
3643          * It virtually gives us full control on overflow processing from that point
3644          * on.
3645          * It must be an atomic operation.
3646          */
3647         SET_PMU_OWNER(NULL);
3648
3649         /*
3650          * read current overflow status:
3651          *
3652          * we are guaranteed to read the final stable state
3653          */
3654         ia64_srlz_d();
3655         pmc0 = ia64_get_pmc(0); /* slow */
3656
3657         /*
3658          * freeze PMU:
3659          *
3660          * This destroys the overflow information. This is required to make sure
3661          * next process does not start with monitoring on if not requested
3662          */
3663         pfm_freeze_pmu();
3664
3665         /*
3666          * We don't need to restore psr, because we are on our way out
3667          */
3668
3669         /*
3670          * This loop flushes the PMD into the PFM context.
3671          * It also processes overflow inline.
3672          *
3673          * IMPORTANT: No notification is sent at this point as the process is dying.
3674          * The implicit notification will come from a SIGCHILD or a return from a
3675          * waitpid().
3676          *
3677          */
3678
3679         if (atomic_read(&ctx->ctx_last_cpu) != smp_processor_id())
3680                 printk(KERN_DEBUG "perfmon: [%d] last_cpu=%d\n",
3681                        task->pid, atomic_read(&ctx->ctx_last_cpu));
3682
3683         /*
3684          * we save all the used pmds
3685          * we take care of overflows for pmds used as counters
3686          */
3687         mask2 = ctx->ctx_used_pmds[0];
3688         for (i = 0; mask2; i++, mask2>>=1) {
3689
3690                 /* skip non used pmds */
3691                 if ((mask2 & 0x1) == 0) continue;
3692
3693                 val = ia64_get_pmd(i);
3694
3695                 if (PMD_IS_COUNTING(i)) {
3696                         DBprintk(("[%d] pmd[%d] soft_pmd=0x%lx hw_pmd=0x%lx\n",
3697                                 task->pid,
3698                                 i,
3699                                 ctx->ctx_soft_pmds[i].val,
3700                                 val & pmu_conf.ovfl_val));
3701
3702                         /* collect latest results */
3703                         ctx->ctx_soft_pmds[i].val += val & pmu_conf.ovfl_val;
3704
3705                         /*
3706                          * now everything is in ctx_soft_pmds[] and we need
3707                          * to clear the saved context from save_regs() such that
3708                          * pfm_read_pmds() gets the correct value
3709                          */
3710                         task->thread.pmd[i] = 0;
3711
3712                         /*
3713                          * take care of overflow inline
3714                          */
3715                         if (pmc0 & (1UL << i)) {
3716                                 ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.ovfl_val;
3717                                 DBprintk(("[%d] pmd[%d] overflowed soft_pmd=0x%lx\n",
3718                                         task->pid, i, ctx->ctx_soft_pmds[i].val));
3719                         }
3720                 } else {
3721                         DBprintk(("[%d] pmd[%d] hw_pmd=0x%lx\n", task->pid, i, val));
3722                         /*
3723                          * not a counter, just save value as is
3724                          */
3725                         task->thread.pmd[i] = val;
3726                 }
3727         }
3728         /*
3729          * indicates that context has been saved
3730          */
3731         atomic_set(&ctx->ctx_last_cpu, -1);
3732         preempt_enable();
3733 }
3734
3735
3736 /*
3737  * task is the newly created task, pt_regs for new child
3738  */
3739 int
3740 pfm_inherit(struct task_struct *task, struct pt_regs *regs)
3741 {
3742         pfm_context_t *ctx;
3743         pfm_context_t *nctx;
3744         struct thread_struct *thread;
3745         unsigned long m;
3746         int i;
3747
3748         /*
3749          * the new task was copied from parent and therefore points
3750          * to the parent's context at this point
3751          */
3752         ctx    = task->thread.pfm_context;
3753         thread = &task->thread;
3754
3755         preempt_disable();
3756         /*
3757          * make sure child cannot mess up the monitoring session
3758          */
3759          ia64_psr(regs)->sp = 1;
3760          DBprintk(("enabling psr.sp for [%d]\n", task->pid));
3761
3762
3763         /*
3764          * if there was a virtual mapping for the sampling buffer
3765          * the mapping is NOT inherited across fork() (see VM_DONTCOPY),
3766          * so we don't have to explicitely remove it here.
3767          *
3768          *
3769          * Part of the clearing of fields is also done in
3770          * copy_thread() because the fiels are outside the
3771          * pfm_context structure and can affect tasks not
3772          * using perfmon.
3773          */
3774
3775         /* clear pending notification */
3776         task->thread.pfm_ovfl_block_reset = 0;
3777
3778         /*
3779          * clear cpu pinning restriction for child
3780          */
3781         if (ctx->ctx_fl_system) {
3782                 set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
3783
3784                 DBprintk(("setting cpus_allowed for [%d] to 0x%lx from 0x%lx\n",
3785                         task->pid,
3786                         ctx->ctx_saved_cpus_allowed,
3787                         current->cpus_allowed));
3788         }
3789
3790         /*
3791          * takes care of easiest case first
3792          */
3793         if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) {
3794
3795                 DBprintk(("removing PFM context for [%d]\n", task->pid));
3796
3797                 task->thread.pfm_context = NULL;
3798
3799                 /*
3800                  * we must clear psr.up because the new child does
3801                  * not have a context and the PM_VALID flag is cleared
3802                  * in copy_thread().
3803                  *
3804                  * we do not clear psr.pp because it is always
3805                  * controlled by the system wide logic and we should
3806                  * never be here when system wide is running anyway
3807                  */
3808                 ia64_psr(regs)->up = 0;
3809
3810                 preempt_enable();
3811
3812                 /* copy_thread() clears IA64_THREAD_PM_VALID */
3813                 return 0;
3814         }
3815         nctx = pfm_context_alloc();
3816         if (nctx == NULL) return -ENOMEM;
3817
3818         /* copy content */
3819         *nctx = *ctx;
3820
3821
3822         if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) {
3823                 nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE;
3824                 DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));
3825         }
3826         /*
3827          * task is not yet visible in the tasklist, so we do
3828          * not need to lock the newly created context.
3829          * However, we must grab the tasklist_lock to ensure
3830          * that the ctx_owner or ctx_notify_task do not disappear
3831          * while we increment their check counters.
3832          */
3833         read_lock(&tasklist_lock);
3834
3835         if (nctx->ctx_notify_task)
3836                 atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);
3837
3838         if (nctx->ctx_owner)
3839                 atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);
3840
3841         read_unlock(&tasklist_lock);
3842
3843
3844         LOCK_PFS();
3845         pfm_sessions.pfs_task_sessions++;
3846         UNLOCK_PFS();
3847
3848         /* initialize counters in new context */
3849         m = nctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
3850         for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) {
3851                 if ((m & 0x1) && pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING) {
3852                         nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].lval & ~pmu_conf.ovfl_val;
3853                         thread->pmd[i]             = nctx->ctx_soft_pmds[i].lval & pmu_conf.ovfl_val;
3854                 } else {
3855                         thread->pmd[i]             = 0UL; /* reset to initial state */
3856                 }
3857         }
3858
3859         nctx->ctx_fl_frozen      = 0;
3860         nctx->ctx_ovfl_regs[0]   = 0UL;
3861         nctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
3862         atomic_set(&nctx->ctx_last_cpu, -1);
3863
3864         /*
3865          * here nctx->ctx_psb == ctx->ctx_psb
3866          *
3867          * increment reference count to sampling
3868          * buffer, if any. Note that this is independent
3869          * from the virtual mapping. The latter is never
3870          * inherited while the former will be if context
3871          * is setup to something different from PFM_FL_INHERIT_NONE
3872          */
3873         if (nctx->ctx_psb) {
3874                 LOCK_PSB(nctx->ctx_psb);
3875
3876                 nctx->ctx_psb->psb_refcnt++;
3877
3878                 DBprintk(("updated smpl @ %p refcnt=%lu psb_flags=0x%x\n",
3879                         ctx->ctx_psb->psb_hdr,
3880                         ctx->ctx_psb->psb_refcnt,
3881                         ctx->ctx_psb->psb_flags));
3882
3883                 UNLOCK_PSB(nctx->ctx_psb);
3884
3885                 /*
3886                  * remove any pointer to sampling buffer mapping
3887                  */
3888                 nctx->ctx_smpl_vaddr = 0;
3889         }
3890
3891         sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */
3892
3893         /*
3894          * propagate kernel psr in new context (used for first ctxsw in
3895          */
3896         nctx->ctx_saved_psr = pfm_get_psr();
3897
3898         /*
3899          * propagate kernel psr in new context (used for first ctxsw in
3900          */
3901         nctx->ctx_saved_psr = pfm_get_psr();
3902
3903         /* link with new task */
3904         thread->pfm_context = nctx;
3905
3906         DBprintk(("nctx=%p for process [%d]\n", (void *)nctx, task->pid));
3907
3908         /*
3909          * the copy_thread routine automatically clears
3910          * IA64_THREAD_PM_VALID, so we need to reenable it, if it was used by the caller
3911          */
3912         if (current->thread.flags & IA64_THREAD_PM_VALID) {
3913                 DBprintk(("setting PM_VALID for [%d]\n", task->pid));
3914                 thread->flags |= IA64_THREAD_PM_VALID;
3915         }
3916
3917         preempt_enable();
3918
3919         return 0;
3920 }
3921
3922 /*
3923  *
3924  * We cannot touch any of the PMU registers at this point as we may
3925  * not be running on the same CPU the task was last run on.  Therefore
3926  * it is assumed that the PMU has been stopped appropriately in
3927  * pfm_flush_regs() called from exit_thread().
3928  *
3929  * The function is called in the context of the parent via a release_thread()
3930  * and wait4(). The task is not in the tasklist anymore.
3931  */
3932 void
3933 pfm_context_exit(struct task_struct *task)
3934 {
3935         pfm_context_t *ctx = task->thread.pfm_context;
3936
3937         /*
3938          * check sampling buffer
3939          */
3940         preempt_disable();
3941         if (ctx->ctx_psb) {
3942                 pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb;
3943
3944                 LOCK_PSB(psb);
3945
3946                 DBprintk(("sampling buffer from [%d] @%p size %ld refcnt=%lu psb_flags=0x%x\n",
3947                         task->pid,
3948                         psb->psb_hdr, psb->psb_size, psb->psb_refcnt, psb->psb_flags));
3949
3950                 /*
3951                  * in the case where we are the last user, we may be able to free
3952                  * the buffer
3953                  */
3954                 psb->psb_refcnt--;
3955
3956                 if (psb->psb_refcnt == 0) {
3957
3958                         /*
3959                          * The flag is cleared in pfm_vm_close(). which gets
3960                          * called from do_exit() via exit_mm().
3961                          * By the time we come here, the task has no more mm context.
3962                          *
3963                          * We can only free the psb and buffer here after the vm area
3964                          * describing the buffer has been removed. This normally happens
3965                          * as part of do_exit() but the entire mm context is ONLY removed
3966                          * once its reference counts goes to zero. This is typically
3967                          * the case except for multi-threaded (several tasks) processes.
3968                          *
3969                          * See pfm_vm_close() and pfm_cleanup_smpl_buf() for more details.
3970                          */
3971                         if ((psb->psb_flags & PSB_HAS_VMA) == 0) {
3972
3973                                 DBprintk(("cleaning sampling buffer from [%d] @%p size %ld\n",
3974                                         task->pid,
3975                                         psb->psb_hdr, psb->psb_size));
3976
3977                                 /*
3978                                  * free the buffer and psb
3979                                  */
3980                                 pfm_rvfree(psb->psb_hdr, psb->psb_size);
3981                                 kfree(psb);
3982                                 psb = NULL;
3983                         }
3984                 }
3985                 /* psb may have been deleted */
3986                 if (psb) UNLOCK_PSB(psb);
3987         }
3988
3989         DBprintk(("cleaning [%d] pfm_context @%p notify_task=%p check=%d mm=%p\n",
3990                 task->pid, ctx,
3991                 ctx->ctx_notify_task,
3992                 atomic_read(&task->thread.pfm_notifiers_check), task->mm));
3993
3994         /*
3995          * To avoid getting the notified task or owner task scan the entire process
3996          * list when they exit, we decrement notifiers_check and owners_check respectively.
3997          *
3998          * Of course, there is race condition between decreasing the value and the
3999          * task exiting. The danger comes from the fact that, in both cases, we have a
4000          * direct pointer to a task structure thereby bypassing the tasklist.
4001          * We must make sure that, if we have task!= NULL, the target task is still
4002          * present and is identical to the initial task specified
4003          * during pfm_context_create(). It may already be detached from the tasklist but
4004          * that's okay. Note that it is okay if we miss the deadline and the task scans
4005          * the list for nothing, it will affect performance but not correctness.
4006          * The correctness is ensured by using the ctx_lock which prevents the
4007          * notify_task from changing the fields in our context.
4008          * Once holdhing this lock, if we see task!= NULL, then it will stay like
4009          * that until we release the lock. If it is NULL already then we came too late.
4010          */
4011         LOCK_CTX(ctx);
4012
4013         if (ctx->ctx_notify_task != NULL) {
4014                 DBprintk(("[%d], [%d] atomic_sub on [%d] notifiers=%u\n", current->pid,
4015                         task->pid,
4016                         ctx->ctx_notify_task->pid,
4017                         atomic_read(&ctx->ctx_notify_task->thread.pfm_notifiers_check)));
4018
4019                 atomic_dec(&ctx->ctx_notify_task->thread.pfm_notifiers_check);
4020         }
4021
4022         if (ctx->ctx_owner != NULL) {
4023                 DBprintk(("[%d], [%d] atomic_sub on [%d] owners=%u\n",
4024                          current->pid,
4025                          task->pid,
4026                          ctx->ctx_owner->pid,
4027                          atomic_read(&ctx->ctx_owner->thread.pfm_owners_check)));
4028
4029                 atomic_dec(&ctx->ctx_owner->thread.pfm_owners_check);
4030         }
4031
4032         UNLOCK_CTX(ctx);
4033         preempt_enable();
4034
4035         pfm_unreserve_session(task, ctx->ctx_fl_system, 1UL << ctx->ctx_cpu);
4036
4037         if (ctx->ctx_fl_system) {
4038                 /*
4039                  * remove any CPU pinning
4040                  */
4041                 set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
4042         }
4043
4044         pfm_context_free(ctx);
4045         /*
4046          *  clean pfm state in thread structure,
4047          */
4048         task->thread.pfm_context          = NULL;
4049         task->thread.pfm_ovfl_block_reset = 0;
4050
4051         /* pfm_notifiers is cleaned in pfm_cleanup_notifiers() */
4052 }
4053
4054 /*
4055  * function invoked from release_thread when pfm_smpl_buf_list is not NULL
4056  */
4057 int
4058 pfm_cleanup_smpl_buf(struct task_struct *task)
4059 {
4060         pfm_smpl_buffer_desc_t *tmp, *psb = task->thread.pfm_smpl_buf_list;
4061
4062         if (psb == NULL) {
4063                 printk(KERN_DEBUG "perfmon: psb is null in [%d]\n", current->pid);
4064                 return -1;
4065         }
4066         /*
4067          * Walk through the list and free the sampling buffer and psb
4068          */
4069         while (psb) {
4070                 DBprintk(("[%d] freeing smpl @%p size %ld\n", current->pid, psb->psb_hdr, psb->psb_size));
4071
4072                 pfm_rvfree(psb->psb_hdr, psb->psb_size);
4073                 tmp = psb->psb_next;
4074                 kfree(psb);
4075                 psb = tmp;
4076         }
4077
4078         /* just in case */
4079         task->thread.pfm_smpl_buf_list = NULL;
4080
4081         return 0;
4082 }
4083
4084 /*
4085  * function invoked from release_thread to make sure that the ctx_owner field does not
4086  * point to an unexisting task.
4087  */
4088 void
4089 pfm_cleanup_owners(struct task_struct *task)
4090 {
4091         struct task_struct *g, *p;
4092         pfm_context_t *ctx;
4093
4094         DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid));
4095
4096         read_lock(&tasklist_lock);
4097
4098         do_each_thread(g, p) {
4099                 /*
4100                  * It is safe to do the 2-step test here, because thread.ctx
4101                  * is cleaned up only in release_thread() and at that point
4102                  * the task has been detached from the tasklist which is an
4103                  * operation which uses the write_lock() on the tasklist_lock
4104                  * so it cannot run concurrently to this loop. So we have the
4105                  * guarantee that if we find p and it has a perfmon ctx then
4106                  * it is going to stay like this for the entire execution of this
4107                  * loop.
4108                  */
4109                 ctx = p->thread.pfm_context;
4110
4111                 //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));
4112
4113                 if (ctx && ctx->ctx_owner == task) {
4114                         DBprintk(("trying for owner [%d] in [%d]\n", task->pid, p->pid));
4115                         /*
4116                          * the spinlock is required to take care of a race condition
4117                          * with the send_sig_info() call. We must make sure that
4118                          * either the send_sig_info() completes using a valid task,
4119                          * or the notify_task is cleared before the send_sig_info()
4120                          * can pick up a stale value. Note that by the time this
4121                          * function is executed the 'task' is already detached from the
4122                          * tasklist. The problem is that the notifiers have a direct
4123                          * pointer to it. It is okay to send a signal to a task in this
4124                          * stage, it simply will have no effect. But it is better than sending
4125                          * to a completely destroyed task or worse to a new task using the same
4126                          * task_struct address.
4127                          */
4128                         LOCK_CTX(ctx);
4129
4130                         ctx->ctx_owner = NULL;
4131
4132                         UNLOCK_CTX(ctx);
4133
4134                         DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid));
4135                 }
4136         } while_each_thread(g, p);
4137
4138         read_unlock(&tasklist_lock);
4139
4140         atomic_set(&task->thread.pfm_owners_check, 0);
4141 }
4142
4143
4144 /*
4145  * function called from release_thread to make sure that the ctx_notify_task is not pointing
4146  * to an unexisting task
4147  */
4148 void
4149 pfm_cleanup_notifiers(struct task_struct *task)
4150 {
4151         struct task_struct *g, *p;
4152         pfm_context_t *ctx;
4153
4154         DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid));
4155
4156         read_lock(&tasklist_lock);
4157
4158         do_each_thread(g, p) {
4159                 /*
4160                  * It is safe to do the 2-step test here, because thread.ctx is cleaned up
4161                  * only in release_thread() and at that point the task has been detached
4162                  * from the tasklist which is an operation which uses the write_lock() on
4163                  * the tasklist_lock so it cannot run concurrently to this loop. So we
4164                  * have the guarantee that if we find p and it has a perfmon ctx then it
4165                  * is going to stay like this for the entire execution of this loop.
4166                  */
4167                 ctx = p->thread.pfm_context;
4168
4169                 //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));
4170
4171                 if (ctx && ctx->ctx_notify_task == task) {
4172                         DBprintk(("trying for notifier [%d] in [%d]\n", task->pid, p->pid));
4173                         /*
4174                          * the spinlock is required to take care of a race condition
4175                          * with the send_sig_info() call. We must make sure that
4176                          * either the send_sig_info() completes using a valid task,
4177                          * or the notify_task is cleared before the send_sig_info()
4178                          * can pick up a stale value. Note that by the time this
4179                          * function is executed the 'task' is already detached from the
4180                          * tasklist. The problem is that the notifiers have a direct
4181                          * pointer to it. It is okay to send a signal to a task in this
4182                          * stage, it simply will have no effect. But it is better than sending
4183                          * to a completely destroyed task or worse to a new task using the same
4184                          * task_struct address.
4185                          */
4186                         LOCK_CTX(ctx);
4187
4188                         ctx->ctx_notify_task = NULL;
4189
4190                         UNLOCK_CTX(ctx);
4191
4192                         DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid));
4193                 }
4194         } while_each_thread(g, p);
4195
4196         read_unlock(&tasklist_lock);
4197
4198         atomic_set(&task->thread.pfm_notifiers_check, 0);
4199 }
4200
4201 static struct irqaction perfmon_irqaction = {
4202         .handler =      pfm_interrupt_handler,
4203         .flags   =      SA_INTERRUPT,
4204         .name    =      "perfmon"
4205 };
4206
4207 int
4208 pfm_install_alternate_syswide_subsystem(pfm_intr_handler_desc_t *hdl)
4209 {
4210         int ret;
4211
4212
4213         /* some sanity checks */
4214         if (hdl == NULL || hdl->handler == NULL) {
4215                 return -EINVAL;
4216         }
4217
4218         /* do the easy test first */
4219         if (pfm_alternate_intr_handler) {
4220                 return -EBUSY;
4221         }
4222
4223         preempt_disable();
4224         /* reserve our session */
4225         ret = pfm_reserve_session(NULL, 1, cpu_online_map);
4226         if (ret) {
4227                 preempt_enable();
4228                 return ret;
4229         }
4230
4231         if (pfm_alternate_intr_handler) {
4232                 preempt_enable();
4233                 printk(KERN_DEBUG "perfmon: install_alternate, intr_handler not NULL "
4234                        "after reserve\n");
4235                 return -EINVAL;
4236         }
4237
4238         pfm_alternate_intr_handler = hdl;
4239
4240         preempt_enable();
4241         return 0;
4242 }
4243
4244 int
4245 pfm_remove_alternate_syswide_subsystem(pfm_intr_handler_desc_t *hdl)
4246 {
4247         if (hdl == NULL)
4248                 return -EINVAL;
4249
4250         /* cannot remove someone else's handler! */
4251         if (pfm_alternate_intr_handler != hdl)
4252                 return -EINVAL;
4253
4254         preempt_disable();
4255         pfm_alternate_intr_handler = NULL;
4256
4257         /*
4258          * XXX: assume cpu_online_map has not changed since reservation
4259          */
4260         pfm_unreserve_session(NULL, 1, cpu_online_map);
4261
4262         preempt_enable();
4263
4264         return 0;
4265 }
4266
4267 /*
4268  * perfmon initialization routine, called from the initcall() table
4269  */
4270 int __init
4271 pfm_init(void)
4272 {
4273         unsigned int n, n_counters, i;
4274
4275         pmu_conf.disabled = 1;
4276
4277         printk(KERN_INFO "perfmon: version %u.%u IRQ %u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN,
4278                IA64_PERFMON_VECTOR);
4279
4280         /*
4281          * compute the number of implemented PMD/PMC from the
4282          * description tables
4283          */
4284         n = 0;
4285         for (i=0; PMC_IS_LAST(i) == 0;  i++) {
4286                 if (PMC_IS_IMPL(i) == 0) continue;
4287                 pmu_conf.impl_pmcs[i>>6] |= 1UL << (i&63);
4288                 n++;
4289         }
4290         pmu_conf.num_pmcs = n;
4291
4292         n = 0; n_counters = 0;
4293         for (i=0; PMD_IS_LAST(i) == 0;  i++) {
4294                 if (PMD_IS_IMPL(i) == 0) continue;
4295                 pmu_conf.impl_pmds[i>>6] |= 1UL << (i&63);
4296                 n++;
4297                 if (PMD_IS_COUNTING(i)) n_counters++;
4298         }
4299         pmu_conf.num_pmds      = n;
4300         pmu_conf.num_counters  = n_counters;
4301
4302         printk(KERN_INFO "perfmon: %u PMCs, %u PMDs, %u counters (%lu bits)\n",
4303                pmu_conf.num_pmcs,
4304                pmu_conf.num_pmds,
4305                pmu_conf.num_counters,
4306                ffz(pmu_conf.ovfl_val));
4307
4308         /* sanity check */
4309         if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) {
4310                 printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
4311                 return -1;
4312         }
4313
4314         /*
4315          * for now here for debug purposes
4316          */
4317         perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
4318         if (perfmon_dir == NULL) {
4319                 printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
4320                 return -1;
4321         }
4322
4323         /*
4324          * create /proc/perfmon
4325          */
4326         pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
4327
4328         /*
4329          * initialize all our spinlocks
4330          */
4331         spin_lock_init(&pfm_sessions.pfs_lock);
4332
4333         /* we are all set */
4334         pmu_conf.disabled = 0;
4335
4336         return 0;
4337 }
4338 __initcall(pfm_init);
4339
4340 void
4341 pfm_init_percpu(void)
4342 {
4343         int i;
4344         int me = get_cpu();
4345
4346         if (me == 0)
4347                 register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
4348
4349         ia64_set_pmv(IA64_PERFMON_VECTOR);
4350         ia64_srlz_d();
4351
4352         /*
4353          * we first initialize the PMU to a stable state.
4354          * the values may have been changed from their power-up
4355          * values by software executed before the kernel took over.
4356          *
4357          * At this point, pmu_conf has not yet been initialized
4358          *
4359          * On McKinley, this code is ineffective until PMC4 is initialized.
4360          */
4361         for (i=1; PMC_IS_LAST(i) == 0;  i++) {
4362                 if (PMC_IS_IMPL(i) == 0) continue;
4363                 ia64_set_pmc(i, PMC_DFL_VAL(i));
4364         }
4365
4366         for (i=0; PMD_IS_LAST(i); i++) {
4367                 if (PMD_IS_IMPL(i) == 0) continue;
4368                 ia64_set_pmd(i, 0UL);
4369         }
4370         put_cpu();
4371         pfm_freeze_pmu();
4372 }
4373
4374 #else /* !CONFIG_PERFMON */
4375
4376 asmlinkage long
4377 sys_perfmonctl (int pid, int cmd, void *req, int count, long arg5, long arg6,
4378                 long arg7, long arg8, long stack)
4379 {
4380         return -ENOSYS;
4381 }
4382
4383 #endif /* !CONFIG_PERFMON */