arch/ia64/kernel/perfmon.c

   1 /*
   2  * This file implements the perfmon subsystem which is used
   3  * to program the IA-64 Performance Monitoring Unit (PMU).
   4  *
   5  * Originally Written by Ganesh Venkitachalam, IBM Corp.
   6  * Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
   7  *
   8  * Modifications by Stephane Eranian, Hewlett-Packard Co.
   9  * Modifications by David Mosberger-Tang, Hewlett-Packard Co.
  10  *
  11  * Copyright (C) 1999-2003  Hewlett Packard Co
  12  *               Stephane Eranian <eranian@hpl.hp.com>
  13  *               David Mosberger-Tang <davidm@hpl.hp.com>
  14  */
  15
  16 #include <linux/config.h>
  17 #include <linux/kernel.h>
  18 #include <linux/sched.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/smp_lock.h>
  21 #include <linux/proc_fs.h>
  22 #include <linux/init.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/mm.h>
  25 #include <linux/sysctl.h>
  26 #include <linux/smp.h>
  27
  28 #include <asm/bitops.h>
  29 #include <asm/errno.h>
  30 #include <asm/page.h>
  31 #include <asm/perfmon.h>
  32 #include <asm/processor.h>
  33 #include <asm/signal.h>
  34 #include <asm/system.h>
  35 #include <asm/uaccess.h>
  36 #include <asm/delay.h> /* for ia64_get_itc() */
  37
  38 #ifdef CONFIG_PERFMON
  39
  40 /*
  41  * For PMUs which rely on the debug registers for some features, you must
  42  * you must enable the following flag to activate the support for
  43  * accessing the registers via the perfmonctl() interface.
  44  */
  45 #if defined(CONFIG_ITANIUM) || defined(CONFIG_MCKINLEY)
  46 #define PFM_PMU_USES_DBR        1
  47 #endif
  48
  49 /*
  50  * perfmon context states
  51  */
  52 #define PFM_CTX_DISABLED        0
  53 #define PFM_CTX_ENABLED         1
  54
  55 /*
  56  * Reset register flags
  57  */
  58 #define PFM_PMD_LONG_RESET      1
  59 #define PFM_PMD_SHORT_RESET     2
  60
  61 /*
  62  * Misc macros and definitions
  63  */
  64 #define PMU_FIRST_COUNTER       4
  65 #define PMU_MAX_PMCS            256
  66 #define PMU_MAX_PMDS            256
  67
  68 /*
  69  * type of a PMU register (bitmask).
  70  * bitmask structure:
  71  *      bit0   : register implemented
  72  *      bit1   : end marker
  73  *      bit2-3 : reserved
  74  *      bit4-7 : register type
  75  *      bit8-31: reserved
  76  */
  77 #define PFM_REG_IMPL            0x1 /* register implemented */
  78 #define PFM_REG_END             0x2 /* end marker */
  79 #define PFM_REG_MONITOR         (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
  80 #define PFM_REG_COUNTING        (0x2<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm AND pmc.oi, a PMD used as a counter */
  81 #define PFM_REG_CONTROL         (0x3<<4|PFM_REG_IMPL) /* PMU control register */
  82 #define PFM_REG_CONFIG          (0x4<<4|PFM_REG_IMPL) /* refine configuration */
  83 #define PFM_REG_BUFFER          (0x5<<4|PFM_REG_IMPL) /* PMD used as buffer */
  84
  85 #define PMC_IS_LAST(i)  (pmu_conf.pmc_desc[i].type & PFM_REG_END)
  86 #define PMD_IS_LAST(i)  (pmu_conf.pmd_desc[i].type & PFM_REG_END)
  87
  88 #define PFM_IS_DISABLED() pmu_conf.disabled
  89
  90 #define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_soft_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
  91 #define PFM_FL_INHERIT_MASK     (PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)
  92
  93 /* i assume unsigned */
  94 #define PMC_IS_IMPL(i)    (i< PMU_MAX_PMCS && (pmu_conf.pmc_desc[i].type & PFM_REG_IMPL))
  95 #define PMD_IS_IMPL(i)    (i< PMU_MAX_PMDS && (pmu_conf.pmd_desc[i].type & PFM_REG_IMPL))
  96
  97 /* XXX: these three assume that register i is implemented */
  98 #define PMD_IS_COUNTING(i) (pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING)
  99 #define PMC_IS_COUNTING(i) (pmu_conf.pmc_desc[i].type == PFM_REG_COUNTING)
 100 #define PMC_IS_MONITOR(i)  (pmu_conf.pmc_desc[i].type == PFM_REG_MONITOR)
 101 #define PMC_DFL_VAL(i)     pmu_conf.pmc_desc[i].default_value
 102 #define PMC_RSVD_MASK(i)   pmu_conf.pmc_desc[i].reserved_mask
 103 #define PMD_PMD_DEP(i)     pmu_conf.pmd_desc[i].dep_pmd[0]
 104 #define PMC_PMD_DEP(i)     pmu_conf.pmc_desc[i].dep_pmd[0]
 105
 106 /* k assume unsigned */
 107 #define IBR_IS_IMPL(k)    (k<pmu_conf.num_ibrs)
 108 #define DBR_IS_IMPL(k)    (k<pmu_conf.num_dbrs)
 109
 110 #define CTX_IS_ENABLED(c)       ((c)->ctx_flags.state == PFM_CTX_ENABLED)
 111 #define CTX_OVFL_NOBLOCK(c)     ((c)->ctx_fl_block == 0)
 112 #define CTX_INHERIT_MODE(c)     ((c)->ctx_fl_inherit)
 113 #define CTX_HAS_SMPL(c)         ((c)->ctx_psb != NULL)
 114 /* XXX: does not support more than 64 PMDs */
 115 #define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
 116 #define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
 117
 118
 119 #define CTX_USED_IBR(ctx,n)     (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
 120 #define CTX_USED_DBR(ctx,n)     (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
 121 #define CTX_USES_DBREGS(ctx)    (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
 122
 123 #define LOCK_CTX(ctx)   spin_lock(&(ctx)->ctx_lock)
 124 #define UNLOCK_CTX(ctx) spin_unlock(&(ctx)->ctx_lock)
 125
 126 #define SET_PMU_OWNER(t)    do { pmu_owners[smp_processor_id()].owner = (t); } while(0)
 127 #define PMU_OWNER()         pmu_owners[smp_processor_id()].owner
 128
 129 #define LOCK_PFS()          spin_lock(&pfm_sessions.pfs_lock)
 130 #define UNLOCK_PFS()        spin_unlock(&pfm_sessions.pfs_lock)
 131
 132 #define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
 133
 134 #define PFM_CPUINFO_CLEAR(v)    __get_cpu_var(pfm_syst_info) &= ~(v)
 135 #define PFM_CPUINFO_SET(v)      __get_cpu_var(pfm_syst_info) |= (v)
 136
 137 /*
 138  * debugging
 139  */
 140 #define DBprintk(a) \
 141         do { \
 142                 if (pfm_sysctl.debug >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
 143         } while (0)
 144
 145 #define DBprintk_ovfl(a) \
 146         do { \
 147                 if (pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
 148         } while (0)
 149
 150
 151
 152 /*
 153  * Architected PMC structure
 154  */
 155 typedef struct {
 156         unsigned long pmc_plm:4;        /* privilege level mask */
 157         unsigned long pmc_ev:1;         /* external visibility */
 158         unsigned long pmc_oi:1;         /* overflow interrupt */
 159         unsigned long pmc_pm:1;         /* privileged monitor */
 160         unsigned long pmc_ig1:1;        /* reserved */
 161         unsigned long pmc_es:8;         /* event select */
 162         unsigned long pmc_ig2:48;       /* reserved */
 163 } pfm_monitor_t;
 164
 165 /*
 166  * There is one such data structure per perfmon context. It is used to describe the
 167  * sampling buffer. It is to be shared among siblings whereas the pfm_context
 168  * is not.
 169  * Therefore we maintain a refcnt which is incremented on fork().
 170  * This buffer is private to the kernel only the actual sampling buffer
 171  * including its header are exposed to the user. This construct allows us to
 172  * export the buffer read-write, if needed, without worrying about security
 173  * problems.
 174  */
 175 typedef struct _pfm_smpl_buffer_desc {
 176         spinlock_t              psb_lock;       /* protection lock */
 177         unsigned long           psb_refcnt;     /* how many users for the buffer */
 178         int                     psb_flags;      /* bitvector of flags (not yet used) */
 179
 180         void                    *psb_addr;      /* points to location of first entry */
 181         unsigned long           psb_entries;    /* maximum number of entries */
 182         unsigned long           psb_size;       /* aligned size of buffer */
 183         unsigned long           psb_index;      /* next free entry slot XXX: must use the one in buffer */
 184         unsigned long           psb_entry_size; /* size of each entry including entry header */
 185
 186         perfmon_smpl_hdr_t      *psb_hdr;       /* points to sampling buffer header */
 187
 188         struct _pfm_smpl_buffer_desc *psb_next; /* next psb, used for rvfreeing of psb_hdr */
 189
 190 } pfm_smpl_buffer_desc_t;
 191
 192 /*
 193  * psb_flags
 194  */
 195 #define PSB_HAS_VMA     0x1             /* a virtual mapping for the buffer exists */
 196
 197 #define LOCK_PSB(p)     spin_lock(&(p)->psb_lock)
 198 #define UNLOCK_PSB(p)   spin_unlock(&(p)->psb_lock)
 199
 200 /*
 201  * 64-bit software counter structure
 202  */
 203 typedef struct {
 204         u64 val;        /* virtual 64bit counter value */
 205         u64 lval;       /* last value */
 206         u64 long_reset; /* reset value on sampling overflow */
 207         u64 short_reset;/* reset value on overflow */
 208         u64 reset_pmds[4]; /* which other pmds to reset when this counter overflows */
 209         u64 seed;       /* seed for random-number generator */
 210         u64 mask;       /* mask for random-number generator */
 211         unsigned int flags; /* notify/do not notify */
 212 } pfm_counter_t;
 213
 214 /*
 215  * perfmon context. One per process, is cloned on fork() depending on
 216  * inheritance flags
 217  */
 218 typedef struct {
 219         unsigned int state:1;           /* 0=disabled, 1=enabled */
 220         unsigned int inherit:2;         /* inherit mode */
 221         unsigned int block:1;           /* when 1, task will blocked on user notifications */
 222         unsigned int system:1;          /* do system wide monitoring */
 223         unsigned int frozen:1;          /* pmu must be kept frozen on ctxsw in */
 224         unsigned int protected:1;       /* allow access to creator of context only */
 225         unsigned int using_dbreg:1;     /* using range restrictions (debug registers) */
 226         unsigned int excl_idle:1;       /* exclude idle task in system wide session */
 227         unsigned int unsecure:1;        /* sp = 0 for non self-monitored task */
 228         unsigned int trap_reason:2;     /* reason for going into pfm_block_ovfl_reset() */
 229         unsigned int reserved:20;
 230 } pfm_context_flags_t;
 231
 232 #define PFM_TRAP_REASON_NONE            0x0     /* default value */
 233 #define PFM_TRAP_REASON_BLOCKSIG        0x1     /* we need to block on overflow and signal user */
 234 #define PFM_TRAP_REASON_SIG             0x2     /* we simply need to signal user */
 235 #define PFM_TRAP_REASON_RESET           0x3     /* we need to reset PMDs */
 236
 237 /*
 238  * perfmon context: encapsulates all the state of a monitoring session
 239  * XXX: probably need to change layout
 240  */
 241 typedef struct pfm_context {
 242         pfm_smpl_buffer_desc_t  *ctx_psb;               /* sampling buffer, if any */
 243         unsigned long           ctx_smpl_vaddr;         /* user level virtual address of smpl buffer */
 244
 245         spinlock_t              ctx_lock;
 246         pfm_context_flags_t     ctx_flags;              /* block/noblock */
 247
 248         struct task_struct      *ctx_notify_task;       /* who to notify on overflow */
 249         struct task_struct      *ctx_owner;             /* pid of creator (debug) */
 250
 251         unsigned long           ctx_ovfl_regs[4];       /* which registers overflowed (notification) */
 252         unsigned long           ctx_smpl_regs[4];       /* which registers to record on overflow */
 253
 254         struct semaphore        ctx_restart_sem;        /* use for blocking notification mode */
 255
 256         unsigned long           ctx_used_pmds[4];       /* bitmask of PMD used                 */
 257         unsigned long           ctx_reload_pmds[4];     /* bitmask of PMD to reload on ctxsw   */
 258
 259         unsigned long           ctx_used_pmcs[4];       /* bitmask PMC used by context         */
 260         unsigned long           ctx_reload_pmcs[4];     /* bitmask of PMC to reload on ctxsw   */
 261
 262         unsigned long           ctx_used_ibrs[4];       /* bitmask of used IBR (speedup ctxsw) */
 263         unsigned long           ctx_used_dbrs[4];       /* bitmask of used DBR (speedup ctxsw) */
 264
 265         pfm_counter_t           ctx_soft_pmds[IA64_NUM_PMD_REGS]; /* XXX: size should be dynamic */
 266
 267         u64                     ctx_saved_psr;          /* copy of psr used for lazy ctxsw */
 268         unsigned long           ctx_saved_cpus_allowed; /* copy of the task cpus_allowed (system wide) */
 269         unsigned int            ctx_cpu;                /* CPU used by system wide session */
 270
 271         atomic_t                ctx_last_cpu;           /* CPU id of current or last CPU used */
 272 } pfm_context_t;
 273
 274 #define ctx_fl_inherit          ctx_flags.inherit
 275 #define ctx_fl_block            ctx_flags.block
 276 #define ctx_fl_system           ctx_flags.system
 277 #define ctx_fl_frozen           ctx_flags.frozen
 278 #define ctx_fl_protected        ctx_flags.protected
 279 #define ctx_fl_using_dbreg      ctx_flags.using_dbreg
 280 #define ctx_fl_excl_idle        ctx_flags.excl_idle
 281 #define ctx_fl_trap_reason      ctx_flags.trap_reason
 282 #define ctx_fl_unsecure         ctx_flags.unsecure
 283
 284 /*
 285  * global information about all sessions
 286  * mostly used to synchronize between system wide and per-process
 287  */
 288 typedef struct {
 289         spinlock_t              pfs_lock;                  /* lock the structure */
 290
 291         unsigned int            pfs_task_sessions;         /* number of per task sessions */
 292         unsigned int            pfs_sys_sessions;          /* number of per system wide sessions */
 293         unsigned int            pfs_sys_use_dbregs;        /* incremented when a system wide session uses debug regs */
 294         unsigned int            pfs_ptrace_use_dbregs;     /* incremented when a process uses debug regs */
 295         struct task_struct      *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
 296 } pfm_session_t;
 297
 298 /*
 299  * information about a PMC or PMD.
 300  * dep_pmd[]: a bitmask of dependent PMD registers
 301  * dep_pmc[]: a bitmask of dependent PMC registers
 302  */
 303 typedef struct {
 304         unsigned int            type;
 305         int                     pm_pos;
 306         unsigned long           default_value;  /* power-on default value */
 307         unsigned long           reserved_mask;  /* bitmask of reserved bits */
 308         int                     (*read_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
 309         int                     (*write_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
 310         unsigned long           dep_pmd[4];
 311         unsigned long           dep_pmc[4];
 312 } pfm_reg_desc_t;
 313
 314 /* assume cnum is a valid monitor */
 315 #define PMC_PM(cnum, val)       (((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
 316 #define PMC_WR_FUNC(cnum)       (pmu_conf.pmc_desc[cnum].write_check)
 317 #define PMD_WR_FUNC(cnum)       (pmu_conf.pmd_desc[cnum].write_check)
 318 #define PMD_RD_FUNC(cnum)       (pmu_conf.pmd_desc[cnum].read_check)
 319
 320 /*
 321  * This structure is initialized at boot time and contains
 322  * a description of the PMU main characteristics.
 323  */
 324 typedef struct {
 325         unsigned int  disabled;         /* indicates if perfmon is working properly */
 326         unsigned long ovfl_val;         /* overflow value for generic counters   */
 327         unsigned long impl_pmcs[4];     /* bitmask of implemented PMCS */
 328         unsigned long impl_pmds[4];     /* bitmask of implemented PMDS */
 329         unsigned int  num_pmcs;         /* number of implemented PMCS */
 330         unsigned int  num_pmds;         /* number of implemented PMDS */
 331         unsigned int  num_ibrs;         /* number of implemented IBRS */
 332         unsigned int  num_dbrs;         /* number of implemented DBRS */
 333         unsigned int  num_counters;     /* number of PMD/PMC counters */
 334         pfm_reg_desc_t *pmc_desc;       /* detailed PMC register dependencies descriptions */
 335         pfm_reg_desc_t *pmd_desc;       /* detailed PMD register dependencies descriptions */
 336 } pmu_config_t;
 337
 338 /*
 339  * structure used to pass argument to/from remote CPU
 340  * using IPI to check and possibly save the PMU context on SMP systems.
 341  *
 342  * not used in UP kernels
 343  */
 344 typedef struct {
 345         struct task_struct *task;       /* which task we are interested in */
 346         int retval;                     /* return value of the call: 0=you can proceed, 1=need to wait for completion */
 347 } pfm_smp_ipi_arg_t;
 348
 349 /*
 350  * perfmon command descriptions
 351  */
 352 typedef struct {
 353         int             (*cmd_func)(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
 354         int             cmd_flags;
 355         unsigned int    cmd_narg;
 356         size_t          cmd_argsize;
 357 } pfm_cmd_desc_t;
 358
 359 #define PFM_CMD_PID             0x1     /* command requires pid argument */
 360 #define PFM_CMD_ARG_READ        0x2     /* command must read argument(s) */
 361 #define PFM_CMD_ARG_RW          0x4     /* command must read/write argument(s) */
 362 #define PFM_CMD_CTX             0x8     /* command needs a perfmon context */
 363 #define PFM_CMD_NOCHK           0x10    /* command does not need to check task's state */
 364
 365 #define PFM_CMD_IDX(cmd)        (cmd)
 366
 367 #define PFM_CMD_IS_VALID(cmd)   ((PFM_CMD_IDX(cmd) >= 0)                                \
 368                                  && (PFM_CMD_IDX(cmd) < (int) PFM_CMD_COUNT)            \
 369                                  && pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func != NULL)
 370
 371 #define PFM_CMD_USE_PID(cmd)    ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_PID) != 0)
 372 #define PFM_CMD_READ_ARG(cmd)   ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_READ) != 0)
 373 #define PFM_CMD_RW_ARG(cmd)     ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_RW) != 0)
 374 #define PFM_CMD_USE_CTX(cmd)    ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_CTX) != 0)
 375 #define PFM_CMD_CHK(cmd)        ((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_NOCHK) == 0)
 376
 377 #define PFM_CMD_ARG_MANY        -1 /* cannot be zero */
 378 #define PFM_CMD_NARG(cmd)       (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg)
 379 #define PFM_CMD_ARG_SIZE(cmd)   (pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize)
 380
 381 typedef struct {
 382         int     debug;          /* turn on/off debugging via syslog */
 383         int     debug_ovfl;     /* turn on/off debug printk in overflow handler */
 384         int     fastctxsw;      /* turn on/off fast (unsecure) ctxsw */
 385 } pfm_sysctl_t;
 386
 387 typedef struct {
 388         unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
 389         unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
 390         unsigned long pfm_recorded_samples_count;
 391         unsigned long pfm_full_smpl_buffer_count; /* how many times the sampling buffer was full */
 392         char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
 393 } pfm_stats_t;
 394
 395 /*
 396  * perfmon internal variables
 397  */
 398 static pfm_session_t    pfm_sessions;   /* global sessions information */
 399 static struct proc_dir_entry *perfmon_dir; /* for debug only */
 400 static pfm_stats_t      pfm_stats[NR_CPUS];
 401 static pfm_intr_handler_desc_t  *pfm_alternate_intr_handler;
 402
 403 DEFINE_PER_CPU(unsigned long, pfm_syst_info);
 404
 405 /* sysctl() controls */
 406 static pfm_sysctl_t pfm_sysctl;
 407
 408 static ctl_table pfm_ctl_table[]={
 409         {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
 410         {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
 411         {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
 412         { 0, },
 413 };
 414 static ctl_table pfm_sysctl_dir[] = {
 415         {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
 416         {0,},
 417 };
 418 static ctl_table pfm_sysctl_root[] = {
 419         {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
 420         {0,},
 421 };
 422 static struct ctl_table_header *pfm_sysctl_header;
 423
 424 static void pfm_vm_close(struct vm_area_struct * area);
 425
 426 static struct vm_operations_struct pfm_vm_ops={
 427         .close = pfm_vm_close
 428 };
 429
 430 /*
 431  * keep track of task owning the PMU per CPU.
 432  */
 433 static struct {
 434         struct task_struct *owner;
 435         char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
 436 } pmu_owners[NR_CPUS];
 437
 438
 439
 440 /*
 441  * forward declarations
 442  */
 443 static void pfm_reset_pmu(struct task_struct *);
 444 static void pfm_lazy_save_regs (struct task_struct *ta);
 445
 446 #if   defined(CONFIG_ITANIUM)
 447 #include "perfmon_itanium.h"
 448 #elif defined(CONFIG_MCKINLEY)
 449 #include "perfmon_mckinley.h"
 450 #else
 451 #include "perfmon_generic.h"
 452 #endif
 453
 454 static inline void
 455 pfm_clear_psr_pp(void)
 456 {
 457         __asm__ __volatile__ ("rsm psr.pp;; srlz.i;;"::: "memory");
 458 }
 459
 460 static inline void
 461 pfm_set_psr_pp(void)
 462 {
 463         __asm__ __volatile__ ("ssm psr.pp;; srlz.i;;"::: "memory");
 464 }
 465
 466 static inline void
 467 pfm_clear_psr_up(void)
 468 {
 469         __asm__ __volatile__ ("rum psr.up;; srlz.i;;"::: "memory");
 470 }
 471
 472 static inline void
 473 pfm_set_psr_up(void)
 474 {
 475         __asm__ __volatile__ ("sum psr.up;; srlz.i;;"::: "memory");
 476 }
 477
 478 static inline unsigned long
 479 pfm_get_psr(void)
 480 {
 481         unsigned long tmp;
 482         __asm__ __volatile__ ("mov %0=psr;;": "=r"(tmp) :: "memory");
 483         return tmp;
 484 }
 485
 486 static inline void
 487 pfm_set_psr_l(unsigned long val)
 488 {
 489         __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(val): "memory");
 490 }
 491
 492 static inline void
 493 pfm_freeze_pmu(void)
 494 {
 495         ia64_set_pmc(0,1UL);
 496         ia64_srlz_d();
 497 }
 498
 499 static inline void
 500 pfm_unfreeze_pmu(void)
 501 {
 502         ia64_set_pmc(0,0UL);
 503         ia64_srlz_d();
 504 }
 505
 506 static inline unsigned long
 507 pfm_read_soft_counter(pfm_context_t *ctx, int i)
 508 {
 509         return ctx->ctx_soft_pmds[i].val + (ia64_get_pmd(i) & pmu_conf.ovfl_val);
 510 }
 511
 512 static inline void
 513 pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
 514 {
 515         ctx->ctx_soft_pmds[i].val = val  & ~pmu_conf.ovfl_val;
 516         /*
 517          * writing to unimplemented part is ignore, so we do not need to
 518          * mask off top part
 519          */
 520         ia64_set_pmd(i, val & pmu_conf.ovfl_val);
 521 }
 522
 523 /*
 524  * Generates a unique (per CPU) timestamp
 525  */
 526 static inline unsigned long
 527 pfm_get_stamp(void)
 528 {
 529         /*
 530          * XXX: must find something more efficient
 531          */
 532         return ia64_get_itc();
 533 }
 534
 535 /* Here we want the physical address of the memory.
 536  * This is used when initializing the contents of the
 537  * area and marking the pages as reserved.
 538  */
 539 static inline unsigned long
 540 pfm_kvirt_to_pa(unsigned long adr)
 541 {
 542         __u64 pa = ia64_tpa(adr);
 543         //DBprintk(("kv2pa(%lx-->%lx)\n", adr, pa));
 544         return pa;
 545 }
 546
 547 static void *
 548 pfm_rvmalloc(unsigned long size)
 549 {
 550         void *mem;
 551         unsigned long adr;
 552
 553         size=PAGE_ALIGN(size);
 554         mem=vmalloc(size);
 555         if (mem) {
 556                 //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
 557                 memset(mem, 0, size); /* Clear the ram out, no junk to the user */
 558                 adr=(unsigned long) mem;
 559                 while (size > 0) {
 560                         SetPageReserved(vmalloc_to_page((void *)adr));
 561                         adr+=PAGE_SIZE;
 562                         size-=PAGE_SIZE;
 563                 }
 564         }
 565         return mem;
 566 }
 567
 568 static void
 569 pfm_rvfree(void *mem, unsigned long size)
 570 {
 571         unsigned long adr;
 572
 573         if (mem) {
 574                 adr=(unsigned long) mem;
 575                 while ((long) size > 0) {
 576                         ClearPageReserved(vmalloc_to_page((void*)adr));
 577                         adr+=PAGE_SIZE;
 578                         size-=PAGE_SIZE;
 579                 }
 580                 vfree(mem);
 581         }
 582         return;
 583 }
 584
 585 /*
 586  * This function gets called from mm/mmap.c:exit_mmap() only when there is a sampling buffer
 587  * attached to the context AND the current task has a mapping for it, i.e., it is the original
 588  * creator of the context.
 589  *
 590  * This function is used to remember the fact that the vma describing the sampling buffer
 591  * has now been removed. It can only be called when no other tasks share the same mm context.
 592  *
 593  */
 594 static void
 595 pfm_vm_close(struct vm_area_struct *vma)
 596 {
 597         pfm_smpl_buffer_desc_t *psb = (pfm_smpl_buffer_desc_t *)vma->vm_private_data;
 598
 599         if (psb == NULL) {
 600                 printk(KERN_DEBUG "perfmon: psb is null in [%d]\n", current->pid);
 601                 return;
 602         }
 603         /*
 604          * Add PSB to list of buffers to free on release_thread() when no more users
 605          *
 606          * This call is safe because, once the count is zero is cannot be modified anymore.
 607          * This is not because there is no more user of the mm context, that the sampling
 608          * buffer is not being used anymore outside of this task. In fact, it can still
 609          * be accessed from within the kernel by another task (such as the monitored task).
 610          *
 611          * Therefore, we only move the psb into the list of buffers to free when we know
 612          * nobody else is using it.
 613          * The linked list if independent of the perfmon context, because in the case of
 614          * multi-threaded processes, the last thread may not have been involved with
 615          * monitoring however it will be the one removing the vma and it should therefore
 616          * also remove the sampling buffer. This buffer cannot be removed until the vma
 617          * is removed.
 618          *
 619          * This function cannot remove the buffer from here, because exit_mmap() must first
 620          * complete. Given that there is no other vma related callback in the generic code,
 621          * we have created our own with the linked list of sampling buffers to free. The list
 622          * is part of the thread structure. In release_thread() we check if the list is
 623          * empty. If not we call into perfmon to free the buffer and psb. That is the only
 624          * way to ensure a safe deallocation of the sampling buffer which works when
 625          * the buffer is shared between distinct processes or with multi-threaded programs.
 626          *
 627          * We need to lock the psb because the refcnt test and flag manipulation must
 628          * looked like an atomic operation vis a vis pfm_context_exit()
 629          */
 630         LOCK_PSB(psb);
 631
 632         if (psb->psb_refcnt == 0) {
 633
 634                 psb->psb_next = current->thread.pfm_smpl_buf_list;
 635                 current->thread.pfm_smpl_buf_list = psb;
 636
 637                 DBprintk(("[%d] add smpl @%p size %lu to smpl_buf_list psb_flags=0x%x\n",
 638                         current->pid, psb->psb_hdr, psb->psb_size, psb->psb_flags));
 639         }
 640         DBprintk(("[%d] clearing psb_flags=0x%x smpl @%p size %lu\n",
 641                         current->pid, psb->psb_flags, psb->psb_hdr, psb->psb_size));
 642         /*
 643          * decrement the number vma for the buffer
 644          */
 645         psb->psb_flags &= ~PSB_HAS_VMA;
 646
 647         UNLOCK_PSB(psb);
 648 }
 649
 650 /*
 651  * This function is called from pfm_destroy_context() and also from pfm_inherit()
 652  * to explicitly remove the sampling buffer mapping from the user level address space.
 653  */
 654 static int
 655 pfm_remove_smpl_mapping(struct task_struct *task)
 656 {
 657         pfm_context_t *ctx = task->thread.pfm_context;
 658         pfm_smpl_buffer_desc_t *psb;
 659         int r;
 660
 661         /*
 662          * some sanity checks first
 663          */
 664         if (ctx == NULL || task->mm == NULL || ctx->ctx_smpl_vaddr == 0 || ctx->ctx_psb == NULL) {
 665                 printk(KERN_DEBUG "perfmon: invalid context mm=%p\n", task->mm);
 666                 return -1;
 667         }
 668         psb = ctx->ctx_psb;
 669
 670         down_write(&task->mm->mmap_sem);
 671
 672         r = do_munmap(task->mm, ctx->ctx_smpl_vaddr, psb->psb_size);
 673
 674         up_write(&task->mm->mmap_sem);
 675         if (r !=0) {
 676                 printk(KERN_DEBUG "perfmon: pid %d unable to unmap sampling buffer "
 677                        "@0x%lx size=%ld\n", task->pid, ctx->ctx_smpl_vaddr, psb->psb_size);
 678         }
 679
 680         DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d refcnt=%lu psb_flags=0x%x\n",
 681                 task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r, psb->psb_refcnt, psb->psb_flags));
 682
 683         return 0;
 684 }
 685
 686 static pfm_context_t *
 687 pfm_context_alloc(void)
 688 {
 689         pfm_context_t *ctx;
 690
 691         /* allocate context descriptor */
 692         ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
 693         if (ctx) memset(ctx, 0, sizeof(pfm_context_t));
 694
 695         return ctx;
 696 }
 697
 698 static void
 699 pfm_context_free(pfm_context_t *ctx)
 700 {
 701         if (ctx) kfree(ctx);
 702 }
 703
 704 static int
 705 pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
 706 {
 707         unsigned long page;
 708
 709         DBprintk(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
 710
 711         while (size > 0) {
 712                 page = pfm_kvirt_to_pa(buf);
 713
 714                 if (remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
 715
 716                 addr  += PAGE_SIZE;
 717                 buf   += PAGE_SIZE;
 718                 size  -= PAGE_SIZE;
 719         }
 720         return 0;
 721 }
 722
 723 /*
 724  * counts the number of PMDS to save per entry.
 725  * This code is generic enough to accommodate more than 64 PMDS when they become available
 726  */
 727 static unsigned long
 728 pfm_smpl_entry_size(unsigned long *which, unsigned long size)
 729 {
 730         unsigned long i, res = 0;
 731
 732         for (i=0; i < size; i++, which++) res += hweight64(*which);
 733
 734         DBprintk(("weight=%ld\n", res));
 735
 736         return res;
 737 }
 738
 739 /*
 740  * Allocates the sampling buffer and remaps it into caller's address space
 741  */
 742 static int
 743 pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned long entries,
 744                       void **user_vaddr)
 745 {
 746         struct mm_struct *mm = current->mm;
 747         struct vm_area_struct *vma = NULL;
 748         unsigned long size, regcount;
 749         void *smpl_buf;
 750         pfm_smpl_buffer_desc_t *psb;
 751
 752
 753         /* note that regcount might be 0, in this case only the header for each
 754          * entry will be recorded.
 755          */
 756         regcount = pfm_smpl_entry_size(which_pmds, 1);
 757
 758         if ((sizeof(perfmon_smpl_hdr_t)+ entries*sizeof(perfmon_smpl_entry_t)) <= entries) {
 759                 DBprintk(("requested entries %lu is too big\n", entries));
 760                 return -EINVAL;
 761         }
 762
 763         /*
 764          * 1 buffer hdr and for each entry a header + regcount PMDs to save
 765          */
 766         size = PAGE_ALIGN(  sizeof(perfmon_smpl_hdr_t)
 767                           + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64)));
 768
 769         DBprintk(("sampling buffer size=%lu bytes\n", size));
 770
 771         /*
 772          * check requested size to avoid Denial-of-service attacks
 773          * XXX: may have to refine this test
 774          * Check against address space limit.
 775          *
 776          * if ((mm->total_vm << PAGE_SHIFT) + len> current->rlim[RLIMIT_AS].rlim_cur)
 777          *      return -ENOMEM;
 778          */
 779         if (size > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;
 780
 781         /*
 782          * We do the easy to undo allocations first.
 783          *
 784          * pfm_rvmalloc(), clears the buffer, so there is no leak
 785          */
 786         smpl_buf = pfm_rvmalloc(size);
 787         if (smpl_buf == NULL) {
 788                 DBprintk(("Can't allocate sampling buffer\n"));
 789                 return -ENOMEM;
 790         }
 791
 792         DBprintk(("smpl_buf @%p\n", smpl_buf));
 793
 794         /* allocate sampling buffer descriptor now */
 795         psb = kmalloc(sizeof(*psb), GFP_KERNEL);
 796         if (psb == NULL) {
 797                 DBprintk(("Can't allocate sampling buffer descriptor\n"));
 798                 goto error_kmalloc;
 799         }
 800
 801         /* allocate vma */
 802         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 803         if (!vma) {
 804                 DBprintk(("Cannot allocate vma\n"));
 805                 goto error_kmem;
 806         }
 807         /*
 808          * partially initialize the vma for the sampling buffer
 809          *
 810          * The VM_DONTCOPY flag is very important as it ensures that the mapping
 811          * will never be inherited for any child process (via fork()) which is always
 812          * what we want.
 813          */
 814         vma->vm_mm           = mm;
 815         vma->vm_flags        = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY;
 816         vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 817         vma->vm_ops          = &pfm_vm_ops; /* necesarry to get the close() callback */
 818         vma->vm_pgoff        = 0;
 819         vma->vm_file         = NULL;
 820         vma->vm_private_data = psb;     /* information needed by the pfm_vm_close() function */
 821
 822         /*
 823          * Now we have everything we need and we can initialize
 824          * and connect all the data structures
 825          */
 826
 827         psb->psb_hdr     = smpl_buf;
 828         psb->psb_addr    = ((char *)smpl_buf)+sizeof(perfmon_smpl_hdr_t); /* first entry */
 829         psb->psb_size    = size; /* aligned size */
 830         psb->psb_index   = 0;
 831         psb->psb_entries = entries;
 832         psb->psb_refcnt  = 1;
 833         psb->psb_flags   = PSB_HAS_VMA;
 834
 835         spin_lock_init(&psb->psb_lock);
 836
 837         /*
 838          * XXX: will need to do cacheline alignment to avoid false sharing in SMP mode and
 839          * multitask monitoring.
 840          */
 841         psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64);
 842
 843         DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p refcnt=%lu psb_flags=0x%x\n",
 844                   (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr,
 845                   (void *)psb->psb_addr, psb->psb_refcnt, psb->psb_flags));
 846
 847         /* initialize some of the fields of user visible buffer header */
 848         psb->psb_hdr->hdr_version    = PFM_SMPL_VERSION;
 849         psb->psb_hdr->hdr_entry_size = psb->psb_entry_size;
 850         psb->psb_hdr->hdr_pmds[0]    = which_pmds[0];
 851
 852         /*
 853          * Let's do the difficult operations next.
 854          *
 855          * now we atomically find some area in the address space and
 856          * remap the buffer in it.
 857          */
 858         down_write(&current->mm->mmap_sem);
 859
 860
 861         /* find some free area in address space, must have mmap sem held */
 862         vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
 863         if (vma->vm_start == 0UL) {
 864                 DBprintk(("Cannot find unmapped area for size %ld\n", size));
 865                 up_write(&current->mm->mmap_sem);
 866                 goto error;
 867         }
 868         vma->vm_end = vma->vm_start + size;
 869
 870         DBprintk(("entries=%ld aligned size=%ld, unmapped @0x%lx\n", entries, size, vma->vm_start));
 871
 872         /* can only be applied to current, need to have the mm semaphore held when called */
 873         if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
 874                 DBprintk(("Can't remap buffer\n"));
 875                 up_write(&current->mm->mmap_sem);
 876                 goto error;
 877         }
 878
 879         /*
 880          * now insert the vma in the vm list for the process, must be
 881          * done with mmap lock held
 882          */
 883         insert_vm_struct(mm, vma);
 884
 885         mm->total_vm  += size >> PAGE_SHIFT;
 886
 887         up_write(&current->mm->mmap_sem);
 888
 889         /* store which PMDS to record */
 890         ctx->ctx_smpl_regs[0] = which_pmds[0];
 891
 892
 893         /* link to perfmon context */
 894         ctx->ctx_psb        = psb;
 895
 896         /*
 897          * keep track of user level virtual address
 898          */
 899         ctx->ctx_smpl_vaddr = *(unsigned long *)user_vaddr = vma->vm_start;
 900
 901         return 0;
 902
 903 error:
 904         kmem_cache_free(vm_area_cachep, vma);
 905 error_kmem:
 906         kfree(psb);
 907 error_kmalloc:
 908         pfm_rvfree(smpl_buf, size);
 909         return -ENOMEM;
 910 }
 911
 912 static int
 913 pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned long cpu_mask)
 914 {
 915         unsigned long m, undo_mask;
 916         unsigned int n, i;
 917
 918         /*
 919          * validy checks on cpu_mask have been done upstream
 920          */
 921         LOCK_PFS();
 922
 923         if (is_syswide) {
 924                 /*
 925                  * cannot mix system wide and per-task sessions
 926                  */
 927                 if (pfm_sessions.pfs_task_sessions > 0UL) {
 928                         DBprintk(("system wide not possible, %u conflicting task_sessions\n",
 929                                 pfm_sessions.pfs_task_sessions));
 930                         goto abort;
 931                 }
 932
 933                 m = cpu_mask; undo_mask = 0UL; n = 0;
 934                 DBprintk(("cpu_mask=0x%lx\n", cpu_mask));
 935                 for(i=0; m; i++, m>>=1) {
 936
 937                         if ((m & 0x1) == 0UL) continue;
 938
 939                         if (pfm_sessions.pfs_sys_session[i]) goto undo;
 940
 941                         DBprintk(("reserving CPU%d currently on CPU%d\n", i, smp_processor_id()));
 942
 943                         pfm_sessions.pfs_sys_session[i] = task;
 944                         undo_mask |= 1UL << i;
 945                         n++;
 946                 }
 947                 pfm_sessions.pfs_sys_sessions += n;
 948         } else {
 949                 if (pfm_sessions.pfs_sys_sessions) goto abort;
 950                 pfm_sessions.pfs_task_sessions++;
 951         }
 952         DBprintk(("task_sessions=%u sys_session[%d]=%d",
 953                   pfm_sessions.pfs_task_sessions,
 954                   smp_processor_id(), pfm_sessions.pfs_sys_session[smp_processor_id()] ? 1 : 0));
 955         UNLOCK_PFS();
 956         return 0;
 957 undo:
 958         DBprintk(("system wide not possible, conflicting session [%d] on CPU%d\n",
 959                 pfm_sessions.pfs_sys_session[i]->pid, i));
 960
 961         for(i=0; undo_mask; i++, undo_mask >>=1) {
 962                 pfm_sessions.pfs_sys_session[i] = NULL;
 963         }
 964 abort:
 965         UNLOCK_PFS();
 966
 967         return -EBUSY;
 968
 969 }
 970
 971 static int
 972 pfm_unreserve_session(struct task_struct *task, int is_syswide, unsigned long cpu_mask)
 973 {
 974         pfm_context_t *ctx;
 975         unsigned long m;
 976         unsigned int n, i;
 977
 978         ctx = task ? task->thread.pfm_context : NULL;
 979
 980         /*
 981          * validy checks on cpu_mask have been done upstream
 982          */
 983         LOCK_PFS();
 984
 985         DBprintk(("[%d] sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu_mask=0x%lx\n",
 986                 task->pid,
 987                 pfm_sessions.pfs_sys_sessions,
 988                 pfm_sessions.pfs_task_sessions,
 989                 pfm_sessions.pfs_sys_use_dbregs,
 990                 is_syswide,
 991                 cpu_mask));
 992
 993
 994         if (is_syswide) {
 995                 m = cpu_mask; n = 0;
 996                 for(i=0; m; i++, m>>=1) {
 997                         if ((m & 0x1) == 0UL) continue;
 998                         pfm_sessions.pfs_sys_session[i] = NULL;
 999                         n++;
1000                 }
1001                 /*
1002                  * would not work with perfmon+more than one bit in cpu_mask
1003                  */
1004                 if (ctx && ctx->ctx_fl_using_dbreg) {
1005                         if (pfm_sessions.pfs_sys_use_dbregs == 0) {
1006                                 printk(KERN_DEBUG "perfmon: invalid release for [%d] "
1007                                        "sys_use_dbregs=0\n", task->pid);
1008                         } else {
1009                                 pfm_sessions.pfs_sys_use_dbregs--;
1010                         }
1011                 }
1012                 pfm_sessions.pfs_sys_sessions -= n;
1013
1014                 DBprintk(("CPU%d sys_sessions=%u\n",
1015                         smp_processor_id(), pfm_sessions.pfs_sys_sessions));
1016         } else {
1017                 pfm_sessions.pfs_task_sessions--;
1018                 DBprintk(("[%d] task_sessions=%u\n",
1019                         task->pid, pfm_sessions.pfs_task_sessions));
1020         }
1021
1022         UNLOCK_PFS();
1023
1024         return 0;
1025 }
1026
1027 /*
1028  * XXX: do something better here
1029  */
1030 static int
1031 pfm_bad_permissions(struct task_struct *task)
1032 {
1033         /* stolen from bad_signal() */
1034         return (current->session != task->session)
1035             && (current->euid ^ task->suid) && (current->euid ^ task->uid)
1036             && (current->uid ^ task->suid) && (current->uid ^ task->uid);
1037 }
1038
1039
1040 static int
1041 pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
1042 {
1043         unsigned long smpl_pmds = pfx->ctx_smpl_regs[0];
1044         int ctx_flags;
1045         int cpu;
1046
1047         /* valid signal */
1048
1049         /* cannot send to process 1, 0 means do not notify */
1050         if (pfx->ctx_notify_pid == 1) {
1051                 DBprintk(("invalid notify_pid %d\n", pfx->ctx_notify_pid));
1052                 return -EINVAL;
1053         }
1054         ctx_flags = pfx->ctx_flags;
1055
1056         if ((ctx_flags & PFM_FL_INHERIT_MASK) == (PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)) {
1057                 DBprintk(("invalid inherit mask 0x%x\n",ctx_flags & PFM_FL_INHERIT_MASK));
1058                 return -EINVAL;
1059         }
1060
1061         if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
1062                 DBprintk(("cpu_mask=0x%lx\n", pfx->ctx_cpu_mask));
1063                 /*
1064                  * cannot block in this mode
1065                  */
1066                 if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
1067                         DBprintk(("cannot use blocking mode when in system wide monitoring\n"));
1068                         return -EINVAL;
1069                 }
1070                 /*
1071                  * must only have one bit set in the CPU mask
1072                  */
1073                 if (hweight64(pfx->ctx_cpu_mask) != 1UL) {
1074                         DBprintk(("invalid CPU mask specified\n"));
1075                         return -EINVAL;
1076                 }
1077                 /*
1078                  * and it must be a valid CPU
1079                  */
1080                 cpu = ffz(~pfx->ctx_cpu_mask);
1081 #ifdef CONFIG_SMP
1082                 if (cpu_online(cpu) == 0) {
1083 #else
1084                 if (cpu != 0) {
1085 #endif
1086                         DBprintk(("CPU%d is not online\n", cpu));
1087                         return -EINVAL;
1088                 }
1089
1090                 /*
1091                  * check for pre-existing pinning, if conflicting reject
1092                  */
1093                 if (task->cpus_allowed != ~0UL && (task->cpus_allowed & (1UL<<cpu)) == 0) {
1094                         DBprintk(("[%d] pinned on 0x%lx, mask for CPU%d \n", task->pid,
1095                                 task->cpus_allowed, cpu));
1096                         return -EINVAL;
1097                 }
1098
1099         } else {
1100                 /*
1101                  * must provide a target for the signal in blocking mode even when
1102                  * no counter is configured with PFM_FL_REG_OVFL_NOTIFY
1103                  */
1104                 if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) {
1105                         DBprintk(("must have notify_pid when blocking for [%d]\n", task->pid));
1106                         return -EINVAL;
1107                 }
1108 #if 0
1109                 if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == task->pid) {
1110                         DBprintk(("cannot notify self when blocking for [%d]\n", task->pid));
1111                         return -EINVAL;
1112                 }
1113 #endif
1114         }
1115         /* verify validity of smpl_regs */
1116         if ((smpl_pmds & pmu_conf.impl_pmds[0]) != smpl_pmds) {
1117                 DBprintk(("invalid smpl_regs 0x%lx\n", smpl_pmds));
1118                 return -EINVAL;
1119         }
1120         /* probably more to add here */
1121
1122         return 0;
1123 }
1124
1125 static int
1126 pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int count,
1127                    struct pt_regs *regs)
1128 {
1129         pfarg_context_t tmp;
1130         void *uaddr = NULL;
1131         int ret;
1132         int ctx_flags;
1133         pid_t notify_pid;
1134
1135         /* a context has already been defined */
1136         if (ctx) return -EBUSY;
1137
1138         /*
1139          * not yet supported
1140          */
1141         if (task != current) return -EINVAL;
1142
1143         if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1144
1145         ret = pfx_is_sane(task, &tmp);
1146         if (ret < 0) return ret;
1147
1148         ctx_flags = tmp.ctx_flags;
1149
1150         ret = pfm_reserve_session(task, ctx_flags & PFM_FL_SYSTEM_WIDE, tmp.ctx_cpu_mask);
1151         if (ret) goto abort;
1152
1153         ret = -ENOMEM;
1154
1155         ctx = pfm_context_alloc();
1156         if (!ctx) goto error;
1157
1158         /* record the creator (important for inheritance) */
1159         ctx->ctx_owner = current;
1160
1161         notify_pid = tmp.ctx_notify_pid;
1162
1163         spin_lock_init(&ctx->ctx_lock);
1164
1165         if (notify_pid == current->pid) {
1166
1167                 ctx->ctx_notify_task = current;
1168                 task->thread.pfm_context = ctx;
1169
1170         } else if (notify_pid!=0) {
1171                 struct task_struct *notify_task;
1172
1173                 read_lock(&tasklist_lock);
1174
1175                 notify_task = find_task_by_pid(notify_pid);
1176
1177                 if (notify_task) {
1178
1179                         ret = -EPERM;
1180
1181                         /*
1182                          * check if we can send this task a signal
1183                          */
1184                         if (pfm_bad_permissions(notify_task)) {
1185                                 read_unlock(&tasklist_lock);
1186                                 goto buffer_error;
1187                         }
1188
1189                         /*
1190                          * make visible
1191                          * must be done inside critical section
1192                          *
1193                          * if the initialization does not go through it is still
1194                          * okay because child will do the scan for nothing which
1195                          * won't hurt.
1196                          */
1197                         task->thread.pfm_context = ctx;
1198
1199                         /*
1200                          * will cause task to check on exit for monitored
1201                          * processes that would notify it. see release_thread()
1202                          * Note: the scan MUST be done in release thread, once the
1203                          * task has been detached from the tasklist otherwise you are
1204                          * exposed to race conditions.
1205                          */
1206                         atomic_add(1, &ctx->ctx_notify_task->thread.pfm_notifiers_check);
1207
1208                         ctx->ctx_notify_task = notify_task;
1209                 }
1210                 read_unlock(&tasklist_lock);
1211         }
1212
1213         /*
1214          * notification process does not exist
1215          */
1216         if (notify_pid != 0 && ctx->ctx_notify_task == NULL) {
1217                 ret = -EINVAL;
1218                 goto buffer_error;
1219         }
1220
1221         if (tmp.ctx_smpl_entries) {
1222                 DBprintk(("sampling entries=%lu\n",tmp.ctx_smpl_entries));
1223
1224                 ret = pfm_smpl_buffer_alloc(ctx, tmp.ctx_smpl_regs,
1225                                                  tmp.ctx_smpl_entries, &uaddr);
1226                 if (ret<0) goto buffer_error;
1227
1228                 tmp.ctx_smpl_vaddr = uaddr;
1229         }
1230         /* initialization of context's flags */
1231         ctx->ctx_fl_inherit   = ctx_flags & PFM_FL_INHERIT_MASK;
1232         ctx->ctx_fl_block     = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
1233         ctx->ctx_fl_system    = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
1234         ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
1235         ctx->ctx_fl_unsecure  = (ctx_flags & PFM_FL_UNSECURE) ? 1: 0;
1236         ctx->ctx_fl_frozen    = 0;
1237         ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
1238
1239         /*
1240          * setting this flag to 0 here means, that the creator or the task that the
1241          * context is being attached are granted access. Given that a context can only
1242          * be created for the calling process this, in effect only allows the creator
1243          * to access the context. See pfm_protect() for more.
1244          */
1245         ctx->ctx_fl_protected = 0;
1246
1247         /* for system wide mode only (only 1 bit set) */
1248         ctx->ctx_cpu = ffz(~tmp.ctx_cpu_mask);
1249
1250         atomic_set(&ctx->ctx_last_cpu,-1); /* SMP only, means no CPU */
1251
1252         sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */
1253
1254         if (__copy_to_user(req, &tmp, sizeof(tmp))) {
1255                 ret = -EFAULT;
1256                 goto buffer_error;
1257         }
1258
1259         DBprintk(("context=%p, pid=%d notify_task=%p\n",
1260                         (void *)ctx, task->pid, ctx->ctx_notify_task));
1261
1262         DBprintk(("context=%p, pid=%d flags=0x%x inherit=%d block=%d system=%d excl_idle=%d unsecure=%d\n",
1263                         (void *)ctx, task->pid, ctx_flags, ctx->ctx_fl_inherit,
1264                         ctx->ctx_fl_block, ctx->ctx_fl_system,
1265                         ctx->ctx_fl_excl_idle,
1266                         ctx->ctx_fl_unsecure));
1267
1268         /*
1269          * when no notification is required, we can make this visible at the last moment
1270          */
1271         if (notify_pid == 0) task->thread.pfm_context = ctx;
1272         /*
1273          * pin task to CPU and force reschedule on exit to ensure
1274          * that when back to user level the task runs on the designated
1275          * CPU.
1276          */
1277         if (ctx->ctx_fl_system) {
1278                 ctx->ctx_saved_cpus_allowed = task->cpus_allowed;
1279                 set_cpus_allowed(task, tmp.ctx_cpu_mask);
1280                 DBprintk(("[%d] rescheduled allowed=0x%lx\n", task->pid, task->cpus_allowed));
1281         }
1282
1283         return 0;
1284
1285 buffer_error:
1286         pfm_context_free(ctx);
1287 error:
1288         pfm_unreserve_session(task, ctx_flags & PFM_FL_SYSTEM_WIDE , tmp.ctx_cpu_mask);
1289 abort:
1290         /* make sure we don't leave anything behind */
1291         task->thread.pfm_context = NULL;
1292
1293         return ret;
1294 }
1295
1296 static inline unsigned long
1297 pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
1298 {
1299         unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
1300         unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
1301         extern unsigned long carta_random32 (unsigned long seed);
1302
1303         if (reg->flags & PFM_REGFL_RANDOM) {
1304                 new_seed = carta_random32(old_seed);
1305                 val -= (old_seed & mask);       /* counter values are negative numbers! */
1306                 if ((mask >> 32) != 0)
1307                         /* construct a full 64-bit random value: */
1308                         new_seed |= carta_random32(old_seed >> 32) << 32;
1309                 reg->seed = new_seed;
1310         }
1311         reg->lval = val;
1312         return val;
1313 }
1314
1315 static void
1316 pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
1317 {
1318         unsigned long mask = ovfl_regs[0];
1319         unsigned long reset_others = 0UL;
1320         unsigned long val;
1321         int i, is_long_reset = (flag == PFM_PMD_LONG_RESET);
1322
1323         /*
1324          * now restore reset value on sampling overflowed counters
1325          */
1326         mask >>= PMU_FIRST_COUNTER;
1327         for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
1328                 if (mask & 0x1) {
1329                         val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
1330                         reset_others |= ctx->ctx_soft_pmds[i].reset_pmds[0];
1331
1332                         DBprintk_ovfl(("[%d] %s reset soft_pmd[%d]=%lx\n", current->pid,
1333                                   is_long_reset ? "long" : "short", i, val));
1334
1335                         /* upper part is ignored on rval */
1336                         pfm_write_soft_counter(ctx, i, val);
1337                 }
1338         }
1339
1340         /*
1341          * Now take care of resetting the other registers
1342          */
1343         for(i = 0; reset_others; i++, reset_others >>= 1) {
1344
1345                 if ((reset_others & 0x1) == 0) continue;
1346
1347                 val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
1348
1349                 if (PMD_IS_COUNTING(i)) {
1350                         pfm_write_soft_counter(ctx, i, val);
1351                 } else {
1352                         ia64_set_pmd(i, val);
1353                 }
1354                 DBprintk_ovfl(("[%d] %s reset_others pmd[%d]=%lx\n", current->pid,
1355                           is_long_reset ? "long" : "short", i, val));
1356         }
1357         ia64_srlz_d();
1358 }
1359
1360 static int
1361 pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1362 {
1363         struct thread_struct *th = &task->thread;
1364         pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
1365         unsigned long value, reset_pmds;
1366         unsigned int cnum, reg_flags, flags;
1367         int is_monitor, is_counting;
1368         int i, ret = -EINVAL;
1369 #define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
1370
1371         /* we don't quite support this right now */
1372         if (task != current) return -EINVAL;
1373
1374         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1375
1376         /* XXX: ctx locking may be required here */
1377
1378         for (i = 0; i < count; i++, req++) {
1379
1380                 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1381
1382                 cnum       = tmp.reg_num;
1383                 reg_flags  = tmp.reg_flags;
1384                 value      = tmp.reg_value;
1385                 reset_pmds = tmp.reg_reset_pmds[0];
1386                 flags      = 0;
1387
1388                 is_counting = PMC_IS_COUNTING(cnum);
1389                 is_monitor  = PMC_IS_MONITOR(cnum);
1390
1391                 /*
1392                  * we reject all non implemented PMC as well
1393                  * as attempts to modify PMC[0-3] which are used
1394                  * as status registers by the PMU
1395                  */
1396                 if (!PMC_IS_IMPL(cnum) || cnum < 4) {
1397                         DBprintk(("pmc[%u] is unimplemented or invalid\n", cnum));
1398                         goto error;
1399                 }
1400                 /*
1401                  * If the PMC is a monitor, then if the value is not the default:
1402                  *      - system-wide session: PMCx.pm=1 (privileged monitor)
1403                  *      - per-task           : PMCx.pm=0 (user monitor)
1404                  */
1405                 if ((is_monitor || is_counting) && value != PMC_DFL_VAL(i) && PFM_CHECK_PMC_PM(ctx, cnum, value)) {
1406                         DBprintk(("pmc%u pmc_pm=%ld fl_system=%d\n",
1407                                 cnum,
1408                                 PMC_PM(cnum, value),
1409                                 ctx->ctx_fl_system));
1410                         goto error;
1411                 }
1412
1413                 if (is_counting) {
1414                         pfm_monitor_t *p = (pfm_monitor_t *)&value;
1415                         /*
1416                          * enforce generation of overflow interrupt. Necessary on all
1417                          * CPUs.
1418                          */
1419                         p->pmc_oi = 1;
1420
1421                         if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
1422                                 /*
1423                                  * must have a target for the signal
1424                                  */
1425                                 if (ctx->ctx_notify_task == NULL) {
1426                                         DBprintk(("cannot set ovfl_notify: no notify_task\n"));
1427                                         goto error;
1428                                 }
1429                                 flags |= PFM_REGFL_OVFL_NOTIFY;
1430                         }
1431
1432                         if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
1433
1434                         /* verify validity of reset_pmds */
1435                         if ((reset_pmds & pmu_conf.impl_pmds[0]) != reset_pmds) {
1436                                 DBprintk(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
1437                                 goto error;
1438                         }
1439                 } else if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
1440                                 DBprintk(("cannot set ovfl_notify or random on pmc%u\n", cnum));
1441                                 goto error;
1442                 }
1443
1444                 /*
1445                  * execute write checker, if any
1446                  */
1447                 if (PMC_WR_FUNC(cnum)) {
1448                         ret = PMC_WR_FUNC(cnum)(task, cnum, &value, regs);
1449                         if (ret) goto error;
1450                         ret = -EINVAL;
1451                 }
1452
1453                 /*
1454                  * no error on this register
1455                  */
1456                 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
1457
1458                 /*
1459                  * update register return value, abort all if problem during copy.
1460                  * we only modify the reg_flags field. no check mode is fine because
1461                  * access has been verified upfront in sys_perfmonctl().
1462                  *
1463                  * If this fails, then the software state is not modified
1464                  */
1465                 if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
1466
1467                 /*
1468                  * Now we commit the changes to the software state
1469                  */
1470
1471                 /*
1472                  * full flag update each time a register is programmed
1473                  */
1474                 ctx->ctx_soft_pmds[cnum].flags = flags;
1475
1476                 if (is_counting) {
1477                         ctx->ctx_soft_pmds[cnum].reset_pmds[0] = reset_pmds;
1478
1479                         /* mark all PMDS to be accessed as used */
1480                         CTX_USED_PMD(ctx, reset_pmds);
1481                 }
1482
1483                 /*
1484                  * Needed in case the user does not initialize the equivalent
1485                  * PMD. Clearing is done in reset_pmu() so there is no possible
1486                  * leak here.
1487                  */
1488                 CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
1489
1490                 /*
1491                  * keep copy the pmc, used for register reload
1492                  */
1493                 th->pmc[cnum] = value;
1494
1495                 ia64_set_pmc(cnum, value);
1496
1497                 DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x used_pmds=0x%lx\n",
1498                           task->pid, cnum, value,
1499                           ctx->ctx_soft_pmds[cnum].flags,
1500                           ctx->ctx_used_pmds[0]));
1501
1502         }
1503
1504         return 0;
1505
1506 error:
1507         PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
1508
1509         if (__put_user(tmp.reg_flags, &req->reg_flags)) ret = -EFAULT;
1510
1511         DBprintk(("[%d] pmc[%u]=0x%lx error %d\n", task->pid, cnum, value, ret));
1512
1513         return ret;
1514 }
1515
1516 static int
1517 pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1518 {
1519         pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
1520         unsigned long value, hw_value;
1521         unsigned int cnum;
1522         int i;
1523         int ret = -EINVAL;
1524
1525         /* we don't quite support this right now */
1526         if (task != current) return -EINVAL;
1527
1528         /*
1529          * Cannot do anything before PMU is enabled
1530          */
1531         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1532         preempt_disable();
1533
1534         /* XXX: ctx locking may be required here */
1535
1536
1537         for (i = 0; i < count; i++, req++) {
1538
1539                 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
1540
1541                 cnum  = tmp.reg_num;
1542                 value = tmp.reg_value;
1543
1544                 if (!PMD_IS_IMPL(cnum)) {
1545                         DBprintk(("pmd[%u] is unimplemented or invalid\n", cnum));
1546                         goto abort_mission;
1547                 }
1548
1549                 /*
1550                  * execute write checker, if any
1551                  */
1552                 if (PMD_WR_FUNC(cnum)) {
1553                         unsigned long v = value;
1554                         ret = PMD_WR_FUNC(cnum)(task, cnum, &v, regs);
1555                         if (ret) goto abort_mission;
1556                         value = v;
1557                         ret = -EINVAL;
1558                 }
1559                 hw_value = value;
1560                 /*
1561                  * no error on this register
1562                  */
1563                 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
1564
1565                 if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
1566
1567                 /*
1568                  * now commit changes to software state
1569                  */
1570
1571                 /* update virtualized (64bits) counter */
1572                 if (PMD_IS_COUNTING(cnum)) {
1573                         ctx->ctx_soft_pmds[cnum].lval = value;
1574                         ctx->ctx_soft_pmds[cnum].val  = value & ~pmu_conf.ovfl_val;
1575
1576                         hw_value = value & pmu_conf.ovfl_val;
1577
1578                         ctx->ctx_soft_pmds[cnum].long_reset  = tmp.reg_long_reset;
1579                         ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset;
1580
1581                         ctx->ctx_soft_pmds[cnum].seed = tmp.reg_random_seed;
1582                         ctx->ctx_soft_pmds[cnum].mask = tmp.reg_random_mask;
1583                 }
1584
1585                 /* keep track of what we use */
1586                 CTX_USED_PMD(ctx, pmu_conf.pmd_desc[(cnum)].dep_pmd[0]);
1587
1588                 /* mark this register as used as well */
1589                 CTX_USED_PMD(ctx, RDEP(cnum));
1590
1591                 /* writes to unimplemented part is ignored, so this is safe */
1592                 ia64_set_pmd(cnum, hw_value);
1593
1594                 /* to go away */
1595                 ia64_srlz_d();
1596
1597                 DBprintk(("[%d] pmd[%u]: value=0x%lx hw_value=0x%lx soft_pmd=0x%lx  short_reset=0x%lx "
1598                           "long_reset=0x%lx hw_pmd=%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx\n",
1599                                 task->pid, cnum,
1600                                 value, hw_value,
1601                                 ctx->ctx_soft_pmds[cnum].val,
1602                                 ctx->ctx_soft_pmds[cnum].short_reset,
1603                                 ctx->ctx_soft_pmds[cnum].long_reset,
1604                                 ia64_get_pmd(cnum) & pmu_conf.ovfl_val,
1605                                 PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
1606                                 ctx->ctx_used_pmds[0],
1607                                 ctx->ctx_soft_pmds[cnum].reset_pmds[0]));
1608         }
1609         preempt_enable();
1610         return 0;
1611
1612 abort_mission:
1613         preempt_enable();
1614
1615         /*
1616          * for now, we have only one possibility for error
1617          */
1618         PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
1619
1620         /*
1621          * we change the return value to EFAULT in case we cannot write register return code.
1622          * The caller first must correct this error, then a resubmission of the request will
1623          * eventually yield the EINVAL.
1624          */
1625         if (__put_user(tmp.reg_flags, &req->reg_flags)) ret = -EFAULT;
1626
1627         DBprintk(("[%d] pmc[%u]=0x%lx ret %d\n", task->pid, cnum, value, ret));
1628
1629         return ret;
1630 }
1631
1632 static int
1633 pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
1634 {
1635         struct thread_struct *th = &task->thread;
1636         unsigned long val, lval;
1637         pfarg_reg_t *req = (pfarg_reg_t *)arg;
1638         unsigned int cnum, reg_flags = 0;
1639         int i, ret = 0;
1640
1641         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1642
1643         /*
1644          * XXX: MUST MAKE SURE WE DON"T HAVE ANY PENDING OVERFLOW BEFORE READING
1645          * This is required when the monitoring has been stoppped by user or kernel.
1646          * If it is still going on, then that's fine because we a re not guaranteed
1647          * to return an accurate value in this case.
1648          */
1649
1650         /* XXX: ctx locking may be required here */
1651
1652         DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid));
1653
1654         for (i = 0; i < count; i++, req++) {
1655                 int me;
1656
1657                 if (__get_user(cnum, &req->reg_num)) return -EFAULT;
1658                 if (__get_user(reg_flags, &req->reg_flags)) return -EFAULT;
1659                 lval = 0UL;
1660
1661                 if (!PMD_IS_IMPL(cnum)) goto abort_mission;
1662                 /*
1663                  * we can only read the register that we use. That includes
1664                  * the one we explicitly initialize AND the one we want included
1665                  * in the sampling buffer (smpl_regs).
1666                  *
1667                  * Having this restriction allows optimization in the ctxsw routine
1668                  * without compromising security (leaks)
1669                  */
1670                 if (!CTX_IS_USED_PMD(ctx, cnum)) goto abort_mission;
1671
1672                 /*
1673                  * If the task is not the current one, then we check if the
1674                  * PMU state is still in the local live register due to lazy ctxsw.
1675                  * If true, then we read directly from the registers.
1676                  */
1677                 me = get_cpu();
1678                 if (atomic_read(&ctx->ctx_last_cpu) == me){
1679                         ia64_srlz_d();
1680                         val = ia64_get_pmd(cnum);
1681                         DBprintk(("reading pmd[%u]=0x%lx from hw\n", cnum, val));
1682                 } else {
1683                         val = th->pmd[cnum];
1684                 }
1685
1686
1687                 if (PMD_IS_COUNTING(cnum)) {
1688                         /*
1689                          * XXX: need to check for overflow
1690                          */
1691                         val &= pmu_conf.ovfl_val;
1692                         val += ctx->ctx_soft_pmds[cnum].val;
1693
1694                         lval = ctx->ctx_soft_pmds[cnum].lval;
1695                 }
1696
1697                 /*
1698                  * execute read checker, if any
1699                  */
1700                 if (PMD_RD_FUNC(cnum)) {
1701                         unsigned long v = val;
1702                         ret = PMD_RD_FUNC(cnum)(task, cnum, &v, regs);
1703                         val = v;
1704                 }
1705
1706                 PFM_REG_RETFLAG_SET(reg_flags, ret);
1707
1708                 put_cpu();
1709
1710                 DBprintk(("read pmd[%u] ret=%d value=0x%lx pmc=0x%lx\n",
1711                                         cnum, ret, val, ia64_get_pmc(cnum)));
1712
1713                 /*
1714                  * update register return value, abort all if problem during copy.
1715                  * we only modify the reg_flags field. no check mode is fine because
1716                  * access has been verified upfront in sys_perfmonctl().
1717                  */
1718                 if (__put_user(cnum, &req->reg_num)) return -EFAULT;
1719                 if (__put_user(val, &req->reg_value)) return -EFAULT;
1720                 if (__put_user(reg_flags, &req->reg_flags)) return -EFAULT;
1721                 if (__put_user(lval, &req->reg_last_reset_value)) return -EFAULT;
1722         }
1723
1724         return 0;
1725
1726 abort_mission:
1727         PFM_REG_RETFLAG_SET(reg_flags, PFM_REG_RETFL_EINVAL);
1728         /*
1729          * XXX: if this fails, we stick with the original failure, flag not updated!
1730          */
1731         __put_user(reg_flags, &req->reg_flags);
1732
1733         return -EINVAL;
1734 }
1735
1736 #ifdef PFM_PMU_USES_DBR
1737 /*
1738  * Only call this function when a process it trying to
1739  * write the debug registers (reading is always allowed)
1740  */
1741 int
1742 pfm_use_debug_registers(struct task_struct *task)
1743 {
1744         pfm_context_t *ctx = task->thread.pfm_context;
1745         int ret = 0;
1746
1747         DBprintk(("called for [%d]\n", task->pid));
1748
1749         /*
1750          * do it only once
1751          */
1752         if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
1753
1754         /*
1755          * Even on SMP, we do not need to use an atomic here because
1756          * the only way in is via ptrace() and this is possible only when the
1757          * process is stopped. Even in the case where the ctxsw out is not totally
1758          * completed by the time we come here, there is no way the 'stopped' process
1759          * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
1760          * So this is always safe.
1761          */
1762         if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
1763
1764         LOCK_PFS();
1765
1766         /*
1767          * We cannot allow setting breakpoints when system wide monitoring
1768          * sessions are using the debug registers.
1769          */
1770         if (pfm_sessions.pfs_sys_use_dbregs> 0)
1771                 ret = -1;
1772         else
1773                 pfm_sessions.pfs_ptrace_use_dbregs++;
1774
1775         DBprintk(("ptrace_use_dbregs=%u  sys_use_dbregs=%u by [%d] ret = %d\n",
1776                   pfm_sessions.pfs_ptrace_use_dbregs,
1777                   pfm_sessions.pfs_sys_use_dbregs,
1778                   task->pid, ret));
1779
1780         UNLOCK_PFS();
1781
1782         return ret;
1783 }
1784
1785 /*
1786  * This function is called for every task that exits with the
1787  * IA64_THREAD_DBG_VALID set. This indicates a task which was
1788  * able to use the debug registers for debugging purposes via
1789  * ptrace(). Therefore we know it was not using them for
1790  * perfmormance monitoring, so we only decrement the number
1791  * of "ptraced" debug register users to keep the count up to date
1792  */
1793 int
1794 pfm_release_debug_registers(struct task_struct *task)
1795 {
1796         int ret;
1797
1798         LOCK_PFS();
1799         if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
1800                 printk(KERN_DEBUG "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n",
1801                        task->pid);
1802                 ret = -1;
1803         }  else {
1804                 pfm_sessions.pfs_ptrace_use_dbregs--;
1805                 ret = 0;
1806         }
1807         UNLOCK_PFS();
1808
1809         return ret;
1810 }
1811 #else /* PFM_PMU_USES_DBR is true */
1812 /*
1813  * in case, the PMU does not use the debug registers, these two functions are nops.
1814  * The first function is called from arch/ia64/kernel/ptrace.c.
1815  * The second function is called from arch/ia64/kernel/process.c.
1816  */
1817 int
1818 pfm_use_debug_registers(struct task_struct *task)
1819 {
1820         return 0;
1821 }
1822
1823 int
1824 pfm_release_debug_registers(struct task_struct *task)
1825 {
1826         return 0;
1827 }
1828 #endif /* PFM_PMU_USES_DBR */
1829
1830 static int
1831 pfm_restart(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1832          struct pt_regs *regs)
1833 {
1834         void *sem = &ctx->ctx_restart_sem;
1835
1836         /*
1837          * Cannot do anything before PMU is enabled
1838          */
1839         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1840
1841         if (task == current) {
1842                 DBprintk(("restarting self %d frozen=%d ovfl_regs=0x%lx\n",
1843                         task->pid,
1844                         ctx->ctx_fl_frozen,
1845                         ctx->ctx_ovfl_regs[0]));
1846
1847                 preempt_disable();
1848                 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
1849
1850                 ctx->ctx_ovfl_regs[0] = 0UL;
1851
1852                 /*
1853                  * We ignore block/don't block because we never block
1854                  * for a self-monitoring process.
1855                  */
1856                 ctx->ctx_fl_frozen = 0;
1857
1858                 if (CTX_HAS_SMPL(ctx)) {
1859                         ctx->ctx_psb->psb_hdr->hdr_count = 0;
1860                         ctx->ctx_psb->psb_index = 0;
1861                 }
1862
1863                 /* simply unfreeze */
1864                 pfm_unfreeze_pmu();
1865
1866                 preempt_enable();
1867
1868                 return 0;
1869         }
1870         /* restart on another task */
1871
1872         /*
1873          * if blocking, then post the semaphore.
1874          * if non-blocking, then we ensure that the task will go into
1875          * pfm_overflow_must_block() before returning to user mode.
1876          * We cannot explicitly reset another task, it MUST always
1877          * be done by the task itself. This works for system wide because
1878          * the tool that is controlling the session is doing "self-monitoring".
1879          *
1880          * XXX: what if the task never goes back to user?
1881          *
1882          */
1883         if (CTX_OVFL_NOBLOCK(ctx) == 0) {
1884                 DBprintk(("unblocking %d \n", task->pid));
1885                 up(sem);
1886         } else {
1887                 struct thread_info *info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
1888                 task->thread.pfm_ovfl_block_reset = 1;
1889                 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
1890                 set_bit(TIF_NOTIFY_RESUME, &info->flags);
1891         }
1892 #if 0
1893         /*
1894          * in case of non blocking mode, then it's just a matter of
1895          * of reseting the sampling buffer (if any) index. The PMU
1896          * is already active.
1897          */
1898
1899         /*
1900          * must reset the header count first
1901          */
1902         if (CTX_HAS_SMPL(ctx)) {
1903                 DBprintk(("resetting sampling indexes for %d \n", task->pid));
1904                 ctx->ctx_psb->psb_hdr->hdr_count = 0;
1905                 ctx->ctx_psb->psb_index = 0;
1906         }
1907 #endif
1908         return 0;
1909 }
1910
1911 static int
1912 pfm_stop(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1913          struct pt_regs *regs)
1914 {
1915         /* we don't quite support this right now */
1916         if (task != current) return -EINVAL;
1917
1918         /*
1919          * Cannot do anything before PMU is enabled
1920          */
1921         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1922
1923         DBprintk(("[%d] fl_system=%d owner=%p current=%p\n",
1924                                 current->pid,
1925                                 ctx->ctx_fl_system, PMU_OWNER(),
1926                                 current));
1927
1928         preempt_disable();
1929         /* simply stop monitoring but not the PMU */
1930         if (ctx->ctx_fl_system) {
1931
1932                 /* disable dcr pp */
1933                 ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
1934
1935                 /* stop monitoring */
1936                 pfm_clear_psr_pp();
1937
1938                 ia64_srlz_i();
1939
1940                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
1941
1942                 ia64_psr(regs)->pp = 0;
1943
1944         } else {
1945
1946                 /* stop monitoring */
1947                 pfm_clear_psr_up();
1948
1949                 ia64_srlz_i();
1950
1951                 /*
1952                  * clear user level psr.up
1953                  */
1954                 ia64_psr(regs)->up = 0;
1955         }
1956         preempt_enable();
1957         return 0;
1958 }
1959
1960 static int
1961 pfm_disable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1962            struct pt_regs *regs)
1963 {
1964         /* we don't quite support this right now */
1965         if (task != current) return -EINVAL;
1966
1967         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
1968
1969         preempt_disable();
1970         /*
1971          * stop monitoring, freeze PMU, and save state in context
1972          * this call will clear IA64_THREAD_PM_VALID for per-task sessions.
1973          */
1974         pfm_flush_regs(task);
1975
1976         if (ctx->ctx_fl_system) {
1977                 ia64_psr(regs)->pp = 0;
1978         } else {
1979                 ia64_psr(regs)->up = 0;
1980         }
1981         /*
1982          * goes back to default behavior: no user level control
1983          * no need to change live psr.sp because useless at the kernel level
1984          */
1985         ia64_psr(regs)->sp = 1;
1986
1987         DBprintk(("enabling psr.sp for [%d]\n", current->pid));
1988
1989         ctx->ctx_flags.state = PFM_CTX_DISABLED;
1990         preempt_enable();
1991
1992         return 0;
1993 }
1994
1995 static int
1996 pfm_context_destroy(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
1997          struct pt_regs *regs)
1998 {
1999         /* we don't quite support this right now */
2000         if (task != current) return -EINVAL;
2001
2002         /*
2003          * if context was never enabled, then there is not much
2004          * to do
2005          */
2006         if (!CTX_IS_ENABLED(ctx)) goto skipped_stop;
2007
2008         /*
2009          * Disable context: stop monitoring, flush regs to software state (useless here),
2010          * and freeze PMU
2011          *
2012          * The IA64_THREAD_PM_VALID is cleared by pfm_flush_regs() called from pfm_disable()
2013          */
2014         pfm_disable(task, ctx, arg, count, regs);
2015
2016         if (ctx->ctx_fl_system) {
2017                 ia64_psr(regs)->pp = 0;
2018         } else {
2019                 ia64_psr(regs)->up = 0;
2020         }
2021
2022 skipped_stop:
2023         /*
2024          * remove sampling buffer mapping, if any
2025          */
2026         if (ctx->ctx_smpl_vaddr) {
2027                 pfm_remove_smpl_mapping(task);
2028                 ctx->ctx_smpl_vaddr = 0UL;
2029         }
2030         /* now free context and related state */
2031         pfm_context_exit(task);
2032
2033         return 0;
2034 }
2035
2036 /*
2037  * does nothing at the moment
2038  */
2039 static int
2040 pfm_context_unprotect(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2041          struct pt_regs *regs)
2042 {
2043         return 0;
2044 }
2045
2046 static int
2047 pfm_protect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2048          struct pt_regs *regs)
2049 {
2050         DBprintk(("context from [%d] is protected\n", task->pid));
2051         /*
2052          * from now on, only the creator of the context has access to it
2053          */
2054         ctx->ctx_fl_protected = 1;
2055
2056         /*
2057          * reinforce secure monitoring: cannot toggle psr.up
2058          */
2059         if (ctx->ctx_fl_unsecure == 0) ia64_psr(regs)->sp = 1;
2060
2061         return 0;
2062 }
2063
2064 static int
2065 pfm_debug(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2066          struct pt_regs *regs)
2067 {
2068         unsigned int mode = *(unsigned int *)arg;
2069
2070         pfm_sysctl.debug = mode == 0 ? 0 : 1;
2071
2072         printk(KERN_INFO "perfmon debugging %s\n", pfm_sysctl.debug ? "on" : "off");
2073
2074         return 0;
2075 }
2076
2077 #ifdef PFM_PMU_USES_DBR
2078
2079 typedef struct {
2080         unsigned long ibr_mask:56;
2081         unsigned long ibr_plm:4;
2082         unsigned long ibr_ig:3;
2083         unsigned long ibr_x:1;
2084 } ibr_mask_reg_t;
2085
2086 typedef struct {
2087         unsigned long dbr_mask:56;
2088         unsigned long dbr_plm:4;
2089         unsigned long dbr_ig:2;
2090         unsigned long dbr_w:1;
2091         unsigned long dbr_r:1;
2092 } dbr_mask_reg_t;
2093
2094 typedef union {
2095         unsigned long  val;
2096         ibr_mask_reg_t ibr;
2097         dbr_mask_reg_t dbr;
2098 } dbreg_t;
2099
2100 static int
2101 pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, struct pt_regs *regs)
2102 {
2103         struct thread_struct *thread = &task->thread;
2104         pfm_context_t *ctx = task->thread.pfm_context;
2105         pfarg_dbreg_t tmp, *req = (pfarg_dbreg_t *)arg;
2106         dbreg_t dbreg;
2107         unsigned int rnum;
2108         int first_time;
2109         int i, ret = 0;
2110
2111         /*
2112          * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
2113          * ensuring that no real breakpoint can be installed via this call.
2114          */
2115
2116         first_time = ctx->ctx_fl_using_dbreg == 0;
2117
2118         /*
2119          * check for debug registers in system wide mode
2120          *
2121          */
2122         LOCK_PFS();
2123         if (ctx->ctx_fl_system && first_time) {
2124                 if (pfm_sessions.pfs_ptrace_use_dbregs)
2125                         ret = -EBUSY;
2126                 else
2127                         pfm_sessions.pfs_sys_use_dbregs++;
2128         }
2129         UNLOCK_PFS();
2130
2131         if (ret != 0) return ret;
2132
2133         if (ctx->ctx_fl_system) {
2134                 /* we mark ourselves as owner  of the debug registers */
2135                 ctx->ctx_fl_using_dbreg = 1;
2136                 DBprintk(("system-wide setting fl_using_dbreg for [%d]\n", task->pid));
2137         } else if (first_time) {
2138                         ret= -EBUSY;
2139                         if ((thread->flags & IA64_THREAD_DBG_VALID) != 0) {
2140                                 DBprintk(("debug registers already in use for [%d]\n", task->pid));
2141                                 goto abort_mission;
2142                         }
2143                         /* we mark ourselves as owner  of the debug registers */
2144                         ctx->ctx_fl_using_dbreg = 1;
2145
2146                         DBprintk(("setting fl_using_dbreg for [%d]\n", task->pid));
2147                         /*
2148                          * Given debug registers cannot be used for both debugging
2149                          * and performance monitoring at the same time, we reuse
2150                          * the storage area to save and restore the registers on ctxsw.
2151                          */
2152                         memset(task->thread.dbr, 0, sizeof(task->thread.dbr));
2153                         memset(task->thread.ibr, 0, sizeof(task->thread.ibr));
2154         }
2155
2156         if (first_time) {
2157                 DBprintk(("[%d] clearing ibrs,dbrs\n", task->pid));
2158                 /*
2159                  * clear hardware registers to make sure we don't
2160                  * pick up stale state.
2161                  *
2162                  * for a system wide session, we do not use
2163                  * thread.dbr, thread.ibr because this process
2164                  * never leaves the current CPU and the state
2165                  * is shared by all processes running on it
2166                  */
2167                 for (i=0; i < (int) pmu_conf.num_ibrs; i++) {
2168                         ia64_set_ibr(i, 0UL);
2169                 }
2170                 ia64_srlz_i();
2171                 for (i=0; i < (int) pmu_conf.num_dbrs; i++) {
2172                         ia64_set_dbr(i, 0UL);
2173                 }
2174                 ia64_srlz_d();
2175         }
2176
2177         ret = -EFAULT;
2178
2179         /*
2180          * Now install the values into the registers
2181          */
2182         for (i = 0; i < count; i++, req++) {
2183
2184                 if (__copy_from_user(&tmp, req, sizeof(tmp))) goto abort_mission;
2185
2186                 rnum      = tmp.dbreg_num;
2187                 dbreg.val = tmp.dbreg_value;
2188
2189                 ret = -EINVAL;
2190
2191                 if ((mode == 0 && !IBR_IS_IMPL(rnum)) || ((mode == 1) && !DBR_IS_IMPL(rnum))) {
2192                         DBprintk(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
2193                                   rnum, dbreg.val, mode, i, count));
2194
2195                         goto abort_mission;
2196                 }
2197
2198                 /*
2199                  * make sure we do not install enabled breakpoint
2200                  */
2201                 if (rnum & 0x1) {
2202                         if (mode == 0)
2203                                 dbreg.ibr.ibr_x = 0;
2204                         else
2205                                 dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
2206                 }
2207
2208                 /*
2209                  * clear return flags and copy back to user
2210                  *
2211                  * XXX: fix once EAGAIN is implemented
2212                  */
2213                 ret = -EFAULT;
2214
2215                 PFM_REG_RETFLAG_SET(tmp.dbreg_flags, 0);
2216
2217                 if (__copy_to_user(req, &tmp, sizeof(tmp))) goto abort_mission;
2218
2219                 /*
2220                  * Debug registers, just like PMC, can only be modified
2221                  * by a kernel call. Moreover, perfmon() access to those
2222                  * registers are centralized in this routine. The hardware
2223                  * does not modify the value of these registers, therefore,
2224                  * if we save them as they are written, we can avoid having
2225                  * to save them on context switch out. This is made possible
2226                  * by the fact that when perfmon uses debug registers, ptrace()
2227                  * won't be able to modify them concurrently.
2228                  */
2229                 if (mode == 0) {
2230                         CTX_USED_IBR(ctx, rnum);
2231
2232                         ia64_set_ibr(rnum, dbreg.val);
2233                         ia64_srlz_i();
2234
2235                         thread->ibr[rnum] = dbreg.val;
2236
2237                         DBprintk(("write ibr%u=0x%lx used_ibrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_ibrs[0]));
2238                 } else {
2239                         CTX_USED_DBR(ctx, rnum);
2240
2241                         ia64_set_dbr(rnum, dbreg.val);
2242                         ia64_srlz_d();
2243
2244                         thread->dbr[rnum] = dbreg.val;
2245
2246                         DBprintk(("write dbr%u=0x%lx used_dbrs=0x%lx\n", rnum, dbreg.val, ctx->ctx_used_dbrs[0]));
2247                 }
2248         }
2249
2250         return 0;
2251
2252 abort_mission:
2253         /*
2254          * in case it was our first attempt, we undo the global modifications
2255          */
2256         if (first_time) {
2257                 LOCK_PFS();
2258                 if (ctx->ctx_fl_system) {
2259                         pfm_sessions.pfs_sys_use_dbregs--;
2260                 }
2261                 UNLOCK_PFS();
2262                 ctx->ctx_fl_using_dbreg = 0;
2263         }
2264         /*
2265          * install error return flag
2266          */
2267         if (ret != -EFAULT) {
2268                 /*
2269                  * XXX: for now we can only come here on EINVAL
2270                  */
2271                 PFM_REG_RETFLAG_SET(tmp.dbreg_flags, PFM_REG_RETFL_EINVAL);
2272                 if (__put_user(tmp.dbreg_flags, &req->dbreg_flags)) ret = -EFAULT;
2273         }
2274         return ret;
2275 }
2276
2277 static int
2278 pfm_write_ibrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2279          struct pt_regs *regs)
2280 {
2281         /* we don't quite support this right now */
2282         if (task != current) return -EINVAL;
2283
2284         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2285
2286         return pfm_write_ibr_dbr(0, task, arg, count, regs);
2287 }
2288
2289 static int
2290 pfm_write_dbrs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2291          struct pt_regs *regs)
2292 {
2293         /* we don't quite support this right now */
2294         if (task != current) return -EINVAL;
2295
2296         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2297
2298         return pfm_write_ibr_dbr(1, task, arg, count, regs);
2299 }
2300
2301 #endif /* PFM_PMU_USES_DBR */
2302
2303 static int
2304 pfm_get_features(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
2305 {
2306         pfarg_features_t tmp;
2307
2308         memset(&tmp, 0, sizeof(tmp));
2309
2310         tmp.ft_version      = PFM_VERSION;
2311         tmp.ft_smpl_version = PFM_SMPL_VERSION;
2312
2313         if (__copy_to_user(arg, &tmp, sizeof(tmp))) return -EFAULT;
2314
2315         return 0;
2316 }
2317
2318 static int
2319 pfm_start(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2320           struct pt_regs *regs)
2321 {
2322         /* we don't quite support this right now */
2323         if (task != current) return -EINVAL;
2324
2325         /*
2326          * Cannot do anything before PMU is enabled
2327          */
2328         if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
2329
2330         DBprintk(("[%d] fl_system=%d owner=%p current=%p\n",
2331                                 current->pid,
2332                                 ctx->ctx_fl_system, PMU_OWNER(),
2333                                 current));
2334
2335         if (PMU_OWNER() != task) {
2336                 printk(KERN_DEBUG "perfmon: pfm_start task [%d] not pmu owner\n", task->pid);
2337                 return -EINVAL;
2338         }
2339
2340         preempt_disable();
2341         if (ctx->ctx_fl_system) {
2342
2343                 PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
2344
2345                 /* set user level psr.pp */
2346                 ia64_psr(regs)->pp = 1;
2347
2348                 /* start monitoring at kernel level */
2349                 pfm_set_psr_pp();
2350
2351                 /* enable dcr pp */
2352                 ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
2353
2354                 ia64_srlz_i();
2355
2356         } else {
2357                 if ((task->thread.flags & IA64_THREAD_PM_VALID) == 0) {
2358                         preempt_enable();
2359                         printk(KERN_DEBUG "perfmon: pfm_start task flag not set for [%d]\n",
2360                                task->pid);
2361                         return -EINVAL;
2362                 }
2363                 /* set user level psr.up */
2364                 ia64_psr(regs)->up = 1;
2365
2366                 /* start monitoring at kernel level */
2367                 pfm_set_psr_up();
2368
2369                 ia64_srlz_i();
2370         }
2371
2372         preempt_enable();
2373         return 0;
2374 }
2375
2376 static int
2377 pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2378            struct pt_regs *regs)
2379 {
2380         int me;
2381
2382         /* we don't quite support this right now */
2383         if (task != current) return -EINVAL;
2384
2385         me = get_cpu();  /* make sure we're not migrated or preempted */
2386
2387         if (ctx->ctx_fl_system == 0 && PMU_OWNER()  && PMU_OWNER() != current)
2388                 pfm_lazy_save_regs(PMU_OWNER());
2389
2390         /* reset all registers to stable quiet state */
2391         pfm_reset_pmu(task);
2392
2393         /* make sure nothing starts */
2394         if (ctx->ctx_fl_system) {
2395                 ia64_psr(regs)->pp = 0;
2396                 ia64_psr(regs)->up = 0; /* just to make sure! */
2397
2398                 /* make sure monitoring is stopped */
2399                 pfm_clear_psr_pp();
2400                 ia64_srlz_i();
2401
2402                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
2403                 PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
2404                 if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
2405         } else {
2406                 /*
2407                  * needed in case the task was a passive task during
2408                  * a system wide session and now wants to have its own
2409                  * session
2410                  */
2411                 ia64_psr(regs)->pp = 0; /* just to make sure! */
2412                 ia64_psr(regs)->up = 0;
2413
2414                 /* make sure monitoring is stopped */
2415                 pfm_clear_psr_up();
2416                 ia64_srlz_i();
2417
2418                 DBprintk(("clearing psr.sp for [%d]\n", current->pid));
2419
2420                 /* allow user level control  */
2421                 ia64_psr(regs)->sp = 0;
2422
2423                 /* PMU state will be saved/restored on ctxsw */
2424                 task->thread.flags |= IA64_THREAD_PM_VALID;
2425         }
2426
2427         SET_PMU_OWNER(task);
2428
2429         ctx->ctx_flags.state = PFM_CTX_ENABLED;
2430         atomic_set(&ctx->ctx_last_cpu, me);
2431
2432         /* simply unfreeze */
2433         pfm_unfreeze_pmu();
2434
2435         put_cpu();
2436
2437         return 0;
2438 }
2439
2440 static int
2441 pfm_get_pmc_reset(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
2442            struct pt_regs *regs)
2443 {
2444         pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
2445         unsigned int cnum;
2446         int i, ret = -EINVAL;
2447
2448         for (i = 0; i < count; i++, req++) {
2449
2450                 if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
2451
2452                 cnum = tmp.reg_num;
2453
2454                 if (!PMC_IS_IMPL(cnum)) goto abort_mission;
2455
2456                 tmp.reg_value = PMC_DFL_VAL(cnum);
2457
2458                 PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
2459
2460                 DBprintk(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, tmp.reg_value));
2461
2462                 if (__copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
2463         }
2464         return 0;
2465 abort_mission:
2466         PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
2467         if (__copy_to_user(req, &tmp, sizeof(tmp))) ret = -EFAULT;
2468
2469         return ret;
2470 }
2471
2472 /*
2473  * functions MUST be listed in the increasing order of their index (see permfon.h)
2474  */
2475 static pfm_cmd_desc_t pfm_cmd_tab[]={
2476 /* 0  */{ NULL, 0, 0, 0}, /* not used */
2477 /* 1  */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2478 /* 2  */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2479 /* 3  */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2480 /* 4  */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2481 /* 5  */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2482 /* 6  */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2483 /* 7  */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2484 /* 8  */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_RW, 1, sizeof(pfarg_context_t)},
2485 /* 9  */{ pfm_context_destroy, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2486 /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0},
2487 /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2488 /* 12 */{ pfm_get_features, PFM_CMD_ARG_RW, 0, 0},
2489 /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)},
2490 /* 14 */{ pfm_context_unprotect, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
2491 /* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
2492 /* 16 */{ NULL, 0, 0, 0}, /* not used */
2493 /* 17 */{ NULL, 0, 0, 0}, /* not used */
2494 /* 18 */{ NULL, 0, 0, 0}, /* not used */
2495 /* 19 */{ NULL, 0, 0, 0}, /* not used */
2496 /* 20 */{ NULL, 0, 0, 0}, /* not used */
2497 /* 21 */{ NULL, 0, 0, 0}, /* not used */
2498 /* 22 */{ NULL, 0, 0, 0}, /* not used */
2499 /* 23 */{ NULL, 0, 0, 0}, /* not used */
2500 /* 24 */{ NULL, 0, 0, 0}, /* not used */
2501 /* 25 */{ NULL, 0, 0, 0}, /* not used */
2502 /* 26 */{ NULL, 0, 0, 0}, /* not used */
2503 /* 27 */{ NULL, 0, 0, 0}, /* not used */
2504 /* 28 */{ NULL, 0, 0, 0}, /* not used */
2505 /* 29 */{ NULL, 0, 0, 0}, /* not used */
2506 /* 30 */{ NULL, 0, 0, 0}, /* not used */
2507 /* 31 */{ NULL, 0, 0, 0}, /* not used */
2508 #ifdef PFM_PMU_USES_DBR
2509 /* 32 */{ pfm_write_ibrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)},
2510 /* 33 */{ pfm_write_dbrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)}
2511 #endif
2512 };
2513 #define PFM_CMD_COUNT   ARRAY_SIZE(pfm_cmd_tab)
2514
2515 static int
2516 check_task_state(struct task_struct *task)
2517 {
2518         int ret = 0;
2519 #ifdef CONFIG_SMP
2520         /* We must wait until the state has been completely
2521          * saved. There can be situations where the reader arrives before
2522          * after the task is marked as STOPPED but before pfm_save_regs()
2523          * is completed.
2524          */
2525         if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY;
2526         DBprintk(("before wait_task_inactive [%d] state %ld\n", task->pid, task->state));
2527         wait_task_inactive(task);
2528         DBprintk(("after wait_task_inactive [%d] state %ld\n", task->pid, task->state));
2529 #else
2530         if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) {
2531                 DBprintk(("warning [%d] not in stable state %ld\n", task->pid, task->state));
2532                 ret = -EBUSY;
2533         }
2534 #endif
2535         return ret;
2536 }
2537
2538 asmlinkage long
2539 sys_perfmonctl (pid_t pid, int cmd, void *arg, int count, long arg5, long arg6, long arg7,
2540                 long arg8, long stack)
2541 {
2542         struct pt_regs *regs = (struct pt_regs *)&stack;
2543         struct task_struct *task = current;
2544         pfm_context_t *ctx;
2545         size_t sz;
2546         int ret, narg;
2547
2548         /*
2549          * reject any call if perfmon was disabled at initialization time
2550          */
2551         if (PFM_IS_DISABLED()) return -ENOSYS;
2552
2553         DBprintk(("cmd=%d idx=%d valid=%d narg=0x%x\n", cmd, PFM_CMD_IDX(cmd),
2554                   PFM_CMD_IS_VALID(cmd), PFM_CMD_NARG(cmd)));
2555
2556         if (PFM_CMD_IS_VALID(cmd) == 0) return -EINVAL;
2557
2558         /* ingore arguments when command has none */
2559         narg = PFM_CMD_NARG(cmd);
2560         if ((narg == PFM_CMD_ARG_MANY  && count == 0) || (narg > 0 && narg != count)) return -EINVAL;
2561
2562         sz = PFM_CMD_ARG_SIZE(cmd);
2563
2564         if (PFM_CMD_READ_ARG(cmd) && !access_ok(VERIFY_READ, arg, sz*count)) return -EFAULT;
2565
2566         if (PFM_CMD_RW_ARG(cmd) && !access_ok(VERIFY_WRITE, arg, sz*count)) return -EFAULT;
2567
2568         if (PFM_CMD_USE_PID(cmd))  {
2569                 /*
2570                  * XXX: may need to fine tune this one
2571                  */
2572                 if (pid < 2) return -EPERM;
2573
2574                 if (pid != current->pid) {
2575
2576                         ret = -ESRCH;
2577
2578                         read_lock(&tasklist_lock);
2579
2580                         task = find_task_by_pid(pid);
2581
2582                         if (task) get_task_struct(task);
2583
2584                         read_unlock(&tasklist_lock);
2585
2586                         if (!task) goto abort_call;
2587
2588                         ret = -EPERM;
2589
2590                         if (pfm_bad_permissions(task)) goto abort_call;
2591
2592                         if (PFM_CMD_CHK(cmd)) {
2593                                 ret = check_task_state(task);
2594                                 if (ret != 0) goto abort_call;
2595                         }
2596                 }
2597         }
2598
2599         ctx = task->thread.pfm_context;
2600
2601         if (PFM_CMD_USE_CTX(cmd)) {
2602                 ret = -EINVAL;
2603                if (ctx == NULL) {
2604                         DBprintk(("no context for task %d\n", task->pid));
2605                         goto abort_call;
2606                }
2607                ret = -EPERM;
2608                /*
2609                 * we only grant access to the context if:
2610                 *       - the caller is the creator of the context (ctx_owner)
2611                 *  OR   - the context is attached to the caller AND The context IS NOT
2612                 *         in protected mode
2613                 */
2614                if (ctx->ctx_owner != current && (ctx->ctx_fl_protected || task != current)) {
2615                                 DBprintk(("context protected, no access for [%d]\n", task->pid));
2616                                 goto abort_call;
2617                }
2618         }
2619
2620         ret = (*pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_func)(task, ctx, arg, count, regs);
2621
2622 abort_call:
2623         if (task && task != current) put_task_struct(task);
2624
2625         return ret;
2626 }
2627
2628 /*
2629  * send SIGPROF to register task, must be invoked when it
2630  * is safe to send a signal, e.g., not holding any runqueue
2631  * related locks.
2632  */
2633 static int
2634 pfm_notify_user(pfm_context_t *ctx)
2635 {
2636         struct siginfo si;
2637         int ret;
2638
2639         if (ctx->ctx_notify_task == NULL) {
2640                 DBprintk(("[%d] no notifier\n", current->pid));
2641                 return -EINVAL;
2642         }
2643
2644         si.si_errno    = 0;
2645         si.si_addr     = NULL;
2646         si.si_pid      = current->pid; /* who is sending */
2647         si.si_signo    = SIGPROF;
2648         si.si_code     = PROF_OVFL;
2649
2650         si.si_pfm_ovfl[0] = ctx->ctx_ovfl_regs[0];
2651
2652         /*
2653          * when the target of the signal is not ourself, we have to be more
2654          * careful. The notify_task may being cleared by the target task itself
2655          * in release_thread(). We must ensure mutual exclusion here such that
2656          * the signal is delivered (even to a dying task) safely.
2657          */
2658
2659         if (ctx->ctx_notify_task != current) {
2660                 /*
2661                  * grab the notification lock for this task
2662                  * This guarantees that the sequence: test + send_signal
2663                  * is atomic with regards to the ctx_notify_task field.
2664                  *
2665                  * We need a spinlock and not just an atomic variable for this.
2666                  *
2667                  */
2668                 spin_lock(&ctx->ctx_lock);
2669
2670                 /*
2671                  * now notify_task cannot be modified until we're done
2672                  * if NULL, they it got modified while we were in the handler
2673                  */
2674                 if (ctx->ctx_notify_task == NULL) {
2675
2676                         spin_unlock(&ctx->ctx_lock);
2677
2678                         /*
2679                          * If we've lost the notified task, then we will run
2680                          * to completion wbut keep the PMU frozen. Results
2681                          * will be incorrect anyway. We do not kill task
2682                          * to leave it possible to attach perfmon context
2683                          * to already running task.
2684                          */
2685                         printk("perfmon: pfm_notify_user() lost notify_task\n");
2686                         DBprintk_ovfl(("notification task has disappeared !\n"));
2687
2688                         /* we cannot afford to block now */
2689                         ctx->ctx_fl_block = 0;
2690
2691                         return  -EINVAL;
2692                 }
2693
2694                 /*
2695                  * required by send_sig_info() to make sure the target
2696                  * task does not disappear on us.
2697                  */
2698                 read_lock(&tasklist_lock);
2699         }
2700         /*
2701          * in this case, we don't stop the task, we let it go on. It will
2702          * necessarily go to the signal handler (if any) when it goes back to
2703          * user mode.
2704          */
2705         DBprintk_ovfl(("[%d] sending notification to [%d]\n",
2706                         current->pid, ctx->ctx_notify_task->pid));
2707
2708         /*
2709          * this call is safe in an interrupt handler, so does read_lock() on tasklist_lock
2710          */
2711         ret = send_sig_info(SIGPROF, &si, ctx->ctx_notify_task);
2712         if (ret) {
2713                 printk("perfmon: send_sig_info(process %d, SIGPROF)=%d\n",
2714                                 ctx->ctx_notify_task->pid, ret);
2715         }
2716
2717         /*
2718          * now undo the protections in order
2719          */
2720         if (ctx->ctx_notify_task != current) {
2721                 read_unlock(&tasklist_lock);
2722                 spin_unlock(&ctx->ctx_lock);
2723         }
2724         return ret;
2725 }
2726
2727 void
2728 pfm_ovfl_block_reset(void)
2729 {
2730         struct thread_struct *th = &current->thread;
2731         pfm_context_t *ctx = current->thread.pfm_context;
2732         unsigned int reason;
2733         int ret;
2734
2735         /*
2736          * clear the flag, to make sure we won't get here
2737          * again
2738          */
2739         th->pfm_ovfl_block_reset = 0;
2740         clear_thread_flag(TIF_NOTIFY_RESUME);
2741
2742         /*
2743          * do some sanity checks first
2744          */
2745         if (!ctx) {
2746                 printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
2747                 return;
2748         }
2749         /*
2750          * extract reason for being here and clear
2751          */
2752         reason = ctx->ctx_fl_trap_reason;
2753         ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
2754
2755         DBprintk(("[%d] reason=%d\n", current->pid, reason));
2756
2757         /*
2758          * just here for a reset (non-blocking context only)
2759          */
2760         if (reason == PFM_TRAP_REASON_RESET) goto non_blocking;
2761
2762         /*
2763          * first notify user. This can fail if notify_task has disappeared.
2764          */
2765         if (reason == PFM_TRAP_REASON_SIG || reason == PFM_TRAP_REASON_BLOCKSIG) {
2766                 ret = pfm_notify_user(ctx);
2767                 if (ret) return;
2768         }
2769
2770         /*
2771          * came here just to signal (non-blocking)
2772          */
2773         if (reason == PFM_TRAP_REASON_SIG) return;
2774
2775         DBprintk(("[%d] before sleeping\n", current->pid));
2776
2777         /*
2778          * may go through without blocking on SMP systems
2779          * if restart has been received already by the time we call down()
2780          */
2781         ret = down_interruptible(&ctx->ctx_restart_sem);
2782
2783         DBprintk(("[%d] after sleeping ret=%d\n", current->pid, ret));
2784
2785         /*
2786          * in case of interruption of down() we don't restart anything
2787          */
2788         if (ret >= 0) {
2789
2790 non_blocking:
2791                 /* we reactivate on context switch */
2792                 ctx->ctx_fl_frozen = 0;
2793                 /*
2794                  * the ovfl_sem is cleared by the restart task and this is safe because we always
2795                  * use the local reference
2796                  */
2797
2798                 pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
2799
2800                 ctx->ctx_ovfl_regs[0] = 0UL;
2801
2802                 /*
2803                  * Unlock sampling buffer and reset index atomically
2804                  * XXX: not really needed when blocking
2805                  */
2806                 if (CTX_HAS_SMPL(ctx)) {
2807                         ctx->ctx_psb->psb_hdr->hdr_count = 0;
2808                         ctx->ctx_psb->psb_index = 0;
2809                 }
2810
2811                 pfm_unfreeze_pmu();
2812
2813                 /* state restored, can go back to work (user mode) */
2814         }
2815 }
2816
2817 /*
2818  * This function will record an entry in the sampling if it is not full already.
2819  * Return:
2820  *      0 : buffer is not full (did not BECOME full: still space or was already full)
2821  *      1 : buffer is full (recorded the last entry)
2822  */
2823 static int
2824 pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ovfl_mask, struct pt_regs *regs)
2825 {
2826         pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb;
2827         unsigned long *e, m, idx;
2828         perfmon_smpl_entry_t *h;
2829         int j;
2830
2831
2832         idx = ia64_fetch_and_add(1, &psb->psb_index);
2833         DBprintk_ovfl(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));
2834
2835         /*
2836          * XXX: there is a small chance that we could run out on index before resetting
2837          * but index is unsigned long, so it will take some time.....
2838          * We use > instead of == because fetch_and_add() is off by one (see below)
2839          *
2840          * This case can happen in non-blocking mode or with multiple processes.
2841          * For non-blocking, we need to reload and continue.
2842          */
2843         if (idx > psb->psb_entries) return 0;
2844
2845         /* first entry is really entry 0, not 1 caused by fetch_and_add */
2846         idx--;
2847
2848         h = (perfmon_smpl_entry_t *)(((char *)psb->psb_addr) + idx*(psb->psb_entry_size));
2849
2850         /*
2851          * initialize entry header
2852          */
2853         h->pid  = current->pid;
2854         h->cpu  = get_cpu();
2855         h->last_reset_value = ovfl_mask ? ctx->ctx_soft_pmds[ffz(~ovfl_mask)].lval : 0UL;
2856         h->ip   = regs ? regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3): 0x0UL;
2857         h->regs = ovfl_mask;                    /* which registers overflowed */
2858
2859         /* guaranteed to monotonically increase on each cpu */
2860         h->stamp  = pfm_get_stamp();
2861
2862         /* position for first pmd */
2863         e = (unsigned long *)(h+1);
2864
2865         /*
2866          * selectively store PMDs in increasing index number
2867          */
2868         m = ctx->ctx_smpl_regs[0];
2869         for (j=0; m; m >>=1, j++) {
2870
2871                 if ((m & 0x1) == 0) continue;
2872
2873                 if (PMD_IS_COUNTING(j)) {
2874                         *e  =  pfm_read_soft_counter(ctx, j);
2875                 } else {
2876                         *e = ia64_get_pmd(j); /* slow */
2877                 }
2878                 DBprintk_ovfl(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
2879                 e++;
2880         }
2881         pfm_stats[h->cpu].pfm_recorded_samples_count++;
2882
2883         /*
2884          * make the new entry visible to user, needs to be atomic
2885          */
2886         ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count);
2887
2888         DBprintk_ovfl(("index=%ld entries=%ld hdr_count=%ld\n",
2889                                 idx, psb->psb_entries, psb->psb_hdr->hdr_count));
2890         /*
2891          * sampling buffer full ?
2892          */
2893         if (idx == (psb->psb_entries-1)) {
2894                 DBprintk_ovfl(("sampling buffer full\n"));
2895                 /*
2896                  * XXX: must reset buffer in blocking mode and lost notified
2897                  */
2898                 pfm_stats[h->cpu].pfm_full_smpl_buffer_count++;
2899                 put_cpu();
2900                 return 1;
2901         }
2902         put_cpu();
2903         return 0;
2904 }
2905
2906 /*
2907  * main overflow processing routine.
2908  * it can be called from the interrupt path or explicitly during the context switch code
2909  * Arguments:
2910  *      mode: 0=coming from PMU interrupt, 1=coming from ctxsw
2911  *
2912  * Return:
2913  *      new value of pmc[0]. if 0x0 then unfreeze, else keep frozen
2914  */
2915 static unsigned long
2916 pfm_overflow_handler(int mode, struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
2917 {
2918         struct thread_struct *t;
2919         unsigned long mask;
2920         unsigned long old_val;
2921         unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL;
2922         int i;
2923         int ret = 1;
2924         /*
2925          * It is never safe to access the task for which the overflow interrupt is destinated
2926          * using the current variable as the interrupt may occur in the middle of a context switch
2927          * where current does not hold the task that is running yet.
2928          *
2929          * For monitoring, however, we do need to get access to the task which caused the overflow
2930          * to account for overflow on the counters.
2931          *
2932          * We accomplish this by maintaining a current owner of the PMU per CPU. During context
2933          * switch the ownership is changed in a way such that the reflected owner is always the
2934          * valid one, i.e. the one that caused the interrupt.
2935          */
2936
2937         preempt_disable();
2938
2939         t   = &task->thread;
2940
2941         /*
2942          * XXX: debug test
2943          * Don't think this could happen given upfront tests
2944          */
2945         if ((t->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system == 0) {
2946                 printk(KERN_DEBUG "perfmon: Spurious overflow interrupt: process %d not "
2947                        "using perfmon\n", task->pid);
2948                 preempt_enable_no_resched();
2949                 return 0x1;
2950         }
2951         /*
2952          * sanity test. Should never happen
2953          */
2954         if ((pmc0 & 0x1) == 0) {
2955                 printk(KERN_DEBUG "perfmon: pid %d pmc0=0x%lx assumption error for freeze bit\n",
2956                        task->pid, pmc0);
2957                 preempt_enable_no_resched();
2958                 return 0x0;
2959         }
2960
2961         mask = pmc0 >> PMU_FIRST_COUNTER;
2962
2963         DBprintk_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
2964                   " mode used_pmds=0x%lx used_pmcs=0x%lx reload_pmcs=0x%lx\n",
2965                         pmc0, task->pid, (regs ? regs->cr_iip : 0),
2966                         CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
2967                         ctx->ctx_used_pmds[0],
2968                         ctx->ctx_used_pmcs[0],
2969                         ctx->ctx_reload_pmcs[0]));
2970
2971         /*
2972          * First we update the virtual counters
2973          */
2974         for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
2975
2976                 /* skip pmd which did not overflow */
2977                 if ((mask & 0x1) == 0) continue;
2978
2979                 DBprintk_ovfl(("pmd[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n",
2980                           i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val));
2981
2982                 /*
2983                  * Note that the pmd is not necessarily 0 at this point as qualified events
2984                  * may have happened before the PMU was frozen. The residual count is not
2985                  * taken into consideration here but will be with any read of the pmd via
2986                  * pfm_read_pmds().
2987                  */
2988                 old_val = ctx->ctx_soft_pmds[i].val;
2989                 ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.ovfl_val;
2990
2991                 /*
2992                  * check for overflow condition
2993                  */
2994                 if (old_val > ctx->ctx_soft_pmds[i].val) {
2995
2996                         ovfl_pmds |= 1UL << i;
2997
2998                         if (PMC_OVFL_NOTIFY(ctx, i)) {
2999                                 ovfl_notify |= 1UL << i;
3000                         }
3001                 }
3002                 DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
3003                           i, ctx->ctx_soft_pmds[i].val, old_val,
3004                           ia64_get_pmd(i) & pmu_conf.ovfl_val, ovfl_pmds, ovfl_notify));
3005         }
3006
3007         /*
3008          * check for sampling buffer
3009          *
3010          * if present, record sample only when a 64-bit counter has overflowed.
3011          * We propagate notification ONLY when buffer becomes full.
3012          */
3013         if(CTX_HAS_SMPL(ctx) && ovfl_pmds) {
3014                 ret = pfm_record_sample(task, ctx, ovfl_pmds, regs);
3015                 if (ret == 1) {
3016                         /*
3017                          * Sampling buffer became full
3018                          * If no notication was requested, then we reset buffer index
3019                          * and reset registers (done below) and resume.
3020                          * If notification requested, then defer reset until pfm_restart()
3021                          */
3022                         if (ovfl_notify == 0UL) {
3023                                 ctx->ctx_psb->psb_hdr->hdr_count = 0UL;
3024                                 ctx->ctx_psb->psb_index          = 0UL;
3025                         }
3026                 } else {
3027                         /*
3028                          * sample recorded in buffer, no need to notify user
3029                          */
3030                         ovfl_notify = 0UL;
3031                 }
3032         }
3033
3034         /*
3035          * No overflow requiring a user level notification
3036          */
3037         if (ovfl_notify == 0UL) {
3038                 if (ovfl_pmds)
3039                         pfm_reset_regs(ctx, &ovfl_pmds, PFM_PMD_SHORT_RESET);
3040                 preempt_enable_no_resched();
3041                 return 0x0UL;
3042         }
3043
3044         /*
3045          * keep track of what to reset when unblocking
3046          */
3047         ctx->ctx_ovfl_regs[0]  = ovfl_pmds;
3048
3049         DBprintk_ovfl(("block=%d notify [%d] current [%d]\n",
3050                 ctx->ctx_fl_block,
3051                 ctx->ctx_notify_task ? ctx->ctx_notify_task->pid: -1,
3052                 current->pid ));
3053
3054         /*
3055          * ctx_notify_task could already be NULL, checked in pfm_notify_user()
3056          */
3057         if (CTX_OVFL_NOBLOCK(ctx) == 0 && ctx->ctx_notify_task != task) {
3058                 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCKSIG;
3059         } else {
3060                 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_SIG;
3061         }
3062         /*
3063          * we cannot block in system wide mode and we do not go
3064          * through the PMU ctxsw code. Therefore we can generate
3065          * the notification here. In system wide mode, the current
3066          * task maybe different from the task controlling the session
3067          * on this CPU, therefore owner can be different from current.
3068          *
3069          * In per-process mode, this function gets called from
3070          * the interrupt handler or pfm_load_regs(). The mode argument
3071          * tells where we are coming from. When coming from the interrupt
3072          * handler, it is safe to notify (send signal) right here because
3073          * we do not hold any runqueue locks needed by send_sig_info().
3074          *
3075          * However when coming from ctxsw, we cannot send the signal here.
3076          * It must be deferred until we are sure we do not hold any runqueue
3077          * related locks. The current task maybe different from the owner
3078          * only in UP mode. The deferral is implemented using the
3079          * TIF_NOTIFY_RESUME mechanism. In this case, the pending work
3080          * is checked when the task is about to leave the kernel (see
3081          * entry.S). As of this version of perfmon, a kernel only
3082          * task cannot be monitored in per-process mode. Therefore,
3083          * when this function gets called from pfm_load_regs(), we know
3084          * we have a user level task which will eventually either exit
3085          * or leave the kernel, and thereby go through the checkpoint
3086          * for TIF_*.
3087          */
3088         if (ctx->ctx_fl_system || mode == 0) {
3089                 pfm_notify_user(ctx);
3090                 ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
3091         } else {
3092                 struct thread_info *info;
3093
3094                 /*
3095                  * given that TIF_NOTIFY_RESUME is not specific to
3096                  * perfmon, we need to have a second level check to
3097                  * verify the source of the notification.
3098                  */
3099                 task->thread.pfm_ovfl_block_reset = 1;
3100                 /*
3101                  * when coming from ctxsw, current still points to the
3102                  * previous task, therefore we must work with task and not current.
3103                  */
3104                 info = ((struct thread_info *) ((char *) task + IA64_TASK_SIZE));
3105                 set_bit(TIF_NOTIFY_RESUME, &info->flags);
3106         }
3107
3108         /*
3109          * keep the PMU frozen until either pfm_restart() or
3110          * task completes (non-blocking or notify_task gone).
3111          */
3112         ctx->ctx_fl_frozen = 1;
3113
3114         DBprintk_ovfl(("current [%d] owner [%d] mode=%d return pmc0=0x%x must_block=%ld reason=%d\n",
3115                 current->pid,
3116                 PMU_OWNER() ? PMU_OWNER()->pid : -1,
3117                 mode,
3118                 ctx->ctx_fl_frozen ? 0x1 : 0x0,
3119                 t->pfm_ovfl_block_reset,
3120                 ctx->ctx_fl_trap_reason));
3121
3122         preempt_enable_no_resched();
3123         return 0x1UL;
3124 }
3125
3126 static irqreturn_t
3127 pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
3128 {
3129         u64 pmc0;
3130         struct task_struct *task;
3131         pfm_context_t *ctx;
3132
3133         pfm_stats[get_cpu()].pfm_ovfl_intr_count++;
3134
3135         /*
3136          * if an alternate handler is registered, just bypass the default one
3137          */
3138         if (pfm_alternate_intr_handler) {
3139                 (*pfm_alternate_intr_handler->handler)(irq, arg, regs);
3140                 put_cpu();
3141                 return IRQ_HANDLED;
3142         }
3143
3144         /*
3145          * srlz.d done before arriving here
3146          *
3147          * This is slow
3148          */
3149         pmc0 = ia64_get_pmc(0);
3150
3151         /*
3152          * if we have some pending bits set
3153          * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1
3154          */
3155         if ((pmc0 & ~0x1UL)!=0UL && (task=PMU_OWNER())!= NULL) {
3156                 /*
3157                  * we assume that pmc0.fr is always set here
3158                  */
3159                 ctx = task->thread.pfm_context;
3160
3161                 /* sanity check */
3162                 if (!ctx) {
3163                         printk(KERN_DEBUG "perfmon: Spurious overflow interrupt: process %d has "
3164                                "no PFM context\n", task->pid);
3165                         put_cpu();
3166                         return IRQ_HANDLED;
3167                 }
3168
3169                 /*
3170                  * assume PMC[0].fr = 1 at this point
3171                  */
3172                 pmc0 = pfm_overflow_handler(0, task, ctx, pmc0, regs);
3173                 /*
3174                  * we can only update pmc0 when the overflow
3175                  * is for the current context or we are in system
3176                  * wide mode. In UP (per-task) the current
3177                  * task may not be the one owning the PMU,
3178                  * same thing for system-wide.
3179                  */
3180                 if (task == current || ctx->ctx_fl_system) {
3181                         /*
3182                          * We always clear the overflow status bits and either unfreeze
3183                          * or keep the PMU frozen.
3184                          */
3185                         ia64_set_pmc(0, pmc0);
3186                         ia64_srlz_d();
3187                 } else {
3188                         task->thread.pmc[0] = pmc0;
3189                 }
3190         } else {
3191                 pfm_stats[smp_processor_id()].pfm_spurious_ovfl_intr_count++;
3192         }
3193         put_cpu_no_resched();
3194         return IRQ_HANDLED;
3195 }
3196
3197 /* for debug only */
3198 static int
3199 pfm_proc_info(char *page)
3200 {
3201         char *p = page;
3202         int i;
3203
3204         p += sprintf(p, "fastctxsw              : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
3205         p += sprintf(p, "ovfl_mask              : 0x%lx\n", pmu_conf.ovfl_val);
3206
3207         for(i=0; i < NR_CPUS; i++) {
3208                 if (cpu_online(i) == 0) continue;
3209                 p += sprintf(p, "CPU%-2d overflow intrs   : %lu\n", i, pfm_stats[i].pfm_ovfl_intr_count);
3210                 p += sprintf(p, "CPU%-2d spurious intrs   : %lu\n", i, pfm_stats[i].pfm_spurious_ovfl_intr_count);
3211                 p += sprintf(p, "CPU%-2d recorded samples : %lu\n", i, pfm_stats[i].pfm_recorded_samples_count);
3212                 p += sprintf(p, "CPU%-2d smpl buffer full : %lu\n", i, pfm_stats[i].pfm_full_smpl_buffer_count);
3213                 p += sprintf(p, "CPU%-2d syst_wide        : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_SYST_WIDE ? 1 : 0);
3214                 p += sprintf(p, "CPU%-2d dcr_pp           : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_DCR_PP ? 1 : 0);
3215                 p += sprintf(p, "CPU%-2d exclude idle     : %d\n", i, per_cpu(pfm_syst_info, i) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0);
3216                 p += sprintf(p, "CPU%-2d owner            : %d\n", i, pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1);
3217         }
3218
3219         LOCK_PFS();
3220
3221         p += sprintf(p, "proc_sessions          : %u\n"
3222                         "sys_sessions           : %u\n"
3223                         "sys_use_dbregs         : %u\n"
3224                         "ptrace_use_dbregs      : %u\n",
3225                         pfm_sessions.pfs_task_sessions,
3226                         pfm_sessions.pfs_sys_sessions,
3227                         pfm_sessions.pfs_sys_use_dbregs,
3228                         pfm_sessions.pfs_ptrace_use_dbregs);
3229
3230         UNLOCK_PFS();
3231
3232         return p - page;
3233 }
3234
3235 /* /proc interface, for debug only */
3236 static int
3237 perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
3238 {
3239         int len = pfm_proc_info(page);
3240
3241         if (len <= off+count) *eof = 1;
3242
3243         *start = page + off;
3244         len   -= off;
3245
3246         if (len>count) len = count;
3247         if (len<0) len = 0;
3248
3249         return len;
3250 }
3251
3252 /*
3253  * we come here as soon as PFM_CPUINFO_SYST_WIDE is set. This happens
3254  * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
3255  * is active or inactive based on mode. We must rely on the value in
3256  * cpu_data(i)->pfm_syst_info
3257  */
3258 void
3259 pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
3260 {
3261         struct pt_regs *regs;
3262         unsigned long dcr;
3263         unsigned long dcr_pp;
3264
3265         preempt_disable();
3266         dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
3267
3268         /*
3269          * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
3270          * on every CPU, so we can rely on the pid to identify the idle task.
3271          */
3272         if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
3273                 regs = (struct pt_regs *)((unsigned long) task + IA64_STK_OFFSET);
3274                 regs--;
3275                 ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
3276                 preempt_enable();
3277                 return;
3278         }
3279         /*
3280          * if monitoring has started
3281          */
3282         if (dcr_pp) {
3283                 dcr = ia64_get_dcr();
3284                 /*
3285                  * context switching in?
3286                  */
3287                 if (is_ctxswin) {
3288                         /* mask monitoring for the idle task */
3289                         ia64_set_dcr(dcr & ~IA64_DCR_PP);
3290                         pfm_clear_psr_pp();
3291                         ia64_srlz_i();
3292                         preempt_enable();
3293                         return;
3294                 }
3295                 /*
3296                  * context switching out
3297                  * restore monitoring for next task
3298                  *
3299                  * Due to inlining this odd if-then-else construction generates
3300                  * better code.
3301                  */
3302                 ia64_set_dcr(dcr |IA64_DCR_PP);
3303                 pfm_set_psr_pp();
3304                 ia64_srlz_i();
3305         }
3306         preempt_enable();
3307 }
3308
3309 void
3310 pfm_save_regs (struct task_struct *task)
3311 {
3312         pfm_context_t *ctx;
3313         unsigned long mask;
3314         u64 psr;
3315         int i;
3316
3317         preempt_disable();
3318
3319         ctx = task->thread.pfm_context;
3320
3321
3322         /*
3323          * save current PSR: needed because we modify it
3324          */
3325         psr = pfm_get_psr();
3326
3327         /*
3328          * stop monitoring:
3329          * This is the last instruction which can generate an overflow
3330          *
3331          * We do not need to set psr.sp because, it is irrelevant in kernel.
3332          * It will be restored from ipsr when going back to user level
3333          */
3334         pfm_clear_psr_up();
3335         ia64_srlz_i();
3336
3337         ctx->ctx_saved_psr = psr;
3338
3339 #ifdef CONFIG_SMP
3340         /*
3341          * We do not use a lazy scheme in SMP because
3342          * of the new scheduler which masks interrupts
3343          * during low-level context switch. So we save
3344          * all the PMD register we use and restore on
3345          * ctxsw in.
3346          *
3347          * release ownership of this PMU.
3348          * must be done before we save the registers.
3349          */
3350         SET_PMU_OWNER(NULL);
3351
3352         /*
3353          * save PMDs
3354          */
3355         ia64_srlz_d();
3356
3357         mask = ctx->ctx_used_pmds[0];
3358         for (i=0; mask; i++, mask>>=1) {
3359                 if (mask & 0x1) task->thread.pmd[i] =ia64_get_pmd(i);
3360         }
3361
3362         /*
3363          * save pmc0
3364          */
3365         task->thread.pmc[0] = ia64_get_pmc(0);
3366
3367         /*
3368          * force a full reload
3369          */
3370         atomic_set(&ctx->ctx_last_cpu, -1);
3371 #endif
3372         preempt_enable();
3373 }
3374
3375 static void
3376 pfm_lazy_save_regs (struct task_struct *task)
3377 {
3378         pfm_context_t *ctx;
3379         struct thread_struct *t;
3380         unsigned long mask;
3381         int i;
3382
3383         preempt_disable();
3384         DBprintk(("on [%d] by [%d]\n", task->pid, current->pid));
3385
3386         t   = &task->thread;
3387         ctx = task->thread.pfm_context;
3388
3389         /*
3390          * do not own the PMU
3391          */
3392         SET_PMU_OWNER(NULL);
3393
3394         ia64_srlz_d();
3395
3396         /*
3397          * XXX needs further optimization.
3398          * Also must take holes into account
3399          */
3400         mask = ctx->ctx_used_pmds[0];
3401         for (i=0; mask; i++, mask>>=1) {
3402                 if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
3403         }
3404
3405         /* save pmc0 */
3406         t->pmc[0] = ia64_get_pmc(0);
3407
3408         /* not owned by this CPU */
3409         atomic_set(&ctx->ctx_last_cpu, -1);
3410         preempt_enable();
3411 }
3412
3413 void
3414 pfm_load_regs (struct task_struct *task)
3415 {
3416         struct thread_struct *t;
3417         pfm_context_t *ctx;
3418         struct task_struct *owner;
3419         unsigned long mask;
3420         u64 psr;
3421         int i;
3422
3423         preempt_disable();
3424
3425         owner = PMU_OWNER();
3426         ctx   = task->thread.pfm_context;
3427         t     = &task->thread;
3428
3429         if (ctx == NULL) {
3430                 preempt_enable();
3431                 printk("perfmon: pfm_load_regs: null ctx for [%d]\n", task->pid);
3432                 return;
3433         }
3434
3435         /*
3436          * we restore ALL the debug registers to avoid picking up
3437          * stale state.
3438          *
3439          * This must be done even when the task is still the owner
3440          * as the registers may have been modified via ptrace()
3441          * (not perfmon) by the previous task.
3442          *
3443          * XXX: dealing with this in a lazy fashion requires modifications
3444          * to the way the the debug registers are managed. This is will done
3445          * in the next version of perfmon.
3446          */
3447         if (ctx->ctx_fl_using_dbreg) {
3448                 for (i=0; i < (int) pmu_conf.num_ibrs; i++) {
3449                         ia64_set_ibr(i, t->ibr[i]);
3450                 }
3451                 ia64_srlz_i();
3452                 for (i=0; i < (int) pmu_conf.num_dbrs; i++) {
3453                         ia64_set_dbr(i, t->dbr[i]);
3454                 }
3455                 ia64_srlz_d();
3456         }
3457
3458         /*
3459          * if we were the last user, then nothing to do except restore psr
3460          * this path cannot be used in SMP
3461          */
3462         if (owner == task) {
3463                 if ((unsigned int) atomic_read(&ctx->ctx_last_cpu) != smp_processor_id())
3464                         DBprintk(("invalid last_cpu=%d for [%d]\n",
3465                                 atomic_read(&ctx->ctx_last_cpu), task->pid));
3466
3467                 psr = ctx->ctx_saved_psr;
3468                 pfm_set_psr_l(psr);
3469                 preempt_enable();
3470                 return;
3471         }
3472
3473         /*
3474          * someone else is still using the PMU, first push it out and
3475          * then we'll be able to install our stuff !
3476          *
3477          * not possible in SMP
3478          */
3479         if (owner) pfm_lazy_save_regs(owner);
3480
3481         /*
3482          * To avoid leaking information to the user level when psr.sp=0,
3483          * we must reload ALL implemented pmds (even the ones we don't use).
3484          * In the kernel we only allow PFM_READ_PMDS on registers which
3485          * we initialized or requested (sampling) so there is no risk there.
3486          *
3487          * As an optimization, we will only reload the PMD that we use when
3488          * the context is in protected mode, i.e. psr.sp=1 because then there
3489          * is no leak possible.
3490          */
3491         mask = pfm_sysctl.fastctxsw || ctx->ctx_fl_protected ?  ctx->ctx_used_pmds[0] : ctx->ctx_reload_pmds[0];
3492         for (i=0; mask; i++, mask>>=1) {
3493                 if (mask & 0x1) ia64_set_pmd(i, t->pmd[i] & pmu_conf.ovfl_val);
3494         }
3495
3496         /*
3497          * PMC0 is never set in the mask because it is always restored
3498          * separately.
3499          *
3500          * ALL PMCs are systematically reloaded, unused registers
3501          * get their default (PAL reset) values to avoid picking up
3502          * stale configuration.
3503          */
3504         mask = ctx->ctx_reload_pmcs[0];
3505         for (i=0; mask; i++, mask>>=1) {
3506                 if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
3507         }
3508
3509         /*
3510          * manually invoke core interrupt handler
3511          * if the task had a pending overflow when it was ctxsw out.
3512          * Side effect on ctx_fl_frozen is possible.
3513          */
3514         if (t->pmc[0] & ~0x1) {
3515                 t->pmc[0] = pfm_overflow_handler(1, task, ctx, t->pmc[0], NULL);
3516         }
3517
3518         /*
3519          * unfreeze PMU if possible
3520          */
3521         if (ctx->ctx_fl_frozen == 0) pfm_unfreeze_pmu();
3522
3523         atomic_set(&ctx->ctx_last_cpu, smp_processor_id());
3524
3525         SET_PMU_OWNER(task);
3526
3527         /*
3528          * restore the psr we changed in pfm_save_regs()
3529          */
3530         psr = ctx->ctx_saved_psr;
3531         preempt_enable();
3532         pfm_set_psr_l(psr);
3533 }
3534
3535 /*
3536  * XXX: make this routine able to work with non current context
3537  */
3538 static void
3539 pfm_reset_pmu(struct task_struct *task)
3540 {
3541         struct thread_struct *t = &task->thread;
3542         pfm_context_t *ctx = t->pfm_context;
3543         int i;
3544
3545         if (task != current) {
3546                 printk("perfmon: invalid task in pfm_reset_pmu()\n");
3547                 return;
3548         }
3549         preempt_disable();
3550
3551         /* Let's make sure the PMU is frozen */
3552         pfm_freeze_pmu();
3553
3554         /*
3555          * install reset values for PMC. We skip PMC0 (done above)
3556          * XX: good up to 64 PMCS
3557          */
3558         for (i=1; (pmu_conf.pmc_desc[i].type & PFM_REG_END) == 0; i++) {
3559                 if ((pmu_conf.pmc_desc[i].type & PFM_REG_IMPL) == 0) continue;
3560                 ia64_set_pmc(i, PMC_DFL_VAL(i));
3561                 /*
3562                  * When restoring context, we must restore ALL pmcs, even the ones
3563                  * that the task does not use to avoid leaks and possibly corruption
3564                  * of the sesion because of configuration conflicts. So here, we
3565                  * initialize the entire set used in the context switch restore routine.
3566                  */
3567                 t->pmc[i] = PMC_DFL_VAL(i);
3568                 DBprintk(("pmc[%d]=0x%lx\n", i, t->pmc[i]));
3569         }
3570
3571         /*
3572          * clear reset values for PMD.
3573          * XXX: good up to 64 PMDS.
3574          */
3575         for (i=0; (pmu_conf.pmd_desc[i].type & PFM_REG_END) == 0; i++) {
3576                 if ((pmu_conf.pmd_desc[i].type & PFM_REG_IMPL) == 0) continue;
3577                 ia64_set_pmd(i, 0UL);
3578                 t->pmd[i] = 0UL;
3579         }
3580
3581         /*
3582          * On context switched restore, we must restore ALL pmc and ALL pmd even
3583          * when they are not actively used by the task. In UP, the incoming process
3584          * may otherwise pick up left over PMC, PMD state from the previous process.
3585          * As opposed to PMD, stale PMC can cause harm to the incoming
3586          * process because they may change what is being measured.
3587          * Therefore, we must systematically reinstall the entire
3588          * PMC state. In SMP, the same thing is possible on the
3589          * same CPU but also on between 2 CPUs.
3590          *
3591          * The problem with PMD is information leaking especially
3592          * to user level when psr.sp=0
3593          *
3594          * There is unfortunately no easy way to avoid this problem
3595          * on either UP or SMP. This definitively slows down the
3596          * pfm_load_regs() function.
3597          */
3598
3599          /*
3600           * We must include all the PMC in this mask to make sure we don't
3601           * see any side effect of a stale state, such as opcode matching
3602           * or range restrictions, for instance.
3603           *
3604           * We never directly restore PMC0 so we do not include it in the mask.
3605           */
3606         ctx->ctx_reload_pmcs[0] = pmu_conf.impl_pmcs[0] & ~0x1;
3607         /*
3608          * We must include all the PMD in this mask to avoid picking
3609          * up stale value and leak information, especially directly
3610          * at the user level when psr.sp=0
3611          */
3612         ctx->ctx_reload_pmds[0] = pmu_conf.impl_pmds[0];
3613
3614         /*
3615          * Keep track of the pmds we want to sample
3616          * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
3617          * but we do need the BTB for sure. This is because of a hardware
3618          * buffer of 1 only for non-BTB pmds.
3619          *
3620          * We ignore the unimplemented pmds specified by the user
3621          */
3622         ctx->ctx_used_pmds[0] = ctx->ctx_smpl_regs[0];
3623         ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */
3624
3625         /*
3626          * useful in case of re-enable after disable
3627          */
3628         ctx->ctx_used_ibrs[0] = 0UL;
3629         ctx->ctx_used_dbrs[0] = 0UL;
3630
3631         ia64_srlz_d();
3632         preempt_enable();
3633 }
3634
3635 /*
3636  * This function is called when a thread exits (from exit_thread()).
3637  * This is a simplified pfm_save_regs() that simply flushes the current
3638  * register state into the save area taking into account any pending
3639  * overflow. This time no notification is sent because the task is dying
3640  * anyway. The inline processing of overflows avoids loosing some counts.
3641  * The PMU is frozen on exit from this call and is to never be reenabled
3642  * again for this task.
3643  *
3644  */
3645 void
3646 pfm_flush_regs (struct task_struct *task)
3647 {
3648         pfm_context_t *ctx;
3649         u64 pmc0;
3650         unsigned long mask2, val;
3651         int i;
3652
3653         ctx = task->thread.pfm_context;
3654
3655         if (ctx == NULL) return;
3656
3657         /*
3658          * that's it if context already disabled
3659          */
3660         if (ctx->ctx_flags.state == PFM_CTX_DISABLED) return;
3661
3662         preempt_disable();
3663         /*
3664          * stop monitoring:
3665          * This is the only way to stop monitoring without destroying overflow
3666          * information in PMC[0].
3667          * This is the last instruction which can cause overflow when monitoring
3668          * in kernel.
3669          * By now, we could still have an overflow interrupt in-flight.
3670          */
3671         if (ctx->ctx_fl_system) {
3672
3673
3674                 /* disable dcr pp */
3675                 ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
3676
3677                 /* stop monitoring */
3678                 pfm_clear_psr_pp();
3679
3680                 ia64_srlz_i();
3681
3682                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
3683                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
3684                 PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
3685         } else  {
3686
3687                 /* stop monitoring */
3688                 pfm_clear_psr_up();
3689
3690                 ia64_srlz_i();
3691
3692                 /* no more save/restore on ctxsw */
3693                 current->thread.flags &= ~IA64_THREAD_PM_VALID;
3694         }
3695
3696         /*
3697          * Mark the PMU as not owned
3698          * This will cause the interrupt handler to do nothing in case an overflow
3699          * interrupt was in-flight
3700          * This also guarantees that pmc0 will contain the final state
3701          * It virtually gives us full control on overflow processing from that point
3702          * on.
3703          * It must be an atomic operation.
3704          */
3705         SET_PMU_OWNER(NULL);
3706
3707         /*
3708          * read current overflow status:
3709          *
3710          * we are guaranteed to read the final stable state
3711          */
3712         ia64_srlz_d();
3713         pmc0 = ia64_get_pmc(0); /* slow */
3714
3715         /*
3716          * freeze PMU:
3717          *
3718          * This destroys the overflow information. This is required to make sure
3719          * next process does not start with monitoring on if not requested
3720          */
3721         pfm_freeze_pmu();
3722
3723         /*
3724          * We don't need to restore psr, because we are on our way out
3725          */
3726
3727         /*
3728          * This loop flushes the PMD into the PFM context.
3729          * It also processes overflow inline.
3730          *
3731          * IMPORTANT: No notification is sent at this point as the process is dying.
3732          * The implicit notification will come from a SIGCHILD or a return from a
3733          * waitpid().
3734          *
3735          */
3736
3737         if ((unsigned int) atomic_read(&ctx->ctx_last_cpu) != smp_processor_id())
3738                 printk(KERN_DEBUG "perfmon: [%d] last_cpu=%d\n",
3739                        task->pid, atomic_read(&ctx->ctx_last_cpu));
3740
3741         /*
3742          * we save all the used pmds
3743          * we take care of overflows for pmds used as counters
3744          */
3745         mask2 = ctx->ctx_used_pmds[0];
3746         for (i = 0; mask2; i++, mask2>>=1) {
3747
3748                 /* skip non used pmds */
3749                 if ((mask2 & 0x1) == 0) continue;
3750
3751                 val = ia64_get_pmd(i);
3752
3753                 if (PMD_IS_COUNTING(i)) {
3754                         DBprintk(("[%d] pmd[%d] soft_pmd=0x%lx hw_pmd=0x%lx\n",
3755                                 task->pid,
3756                                 i,
3757                                 ctx->ctx_soft_pmds[i].val,
3758                                 val & pmu_conf.ovfl_val));
3759
3760                         /* collect latest results */
3761                         ctx->ctx_soft_pmds[i].val += val & pmu_conf.ovfl_val;
3762
3763                         /*
3764                          * now everything is in ctx_soft_pmds[] and we need
3765                          * to clear the saved context from save_regs() such that
3766                          * pfm_read_pmds() gets the correct value
3767                          */
3768                         task->thread.pmd[i] = 0;
3769
3770                         /*
3771                          * take care of overflow inline
3772                          */
3773                         if (pmc0 & (1UL << i)) {
3774                                 ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.ovfl_val;
3775                                 DBprintk(("[%d] pmd[%d] overflowed soft_pmd=0x%lx\n",
3776                                         task->pid, i, ctx->ctx_soft_pmds[i].val));
3777                         }
3778                 } else {
3779                         DBprintk(("[%d] pmd[%d] hw_pmd=0x%lx\n", task->pid, i, val));
3780                         /*
3781                          * not a counter, just save value as is
3782                          */
3783                         task->thread.pmd[i] = val;
3784                 }
3785         }
3786         /*
3787          * indicates that context has been saved
3788          */
3789         atomic_set(&ctx->ctx_last_cpu, -1);
3790         preempt_enable();
3791 }
3792
3793
3794 /*
3795  * task is the newly created task, pt_regs for new child
3796  */
3797 int
3798 pfm_inherit(struct task_struct *task, struct pt_regs *regs)
3799 {
3800         pfm_context_t *ctx;
3801         pfm_context_t *nctx;
3802         struct thread_struct *thread;
3803         unsigned long m;
3804         int i;
3805
3806         /*
3807          * the new task was copied from parent and therefore points
3808          * to the parent's context at this point
3809          */
3810         ctx    = task->thread.pfm_context;
3811         thread = &task->thread;
3812
3813         preempt_disable();
3814         /*
3815          * for secure sessions, make sure child cannot mess up
3816          * the monitoring session.
3817          */
3818         if (ctx->ctx_fl_unsecure == 0) {
3819                 ia64_psr(regs)->sp = 1;
3820                 DBprintk(("enabling psr.sp for [%d]\n", task->pid));
3821         } else {
3822                 DBprintk(("psr.sp=%d [%d]\n", ia64_psr(regs)->sp, task->pid));
3823         }
3824
3825         /*
3826          * if there was a virtual mapping for the sampling buffer
3827          * the mapping is NOT inherited across fork() (see VM_DONTCOPY),
3828          * so we don't have to explicitly remove it here.
3829          *
3830          *
3831          * Part of the clearing of fields is also done in
3832          * copy_thread() because the fiels are outside the
3833          * pfm_context structure and can affect tasks not
3834          * using perfmon.
3835          */
3836
3837         /* clear pending notification */
3838         task->thread.pfm_ovfl_block_reset = 0;
3839
3840         /*
3841          * clear cpu pinning restriction for child
3842          */
3843         if (ctx->ctx_fl_system) {
3844                 set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
3845
3846                 DBprintk(("setting cpus_allowed for [%d] to 0x%lx from 0x%lx\n",
3847                         task->pid,
3848                         ctx->ctx_saved_cpus_allowed,
3849                         current->cpus_allowed));
3850         }
3851
3852         /*
3853          * takes care of easiest case first
3854          */
3855         if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) {
3856
3857                 DBprintk(("removing PFM context for [%d]\n", task->pid));
3858
3859                 task->thread.pfm_context = NULL;
3860
3861                 /*
3862                  * we must clear psr.up because the new child does
3863                  * not have a context and the PM_VALID flag is cleared
3864                  * in copy_thread().
3865                  *
3866                  * we do not clear psr.pp because it is always
3867                  * controlled by the system wide logic and we should
3868                  * never be here when system wide is running anyway
3869                  */
3870                 ia64_psr(regs)->up = 0;
3871
3872                 preempt_enable();
3873
3874                 /* copy_thread() clears IA64_THREAD_PM_VALID */
3875                 return 0;
3876         }
3877         nctx = pfm_context_alloc();
3878         if (nctx == NULL) return -ENOMEM;
3879
3880         /* copy content */
3881         *nctx = *ctx;
3882
3883
3884         if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) {
3885                 nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE;
3886                 DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));
3887         }
3888         /*
3889          * task is not yet visible in the tasklist, so we do
3890          * not need to lock the newly created context.
3891          * However, we must grab the tasklist_lock to ensure
3892          * that the ctx_owner or ctx_notify_task do not disappear
3893          * while we increment their check counters.
3894          */
3895         read_lock(&tasklist_lock);
3896
3897         if (nctx->ctx_notify_task)
3898                 atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);
3899
3900         if (nctx->ctx_owner)
3901                 atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);
3902
3903         read_unlock(&tasklist_lock);
3904
3905
3906         LOCK_PFS();
3907         pfm_sessions.pfs_task_sessions++;
3908         UNLOCK_PFS();
3909
3910         /* initialize counters in new context */
3911         m = nctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
3912         for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) {
3913                 if ((m & 0x1) && pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING) {
3914                         nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].lval & ~pmu_conf.ovfl_val;
3915                         thread->pmd[i]             = nctx->ctx_soft_pmds[i].lval & pmu_conf.ovfl_val;
3916                 } else {
3917                         thread->pmd[i]             = 0UL; /* reset to initial state */
3918                 }
3919         }
3920
3921         nctx->ctx_fl_frozen      = 0;
3922         nctx->ctx_ovfl_regs[0]   = 0UL;
3923         nctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
3924         atomic_set(&nctx->ctx_last_cpu, -1);
3925
3926         /*
3927          * here nctx->ctx_psb == ctx->ctx_psb
3928          *
3929          * increment reference count to sampling
3930          * buffer, if any. Note that this is independent
3931          * from the virtual mapping. The latter is never
3932          * inherited while the former will be if context
3933          * is setup to something different from PFM_FL_INHERIT_NONE
3934          */
3935         if (nctx->ctx_psb) {
3936                 LOCK_PSB(nctx->ctx_psb);
3937
3938                 nctx->ctx_psb->psb_refcnt++;
3939
3940                 DBprintk(("updated smpl @ %p refcnt=%lu psb_flags=0x%x\n",
3941                         ctx->ctx_psb->psb_hdr,
3942                         ctx->ctx_psb->psb_refcnt,
3943                         ctx->ctx_psb->psb_flags));
3944
3945                 UNLOCK_PSB(nctx->ctx_psb);
3946
3947                 /*
3948                  * remove any pointer to sampling buffer mapping
3949                  */
3950                 nctx->ctx_smpl_vaddr = 0;
3951         }
3952
3953         sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */
3954
3955         /*
3956          * propagate kernel psr in new context (used for first ctxsw in
3957          */
3958         nctx->ctx_saved_psr = pfm_get_psr();
3959
3960         /*
3961          * propagate kernel psr in new context (used for first ctxsw in
3962          */
3963         nctx->ctx_saved_psr = pfm_get_psr();
3964
3965         /* link with new task */
3966         thread->pfm_context = nctx;
3967
3968         DBprintk(("nctx=%p for process [%d]\n", (void *)nctx, task->pid));
3969
3970         /*
3971          * the copy_thread routine automatically clears
3972          * IA64_THREAD_PM_VALID, so we need to reenable it, if it was used by the caller
3973          */
3974         if (current->thread.flags & IA64_THREAD_PM_VALID) {
3975                 DBprintk(("setting PM_VALID for [%d]\n", task->pid));
3976                 thread->flags |= IA64_THREAD_PM_VALID;
3977         }
3978
3979         preempt_enable();
3980
3981         return 0;
3982 }
3983
3984 /*
3985  *
3986  * We cannot touch any of the PMU registers at this point as we may
3987  * not be running on the same CPU the task was last run on.  Therefore
3988  * it is assumed that the PMU has been stopped appropriately in
3989  * pfm_flush_regs() called from exit_thread().
3990  *
3991  * The function is called in the context of the parent via a release_thread()
3992  * and wait4(). The task is not in the tasklist anymore.
3993  */
3994 void
3995 pfm_context_exit(struct task_struct *task)
3996 {
3997         pfm_context_t *ctx = task->thread.pfm_context;
3998
3999         /*
4000          * check sampling buffer
4001          */
4002         preempt_disable();
4003         if (ctx->ctx_psb) {
4004                 pfm_smpl_buffer_desc_t *psb = ctx->ctx_psb;
4005
4006                 LOCK_PSB(psb);
4007
4008                 DBprintk(("sampling buffer from [%d] @%p size %ld refcnt=%lu psb_flags=0x%x\n",
4009                         task->pid,
4010                         psb->psb_hdr, psb->psb_size, psb->psb_refcnt, psb->psb_flags));
4011
4012                 /*
4013                  * in the case where we are the last user, we may be able to free
4014                  * the buffer
4015                  */
4016                 psb->psb_refcnt--;
4017
4018                 if (psb->psb_refcnt == 0) {
4019
4020                         /*
4021                          * The flag is cleared in pfm_vm_close(). which gets
4022                          * called from do_exit() via exit_mm().
4023                          * By the time we come here, the task has no more mm context.
4024                          *
4025                          * We can only free the psb and buffer here after the vm area
4026                          * describing the buffer has been removed. This normally happens
4027                          * as part of do_exit() but the entire mm context is ONLY removed
4028                          * once its reference counts goes to zero. This is typically
4029                          * the case except for multi-threaded (several tasks) processes.
4030                          *
4031                          * See pfm_vm_close() and pfm_cleanup_smpl_buf() for more details.
4032                          */
4033                         if ((psb->psb_flags & PSB_HAS_VMA) == 0) {
4034
4035                                 DBprintk(("cleaning sampling buffer from [%d] @%p size %ld\n",
4036                                         task->pid,
4037                                         psb->psb_hdr, psb->psb_size));
4038
4039                                 /*
4040                                  * free the buffer and psb
4041                                  */
4042                                 pfm_rvfree(psb->psb_hdr, psb->psb_size);
4043                                 kfree(psb);
4044                                 psb = NULL;
4045                         }
4046                 }
4047                 /* psb may have been deleted */
4048                 if (psb) UNLOCK_PSB(psb);
4049         }
4050
4051         DBprintk(("cleaning [%d] pfm_context @%p notify_task=%p check=%d mm=%p\n",
4052                 task->pid, ctx,
4053                 ctx->ctx_notify_task,
4054                 atomic_read(&task->thread.pfm_notifiers_check), task->mm));
4055
4056         /*
4057          * To avoid getting the notified task or owner task scan the entire process
4058          * list when they exit, we decrement notifiers_check and owners_check respectively.
4059          *
4060          * Of course, there is race condition between decreasing the value and the
4061          * task exiting. The danger comes from the fact that, in both cases, we have a
4062          * direct pointer to a task structure thereby bypassing the tasklist.
4063          * We must make sure that, if we have task!= NULL, the target task is still
4064          * present and is identical to the initial task specified
4065          * during pfm_context_create(). It may already be detached from the tasklist but
4066          * that's okay. Note that it is okay if we miss the deadline and the task scans
4067          * the list for nothing, it will affect performance but not correctness.
4068          * The correctness is ensured by using the ctx_lock which prevents the
4069          * notify_task from changing the fields in our context.
4070          * Once holdhing this lock, if we see task!= NULL, then it will stay like
4071          * that until we release the lock. If it is NULL already then we came too late.
4072          */
4073         LOCK_CTX(ctx);
4074
4075         if (ctx->ctx_notify_task != NULL) {
4076                 DBprintk(("[%d], [%d] atomic_sub on [%d] notifiers=%u\n", current->pid,
4077                         task->pid,
4078                         ctx->ctx_notify_task->pid,
4079                         atomic_read(&ctx->ctx_notify_task->thread.pfm_notifiers_check)));
4080
4081                 atomic_dec(&ctx->ctx_notify_task->thread.pfm_notifiers_check);
4082         }
4083
4084         if (ctx->ctx_owner != NULL) {
4085                 DBprintk(("[%d], [%d] atomic_sub on [%d] owners=%u\n",
4086                          current->pid,
4087                          task->pid,
4088                          ctx->ctx_owner->pid,
4089                          atomic_read(&ctx->ctx_owner->thread.pfm_owners_check)));
4090
4091                 atomic_dec(&ctx->ctx_owner->thread.pfm_owners_check);
4092         }
4093
4094         UNLOCK_CTX(ctx);
4095         preempt_enable();
4096
4097         pfm_unreserve_session(task, ctx->ctx_fl_system, 1UL << ctx->ctx_cpu);
4098
4099         if (ctx->ctx_fl_system) {
4100                 /*
4101                  * remove any CPU pinning
4102                  */
4103                 set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
4104         }
4105
4106         pfm_context_free(ctx);
4107         /*
4108          *  clean pfm state in thread structure,
4109          */
4110         task->thread.pfm_context          = NULL;
4111         task->thread.pfm_ovfl_block_reset = 0;
4112
4113         /* pfm_notifiers is cleaned in pfm_cleanup_notifiers() */
4114 }
4115
4116 /*
4117  * function invoked from release_thread when pfm_smpl_buf_list is not NULL
4118  */
4119 int
4120 pfm_cleanup_smpl_buf(struct task_struct *task)
4121 {
4122         pfm_smpl_buffer_desc_t *tmp, *psb = task->thread.pfm_smpl_buf_list;
4123
4124         if (psb == NULL) {
4125                 printk(KERN_DEBUG "perfmon: psb is null in [%d]\n", current->pid);
4126                 return -1;
4127         }
4128         /*
4129          * Walk through the list and free the sampling buffer and psb
4130          */
4131         while (psb) {
4132                 DBprintk(("[%d] freeing smpl @%p size %ld\n", current->pid, psb->psb_hdr, psb->psb_size));
4133
4134                 pfm_rvfree(psb->psb_hdr, psb->psb_size);
4135                 tmp = psb->psb_next;
4136                 kfree(psb);
4137                 psb = tmp;
4138         }
4139
4140         /* just in case */
4141         task->thread.pfm_smpl_buf_list = NULL;
4142
4143         return 0;
4144 }
4145
4146 /*
4147  * function invoked from release_thread to make sure that the ctx_owner field does not
4148  * point to an unexisting task.
4149  */
4150 void
4151 pfm_cleanup_owners(struct task_struct *task)
4152 {
4153         struct task_struct *g, *p;
4154         pfm_context_t *ctx;
4155
4156         DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid));
4157
4158         read_lock(&tasklist_lock);
4159
4160         do_each_thread(g, p) {
4161                 /*
4162                  * It is safe to do the 2-step test here, because thread.ctx
4163                  * is cleaned up only in release_thread() and at that point
4164                  * the task has been detached from the tasklist which is an
4165                  * operation which uses the write_lock() on the tasklist_lock
4166                  * so it cannot run concurrently to this loop. So we have the
4167                  * guarantee that if we find p and it has a perfmon ctx then
4168                  * it is going to stay like this for the entire execution of this
4169                  * loop.
4170                  */
4171                 ctx = p->thread.pfm_context;
4172
4173                 //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));
4174
4175                 if (ctx && ctx->ctx_owner == task) {
4176                         DBprintk(("trying for owner [%d] in [%d]\n", task->pid, p->pid));
4177                         /*
4178                          * the spinlock is required to take care of a race condition
4179                          * with the send_sig_info() call. We must make sure that
4180                          * either the send_sig_info() completes using a valid task,
4181                          * or the notify_task is cleared before the send_sig_info()
4182                          * can pick up a stale value. Note that by the time this
4183                          * function is executed the 'task' is already detached from the
4184                          * tasklist. The problem is that the notifiers have a direct
4185                          * pointer to it. It is okay to send a signal to a task in this
4186                          * stage, it simply will have no effect. But it is better than sending
4187                          * to a completely destroyed task or worse to a new task using the same
4188                          * task_struct address.
4189                          */
4190                         LOCK_CTX(ctx);
4191
4192                         ctx->ctx_owner = NULL;
4193
4194                         UNLOCK_CTX(ctx);
4195
4196                         DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid));
4197                 }
4198         } while_each_thread(g, p);
4199
4200         read_unlock(&tasklist_lock);
4201
4202         atomic_set(&task->thread.pfm_owners_check, 0);
4203 }
4204
4205
4206 /*
4207  * function called from release_thread to make sure that the ctx_notify_task is not pointing
4208  * to an unexisting task
4209  */
4210 void
4211 pfm_cleanup_notifiers(struct task_struct *task)
4212 {
4213         struct task_struct *g, *p;
4214         pfm_context_t *ctx;
4215
4216         DBprintk(("called by [%d] for [%d]\n", current->pid, task->pid));
4217
4218         read_lock(&tasklist_lock);
4219
4220         do_each_thread(g, p) {
4221                 /*
4222                  * It is safe to do the 2-step test here, because thread.ctx is cleaned up
4223                  * only in release_thread() and at that point the task has been detached
4224                  * from the tasklist which is an operation which uses the write_lock() on
4225                  * the tasklist_lock so it cannot run concurrently to this loop. So we
4226                  * have the guarantee that if we find p and it has a perfmon ctx then it
4227                  * is going to stay like this for the entire execution of this loop.
4228                  */
4229                 ctx = p->thread.pfm_context;
4230
4231                 //DBprintk(("[%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));
4232
4233                 if (ctx && ctx->ctx_notify_task == task) {
4234                         DBprintk(("trying for notifier [%d] in [%d]\n", task->pid, p->pid));
4235                         /*
4236                          * the spinlock is required to take care of a race condition
4237                          * with the send_sig_info() call. We must make sure that
4238                          * either the send_sig_info() completes using a valid task,
4239                          * or the notify_task is cleared before the send_sig_info()
4240                          * can pick up a stale value. Note that by the time this
4241                          * function is executed the 'task' is already detached from the
4242                          * tasklist. The problem is that the notifiers have a direct
4243                          * pointer to it. It is okay to send a signal to a task in this
4244                          * stage, it simply will have no effect. But it is better than sending
4245                          * to a completely destroyed task or worse to a new task using the same
4246                          * task_struct address.
4247                          */
4248                         LOCK_CTX(ctx);
4249
4250                         ctx->ctx_notify_task = NULL;
4251
4252                         UNLOCK_CTX(ctx);
4253
4254                         DBprintk(("done for notifier [%d] in [%d]\n", task->pid, p->pid));
4255                 }
4256         } while_each_thread(g, p);
4257
4258         read_unlock(&tasklist_lock);
4259
4260         atomic_set(&task->thread.pfm_notifiers_check, 0);
4261 }
4262
4263 static struct irqaction perfmon_irqaction = {
4264         .handler =      pfm_interrupt_handler,
4265         .flags   =      SA_INTERRUPT,
4266         .name    =      "perfmon"
4267 };
4268
4269 int
4270 pfm_install_alternate_syswide_subsystem(pfm_intr_handler_desc_t *hdl)
4271 {
4272         int ret;
4273
4274
4275         /* some sanity checks */
4276         if (hdl == NULL || hdl->handler == NULL) {
4277                 return -EINVAL;
4278         }
4279
4280         /* do the easy test first */
4281         if (pfm_alternate_intr_handler) {
4282                 return -EBUSY;
4283         }
4284
4285         preempt_disable();
4286         /* reserve our session */
4287         ret = pfm_reserve_session(NULL, 1, cpu_online_map);
4288         if (ret) {
4289                 preempt_enable();
4290                 return ret;
4291         }
4292
4293         if (pfm_alternate_intr_handler) {
4294                 preempt_enable();
4295                 printk(KERN_DEBUG "perfmon: install_alternate, intr_handler not NULL "
4296                        "after reserve\n");
4297                 return -EINVAL;
4298         }
4299
4300         pfm_alternate_intr_handler = hdl;
4301
4302         preempt_enable();
4303         return 0;
4304 }
4305
4306 int
4307 pfm_remove_alternate_syswide_subsystem(pfm_intr_handler_desc_t *hdl)
4308 {
4309         if (hdl == NULL)
4310                 return -EINVAL;
4311
4312         /* cannot remove someone else's handler! */
4313         if (pfm_alternate_intr_handler != hdl)
4314                 return -EINVAL;
4315
4316         preempt_disable();
4317         pfm_alternate_intr_handler = NULL;
4318
4319         /*
4320          * XXX: assume cpu_online_map has not changed since reservation
4321          */
4322         pfm_unreserve_session(NULL, 1, cpu_online_map);
4323
4324         preempt_enable();
4325
4326         return 0;
4327 }
4328
4329 /*
4330  * perfmon initialization routine, called from the initcall() table
4331  */
4332 int __init
4333 pfm_init(void)
4334 {
4335         unsigned int n, n_counters, i;
4336
4337         pmu_conf.disabled = 1;
4338
4339         printk(KERN_INFO "perfmon: version %u.%u IRQ %u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN,
4340                IA64_PERFMON_VECTOR);
4341
4342         /*
4343          * compute the number of implemented PMD/PMC from the
4344          * description tables
4345          */
4346         n = 0;
4347         for (i=0; PMC_IS_LAST(i) == 0;  i++) {
4348                 if (PMC_IS_IMPL(i) == 0) continue;
4349                 pmu_conf.impl_pmcs[i>>6] |= 1UL << (i&63);
4350                 n++;
4351         }
4352         pmu_conf.num_pmcs = n;
4353
4354         n = 0; n_counters = 0;
4355         for (i=0; PMD_IS_LAST(i) == 0;  i++) {
4356                 if (PMD_IS_IMPL(i) == 0) continue;
4357                 pmu_conf.impl_pmds[i>>6] |= 1UL << (i&63);
4358                 n++;
4359                 if (PMD_IS_COUNTING(i)) n_counters++;
4360         }
4361         pmu_conf.num_pmds      = n;
4362         pmu_conf.num_counters  = n_counters;
4363
4364         printk(KERN_INFO "perfmon: %u PMCs, %u PMDs, %u counters (%lu bits)\n",
4365                pmu_conf.num_pmcs,
4366                pmu_conf.num_pmds,
4367                pmu_conf.num_counters,
4368                ffz(pmu_conf.ovfl_val));
4369
4370         /* sanity check */
4371         if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) {
4372                 printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
4373                 return -1;
4374         }
4375
4376         /*
4377          * for now here for debug purposes
4378          */
4379         perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
4380         if (perfmon_dir == NULL) {
4381                 printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
4382                 return -1;
4383         }
4384
4385         /*
4386          * create /proc/perfmon
4387          */
4388         pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
4389
4390         /*
4391          * initialize all our spinlocks
4392          */
4393         spin_lock_init(&pfm_sessions.pfs_lock);
4394
4395         /* we are all set */
4396         pmu_conf.disabled = 0;
4397
4398         return 0;
4399 }
4400 __initcall(pfm_init);
4401
4402 void
4403 pfm_init_percpu(void)
4404 {
4405         int i;
4406         int me = get_cpu();
4407
4408         if (me == 0)
4409                 register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
4410
4411         ia64_set_pmv(IA64_PERFMON_VECTOR);
4412         ia64_srlz_d();
4413
4414         /*
4415          * we first initialize the PMU to a stable state.
4416          * the values may have been changed from their power-up
4417          * values by software executed before the kernel took over.
4418          *
4419          * At this point, pmu_conf has not yet been initialized
4420          *
4421          * On McKinley, this code is ineffective until PMC4 is initialized.
4422          */
4423         for (i=1; PMC_IS_LAST(i) == 0;  i++) {
4424                 if (PMC_IS_IMPL(i) == 0) continue;
4425                 ia64_set_pmc(i, PMC_DFL_VAL(i));
4426         }
4427
4428         for (i=0; PMD_IS_LAST(i); i++) {
4429                 if (PMD_IS_IMPL(i) == 0) continue;
4430                 ia64_set_pmd(i, 0UL);
4431         }
4432         put_cpu();
4433         pfm_freeze_pmu();
4434 }
4435
4436 #else /* !CONFIG_PERFMON */
4437
4438 asmlinkage long
4439 sys_perfmonctl (int pid, int cmd, void *req, int count, long arg5, long arg6,
4440                 long arg7, long arg8, long stack)
4441 {
4442         return -ENOSYS;
4443 }
4444
4445 #endif /* !CONFIG_PERFMON */