drivers/oprofile/cpu_buffer.c

   1 /**
   2  * @file cpu_buffer.c
   3  *
   4  * @remark Copyright 2002-2009 OProfile authors
   5  * @remark Read the file COPYING
   6  *
   7  * @author John Levon <levon@movementarian.org>
   8  * @author Barry Kasindorf <barry.kasindorf@amd.com>
   9  * @author Robert Richter <robert.richter@amd.com>
  10  *
  11  * Modified by Aravind Menon for Xen
  12  * These modifications are:
  13  * Copyright (C) 2005 Hewlett-Packard Co.
  14  *
  15  * Each CPU has a local buffer that stores PC value/event
  16  * pairs. We also log context switches when we notice them.
  17  * Eventually each CPU's buffer is processed into the global
  18  * event buffer by sync_buffer().
  19  *
  20  * We use a local buffer for two reasons: an NMI or similar
  21  * interrupt cannot synchronise, and high sampling rates
  22  * would lead to catastrophic global synchronisation if
  23  * a global buffer was used.
  24  */
  25
  26 #include <linux/sched.h>
  27 #include <linux/oprofile.h>
  28 #include <linux/errno.h>
  29
  30 #include "event_buffer.h"
  31 #include "cpu_buffer.h"
  32 #include "buffer_sync.h"
  33 #include "oprof.h"
  34
  35 #define OP_BUFFER_FLAGS 0
  36
  37 static struct ring_buffer *op_ring_buffer;
  38 DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
  39
  40 static void wq_sync_buffer(struct work_struct *work);
  41
  42 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
  43 static int work_enabled;
  44
  45 #ifndef CONFIG_XEN
  46 #define current_domain COORDINATOR_DOMAIN
  47 #else
  48 static int32_t current_domain = COORDINATOR_DOMAIN;
  49 #endif
  50
  51 unsigned long oprofile_get_cpu_buffer_size(void)
  52 {
  53         return oprofile_cpu_buffer_size;
  54 }
  55
  56 void oprofile_cpu_buffer_inc_smpl_lost(void)
  57 {
  58         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
  59
  60         cpu_buf->sample_lost_overflow++;
  61 }
  62
  63 void free_cpu_buffers(void)
  64 {
  65         if (op_ring_buffer)
  66                 ring_buffer_free(op_ring_buffer);
  67         op_ring_buffer = NULL;
  68 }
  69
  70 #define RB_EVENT_HDR_SIZE 4
  71
  72 int alloc_cpu_buffers(void)
  73 {
  74         int i;
  75
  76         unsigned long buffer_size = oprofile_cpu_buffer_size;
  77         unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
  78                                                  RB_EVENT_HDR_SIZE);
  79
  80         op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  81         if (!op_ring_buffer)
  82                 goto fail;
  83
  84         for_each_possible_cpu(i) {
  85                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
  86
  87                 b->last_task = NULL;
  88                 b->last_cpu_mode = -1;
  89                 b->tracing = 0;
  90                 b->buffer_size = buffer_size;
  91                 b->sample_received = 0;
  92                 b->sample_lost_overflow = 0;
  93                 b->backtrace_aborted = 0;
  94                 b->sample_invalid_eip = 0;
  95                 b->cpu = i;
  96                 INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
  97         }
  98         return 0;
  99
 100 fail:
 101         free_cpu_buffers();
 102         return -ENOMEM;
 103 }
 104
 105 void start_cpu_work(void)
 106 {
 107         int i;
 108
 109         work_enabled = 1;
 110
 111         for_each_online_cpu(i) {
 112                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 113
 114                 /*
 115                  * Spread the work by 1 jiffy per cpu so they dont all
 116                  * fire at once.
 117                  */
 118                 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
 119         }
 120 }
 121
 122 void end_cpu_work(void)
 123 {
 124         work_enabled = 0;
 125 }
 126
 127 void flush_cpu_work(void)
 128 {
 129         int i;
 130
 131         for_each_online_cpu(i) {
 132                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 133
 134                 /* these works are per-cpu, no need for flush_sync */
 135                 flush_delayed_work(&b->work);
 136         }
 137 }
 138
 139 /*
 140  * This function prepares the cpu buffer to write a sample.
 141  *
 142  * Struct op_entry is used during operations on the ring buffer while
 143  * struct op_sample contains the data that is stored in the ring
 144  * buffer. Struct entry can be uninitialized. The function reserves a
 145  * data array that is specified by size. Use
 146  * op_cpu_buffer_write_commit() after preparing the sample. In case of
 147  * errors a null pointer is returned, otherwise the pointer to the
 148  * sample.
 149  *
 150  */
 151 struct op_sample
 152 *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 153 {
 154         entry->event = ring_buffer_lock_reserve
 155                 (op_ring_buffer, sizeof(struct op_sample) +
 156                  size * sizeof(entry->sample->data[0]));
 157         if (!entry->event)
 158                 return NULL;
 159         entry->sample = ring_buffer_event_data(entry->event);
 160         entry->size = size;
 161         entry->data = entry->sample->data;
 162
 163         return entry->sample;
 164 }
 165
 166 int op_cpu_buffer_write_commit(struct op_entry *entry)
 167 {
 168         return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
 169 }
 170
 171 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 172 {
 173         struct ring_buffer_event *e;
 174         e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL);
 175         if (!e)
 176                 return NULL;
 177
 178         entry->event = e;
 179         entry->sample = ring_buffer_event_data(e);
 180         entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
 181                 / sizeof(entry->sample->data[0]);
 182         entry->data = entry->sample->data;
 183         return entry->sample;
 184 }
 185
 186 unsigned long op_cpu_buffer_entries(int cpu)
 187 {
 188         return ring_buffer_entries_cpu(op_ring_buffer, cpu);
 189 }
 190
 191 static int
 192 op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
 193             int cpu_mode, struct task_struct *task)
 194 {
 195         struct op_entry entry;
 196         struct op_sample *sample;
 197         unsigned long flags;
 198         int size;
 199
 200         flags = 0;
 201
 202         if (backtrace)
 203                 flags |= TRACE_BEGIN;
 204
 205         /* notice a switch from user->kernel or vice versa */
 206         if (cpu_buf->last_cpu_mode != cpu_mode) {
 207                 cpu_buf->last_cpu_mode = cpu_mode;
 208                 flags |= KERNEL_CTX_SWITCH | cpu_mode;
 209         }
 210
 211         /* notice a task switch */
 212         /* if not processing other domain samples */
 213         if (cpu_buf->last_task != task &&
 214             current_domain == COORDINATOR_DOMAIN) {
 215                 cpu_buf->last_task = task;
 216                 flags |= USER_CTX_SWITCH;
 217         }
 218
 219         if (!flags)
 220                 /* nothing to do */
 221                 return 0;
 222
 223         if (flags & USER_CTX_SWITCH)
 224                 size = 1;
 225         else
 226                 size = 0;
 227
 228         sample = op_cpu_buffer_write_reserve(&entry, size);
 229         if (!sample)
 230                 return -ENOMEM;
 231
 232         sample->eip = ESCAPE_CODE;
 233         sample->event = flags;
 234
 235         if (size)
 236                 op_cpu_buffer_add_data(&entry, (unsigned long)task);
 237
 238         op_cpu_buffer_write_commit(&entry);
 239
 240         return 0;
 241 }
 242
 243 static inline int
 244 op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
 245               unsigned long pc, unsigned long event)
 246 {
 247         struct op_entry entry;
 248         struct op_sample *sample;
 249
 250         sample = op_cpu_buffer_write_reserve(&entry, 0);
 251         if (!sample)
 252                 return -ENOMEM;
 253
 254         sample->eip = pc;
 255         sample->event = event;
 256
 257         return op_cpu_buffer_write_commit(&entry);
 258 }
 259
 260 /*
 261  * This must be safe from any context.
 262  *
 263  * cpu_mode is needed because on some architectures you cannot
 264  * tell if you are in kernel or user space simply by looking at
 265  * pc. We tag this in the buffer by generating kernel/user (and
 266  * xen) enter events whenever cpu_mode changes
 267  */
 268 static int
 269 log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 270            unsigned long backtrace, int cpu_mode, unsigned long event,
 271            struct task_struct *task)
 272 {
 273         struct task_struct *tsk = task ? task : current;
 274         cpu_buf->sample_received++;
 275
 276         if (pc == ESCAPE_CODE) {
 277                 cpu_buf->sample_invalid_eip++;
 278                 return 0;
 279         }
 280
 281         if (op_add_code(cpu_buf, backtrace, cpu_mode, tsk))
 282                 goto fail;
 283
 284         if (op_add_sample(cpu_buf, pc, event))
 285                 goto fail;
 286
 287         return 1;
 288
 289 fail:
 290         cpu_buf->sample_lost_overflow++;
 291         return 0;
 292 }
 293
 294 static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
 295 {
 296         cpu_buf->tracing = 1;
 297 }
 298
 299 static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
 300 {
 301         cpu_buf->tracing = 0;
 302 }
 303
 304 static inline void
 305 __oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 306                           unsigned long event, int is_kernel,
 307                           struct task_struct *task)
 308 {
 309         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
 310         unsigned long backtrace = oprofile_backtrace_depth;
 311
 312         /*
 313          * if log_sample() fail we can't backtrace since we lost the
 314          * source of this event
 315          */
 316         if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task))
 317                 /* failed */
 318                 return;
 319
 320         if (!backtrace)
 321                 return;
 322
 323         oprofile_begin_trace(cpu_buf);
 324         oprofile_ops.backtrace(regs, backtrace);
 325         oprofile_end_trace(cpu_buf);
 326 }
 327
 328 void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
 329                                 unsigned long event, int is_kernel,
 330                                 struct task_struct *task)
 331 {
 332         __oprofile_add_ext_sample(pc, regs, event, is_kernel, task);
 333 }
 334
 335 void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 336                              unsigned long event, int is_kernel)
 337 {
 338         __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
 339 }
 340
 341 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 342 {
 343         int is_kernel;
 344         unsigned long pc;
 345
 346         if (likely(regs)) {
 347                 is_kernel = !user_mode(regs);
 348                 pc = profile_pc(regs);
 349         } else {
 350                 is_kernel = 0;    /* This value will not be used */
 351                 pc = ESCAPE_CODE; /* as this causes an early return. */
 352         }
 353
 354         __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
 355 }
 356
 357 /*
 358  * Add samples with data to the ring buffer.
 359  *
 360  * Use oprofile_add_data(&entry, val) to add data and
 361  * oprofile_write_commit(&entry) to commit the sample.
 362  */
 363 void
 364 oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
 365                        unsigned long pc, int code, int size)
 366 {
 367         struct op_sample *sample;
 368         int is_kernel = !user_mode(regs);
 369         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
 370
 371         cpu_buf->sample_received++;
 372
 373         /* no backtraces for samples with data */
 374         if (op_add_code(cpu_buf, 0, is_kernel, current))
 375                 goto fail;
 376
 377         sample = op_cpu_buffer_write_reserve(entry, size + 2);
 378         if (!sample)
 379                 goto fail;
 380         sample->eip = ESCAPE_CODE;
 381         sample->event = 0;              /* no flags */
 382
 383         op_cpu_buffer_add_data(entry, code);
 384         op_cpu_buffer_add_data(entry, pc);
 385
 386         return;
 387
 388 fail:
 389         entry->event = NULL;
 390         cpu_buf->sample_lost_overflow++;
 391 }
 392
 393 int oprofile_add_data(struct op_entry *entry, unsigned long val)
 394 {
 395         if (!entry->event)
 396                 return 0;
 397         return op_cpu_buffer_add_data(entry, val);
 398 }
 399
 400 int oprofile_add_data64(struct op_entry *entry, u64 val)
 401 {
 402         if (!entry->event)
 403                 return 0;
 404         if (op_cpu_buffer_get_size(entry) < 2)
 405                 /*
 406                  * the function returns 0 to indicate a too small
 407                  * buffer, even if there is some space left
 408                  */
 409                 return 0;
 410         if (!op_cpu_buffer_add_data(entry, (u32)val))
 411                 return 0;
 412         return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
 413 }
 414
 415 int oprofile_write_commit(struct op_entry *entry)
 416 {
 417         if (!entry->event)
 418                 return -EINVAL;
 419         return op_cpu_buffer_write_commit(entry);
 420 }
 421
 422 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 423 {
 424         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
 425         log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
 426 }
 427
 428 #ifdef CONFIG_XEN
 429 /*
 430  * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN),
 431  * as was previously accessible through oprofile_add_pc().
 432  */
 433 void oprofile_add_mode(int cpu_mode)
 434 {
 435         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
 436
 437         if (op_add_code(cpu_buf, 1, cpu_mode, current))
 438                 cpu_buf->sample_lost_overflow++;
 439 }
 440 #endif
 441
 442 void oprofile_add_trace(unsigned long pc)
 443 {
 444         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
 445
 446         if (!cpu_buf->tracing)
 447                 return;
 448
 449         /*
 450          * broken frame can give an eip with the same value as an
 451          * escape code, abort the trace if we get it
 452          */
 453         if (pc == ESCAPE_CODE)
 454                 goto fail;
 455
 456         if (op_add_sample(cpu_buf, pc, 0))
 457                 goto fail;
 458
 459         return;
 460 fail:
 461         cpu_buf->tracing = 0;
 462         cpu_buf->backtrace_aborted++;
 463         return;
 464 }
 465
 466 #ifdef CONFIG_XEN
 467 int oprofile_add_domain_switch(int32_t domain_id)
 468 {
 469         struct op_entry entry;
 470         struct op_sample *sample;
 471
 472         sample = op_cpu_buffer_write_reserve(&entry, 1);
 473         if (!sample)
 474                 return 0;
 475
 476         sample->eip = ESCAPE_CODE;
 477         sample->event = DOMAIN_SWITCH;
 478
 479         op_cpu_buffer_add_data(&entry, domain_id);
 480         op_cpu_buffer_write_commit(&entry);
 481
 482         current_domain = domain_id;
 483
 484         return 1;
 485 }
 486 #endif
 487
 488 /*
 489  * This serves to avoid cpu buffer overflow, and makes sure
 490  * the task mortuary progresses
 491  *
 492  * By using schedule_delayed_work_on and then schedule_delayed_work
 493  * we guarantee this will stay on the correct cpu
 494  */
 495 static void wq_sync_buffer(struct work_struct *work)
 496 {
 497         struct oprofile_cpu_buffer *b =
 498                 container_of(work, struct oprofile_cpu_buffer, work.work);
 499         if (b->cpu != smp_processor_id()) {
 500                 printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n",
 501                        smp_processor_id(), b->cpu);
 502
 503                 if (!cpu_online(b->cpu)) {
 504                         cancel_delayed_work(&b->work);
 505                         return;
 506                 }
 507         }
 508         sync_buffer(b->cpu);
 509
 510         /* don't re-add the work if we're shutting down */
 511         if (work_enabled)
 512                 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
 513 }