Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 20 Mar 2012 17:29:15 +0000 (10:29 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 20 Mar 2012 17:29:15 +0000 (10:29 -0700)
Pull perf events changes for v3.4 from Ingo Molnar:

 - New "hardware based branch profiling" feature both on the kernel and
   the tooling side, on CPUs that support it.  (modern x86 Intel CPUs
   with the 'LBR' hardware feature currently.)

   This new feature is basically a sophisticated 'magnifying glass' for
   branch execution - something that is pretty difficult to extract from
   regular, function histogram centric profiles.

   The simplest mode is activated via 'perf record -b', and the result
   looks like this in perf report:

$ perf record -b any_call,u -e cycles:u branchy

$ perf report -b --sort=symbol
    52.34%  [.] main                   [.] f1
    24.04%  [.] f1                     [.] f3
    23.60%  [.] f1                     [.] f2
     0.01%  [k] _IO_new_file_xsputn    [k] _IO_file_overflow
     0.01%  [k] _IO_vfprintf_internal  [k] _IO_new_file_xsputn
     0.01%  [k] _IO_vfprintf_internal  [k] strchrnul
     0.01%  [k] __printf               [k] _IO_vfprintf_internal
     0.01%  [k] main                   [k] __printf

   This output shows from/to branch columns and shows the highest
   percentage (from,to) jump combinations - i.e.  the most likely taken
   branches in the system.  "branches" can also include function calls
   and any other synchronous and asynchronous transitions of the
   instruction pointer that are not 'next instruction' - such as system
   calls, traps, interrupts, etc.

   This feature comes with (hopefully intuitive) flat ascii and TUI
   support in perf report.

 - Various 'perf annotate' visual improvements for us assembly junkies.
   It will now recognize function calls in the TUI and by hitting enter
   you can follow the call (recursively) and back, amongst other
   improvements.

 - Multiple threads/processes recording support in perf record, perf
   stat, perf top - which is activated via a comma-list of PIDs:

perf top -p 21483,21485
perf stat -p 21483,21485 -ddd
perf record -p 21483,21485

 - Support for per UID views, via the --uid paramter to perf top, perf
   report, etc.  For example 'perf top --uid mingo' will only show the
   tasks that I am running, excluding other users, root, etc.

 - Jump label restructurings and improvements - this includes the
   factoring out of the (hopefully much clearer) include/linux/static_key.h
   generic facility:

struct static_key key = STATIC_KEY_INIT_FALSE;

...

if (static_key_false(&key))
        do unlikely code
else
        do likely code

...
static_key_slow_inc();
...
static_key_slow_inc();
...

   The static_key_false() branch will be generated into the code with as
   little impact to the likely code path as possible.  the
   static_key_slow_*() APIs flip the branch via live kernel code patching.

   This facility can now be used more widely within the kernel to
   micro-optimize hot branches whose likelihood matches the static-key
   usage and fast/slow cost patterns.

 - SW function tracer improvements: perf support and filtering support.

 - Various hardenings of the perf.data ABI, to make older perf.data's
   smoother on newer tool versions, to make new features integrate more
   smoothly, to support cross-endian recording/analyzing workflows
   better, etc.

 - Restructuring of the kprobes code, the splitting out of 'optprobes',
   and a corner case bugfix.

 - Allow the tracing of kernel console output (printk).

 - Improvements/fixes to user-space RDPMC support, allowing user-space
   self-profiling code to extract PMU counts without performing any
   system calls, while playing nice with the kernel side.

 - 'perf bench' improvements

 - ... and lots of internal restructurings, cleanups and fixes that made
   these features possible.  And, as usual this list is incomplete as
   there were also lots of other improvements

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (120 commits)
  perf report: Fix annotate double quit issue in branch view mode
  perf report: Remove duplicate annotate choice in branch view mode
  perf/x86: Prettify pmu config literals
  perf report: Enable TUI in branch view mode
  perf report: Auto-detect branch stack sampling mode
  perf record: Add HEADER_BRANCH_STACK tag
  perf record: Provide default branch stack sampling mode option
  perf tools: Make perf able to read files from older ABIs
  perf tools: Fix ABI compatibility bug in print_event_desc()
  perf tools: Enable reading of perf.data files from different ABI rev
  perf: Add ABI reference sizes
  perf report: Add support for taken branch sampling
  perf record: Add support for sampling taken branch
  perf tools: Add code to support PERF_SAMPLE_BRANCH_STACK
  x86/kprobes: Split out optprobe related code to kprobes-opt.c
  x86/kprobes: Fix a bug which can modify kernel code permanently
  x86/kprobes: Fix instruction recovery on optimized path
  perf: Add callback to flush branch_stack on context switch
  perf: Disable PERF_SAMPLE_BRANCH_* when not supported
  perf/x86: Add LBR software filter support for Intel CPUs
  ...

1  2 
kernel/irq/chip.c
kernel/softirq.c
lib/Kconfig.debug
tools/perf/Makefile
tools/perf/perf.h
tools/perf/util/header.c
tools/perf/util/hist.c
tools/perf/util/sort.c
tools/perf/util/ui/browsers/hists.c

diff --combined kernel/irq/chip.c
@@@ -16,6 -16,8 +16,8 @@@
  #include <linux/interrupt.h>
  #include <linux/kernel_stat.h>
  
+ #include <trace/events/irq.h>
  #include "internals.h"
  
  /**
@@@ -61,7 -63,8 +63,7 @@@ int irq_set_irq_type(unsigned int irq, 
                return -EINVAL;
  
        type &= IRQ_TYPE_SENSE_MASK;
 -      if (type != IRQ_TYPE_NONE)
 -              ret = __irq_set_trigger(desc, irq, type);
 +      ret = __irq_set_trigger(desc, irq, type);
        irq_put_desc_busunlock(desc, flags);
        return ret;
  }
diff --combined kernel/softirq.c
@@@ -310,21 -310,31 +310,21 @@@ void irq_enter(void
        __irq_enter();
  }
  
 -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
  static inline void invoke_softirq(void)
  {
 -      if (!force_irqthreads)
 +      if (!force_irqthreads) {
 +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
                __do_softirq();
 -      else {
 -              __local_bh_disable((unsigned long)__builtin_return_address(0),
 -                              SOFTIRQ_OFFSET);
 -              wakeup_softirqd();
 -              __local_bh_enable(SOFTIRQ_OFFSET);
 -      }
 -}
  #else
 -static inline void invoke_softirq(void)
 -{
 -      if (!force_irqthreads)
                do_softirq();
 -      else {
 +#endif
 +      } else {
                __local_bh_disable((unsigned long)__builtin_return_address(0),
                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
                __local_bh_enable(SOFTIRQ_OFFSET);
        }
  }
 -#endif
  
  /*
   * Exit an interrupt context. Process softirqs if needed and possible:
@@@ -375,6 -385,12 +375,12 @@@ void raise_softirq(unsigned int nr
        local_irq_restore(flags);
  }
  
+ void __raise_softirq_irqoff(unsigned int nr)
+ {
+       trace_softirq_raise(nr);
+       or_softirq_pending(1UL << nr);
+ }
  void open_softirq(int nr, void (*action)(struct softirq_action *))
  {
        softirq_vec[nr].action = action;
diff --combined lib/Kconfig.debug
@@@ -166,18 -166,21 +166,21 @@@ config LOCKUP_DETECTO
          hard and soft lockups.
  
          Softlockups are bugs that cause the kernel to loop in kernel
-         mode for more than 60 seconds, without giving other tasks a
+         mode for more than 20 seconds, without giving other tasks a
          chance to run.  The current stack trace is displayed upon
          detection and the system will stay locked up.
  
          Hardlockups are bugs that cause the CPU to loop in kernel mode
-         for more than 60 seconds, without letting other interrupts have a
+         for more than 10 seconds, without letting other interrupts have a
          chance to run.  The current stack trace is displayed upon detection
          and the system will stay locked up.
  
          The overhead should be minimal.  A periodic hrtimer runs to
-         generate interrupts and kick the watchdog task every 10-12 seconds.
-         An NMI is generated every 60 seconds or so to check for hardlockups.
+         generate interrupts and kick the watchdog task every 4 seconds.
+         An NMI is generated every 10 seconds or so to check for hardlockups.
+         The frequency of hrtimer and NMI events and the soft and hard lockup
+         thresholds can be controlled through the sysctl watchdog_thresh.
  
  config HARDLOCKUP_DETECTOR
        def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \
@@@ -189,7 -192,8 +192,8 @@@ config BOOTPARAM_HARDLOCKUP_PANI
        help
          Say Y here to enable the kernel to panic on "hard lockups",
          which are bugs that cause the kernel to loop in kernel
-         mode with interrupts disabled for more than 60 seconds.
+         mode with interrupts disabled for more than 10 seconds (configurable
+         using the watchdog_thresh sysctl).
  
          Say N if unsure.
  
@@@ -206,8 -210,8 +210,8 @@@ config BOOTPARAM_SOFTLOCKUP_PANI
        help
          Say Y here to enable the kernel to panic on "soft lockups",
          which are bugs that cause the kernel to loop in kernel
-         mode for more than 60 seconds, without giving other tasks a
-         chance to run.
+         mode for more than 20 seconds (configurable using the watchdog_thresh
+         sysctl), without giving other tasks a chance to run.
  
          The panic can be used in combination with panic_timeout,
          to cause the system to reboot automatically after a
@@@ -927,30 -931,6 +931,30 @@@ config RCU_CPU_STALL_VERBOS
  
          Say Y if you want to enable such checks.
  
 +config RCU_CPU_STALL_INFO
 +      bool "Print additional diagnostics on RCU CPU stall"
 +      depends on (TREE_RCU || TREE_PREEMPT_RCU) && DEBUG_KERNEL
 +      default n
 +      help
 +        For each stalled CPU that is aware of the current RCU grace
 +        period, print out additional per-CPU diagnostic information
 +        regarding scheduling-clock ticks, idle state, and,
 +        for RCU_FAST_NO_HZ kernels, idle-entry state.
 +
 +        Say N if you are unsure.
 +
 +        Say Y if you want to enable such diagnostics.
 +
 +config RCU_TRACE
 +      bool "Enable tracing for RCU"
 +      depends on DEBUG_KERNEL
 +      help
 +        This option provides tracing in RCU which presents stats
 +        in debugfs for debugging RCU implementation.
 +
 +        Say Y here if you want to enable RCU tracing
 +        Say N if you are unsure.
 +
  config KPROBES_SANITY_TEST
        bool "Kprobes sanity tests"
        depends on DEBUG_KERNEL
diff --combined tools/perf/Makefile
@@@ -15,6 -15,16 +15,16 @@@ endi
  
  # Define V to have a more verbose compile.
  #
+ # Define O to save output files in a separate directory.
+ #
+ # Define ARCH as name of target architecture if you want cross-builds.
+ #
+ # Define CROSS_COMPILE as prefix name of compiler if you want cross-builds.
+ #
+ # Define NO_LIBPERL to disable perl script extension.
+ #
+ # Define NO_LIBPYTHON to disable python script extension.
+ #
  # Define PYTHON to point to the python binary if the default
  # `python' is not correct; for example: PYTHON=python2
  #
  # Define NO_DWARF if you do not want debug-info analysis feature at all.
  #
  # Define WERROR=0 to disable treating any warnings as errors.
+ #
+ # Define NO_NEWT if you do not want TUI support.
+ #
+ # Define NO_DEMANGLE if you do not want C++ symbol demangling.
  
  $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
        @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
@@@ -61,7 -75,7 +75,7 @@@ ifeq ($(ARCH),x86_64
        ifeq (${IS_X86_64}, 1)
                RAW_ARCH := x86_64
                ARCH_CFLAGS := -DARCH_X86_64
-               ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
+               ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
        endif
  endif
  
@@@ -183,7 -197,10 +197,10 @@@ SCRIPT_SH += perf-archive.s
  grep-libs = $(filter -l%,$(1))
  strip-libs = $(filter-out -l%,$(1))
  
- $(OUTPUT)python/perf.so: $(PYRF_OBJS)
+ PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
+ PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py
+ $(OUTPUT)python/perf.so: $(PYRF_OBJS) $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
        $(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' $(PYTHON_WORD) util/setup.py \
          --quiet build_ext; \
        mkdir -p $(OUTPUT)python && \
@@@ -249,8 -266,6 +266,8 @@@ LIB_H += util/include/asm/uaccess.
  LIB_H += util/include/dwarf-regs.h
  LIB_H += util/include/asm/dwarf2.h
  LIB_H += util/include/asm/cpufeature.h
 +LIB_H += util/include/asm/unistd_32.h
 +LIB_H += util/include/asm/unistd_64.h
  LIB_H += perf.h
  LIB_H += util/annotate.h
  LIB_H += util/cache.h
@@@ -258,6 -273,7 +275,7 @@@ LIB_H += util/callchain.
  LIB_H += util/build-id.h
  LIB_H += util/debug.h
  LIB_H += util/debugfs.h
+ LIB_H += util/sysfs.h
  LIB_H += util/event.h
  LIB_H += util/evsel.h
  LIB_H += util/evlist.h
@@@ -304,6 -320,7 +322,7 @@@ LIB_OBJS += $(OUTPUT)util/build-id.
  LIB_OBJS += $(OUTPUT)util/config.o
  LIB_OBJS += $(OUTPUT)util/ctype.o
  LIB_OBJS += $(OUTPUT)util/debugfs.o
+ LIB_OBJS += $(OUTPUT)util/sysfs.o
  LIB_OBJS += $(OUTPUT)util/environment.o
  LIB_OBJS += $(OUTPUT)util/event.o
  LIB_OBJS += $(OUTPUT)util/evlist.o
@@@ -361,8 -378,10 +380,10 @@@ BUILTIN_OBJS += $(OUTPUT)bench/sched-me
  BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
  ifeq ($(RAW_ARCH),x86_64)
  BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+ BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
  endif
  BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
+ BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o
  
  BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
  BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
@@@ -794,7 -813,6 +815,6 @@@ help
        @echo '  quick-install-html     - install the html documentation quickly'
        @echo ''
        @echo 'Perf maintainer targets:'
-       @echo '  distclean              - alias to clean'
        @echo '  clean                  - clean all binary objects and build output'
  
  doc:
diff --combined tools/perf/perf.h
@@@ -10,9 -10,6 +10,9 @@@ void get_term_dimensions(struct winsiz
  #define rmb()         asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
  #define cpu_relax()   asm volatile("rep; nop" ::: "memory");
  #define CPUINFO_PROC  "model name"
 +#ifndef __NR_perf_event_open
 +# define __NR_perf_event_open 336
 +#endif
  #endif
  
  #if defined(__x86_64__)
@@@ -20,9 -17,6 +20,9 @@@
  #define rmb()         asm volatile("lfence" ::: "memory")
  #define cpu_relax()   asm volatile("rep; nop" ::: "memory");
  #define CPUINFO_PROC  "model name"
 +#ifndef __NR_perf_event_open
 +# define __NR_perf_event_open 298
 +#endif
  #endif
  
  #ifdef __powerpc__
@@@ -173,7 -167,6 +173,6 @@@ sys_perf_event_open(struct perf_event_a
                      pid_t pid, int cpu, int group_fd,
                      unsigned long flags)
  {
-       attr->size = sizeof(*attr);
        return syscall(__NR_perf_event_open, attr, pid, cpu,
                       group_fd, flags);
  }
@@@ -186,14 -179,32 +185,32 @@@ struct ip_callchain 
        u64 ips[0];
  };
  
+ struct branch_flags {
+       u64 mispred:1;
+       u64 predicted:1;
+       u64 reserved:62;
+ };
+ struct branch_entry {
+       u64                             from;
+       u64                             to;
+       struct branch_flags flags;
+ };
+ struct branch_stack {
+       u64                             nr;
+       struct branch_entry     entries[0];
+ };
  extern bool perf_host, perf_guest;
  extern const char perf_version_string[];
  
  void pthread__unblock_sigwinch(void);
  
  struct perf_record_opts {
-       pid_t        target_pid;
-       pid_t        target_tid;
+       const char   *target_pid;
+       const char   *target_tid;
+       uid_t        uid;
        bool         call_graph;
        bool         group;
        bool         inherit_stat;
        bool         raw_samples;
        bool         sample_address;
        bool         sample_time;
-       bool         sample_id_all_avail;
+       bool         sample_id_all_missing;
        bool         exclude_guest_missing;
        bool         system_wide;
        bool         period;
        unsigned int freq;
        unsigned int mmap_pages;
        unsigned int user_freq;
+       int          branch_stack;
        u64          default_interval;
        u64          user_interval;
        const char   *cpu_list;
diff --combined tools/perf/util/header.c
@@@ -63,9 -63,20 +63,20 @@@ char *perf_header__find_event(u64 id
        return NULL;
  }
  
- static const char *__perf_magic = "PERFFILE";
+ /*
+  * magic2 = "PERFILE2"
+  * must be a numerical value to let the endianness
+  * determine the memory layout. That way we are able
+  * to detect endianness when reading the perf.data file
+  * back.
+  *
+  * we check for legacy (PERFFILE) format.
+  */
+ static const char *__perf_magic1 = "PERFFILE";
+ static const u64 __perf_magic2    = 0x32454c4946524550ULL;
+ static const u64 __perf_magic2_sw = 0x50455246494c4532ULL;
  
- #define PERF_MAGIC    (*(u64 *)__perf_magic)
+ #define PERF_MAGIC    __perf_magic2
  
  struct perf_file_attr {
        struct perf_event_attr  attr;
@@@ -280,7 -291,7 +291,7 @@@ int build_id_cache__add_s(const char *s
        if (realname == NULL || filename == NULL || linkname == NULL)
                goto out_free;
  
 -      len = snprintf(filename, size, "%s%s%s",
 +      len = scnprintf(filename, size, "%s%s%s",
                       debugdir, is_kallsyms ? "/" : "", realname);
        if (mkdir_p(filename, 0755))
                goto out_free;
                        goto out_free;
        }
  
 -      len = snprintf(linkname, size, "%s/.build-id/%.2s",
 +      len = scnprintf(linkname, size, "%s/.build-id/%.2s",
                       debugdir, sbuild_id);
  
        if (access(linkname, X_OK) && mkdir_p(linkname, 0755))
@@@ -1012,6 -1023,12 +1023,12 @@@ write_it
        return do_write_string(fd, buffer);
  }
  
+ static int write_branch_stack(int fd __used, struct perf_header *h __used,
+                      struct perf_evlist *evlist __used)
+ {
+       return 0;
+ }
  static void print_hostname(struct perf_header *ph, int fd, FILE *fp)
  {
        char *str = do_read_string(fd, ph);
@@@ -1133,8 -1150,9 +1150,9 @@@ static void print_event_desc(struct per
        uint64_t id;
        void *buf = NULL;
        char *str;
-       u32 nre, sz, nr, i, j, msz;
-       int ret;
+       u32 nre, sz, nr, i, j;
+       ssize_t ret;
+       size_t msz;
  
        /* number of events */
        ret = read(fd, &nre, sizeof(nre));
        if (ph->needs_swap)
                sz = bswap_32(sz);
  
-       /*
-        * ensure it is at least to our ABI rev
-        */
-       if (sz < (u32)sizeof(attr))
-               goto error;
        memset(&attr, 0, sizeof(attr));
  
-       /* read entire region to sync up to next field */
+       /* buffer to hold on file attr struct */
        buf = malloc(sz);
        if (!buf)
                goto error;
  
        msz = sizeof(attr);
-       if (sz < msz)
+       if (sz < (ssize_t)msz)
                msz = sz;
  
        for (i = 0 ; i < nre; i++) {
  
+               /*
+                * must read entire on-file attr struct to
+                * sync up with layout.
+                */
                ret = read(fd, buf, sz);
                if (ret != (ssize_t)sz)
                        goto error;
@@@ -1305,25 -1321,204 +1321,204 @@@ static void print_cpuid(struct perf_hea
        free(str);
  }
  
+ static void print_branch_stack(struct perf_header *ph __used, int fd __used,
+                              FILE *fp)
+ {
+       fprintf(fp, "# contains samples with branch stack\n");
+ }
+ static int __event_process_build_id(struct build_id_event *bev,
+                                   char *filename,
+                                   struct perf_session *session)
+ {
+       int err = -1;
+       struct list_head *head;
+       struct machine *machine;
+       u16 misc;
+       struct dso *dso;
+       enum dso_kernel_type dso_type;
+       machine = perf_session__findnew_machine(session, bev->pid);
+       if (!machine)
+               goto out;
+       misc = bev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+       switch (misc) {
+       case PERF_RECORD_MISC_KERNEL:
+               dso_type = DSO_TYPE_KERNEL;
+               head = &machine->kernel_dsos;
+               break;
+       case PERF_RECORD_MISC_GUEST_KERNEL:
+               dso_type = DSO_TYPE_GUEST_KERNEL;
+               head = &machine->kernel_dsos;
+               break;
+       case PERF_RECORD_MISC_USER:
+       case PERF_RECORD_MISC_GUEST_USER:
+               dso_type = DSO_TYPE_USER;
+               head = &machine->user_dsos;
+               break;
+       default:
+               goto out;
+       }
+       dso = __dsos__findnew(head, filename);
+       if (dso != NULL) {
+               char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+               dso__set_build_id(dso, &bev->build_id);
+               if (filename[0] == '[')
+                       dso->kernel = dso_type;
+               build_id__sprintf(dso->build_id, sizeof(dso->build_id),
+                                 sbuild_id);
+               pr_debug("build id event received for %s: %s\n",
+                        dso->long_name, sbuild_id);
+       }
+       err = 0;
+ out:
+       return err;
+ }
+ static int perf_header__read_build_ids_abi_quirk(struct perf_header *header,
+                                                int input, u64 offset, u64 size)
+ {
+       struct perf_session *session = container_of(header, struct perf_session, header);
+       struct {
+               struct perf_event_header   header;
+               u8                         build_id[ALIGN(BUILD_ID_SIZE, sizeof(u64))];
+               char                       filename[0];
+       } old_bev;
+       struct build_id_event bev;
+       char filename[PATH_MAX];
+       u64 limit = offset + size;
+       while (offset < limit) {
+               ssize_t len;
+               if (read(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev))
+                       return -1;
+               if (header->needs_swap)
+                       perf_event_header__bswap(&old_bev.header);
+               len = old_bev.header.size - sizeof(old_bev);
+               if (read(input, filename, len) != len)
+                       return -1;
+               bev.header = old_bev.header;
+               /*
+                * As the pid is the missing value, we need to fill
+                * it properly. The header.misc value give us nice hint.
+                */
+               bev.pid = HOST_KERNEL_ID;
+               if (bev.header.misc == PERF_RECORD_MISC_GUEST_USER ||
+                   bev.header.misc == PERF_RECORD_MISC_GUEST_KERNEL)
+                       bev.pid = DEFAULT_GUEST_KERNEL_ID;
+               memcpy(bev.build_id, old_bev.build_id, sizeof(bev.build_id));
+               __event_process_build_id(&bev, filename, session);
+               offset += bev.header.size;
+       }
+       return 0;
+ }
+ static int perf_header__read_build_ids(struct perf_header *header,
+                                      int input, u64 offset, u64 size)
+ {
+       struct perf_session *session = container_of(header, struct perf_session, header);
+       struct build_id_event bev;
+       char filename[PATH_MAX];
+       u64 limit = offset + size, orig_offset = offset;
+       int err = -1;
+       while (offset < limit) {
+               ssize_t len;
+               if (read(input, &bev, sizeof(bev)) != sizeof(bev))
+                       goto out;
+               if (header->needs_swap)
+                       perf_event_header__bswap(&bev.header);
+               len = bev.header.size - sizeof(bev);
+               if (read(input, filename, len) != len)
+                       goto out;
+               /*
+                * The a1645ce1 changeset:
+                *
+                * "perf: 'perf kvm' tool for monitoring guest performance from host"
+                *
+                * Added a field to struct build_id_event that broke the file
+                * format.
+                *
+                * Since the kernel build-id is the first entry, process the
+                * table using the old format if the well known
+                * '[kernel.kallsyms]' string for the kernel build-id has the
+                * first 4 characters chopped off (where the pid_t sits).
+                */
+               if (memcmp(filename, "nel.kallsyms]", 13) == 0) {
+                       if (lseek(input, orig_offset, SEEK_SET) == (off_t)-1)
+                               return -1;
+                       return perf_header__read_build_ids_abi_quirk(header, input, offset, size);
+               }
+               __event_process_build_id(&bev, filename, session);
+               offset += bev.header.size;
+       }
+       err = 0;
+ out:
+       return err;
+ }
+ static int process_trace_info(struct perf_file_section *section __unused,
+                             struct perf_header *ph __unused,
+                             int feat __unused, int fd)
+ {
+       trace_report(fd, false);
+       return 0;
+ }
+ static int process_build_id(struct perf_file_section *section,
+                           struct perf_header *ph,
+                           int feat __unused, int fd)
+ {
+       if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
+               pr_debug("Failed to read buildids, continuing...\n");
+       return 0;
+ }
  struct feature_ops {
        int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
        void (*print)(struct perf_header *h, int fd, FILE *fp);
+       int (*process)(struct perf_file_section *section,
+                      struct perf_header *h, int feat, int fd);
        const char *name;
        bool full_only;
  };
  
  #define FEAT_OPA(n, func) \
        [n] = { .name = #n, .write = write_##func, .print = print_##func }
+ #define FEAT_OPP(n, func) \
+       [n] = { .name = #n, .write = write_##func, .print = print_##func, \
+               .process = process_##func }
  #define FEAT_OPF(n, func) \
-       [n] = { .name = #n, .write = write_##func, .print = print_##func, .full_only = true }
+       [n] = { .name = #n, .write = write_##func, .print = print_##func, \
+               .full_only = true }
  
  /* feature_ops not implemented: */
  #define print_trace_info              NULL
  #define print_build_id                        NULL
  
  static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
-       FEAT_OPA(HEADER_TRACE_INFO,     trace_info),
-       FEAT_OPA(HEADER_BUILD_ID,       build_id),
+       FEAT_OPP(HEADER_TRACE_INFO,     trace_info),
+       FEAT_OPP(HEADER_BUILD_ID,       build_id),
        FEAT_OPA(HEADER_HOSTNAME,       hostname),
        FEAT_OPA(HEADER_OSRELEASE,      osrelease),
        FEAT_OPA(HEADER_VERSION,        version),
        FEAT_OPA(HEADER_CMDLINE,        cmdline),
        FEAT_OPF(HEADER_CPU_TOPOLOGY,   cpu_topology),
        FEAT_OPF(HEADER_NUMA_TOPOLOGY,  numa_topology),
+       FEAT_OPA(HEADER_BRANCH_STACK,   branch_stack),
  };
  
  struct header_print_data {
@@@ -1620,24 -1816,128 +1816,128 @@@ out_free
        return err;
  }
  
+ static const int attr_file_abi_sizes[] = {
+       [0] = PERF_ATTR_SIZE_VER0,
+       [1] = PERF_ATTR_SIZE_VER1,
+       0,
+ };
+ /*
+  * In the legacy file format, the magic number is not used to encode endianness.
+  * hdr_sz was used to encode endianness. But given that hdr_sz can vary based
+  * on ABI revisions, we need to try all combinations for all endianness to
+  * detect the endianness.
+  */
+ static int try_all_file_abis(uint64_t hdr_sz, struct perf_header *ph)
+ {
+       uint64_t ref_size, attr_size;
+       int i;
+       for (i = 0 ; attr_file_abi_sizes[i]; i++) {
+               ref_size = attr_file_abi_sizes[i]
+                        + sizeof(struct perf_file_section);
+               if (hdr_sz != ref_size) {
+                       attr_size = bswap_64(hdr_sz);
+                       if (attr_size != ref_size)
+                               continue;
+                       ph->needs_swap = true;
+               }
+               pr_debug("ABI%d perf.data file detected, need_swap=%d\n",
+                        i,
+                        ph->needs_swap);
+               return 0;
+       }
+       /* could not determine endianness */
+       return -1;
+ }
+ #define PERF_PIPE_HDR_VER0    16
+ static const size_t attr_pipe_abi_sizes[] = {
+       [0] = PERF_PIPE_HDR_VER0,
+       0,
+ };
+ /*
+  * In the legacy pipe format, there is an implicit assumption that endiannesss
+  * between host recording the samples, and host parsing the samples is the
+  * same. This is not always the case given that the pipe output may always be
+  * redirected into a file and analyzed on a different machine with possibly a
+  * different endianness and perf_event ABI revsions in the perf tool itself.
+  */
+ static int try_all_pipe_abis(uint64_t hdr_sz, struct perf_header *ph)
+ {
+       u64 attr_size;
+       int i;
+       for (i = 0 ; attr_pipe_abi_sizes[i]; i++) {
+               if (hdr_sz != attr_pipe_abi_sizes[i]) {
+                       attr_size = bswap_64(hdr_sz);
+                       if (attr_size != hdr_sz)
+                               continue;
+                       ph->needs_swap = true;
+               }
+               pr_debug("Pipe ABI%d perf.data file detected\n", i);
+               return 0;
+       }
+       return -1;
+ }
+ static int check_magic_endian(u64 magic, uint64_t hdr_sz,
+                             bool is_pipe, struct perf_header *ph)
+ {
+       int ret;
+       /* check for legacy format */
+       ret = memcmp(&magic, __perf_magic1, sizeof(magic));
+       if (ret == 0) {
+               pr_debug("legacy perf.data format\n");
+               if (is_pipe)
+                       return try_all_pipe_abis(hdr_sz, ph);
+               return try_all_file_abis(hdr_sz, ph);
+       }
+       /*
+        * the new magic number serves two purposes:
+        * - unique number to identify actual perf.data files
+        * - encode endianness of file
+        */
+       /* check magic number with one endianness */
+       if (magic == __perf_magic2)
+               return 0;
+       /* check magic number with opposite endianness */
+       if (magic != __perf_magic2_sw)
+               return -1;
+       ph->needs_swap = true;
+       return 0;
+ }
  int perf_file_header__read(struct perf_file_header *header,
                           struct perf_header *ph, int fd)
  {
+       int ret;
        lseek(fd, 0, SEEK_SET);
  
-       if (readn(fd, header, sizeof(*header)) <= 0 ||
-           memcmp(&header->magic, __perf_magic, sizeof(header->magic)))
+       ret = readn(fd, header, sizeof(*header));
+       if (ret <= 0)
                return -1;
  
-       if (header->attr_size != sizeof(struct perf_file_attr)) {
-               u64 attr_size = bswap_64(header->attr_size);
-               if (attr_size != sizeof(struct perf_file_attr))
-                       return -1;
+       if (check_magic_endian(header->magic,
+                              header->attr_size, false, ph) < 0) {
+               pr_debug("magic/endian check failed\n");
+               return -1;
+       }
  
+       if (ph->needs_swap) {
                mem_bswap_64(header, offsetof(struct perf_file_header,
-                                           adds_features));
-               ph->needs_swap = true;
+                            adds_features));
        }
  
        if (header->size != sizeof(*header)) {
        return 0;
  }
  
- static int __event_process_build_id(struct build_id_event *bev,
-                                   char *filename,
-                                   struct perf_session *session)
- {
-       int err = -1;
-       struct list_head *head;
-       struct machine *machine;
-       u16 misc;
-       struct dso *dso;
-       enum dso_kernel_type dso_type;
-       machine = perf_session__findnew_machine(session, bev->pid);
-       if (!machine)
-               goto out;
-       misc = bev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-       switch (misc) {
-       case PERF_RECORD_MISC_KERNEL:
-               dso_type = DSO_TYPE_KERNEL;
-               head = &machine->kernel_dsos;
-               break;
-       case PERF_RECORD_MISC_GUEST_KERNEL:
-               dso_type = DSO_TYPE_GUEST_KERNEL;
-               head = &machine->kernel_dsos;
-               break;
-       case PERF_RECORD_MISC_USER:
-       case PERF_RECORD_MISC_GUEST_USER:
-               dso_type = DSO_TYPE_USER;
-               head = &machine->user_dsos;
-               break;
-       default:
-               goto out;
-       }
-       dso = __dsos__findnew(head, filename);
-       if (dso != NULL) {
-               char sbuild_id[BUILD_ID_SIZE * 2 + 1];
-               dso__set_build_id(dso, &bev->build_id);
-               if (filename[0] == '[')
-                       dso->kernel = dso_type;
-               build_id__sprintf(dso->build_id, sizeof(dso->build_id),
-                                 sbuild_id);
-               pr_debug("build id event received for %s: %s\n",
-                        dso->long_name, sbuild_id);
-       }
-       err = 0;
- out:
-       return err;
- }
- static int perf_header__read_build_ids_abi_quirk(struct perf_header *header,
-                                                int input, u64 offset, u64 size)
- {
-       struct perf_session *session = container_of(header, struct perf_session, header);
-       struct {
-               struct perf_event_header   header;
-               u8                         build_id[ALIGN(BUILD_ID_SIZE, sizeof(u64))];
-               char                       filename[0];
-       } old_bev;
-       struct build_id_event bev;
-       char filename[PATH_MAX];
-       u64 limit = offset + size;
-       while (offset < limit) {
-               ssize_t len;
-               if (read(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev))
-                       return -1;
-               if (header->needs_swap)
-                       perf_event_header__bswap(&old_bev.header);
-               len = old_bev.header.size - sizeof(old_bev);
-               if (read(input, filename, len) != len)
-                       return -1;
-               bev.header = old_bev.header;
-               /*
-                * As the pid is the missing value, we need to fill
-                * it properly. The header.misc value give us nice hint.
-                */
-               bev.pid = HOST_KERNEL_ID;
-               if (bev.header.misc == PERF_RECORD_MISC_GUEST_USER ||
-                   bev.header.misc == PERF_RECORD_MISC_GUEST_KERNEL)
-                       bev.pid = DEFAULT_GUEST_KERNEL_ID;
-               memcpy(bev.build_id, old_bev.build_id, sizeof(bev.build_id));
-               __event_process_build_id(&bev, filename, session);
-               offset += bev.header.size;
-       }
-       return 0;
- }
- static int perf_header__read_build_ids(struct perf_header *header,
-                                      int input, u64 offset, u64 size)
- {
-       struct perf_session *session = container_of(header, struct perf_session, header);
-       struct build_id_event bev;
-       char filename[PATH_MAX];
-       u64 limit = offset + size, orig_offset = offset;
-       int err = -1;
-       while (offset < limit) {
-               ssize_t len;
-               if (read(input, &bev, sizeof(bev)) != sizeof(bev))
-                       goto out;
-               if (header->needs_swap)
-                       perf_event_header__bswap(&bev.header);
-               len = bev.header.size - sizeof(bev);
-               if (read(input, filename, len) != len)
-                       goto out;
-               /*
-                * The a1645ce1 changeset:
-                *
-                * "perf: 'perf kvm' tool for monitoring guest performance from host"
-                *
-                * Added a field to struct build_id_event that broke the file
-                * format.
-                *
-                * Since the kernel build-id is the first entry, process the
-                * table using the old format if the well known
-                * '[kernel.kallsyms]' string for the kernel build-id has the
-                * first 4 characters chopped off (where the pid_t sits).
-                */
-               if (memcmp(filename, "nel.kallsyms]", 13) == 0) {
-                       if (lseek(input, orig_offset, SEEK_SET) == (off_t)-1)
-                               return -1;
-                       return perf_header__read_build_ids_abi_quirk(header, input, offset, size);
-               }
-               __event_process_build_id(&bev, filename, session);
-               offset += bev.header.size;
-       }
-       err = 0;
- out:
-       return err;
- }
  static int perf_file_section__process(struct perf_file_section *section,
                                      struct perf_header *ph,
                                      int feat, int fd, void *data __used)
                return 0;
        }
  
-       switch (feat) {
-       case HEADER_TRACE_INFO:
-               trace_report(fd, false);
-               break;
-       case HEADER_BUILD_ID:
-               if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
-                       pr_debug("Failed to read buildids, continuing...\n");
-               break;
-       default:
-               break;
-       }
+       if (!feat_ops[feat].process)
+               return 0;
  
-       return 0;
+       return feat_ops[feat].process(section, ph, feat, fd);
  }
  
  static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
                                       struct perf_header *ph, int fd,
                                       bool repipe)
  {
-       if (readn(fd, header, sizeof(*header)) <= 0 ||
-           memcmp(&header->magic, __perf_magic, sizeof(header->magic)))
-               return -1;
+       int ret;
  
-       if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)
+       ret = readn(fd, header, sizeof(*header));
+       if (ret <= 0)
                return -1;
  
-       if (header->size != sizeof(*header)) {
-               u64 size = bswap_64(header->size);
+       if (check_magic_endian(header->magic, header->size, true, ph) < 0) {
+               pr_debug("endian/magic failed\n");
+               return -1;
+       }
  
-               if (size != sizeof(*header))
-                       return -1;
+       if (ph->needs_swap)
+               header->size = bswap_64(header->size);
  
-               ph->needs_swap = true;
-       }
+       if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)
+               return -1;
  
        return 0;
  }
@@@ -1908,6 -2050,52 +2050,52 @@@ static int perf_header__read_pipe(struc
        return 0;
  }
  
+ static int read_attr(int fd, struct perf_header *ph,
+                    struct perf_file_attr *f_attr)
+ {
+       struct perf_event_attr *attr = &f_attr->attr;
+       size_t sz, left;
+       size_t our_sz = sizeof(f_attr->attr);
+       int ret;
+       memset(f_attr, 0, sizeof(*f_attr));
+       /* read minimal guaranteed structure */
+       ret = readn(fd, attr, PERF_ATTR_SIZE_VER0);
+       if (ret <= 0) {
+               pr_debug("cannot read %d bytes of header attr\n",
+                        PERF_ATTR_SIZE_VER0);
+               return -1;
+       }
+       /* on file perf_event_attr size */
+       sz = attr->size;
+       if (ph->needs_swap)
+               sz = bswap_32(sz);
+       if (sz == 0) {
+               /* assume ABI0 */
+               sz =  PERF_ATTR_SIZE_VER0;
+       } else if (sz > our_sz) {
+               pr_debug("file uses a more recent and unsupported ABI"
+                        " (%zu bytes extra)\n", sz - our_sz);
+               return -1;
+       }
+       /* what we have not yet read and that we know about */
+       left = sz - PERF_ATTR_SIZE_VER0;
+       if (left) {
+               void *ptr = attr;
+               ptr += PERF_ATTR_SIZE_VER0;
+               ret = readn(fd, ptr, left);
+       }
+       /* read perf_file_section, ids are read in caller */
+       ret = readn(fd, &f_attr->ids, sizeof(f_attr->ids));
+       return ret <= 0 ? -1 : 0;
+ }
  int perf_session__read_header(struct perf_session *session, int fd)
  {
        struct perf_header *header = &session->header;
        if (session->fd_pipe)
                return perf_header__read_pipe(session, fd);
  
-       if (perf_file_header__read(&f_header, header, fd) < 0) {
-               pr_debug("incompatible file format\n");
+       if (perf_file_header__read(&f_header, header, fd) < 0)
                return -EINVAL;
-       }
  
-       nr_attrs = f_header.attrs.size / sizeof(f_attr);
+       nr_attrs = f_header.attrs.size / f_header.attr_size;
        lseek(fd, f_header.attrs.offset, SEEK_SET);
  
        for (i = 0; i < nr_attrs; i++) {
                struct perf_evsel *evsel;
                off_t tmp;
  
-               if (readn(fd, &f_attr, sizeof(f_attr)) <= 0)
+               if (read_attr(fd, header, &f_attr) < 0)
                        goto out_errno;
  
                if (header->needs_swap)
diff --combined tools/perf/util/hist.c
@@@ -50,21 -50,25 +50,25 @@@ static void hists__reset_col_len(struc
                hists__set_col_len(hists, col, 0);
  }
  
+ static void hists__set_unres_dso_col_len(struct hists *hists, int dso)
+ {
+       const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
+       if (hists__col_len(hists, dso) < unresolved_col_width &&
+           !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
+           !symbol_conf.dso_list)
+               hists__set_col_len(hists, dso, unresolved_col_width);
+ }
  static void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
  {
+       const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
        u16 len;
  
        if (h->ms.sym)
-               hists__new_col_len(hists, HISTC_SYMBOL, h->ms.sym->namelen);
-       else {
-               const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
-               if (hists__col_len(hists, HISTC_DSO) < unresolved_col_width &&
-                   !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
-                   !symbol_conf.dso_list)
-                       hists__set_col_len(hists, HISTC_DSO,
-                                          unresolved_col_width);
-       }
+               hists__new_col_len(hists, HISTC_SYMBOL, h->ms.sym->namelen + 4);
+       else
+               hists__set_unres_dso_col_len(hists, HISTC_DSO);
  
        len = thread__comm_len(h->thread);
        if (hists__new_col_len(hists, HISTC_COMM, len))
                len = dso__name_len(h->ms.map->dso);
                hists__new_col_len(hists, HISTC_DSO, len);
        }
+       if (h->branch_info) {
+               int symlen;
+               /*
+                * +4 accounts for '[x] ' priv level info
+                * +2 account of 0x prefix on raw addresses
+                */
+               if (h->branch_info->from.sym) {
+                       symlen = (int)h->branch_info->from.sym->namelen + 4;
+                       hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
+                       symlen = dso__name_len(h->branch_info->from.map->dso);
+                       hists__new_col_len(hists, HISTC_DSO_FROM, symlen);
+               } else {
+                       symlen = unresolved_col_width + 4 + 2;
+                       hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
+                       hists__set_unres_dso_col_len(hists, HISTC_DSO_FROM);
+               }
+               if (h->branch_info->to.sym) {
+                       symlen = (int)h->branch_info->to.sym->namelen + 4;
+                       hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
+                       symlen = dso__name_len(h->branch_info->to.map->dso);
+                       hists__new_col_len(hists, HISTC_DSO_TO, symlen);
+               } else {
+                       symlen = unresolved_col_width + 4 + 2;
+                       hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
+                       hists__set_unres_dso_col_len(hists, HISTC_DSO_TO);
+               }
+       }
  }
  
  static void hist_entry__add_cpumode_period(struct hist_entry *he,
@@@ -195,26 -230,14 +230,14 @@@ static u8 symbol__parent_filter(const s
        return 0;
  }
  
- struct hist_entry *__hists__add_entry(struct hists *hists,
+ static struct hist_entry *add_hist_entry(struct hists *hists,
+                                     struct hist_entry *entry,
                                      struct addr_location *al,
-                                     struct symbol *sym_parent, u64 period)
+                                     u64 period)
  {
        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct hist_entry *he;
-       struct hist_entry entry = {
-               .thread = al->thread,
-               .ms = {
-                       .map    = al->map,
-                       .sym    = al->sym,
-               },
-               .cpu    = al->cpu,
-               .ip     = al->addr,
-               .level  = al->level,
-               .period = period,
-               .parent = sym_parent,
-               .filtered = symbol__parent_filter(sym_parent),
-       };
        int cmp;
  
        pthread_mutex_lock(&hists->lock);
                parent = *p;
                he = rb_entry(parent, struct hist_entry, rb_node_in);
  
-               cmp = hist_entry__cmp(&entry, he);
+               cmp = hist_entry__cmp(entry, he);
  
                if (!cmp) {
                        he->period += period;
                        p = &(*p)->rb_right;
        }
  
-       he = hist_entry__new(&entry);
+       he = hist_entry__new(entry);
        if (!he)
                goto out_unlock;
  
@@@ -252,6 -275,51 +275,51 @@@ out_unlock
        return he;
  }
  
+ struct hist_entry *__hists__add_branch_entry(struct hists *self,
+                                            struct addr_location *al,
+                                            struct symbol *sym_parent,
+                                            struct branch_info *bi,
+                                            u64 period)
+ {
+       struct hist_entry entry = {
+               .thread = al->thread,
+               .ms = {
+                       .map    = bi->to.map,
+                       .sym    = bi->to.sym,
+               },
+               .cpu    = al->cpu,
+               .ip     = bi->to.addr,
+               .level  = al->level,
+               .period = period,
+               .parent = sym_parent,
+               .filtered = symbol__parent_filter(sym_parent),
+               .branch_info = bi,
+       };
+       return add_hist_entry(self, &entry, al, period);
+ }
+ struct hist_entry *__hists__add_entry(struct hists *self,
+                                     struct addr_location *al,
+                                     struct symbol *sym_parent, u64 period)
+ {
+       struct hist_entry entry = {
+               .thread = al->thread,
+               .ms = {
+                       .map    = al->map,
+                       .sym    = al->sym,
+               },
+               .cpu    = al->cpu,
+               .ip     = al->addr,
+               .level  = al->level,
+               .period = period,
+               .parent = sym_parent,
+               .filtered = symbol__parent_filter(sym_parent),
+       };
+       return add_hist_entry(self, &entry, al, period);
+ }
  int64_t
  hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@@ -768,7 -836,7 +836,7 @@@ static int hist_entry__pcnt_snprintf(st
                                                     sep ? "%.2f" : "   %6.2f%%",
                                                     (period * 100.0) / total);
                else
 -                      ret = snprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
 +                      ret = scnprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
                                       (period * 100.0) / total);
                if (symbol_conf.show_cpu_utilization) {
                        ret += percent_color_snprintf(s + ret, size - ret,
                        }
                }
        } else
 -              ret = snprintf(s, size, sep ? "%" PRIu64 : "%12" PRIu64 " ", period);
 +              ret = scnprintf(s, size, sep ? "%" PRIu64 : "%12" PRIu64 " ", period);
  
        if (symbol_conf.show_nr_samples) {
                if (sep)
 -                      ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, nr_events);
 +                      ret += scnprintf(s + ret, size - ret, "%c%" PRIu64, *sep, nr_events);
                else
 -                      ret += snprintf(s + ret, size - ret, "%11" PRIu64, nr_events);
 +                      ret += scnprintf(s + ret, size - ret, "%11" PRIu64, nr_events);
        }
  
        if (symbol_conf.show_total_period) {
                if (sep)
 -                      ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period);
 +                      ret += scnprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period);
                else
 -                      ret += snprintf(s + ret, size - ret, " %12" PRIu64, period);
 +                      ret += scnprintf(s + ret, size - ret, " %12" PRIu64, period);
        }
  
        if (pair_hists) {
                diff = new_percent - old_percent;
  
                if (fabs(diff) >= 0.01)
 -                      snprintf(bf, sizeof(bf), "%+4.2F%%", diff);
 +                      ret += scnprintf(bf, sizeof(bf), "%+4.2F%%", diff);
                else
 -                      snprintf(bf, sizeof(bf), " ");
 +                      ret += scnprintf(bf, sizeof(bf), " ");
  
                if (sep)
 -                      ret += snprintf(s + ret, size - ret, "%c%s", *sep, bf);
 +                      ret += scnprintf(s + ret, size - ret, "%c%s", *sep, bf);
                else
 -                      ret += snprintf(s + ret, size - ret, "%11.11s", bf);
 +                      ret += scnprintf(s + ret, size - ret, "%11.11s", bf);
  
                if (show_displacement) {
                        if (displacement)
 -                              snprintf(bf, sizeof(bf), "%+4ld", displacement);
 +                              ret += scnprintf(bf, sizeof(bf), "%+4ld", displacement);
                        else
 -                              snprintf(bf, sizeof(bf), " ");
 +                              ret += scnprintf(bf, sizeof(bf), " ");
  
                        if (sep)
 -                              ret += snprintf(s + ret, size - ret, "%c%s", *sep, bf);
 +                              ret += scnprintf(s + ret, size - ret, "%c%s", *sep, bf);
                        else
 -                              ret += snprintf(s + ret, size - ret, "%6.6s", bf);
 +                              ret += scnprintf(s + ret, size - ret, "%6.6s", bf);
                }
        }
  
@@@ -855,7 -923,7 +923,7 @@@ int hist_entry__snprintf(struct hist_en
                if (se->elide)
                        continue;
  
 -              ret += snprintf(s + ret, size - ret, "%s", sep ?: "  ");
 +              ret += scnprintf(s + ret, size - ret, "%s", sep ?: "  ");
                ret += se->se_snprintf(he, s + ret, size - ret,
                                       hists__col_len(hists, se->se_width_idx));
        }
diff --combined tools/perf/util/sort.c
@@@ -8,6 -8,7 +8,7 @@@ const char       default_sort_order[] = "comm
  const char    *sort_order = default_sort_order;
  int           sort__need_collapse = 0;
  int           sort__has_parent = 0;
+ int           sort__branch_mode = -1; /* -1 = means not set */
  
  enum sort_type        sort__first_dimension;
  
@@@ -33,9 -34,6 +34,9 @@@ static int repsep_snprintf(char *bf, si
                }
        }
        va_end(ap);
 +
 +      if (n >= (int)size)
 +              return size - 1;
        return n;
  }
  
@@@ -97,6 -95,26 +98,26 @@@ static int hist_entry__comm_snprintf(st
        return repsep_snprintf(bf, size, "%*s", width, self->thread->comm);
  }
  
+ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
+ {
+       struct dso *dso_l = map_l ? map_l->dso : NULL;
+       struct dso *dso_r = map_r ? map_r->dso : NULL;
+       const char *dso_name_l, *dso_name_r;
+       if (!dso_l || !dso_r)
+               return cmp_null(dso_l, dso_r);
+       if (verbose) {
+               dso_name_l = dso_l->long_name;
+               dso_name_r = dso_r->long_name;
+       } else {
+               dso_name_l = dso_l->short_name;
+               dso_name_r = dso_r->short_name;
+       }
+       return strcmp(dso_name_l, dso_name_r);
+ }
  struct sort_entry sort_comm = {
        .se_header      = "Command",
        .se_cmp         = sort__comm_cmp,
  static int64_t
  sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
  {
-       struct dso *dso_l = left->ms.map ? left->ms.map->dso : NULL;
-       struct dso *dso_r = right->ms.map ? right->ms.map->dso : NULL;
-       const char *dso_name_l, *dso_name_r;
+       return _sort__dso_cmp(left->ms.map, right->ms.map);
+ }
  
-       if (!dso_l || !dso_r)
-               return cmp_null(dso_l, dso_r);
  
-       if (verbose) {
-               dso_name_l = dso_l->long_name;
-               dso_name_r = dso_r->long_name;
-       } else {
-               dso_name_l = dso_l->short_name;
-               dso_name_r = dso_r->short_name;
+ static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r,
+                             u64 ip_l, u64 ip_r)
+ {
+       if (!sym_l || !sym_r)
+               return cmp_null(sym_l, sym_r);
+       if (sym_l == sym_r)
+               return 0;
+       if (sym_l)
+               ip_l = sym_l->start;
+       if (sym_r)
+               ip_r = sym_r->start;
+       return (int64_t)(ip_r - ip_l);
+ }
+ static int _hist_entry__dso_snprintf(struct map *map, char *bf,
+                                    size_t size, unsigned int width)
+ {
+       if (map && map->dso) {
+               const char *dso_name = !verbose ? map->dso->short_name :
+                       map->dso->long_name;
+               return repsep_snprintf(bf, size, "%-*s", width, dso_name);
        }
  
-       return strcmp(dso_name_l, dso_name_r);
+       return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
  }
  
  static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
                                    size_t size, unsigned int width)
  {
-       if (self->ms.map && self->ms.map->dso) {
-               const char *dso_name = !verbose ? self->ms.map->dso->short_name :
-                                                 self->ms.map->dso->long_name;
-               return repsep_snprintf(bf, size, "%-*s", width, dso_name);
+       return _hist_entry__dso_snprintf(self->ms.map, bf, size, width);
+ }
+ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
+                                    u64 ip, char level, char *bf, size_t size,
+                                    unsigned int width __used)
+ {
+       size_t ret = 0;
+       if (verbose) {
+               char o = map ? dso__symtab_origin(map->dso) : '!';
+               ret += repsep_snprintf(bf, size, "%-#*llx %c ",
+                                      BITS_PER_LONG / 4, ip, o);
        }
  
-       return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
+       ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", level);
+       if (sym)
+               ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+                                      width - ret,
+                                      sym->name);
+       else {
+               size_t len = BITS_PER_LONG / 4;
+               ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx",
+                                      len, ip);
+               ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+                                      width - ret, "");
+       }
+       return ret;
  }
  
  struct sort_entry sort_dso = {
        .se_header      = "Shared Object",
        .se_cmp         = sort__dso_cmp,
        .se_width_idx   = HISTC_DSO,
  };
  
- /* --sort symbol */
+ static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width __used)
+ {
+       return _hist_entry__sym_snprintf(self->ms.map, self->ms.sym, self->ip,
+                                        self->level, bf, size, width);
+ }
  
+ /* --sort symbol */
  static int64_t
  sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
  {
        ip_l = left->ms.sym->start;
        ip_r = right->ms.sym->start;
  
-       return (int64_t)(ip_r - ip_l);
- }
- static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
-                                   size_t size, unsigned int width __used)
- {
-       size_t ret = 0;
-       if (verbose) {
-               char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!';
-               ret += repsep_snprintf(bf, size, "%-#*llx %c ",
-                                      BITS_PER_LONG / 4, self->ip, o);
-       }
-       if (!sort_dso.elide)
-               ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", self->level);
-       if (self->ms.sym)
-               ret += repsep_snprintf(bf + ret, size - ret, "%s",
-                                      self->ms.sym->name);
-       else
-               ret += repsep_snprintf(bf + ret, size - ret, "%-#*llx",
-                                      BITS_PER_LONG / 4, self->ip);
-       return ret;
+       return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r);
  }
  
  struct sort_entry sort_sym = {
@@@ -249,19 -287,155 +290,155 @@@ struct sort_entry sort_cpu = 
        .se_width_idx   = HISTC_CPU,
  };
  
+ static int64_t
+ sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
+ {
+       return _sort__dso_cmp(left->branch_info->from.map,
+                             right->branch_info->from.map);
+ }
+ static int hist_entry__dso_from_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width)
+ {
+       return _hist_entry__dso_snprintf(self->branch_info->from.map,
+                                        bf, size, width);
+ }
+ struct sort_entry sort_dso_from = {
+       .se_header      = "Source Shared Object",
+       .se_cmp         = sort__dso_from_cmp,
+       .se_snprintf    = hist_entry__dso_from_snprintf,
+       .se_width_idx   = HISTC_DSO_FROM,
+ };
+ static int64_t
+ sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
+ {
+       return _sort__dso_cmp(left->branch_info->to.map,
+                             right->branch_info->to.map);
+ }
+ static int hist_entry__dso_to_snprintf(struct hist_entry *self, char *bf,
+                                      size_t size, unsigned int width)
+ {
+       return _hist_entry__dso_snprintf(self->branch_info->to.map,
+                                        bf, size, width);
+ }
+ static int64_t
+ sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
+ {
+       struct addr_map_symbol *from_l = &left->branch_info->from;
+       struct addr_map_symbol *from_r = &right->branch_info->from;
+       if (!from_l->sym && !from_r->sym)
+               return right->level - left->level;
+       return _sort__sym_cmp(from_l->sym, from_r->sym, from_l->addr,
+                            from_r->addr);
+ }
+ static int64_t
+ sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
+ {
+       struct addr_map_symbol *to_l = &left->branch_info->to;
+       struct addr_map_symbol *to_r = &right->branch_info->to;
+       if (!to_l->sym && !to_r->sym)
+               return right->level - left->level;
+       return _sort__sym_cmp(to_l->sym, to_r->sym, to_l->addr, to_r->addr);
+ }
+ static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width __used)
+ {
+       struct addr_map_symbol *from = &self->branch_info->from;
+       return _hist_entry__sym_snprintf(from->map, from->sym, from->addr,
+                                        self->level, bf, size, width);
+ }
+ static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width __used)
+ {
+       struct addr_map_symbol *to = &self->branch_info->to;
+       return _hist_entry__sym_snprintf(to->map, to->sym, to->addr,
+                                        self->level, bf, size, width);
+ }
+ struct sort_entry sort_dso_to = {
+       .se_header      = "Target Shared Object",
+       .se_cmp         = sort__dso_to_cmp,
+       .se_snprintf    = hist_entry__dso_to_snprintf,
+       .se_width_idx   = HISTC_DSO_TO,
+ };
+ struct sort_entry sort_sym_from = {
+       .se_header      = "Source Symbol",
+       .se_cmp         = sort__sym_from_cmp,
+       .se_snprintf    = hist_entry__sym_from_snprintf,
+       .se_width_idx   = HISTC_SYMBOL_FROM,
+ };
+ struct sort_entry sort_sym_to = {
+       .se_header      = "Target Symbol",
+       .se_cmp         = sort__sym_to_cmp,
+       .se_snprintf    = hist_entry__sym_to_snprintf,
+       .se_width_idx   = HISTC_SYMBOL_TO,
+ };
+ static int64_t
+ sort__mispredict_cmp(struct hist_entry *left, struct hist_entry *right)
+ {
+       const unsigned char mp = left->branch_info->flags.mispred !=
+                                       right->branch_info->flags.mispred;
+       const unsigned char p = left->branch_info->flags.predicted !=
+                                       right->branch_info->flags.predicted;
+       return mp || p;
+ }
+ static int hist_entry__mispredict_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width){
+       static const char *out = "N/A";
+       if (self->branch_info->flags.predicted)
+               out = "N";
+       else if (self->branch_info->flags.mispred)
+               out = "Y";
+       return repsep_snprintf(bf, size, "%-*s", width, out);
+ }
+ struct sort_entry sort_mispredict = {
+       .se_header      = "Branch Mispredicted",
+       .se_cmp         = sort__mispredict_cmp,
+       .se_snprintf    = hist_entry__mispredict_snprintf,
+       .se_width_idx   = HISTC_MISPREDICT,
+ };
  struct sort_dimension {
        const char              *name;
        struct sort_entry       *entry;
        int                     taken;
  };
  
+ #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
  static struct sort_dimension sort_dimensions[] = {
-       { .name = "pid",        .entry = &sort_thread,  },
-       { .name = "comm",       .entry = &sort_comm,    },
-       { .name = "dso",        .entry = &sort_dso,     },
-       { .name = "symbol",     .entry = &sort_sym,     },
-       { .name = "parent",     .entry = &sort_parent,  },
-       { .name = "cpu",        .entry = &sort_cpu,     },
+       DIM(SORT_PID, "pid", sort_thread),
+       DIM(SORT_COMM, "comm", sort_comm),
+       DIM(SORT_DSO, "dso", sort_dso),
+       DIM(SORT_DSO_FROM, "dso_from", sort_dso_from),
+       DIM(SORT_DSO_TO, "dso_to", sort_dso_to),
+       DIM(SORT_SYM, "symbol", sort_sym),
+       DIM(SORT_SYM_TO, "symbol_from", sort_sym_from),
+       DIM(SORT_SYM_FROM, "symbol_to", sort_sym_to),
+       DIM(SORT_PARENT, "parent", sort_parent),
+       DIM(SORT_CPU, "cpu", sort_cpu),
+       DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
  };
  
  int sort_dimension__add(const char *tok)
  
                if (strncasecmp(tok, sd->name, strlen(tok)))
                        continue;
                if (sd->entry == &sort_parent) {
                        int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
                        if (ret) {
                                sort__first_dimension = SORT_PARENT;
                        else if (!strcmp(sd->name, "cpu"))
                                sort__first_dimension = SORT_CPU;
+                       else if (!strcmp(sd->name, "symbol_from"))
+                               sort__first_dimension = SORT_SYM_FROM;
+                       else if (!strcmp(sd->name, "symbol_to"))
+                               sort__first_dimension = SORT_SYM_TO;
+                       else if (!strcmp(sd->name, "dso_from"))
+                               sort__first_dimension = SORT_DSO_FROM;
+                       else if (!strcmp(sd->name, "dso_to"))
+                               sort__first_dimension = SORT_DSO_TO;
+                       else if (!strcmp(sd->name, "mispredict"))
+                               sort__first_dimension = SORT_MISPREDICT;
                }
  
                list_add_tail(&sd->entry->list, &hist_entry__sort_list);
  
                return 0;
        }
        return -ESRCH;
  }
  
@@@ -805,8 -805,11 +805,11 @@@ static struct hist_browser *hist_browse
                self->hists = hists;
                self->b.refresh = hist_browser__refresh;
                self->b.seek = ui_browser__hists_seek;
-               self->b.use_navkeypressed = true,
-               self->has_symbols = sort_sym.list.next != NULL;
+               self->b.use_navkeypressed = true;
+               if (sort__branch_mode == 1)
+                       self->has_symbols = sort_sym_from.list.next != NULL;
+               else
+                       self->has_symbols = sort_sym.list.next != NULL;
        }
  
        return self;
@@@ -837,19 -840,32 +840,32 @@@ static int hists__browser_title(struct 
        unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE];
  
        nr_events = convert_unit(nr_events, &unit);
 -      printed = snprintf(bf, size, "Events: %lu%c %s", nr_events, unit, ev_name);
 +      printed = scnprintf(bf, size, "Events: %lu%c %s", nr_events, unit, ev_name);
  
+       if (self->uid_filter_str)
+               printed += snprintf(bf + printed, size - printed,
+                                   ", UID: %s", self->uid_filter_str);
        if (thread)
 -              printed += snprintf(bf + printed, size - printed,
 +              printed += scnprintf(bf + printed, size - printed,
                                    ", Thread: %s(%d)",
                                    (thread->comm_set ? thread->comm : ""),
                                    thread->pid);
        if (dso)
 -              printed += snprintf(bf + printed, size - printed,
 +              printed += scnprintf(bf + printed, size - printed,
                                    ", DSO: %s", dso->short_name);
        return printed;
  }
  
+ static inline void free_popup_options(char **options, int n)
+ {
+       int i;
+       for (i = 0; i < n; ++i) {
+               free(options[i]);
+               options[i] = NULL;
+       }
+ }
  static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                    const char *helpline, const char *ev_name,
                                    bool left_exits,
  {
        struct hists *self = &evsel->hists;
        struct hist_browser *browser = hist_browser__new(self);
+       struct branch_info *bi;
        struct pstack *fstack;
+       char *options[16];
+       int nr_options = 0;
        int key = -1;
  
        if (browser == NULL)
  
        ui_helpline__push(helpline);
  
+       memset(options, 0, sizeof(options));
        while (1) {
                const struct thread *thread = NULL;
                const struct dso *dso = NULL;
-               char *options[16];
-               int nr_options = 0, choice = 0, i,
+               int choice = 0,
                    annotate = -2, zoom_dso = -2, zoom_thread = -2,
-                   browse_map = -2;
+                   annotate_f = -2, annotate_t = -2, browse_map = -2;
+               nr_options = 0;
  
                key = hist_browser__run(browser, ev_name, timer, arg, delay_secs);
  
                        thread = hist_browser__selected_thread(browser);
                        dso = browser->selection->map ? browser->selection->map->dso : NULL;
                }
                switch (key) {
                case K_TAB:
                case K_UNTAB:
                        if (!browser->has_symbols) {
                                ui_browser__warning(&browser->b, delay_secs * 2,
                        "Annotation is only available for symbolic views, "
-                       "include \"sym\" in --sort to use it.");
+                       "include \"sym*\" in --sort to use it.");
                                continue;
                        }
  
                if (!browser->has_symbols)
                        goto add_exit_option;
  
-               if (browser->selection != NULL &&
-                   browser->selection->sym != NULL &&
-                   !browser->selection->map->dso->annotate_warned &&
-                   asprintf(&options[nr_options], "Annotate %s",
-                            browser->selection->sym->name) > 0)
-                       annotate = nr_options++;
+               if (sort__branch_mode == 1) {
+                       bi = browser->he_selection->branch_info;
+                       if (browser->selection != NULL &&
+                           bi &&
+                           bi->from.sym != NULL &&
+                           !bi->from.map->dso->annotate_warned &&
+                               asprintf(&options[nr_options], "Annotate %s",
+                                        bi->from.sym->name) > 0)
+                               annotate_f = nr_options++;
+                       if (browser->selection != NULL &&
+                           bi &&
+                           bi->to.sym != NULL &&
+                           !bi->to.map->dso->annotate_warned &&
+                           (bi->to.sym != bi->from.sym ||
+                            bi->to.map->dso != bi->from.map->dso) &&
+                               asprintf(&options[nr_options], "Annotate %s",
+                                        bi->to.sym->name) > 0)
+                               annotate_t = nr_options++;
+               } else {
+                       if (browser->selection != NULL &&
+                           browser->selection->sym != NULL &&
+                           !browser->selection->map->dso->annotate_warned &&
+                               asprintf(&options[nr_options], "Annotate %s",
+                                        browser->selection->sym->name) > 0)
+                               annotate = nr_options++;
+               }
  
                if (thread != NULL &&
                    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
                        browse_map = nr_options++;
  add_exit_option:
                options[nr_options++] = (char *)"Exit";
+ retry_popup_menu:
                choice = ui__popup_menu(nr_options, options);
  
                if (choice == nr_options - 1)
                        break;
  
-               if (choice == -1)
+               if (choice == -1) {
+                       free_popup_options(options, nr_options - 1);
                        continue;
+               }
  
-               if (choice == annotate) {
+               if (choice == annotate || choice == annotate_t || choice == annotate_f) {
                        struct hist_entry *he;
                        int err;
  do_annotate:
                        he = hist_browser__selected_entry(browser);
                        if (he == NULL)
                                continue;
+                       /*
+                        * we stash the branch_info symbol + map into the
+                        * the ms so we don't have to rewrite all the annotation
+                        * code to use branch_info.
+                        * in branch mode, the ms struct is not used
+                        */
+                       if (choice == annotate_f) {
+                               he->ms.sym = he->branch_info->from.sym;
+                               he->ms.map = he->branch_info->from.map;
+                       }  else if (choice == annotate_t) {
+                               he->ms.sym = he->branch_info->to.sym;
+                               he->ms.map = he->branch_info->to.map;
+                       }
                        /*
                         * Don't let this be freed, say, by hists__decay_entry.
                         */
                        err = hist_entry__tui_annotate(he, evsel->idx,
                                                       timer, arg, delay_secs);
                        he->used = false;
+                       /*
+                        * offer option to annotate the other branch source or target
+                        * (if they exists) when returning from annotate
+                        */
+                       if ((err == 'q' || err == CTRL('c'))
+                           && annotate_t != -2 && annotate_f != -2)
+                               goto retry_popup_menu;
                        ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
                        if (err)
                                ui_browser__handle_resize(&browser->b);
                } else if (choice == browse_map)
                        map__browse(browser->selection->map);
                else if (choice == zoom_dso) {
@@@ -1069,6 -1135,7 +1135,7 @@@ out_free_stack
        pstack__delete(fstack);
  out:
        hist_browser__delete(browser);
+       free_popup_options(options, nr_options - 1);
        return key;
  }
  
@@@ -1095,7 -1162,7 +1162,7 @@@ static void perf_evsel_menu__write(stru
                                                       HE_COLORSET_NORMAL);
  
        nr_events = convert_unit(nr_events, &unit);
 -      printed = snprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events,
 +      printed = scnprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events,
                           unit, unit == ' ' ? "" : " ", ev_name);
        slsmg_printf("%s", bf);
  
                if (!current_entry)
                        ui_browser__set_color(browser, HE_COLORSET_TOP);
                nr_events = convert_unit(nr_events, &unit);
 -              snprintf(bf, sizeof(bf), ": %ld%c%schunks LOST!", nr_events,
 -                       unit, unit == ' ' ? "" : " ");
 +              printed += scnprintf(bf, sizeof(bf), ": %ld%c%schunks LOST!",
 +                                   nr_events, unit, unit == ' ' ? "" : " ");
                warn = bf;
        }