- Updated to 2.6.34-rc1.

author Jeff Mahoney <jeffm@suse.de>

Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)

committer Jeff Mahoney <jeffm@suse.de>

Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
author Jeff Mahoney <jeffm@suse.de>
Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
committer Jeff Mahoney <jeffm@suse.de>
Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
diff --cc Documentation/feature-removal-schedule.txt
Simple merge
diff --cc Documentation/filesystems/Locking
Simple merge
diff --cc Documentation/filesystems/proc.txt
Simple merge
diff --cc Documentation/kernel-parameters.txt
Simple merge
diff --cc Documentation/networking/ixgbevf.txt

index 0000000,19015de..19015de

mode 000000,100755..100644
--- /dev/null
--- 2/Documentation/networking/ixgbevf.txt
+++ b/Documentation/networking/ixgbevf.txt
diff --cc MAINTAINERS
Simple merge
diff --cc Makefile
Simple merge
diff --cc arch/ia64/Kconfig
Simple merge
diff --cc arch/ia64/Makefile

index 567896b,8ae0d26..7476449
--- 1/arch/ia64/Makefile
--- 2/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@@ -56,9 -55,8 +55,9 @@@ core-$(CONFIG_IA64_XEN_GUEST) += arch/i
   core-$(CONFIG_IA64_SGI_SN2)   += arch/ia64/sn/
   core-$(CONFIG_IA64_SGI_UV)    += arch/ia64/uv/
   core-$(CONFIG_KVM)            += arch/ia64/kvm/
- core-$(CONFIG_PARAVIRT_XEN)   += arch/ia64/xen/
+ core-$(CONFIG_XEN)            += arch/ia64/xen/
   
+ +drivers-$(CONFIG_KDB)         += arch/$(ARCH)/kdb/
   drivers-$(CONFIG_PCI)         += arch/ia64/pci/
   drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
   drivers-$(CONFIG_IA64_HP_ZX1) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
diff --cc arch/ia64/kernel/Makefile
Simple merge
diff --cc arch/ia64/kernel/acpi.c
Simple merge
diff --cc arch/ia64/kernel/traps.c
Simple merge
diff --cc arch/ia64/kvm/Kconfig
Simple merge
diff --cc arch/powerpc/kernel/of_platform.c
Simple merge
diff --cc arch/powerpc/platforms/52xx/mpc52xx_gpt.c
Simple merge
diff --cc arch/powerpc/platforms/cell/interrupt.c
Simple merge
diff --cc arch/powerpc/sysdev/fsl_msi.c
Simple merge
diff --cc arch/powerpc/sysdev/ipic.c
Simple merge
diff --cc arch/powerpc/xmon/xmon.c
Simple merge
diff --cc arch/s390/Kconfig
Simple merge
diff --cc arch/s390/Makefile

index 00b6717,0da1074..9bd6a02
--- 1/arch/s390/Makefile
--- 2/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@@ -99,12 -105,12 +105,12 @@@ drivers-$(CONFIG_OPROFILE)       += arch/s390
   
   boot          := arch/s390/boot
   
- all: image kerntypes.o
- -all: image bzImage
++all: image bzImage kerntypes.o
   
   install: vmlinux
         $(Q)$(MAKE) $(build)=$(boot) $@
   
- image kerntypes.o: vmlinux
- -image bzImage: vmlinux
++image bzImage kerntypes.o: vmlinux
         $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
   
   zfcpdump:
diff --cc arch/s390/boot/Makefile

index 1d59536,8800cf0..904df13
--- 1/arch/s390/boot/Makefile
--- 2/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@@ -2,27 -2,25 +2,36 @@@
   # Makefile for the linux s390-specific parts of the memory manager.
   #
   
- -COMPILE_VERSION := __linux_compile_version_id__`hostname |  \
- -                      tr -c '[0-9A-Za-z]' '_'`__`date | \
- -                      tr -c '[0-9A-Za-z]' '_'`_t
+ +COMPILE_VERSION := __linux_compile_version_id__$(shell hostname |  \
+ +                      tr -c '[0-9A-Za-z]' '_')__$(shell date | \
+ +                      tr -c '[0-9A-Za-z]' '_')_t
   
+ +
+ +chk-option = $(shell if $(CC) $(CFLAGS) $(1) -S -o /dev/null -xc /dev/null \
+ +           > /dev/null 2>&1; then echo "$(1)"; fi ;)
+ +
+ +# Remove possible '-g' from CFLAGS_KERNEL, since we want to use stabs
+ +# debug format.
+ +override CFLAGS_KERNEL := $(shell echo $(CFLAGS_KERNEL) | sed 's/-g//')
   EXTRA_CFLAGS  := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I.
+ +# Assume we don't need the flag if the compiler doesn't know about it
+ +EXTRA_CFLAGS  += $(call chk-option,-fno-eliminate-unused-debug-types)
+ +
   
- targets := image kerntypes.o
+ targets := image
+ targets += bzImage
+ subdir- := compressed
++targets += kerntypes.o
   
   $(obj)/image: vmlinux FORCE
         $(call if_changed,objcopy)
   
+ $(obj)/bzImage: $(obj)/compressed/vmlinux FORCE
+       $(call if_changed,objcopy)
+ 
+ $(obj)/compressed/vmlinux: FORCE
+       $(Q)$(MAKE) $(build)=$(obj)/compressed $@
+ 
   install: $(CONFIGURE) $(obj)/image
         sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/image \
- -            System.map Kerntypes "$(INSTALL_PATH)"
+ +            System.map "$(INSTALL_PATH)"
diff --cc arch/x86/Kconfig
Simple merge
diff --cc arch/x86/Kconfig.debug

index a0a40c6,bc01e3e..c6ec4db
--- 1/arch/x86/Kconfig.debug
--- 2/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@@ -313,91 -310,4 +310,91 @@@ config DEBUG_STRICT_USER_COPY_CHECK
   
           If unsure, or if you run an older (pre 4.4) gcc, say N.
   
+ +config KDB
+ +      bool "Built-in Kernel Debugger support"
-       depends on DEBUG_KERNEL && !XEN
++      depends on DEBUG_KERNEL
+ +      select KALLSYMS
+ +      select KALLSYMS_ALL
+ +      help
+ +        This option provides a built-in kernel debugger.  The built-in
+ +        kernel debugger contains commands which allow memory to be examined,
+ +        instructions to be disassembled and breakpoints to be set.  For details,
+ +        see Documentation/kdb/kdb.mm and the manual pages kdb_bt, kdb_ss, etc.
+ +        Kdb can also be used via the serial port.  Set up the system to
+ +        have a serial console (see Documentation/serial-console.txt).
+ +        The key sequence <escape>KDB on the serial port will cause the
+ +        kernel debugger to be entered with input from the serial port and
+ +        output to the serial console.  If unsure, say N.
+ +
+ +config KDB_MODULES
+ +      tristate "KDB modules"
+ +      depends on KDB
+ +      help
+ +        KDB can be extended by adding your own modules, in directory
+ +        kdb/modules.  This option selects the way that these modules should
+ +        be compiled, as free standing modules (select M) or built into the
+ +        kernel (select Y).  If unsure say M.
+ +
+ +config KDB_OFF
+ +      bool "KDB off by default"
+ +      depends on KDB
+ +      help
+ +        Normally kdb is activated by default, as long as CONFIG_KDB is set.
+ +        If you want to ship a kernel with kdb support but only have kdb
+ +        turned on when the user requests it then select this option.  When
+ +        compiled with CONFIG_KDB_OFF, kdb ignores all events unless you boot
+ +        with kdb=on or you echo "1" > /proc/sys/kernel/kdb.  This option also
+ +        works in reverse, if kdb is normally activated, you can boot with
+ +        kdb=off or echo "0" > /proc/sys/kernel/kdb to deactivate kdb. If
+ +        unsure, say N.
+ +
+ +config KDB_CONTINUE_CATASTROPHIC
+ +      int "KDB continues after catastrophic errors"
+ +      depends on KDB
+ +      default "0"
+ +      help
+ +        This integer controls the behaviour of kdb when the kernel gets a
+ +        catastrophic error, i.e. for a panic, oops, NMI or other watchdog
+ +        tripping.  CONFIG_KDB_CONTINUE_CATASTROPHIC interacts with
+ +        /proc/sys/kernel/kdb and CONFIG_LKCD_DUMP (if your kernel has the
+ +        LKCD patch).
+ +        When KDB is active (/proc/sys/kernel/kdb == 1) and a catastrophic
+ +        error occurs, nothing extra happens until you type 'go'.
+ +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 0 (default).  The first time
+ +        you type 'go', kdb warns you.  The second time you type 'go', KDB
+ +        tries to continue - no guarantees that the kernel is still usable.
+ +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 1.  KDB tries to continue - no
+ +        guarantees that the kernel is still usable.
+ +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 2.  If your kernel has the LKCD
+ +        patch and LKCD is configured to take a dump then KDB forces a dump.
+ +        Whether or not a dump is taken, KDB forces a reboot.
+ +        When KDB is not active (/proc/sys/kernel/kdb == 0) and a catastrophic
+ +        error occurs, the following steps are automatic, no human
+ +        intervention is required.
+ +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 0 (default) or 1.  KDB attempts
+ +        to continue - no guarantees that the kernel is still usable.
+ +        CONFIG_KDB_CONTINUE_CATASTROPHIC == 2.  If your kernel has the LKCD
+ +        patch and LKCD is configured to take a dump then KDB automatically
+ +        forces a dump.  Whether or not a dump is taken, KDB forces a
+ +        reboot.
+ +        If you are not sure, say 0.  Read Documentation/kdb/dump.txt before
+ +        setting to 2.
+ +
+ +config KDB_USB
+ +      bool "Support for USB Keyboard in KDB"
+ +      depends on KDB && (USB_OHCI_HCD || USB_EHCI_HCD || USB_UHCI_HCD)
+ +      help
+ +        If you want to use kdb from USB keyboards then say Y here.  If you
+ +        say N then kdb can only be used from a PC (AT) keyboard or a serial
+ +        console.
+ +
+ +config KDB_KDUMP
+ +      bool "Support for Kdump in KDB"
+ +      depends on KDB
+ +      select KEXEC
+ +      default N
+ +      help
+ +        If you want to take Kdump kernel vmcore from KDB then say Y here.
+ +        If unsure, say N.
+ +
   endmenu
diff --cc arch/x86/Makefile

index 1958542,0a43dc5..7741000
--- 1/arch/x86/Makefile
--- 2/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@@ -137,13 -135,8 +137,11 @@@ drivers-$(CONFIG_OPROFILE) += arch/x86/
   # suspend and hibernation support
   drivers-$(CONFIG_PM) += arch/x86/power/
   
- ifeq ($(CONFIG_X86_32),y)
   drivers-$(CONFIG_FB) += arch/x86/video/
- endif
   
+ +# KDB support
+ +drivers-$(CONFIG_KDB) += arch/x86/kdb/
+ +
   ####
   # boot loader support. Several targets are kept for legacy purposes
   
diff --cc arch/x86/include/asm/apic.h
Simple merge
diff --cc arch/x86/include/asm/fixmap.h
Simple merge
diff --cc arch/x86/include/asm/io.h

index 7373932,a1dcfa3..b930d20
--- 1/arch/x86/include/asm/io.h
--- 2/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@@ -173,11 -207,126 +207,126 @@@ static inline void __iomem *ioremap(res
   extern void iounmap(volatile void __iomem *addr);
   
   
- #ifdef CONFIG_X86_32
- # include "io_32.h"
+ #ifdef __KERNEL__
+ 
+ #include <asm-generic/iomap.h>
+ 
+ #include <linux/vmalloc.h>
+ 
+ /*
+  * Convert a virtual cached pointer to an uncached pointer
+  */
+ #define xlate_dev_kmem_ptr(p) p
+ 
+ static inline void
+ memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+ {
+       memset((void __force *)addr, val, count);
+ }
+ 
+ static inline void
+ memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+ {
+       memcpy(dst, (const void __force *)src, count);
+ }
+ 
+ static inline void
+ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+ {
+       memcpy((void __force *)dst, src, count);
+ }
+ 
+ /*
+  * ISA space is 'always mapped' on a typical x86 system, no need to
+  * explicitly ioremap() it. The fact that the ISA IO space is mapped
+  * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+  * are physical addresses. The following constant pointer can be
+  * used as the IO-area pointer (it can be iounmapped as well, so the
+  * analogy with PCI is quite large):
+  */
+ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+ 
+ /*
+  *    Cache management
+  *
+  *    This needed for two cases
+  *    1. Out of order aware processors
+  *    2. Accidentally out of order processors (PPro errata #51)
+  */
+ 
+ static inline void flush_write_buffers(void)
+ {
+ #if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+       asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+ #endif
+ }
+ 
+ #endif /* __KERNEL__ */
+ 
+ extern void native_io_delay(void);
+ 
+ extern int io_delay_type;
+ extern void io_delay_init(void);
+ 
- -#if defined(CONFIG_PARAVIRT)
++#if defined(CONFIG_PARAVIRT_CPU)
+ #include <asm/paravirt.h>
   #else
- # include "io_64.h"
+ 
+ static inline void slow_down_io(void)
+ {
+       native_io_delay();
+ #ifdef REALLY_SLOW_IO
+       native_io_delay();
+       native_io_delay();
+       native_io_delay();
   #endif
+ }
+ 
+ #endif
+ 
+ #define BUILDIO(bwl, bw, type)                                                \
+ static inline void out##bwl(unsigned type value, int port)            \
+ {                                                                     \
+       asm volatile("out" #bwl " %" #bw "0, %w1"                       \
+                    : : "a"(value), "Nd"(port));                       \
+ }                                                                     \
+                                                                       \
+ static inline unsigned type in##bwl(int port)                         \
+ {                                                                     \
+       unsigned type value;                                            \
+       asm volatile("in" #bwl " %w1, %" #bw "0"                        \
+                    : "=a"(value) : "Nd"(port));                       \
+       return value;                                                   \
+ }                                                                     \
+                                                                       \
+ static inline void out##bwl##_p(unsigned type value, int port)                \
+ {                                                                     \
+       out##bwl(value, port);                                          \
+       slow_down_io();                                                 \
+ }                                                                     \
+                                                                       \
+ static inline unsigned type in##bwl##_p(int port)                     \
+ {                                                                     \
+       unsigned type value = in##bwl(port);                            \
+       slow_down_io();                                                 \
+       return value;                                                   \
+ }                                                                     \
+                                                                       \
+ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+ {                                                                     \
+       asm volatile("rep; outs" #bwl                                   \
+                    : "+S"(addr), "+c"(count) : "d"(port));            \
+ }                                                                     \
+                                                                       \
+ static inline void ins##bwl(int port, void *addr, unsigned long count)        \
+ {                                                                     \
+       asm volatile("rep; ins" #bwl                                    \
+                    : "+D"(addr), "+c"(count) : "d"(port));            \
+ }
+ 
+ BUILDIO(b, b, char)
+ BUILDIO(w, w, short)
+ BUILDIO(l, , int)
   
   extern void *xlate_dev_mem_ptr(unsigned long phys);
   extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
diff --cc arch/x86/include/asm/irq_vectors.h

index e50f1a8,8767d99..a253052
--- 1/arch/x86/include/asm/irq_vectors.h
--- 2/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@@ -47,10 -45,16 +45,17 @@@
    */
   #define IRQ_MOVE_CLEANUP_VECTOR               FIRST_EXTERNAL_VECTOR
   
+ #define IA32_SYSCALL_VECTOR           0x80
+ #ifdef CONFIG_X86_32
+ # define SYSCALL_VECTOR                       0x80
+ #endif
++#define KDBENTER_VECTOR       0x81
+ 
   /*
    * Vectors 0x30-0x3f are used for ISA interrupts.
+  *   round up to the next 16-vector boundary
    */
- #define IRQ0_VECTOR                   (FIRST_EXTERNAL_VECTOR + 0x10)
+ #define IRQ0_VECTOR                   ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
   
   #define IRQ1_VECTOR                   (IRQ0_VECTOR +  1)
   #define IRQ2_VECTOR                   (IRQ0_VECTOR +  2)
diff --cc arch/x86/include/asm/paravirt.h
Simple merge
diff --cc arch/x86/include/asm/paravirt_types.h
Simple merge
diff --cc arch/x86/include/asm/pgalloc.h
Simple merge
diff --cc arch/x86/include/asm/ptrace.h
Simple merge
diff --cc arch/x86/include/asm/required-features.h

index ef1fec5,64cf2d2..f68edf2
--- 1/arch/x86/include/asm/required-features.h
--- 2/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@@ -48,7 -48,7 +48,7 @@@
   #endif
   
   #ifdef CONFIG_X86_64
- #if defined(CONFIG_PARAVIRT_MMU) || defined(CONFIG_XEN)
- -#ifdef CONFIG_PARAVIRT
++#ifdef CONFIG_PARAVIRT_MMU
   /* Paravirtualized systems may not have PSE or PGE available */
   #define NEED_PSE      0
   #define NEED_PGE      0
diff --cc arch/x86/include/asm/smp.h
Simple merge
diff --cc arch/x86/include/asm/system.h
Simple merge
diff --cc arch/x86/kdb/kdba_bt.c

index 136268c,0000000..50d2806

mode 100644,000000..100644
--- 1/arch/x86/kdb/kdba_bt.c
--- /dev/null
+++ b/arch/x86/kdb/kdba_bt.c
@@@ -1,5760 -1,0 +1,5757 @@@
+ +/*
+ + * This file is subject to the terms and conditions of the GNU General Public
+ + * License.  See the file "COPYING" in the main directory of this archive
+ + * for more details.
+ + *
+ + * Copyright (c) 2006, 2007-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ + *
+ + * Common code for doing accurate backtraces on i386 and x86_64, including
+ + * printing the values of arguments.
+ + */
+ +
+ +#include <linux/init.h>
+ +#include <linux/kallsyms.h>
+ +#include <linux/kdb.h>
+ +#include <linux/kdbprivate.h>
+ +#include <linux/ctype.h>
+ +#include <linux/string.h>
+ +#include <linux/stringify.h>
+ +#include <linux/kernel.h>
+ +#include <linux/sched.h>
+ +#include <linux/nmi.h>
+ +#include <asm/asm-offsets.h>
+ +#include <asm/system.h>
+ +
+ +#define KDB_DEBUG_BB(fmt, ...)                                                        \
+ +      {if (KDB_DEBUG(BB)) kdb_printf(fmt, ## __VA_ARGS__);}
+ +#define KDB_DEBUG_BB_OFFSET_PRINTF(offset, prefix, suffix)                    \
+ +      kdb_printf(prefix "%c0x%x" suffix,                                      \
+ +                 offset >= 0 ? '+' : '-',                                     \
+ +                 offset >= 0 ? offset : -offset)
+ +#define KDB_DEBUG_BB_OFFSET(offset, prefix, suffix)                           \
+ +      {if (KDB_DEBUG(BB)) KDB_DEBUG_BB_OFFSET_PRINTF(offset, prefix, suffix);}
+ +
+ +#define       BB_CHECK(expr, val, ret)                                                \
+ +({                                                                            \
+ +      if (unlikely(expr)) {                                                   \
+ +              kdb_printf("%s, line %d: BB_CHECK(" #expr ") failed "           \
+ +                      #val "=%lx\n",                                          \
+ +                      __FUNCTION__, __LINE__, (long)val);                     \
+ +              bb_giveup = 1;                                                  \
+ +              return ret;                                                     \
+ +      }                                                                       \
+ +})
+ +
+ +static int bb_giveup;
+ +
+ +/* Use BBRG_Rxx for both i386 and x86_64.  RAX through R15 must be at the end,
+ + * starting with RAX.  Some of these codes do not reflect actual registers,
+ + * such codes are special cases when parsing the record of register changes.
+ + * When updating BBRG_ entries, update bbrg_name as well.
+ + */
+ +
+ +enum bb_reg_code
+ +{
+ +      BBRG_UNDEFINED = 0,     /* Register contents are undefined */
+ +      BBRG_OSP,               /* original stack pointer on entry to function */
+ +      BBRG_RAX,
+ +      BBRG_RBX,
+ +      BBRG_RCX,
+ +      BBRG_RDX,
+ +      BBRG_RDI,
+ +      BBRG_RSI,
+ +      BBRG_RBP,
+ +      BBRG_RSP,
+ +      BBRG_R8,
+ +      BBRG_R9,
+ +      BBRG_R10,
+ +      BBRG_R11,
+ +      BBRG_R12,
+ +      BBRG_R13,
+ +      BBRG_R14,
+ +      BBRG_R15,
+ +};
+ +
+ +const static char *bbrg_name[] = {
+ +      [BBRG_UNDEFINED]   = "undefined",
+ +      [BBRG_OSP]         = "osp",
+ +      [BBRG_RAX]         = "rax",
+ +      [BBRG_RBX]         = "rbx",
+ +      [BBRG_RCX]         = "rcx",
+ +      [BBRG_RDX]         = "rdx",
+ +      [BBRG_RDI]         = "rdi",
+ +      [BBRG_RSI]         = "rsi",
+ +      [BBRG_RBP]         = "rbp",
+ +      [BBRG_RSP]         = "rsp",
+ +      [BBRG_R8]          = "r8",
+ +      [BBRG_R9]          = "r9",
+ +      [BBRG_R10]         = "r10",
+ +      [BBRG_R11]         = "r11",
+ +      [BBRG_R12]         = "r12",
+ +      [BBRG_R13]         = "r13",
+ +      [BBRG_R14]         = "r14",
+ +      [BBRG_R15]         = "r15",
+ +};
+ +
+ +/* Map a register name to its register code.  This includes the sub-register
+ + * addressable fields, e.g. parts of rax can be addressed as ax, al, ah, eax.
+ + * The list is sorted so it can be binary chopped, sort command is:
+ + *   LANG=C sort -t '"' -k2
+ + */
+ +
+ +struct bb_reg_code_map {
+ +      enum bb_reg_code reg;
+ +      const char *name;
+ +};
+ +
+ +const static struct bb_reg_code_map
+ +bb_reg_code_map[] = {
+ +      { BBRG_RAX, "ah" },
+ +      { BBRG_RAX, "al" },
+ +      { BBRG_RAX, "ax" },
+ +      { BBRG_RBX, "bh" },
+ +      { BBRG_RBX, "bl" },
+ +      { BBRG_RBP, "bp" },
+ +      { BBRG_RBP, "bpl" },
+ +      { BBRG_RBX, "bx" },
+ +      { BBRG_RCX, "ch" },
+ +      { BBRG_RCX, "cl" },
+ +      { BBRG_RCX, "cx" },
+ +      { BBRG_RDX, "dh" },
+ +      { BBRG_RDI, "di" },
+ +      { BBRG_RDI, "dil" },
+ +      { BBRG_RDX, "dl" },
+ +      { BBRG_RDX, "dx" },
+ +      { BBRG_RAX, "eax" },
+ +      { BBRG_RBP, "ebp" },
+ +      { BBRG_RBX, "ebx" },
+ +      { BBRG_RCX, "ecx" },
+ +      { BBRG_RDI, "edi" },
+ +      { BBRG_RDX, "edx" },
+ +      { BBRG_RSI, "esi" },
+ +      { BBRG_RSP, "esp" },
+ +      { BBRG_R10, "r10" },
+ +      { BBRG_R10, "r10d" },
+ +      { BBRG_R10, "r10l" },
+ +      { BBRG_R10, "r10w" },
+ +      { BBRG_R11, "r11" },
+ +      { BBRG_R11, "r11d" },
+ +      { BBRG_R11, "r11l" },
+ +      { BBRG_R11, "r11w" },
+ +      { BBRG_R12, "r12" },
+ +      { BBRG_R12, "r12d" },
+ +      { BBRG_R12, "r12l" },
+ +      { BBRG_R12, "r12w" },
+ +      { BBRG_R13, "r13" },
+ +      { BBRG_R13, "r13d" },
+ +      { BBRG_R13, "r13l" },
+ +      { BBRG_R13, "r13w" },
+ +      { BBRG_R14, "r14" },
+ +      { BBRG_R14, "r14d" },
+ +      { BBRG_R14, "r14l" },
+ +      { BBRG_R14, "r14w" },
+ +      { BBRG_R15, "r15" },
+ +      { BBRG_R15, "r15d" },
+ +      { BBRG_R15, "r15l" },
+ +      { BBRG_R15, "r15w" },
+ +      { BBRG_R8,  "r8" },
+ +      { BBRG_R8,  "r8d" },
+ +      { BBRG_R8,  "r8l" },
+ +      { BBRG_R8,  "r8w" },
+ +      { BBRG_R9,  "r9" },
+ +      { BBRG_R9,  "r9d" },
+ +      { BBRG_R9,  "r9l" },
+ +      { BBRG_R9,  "r9w" },
+ +      { BBRG_RAX, "rax" },
+ +      { BBRG_RBP, "rbp" },
+ +      { BBRG_RBX, "rbx" },
+ +      { BBRG_RCX, "rcx" },
+ +      { BBRG_RDI, "rdi" },
+ +      { BBRG_RDX, "rdx" },
+ +      { BBRG_RSI, "rsi" },
+ +      { BBRG_RSP, "rsp" },
+ +      { BBRG_RSI, "si" },
+ +      { BBRG_RSI, "sil" },
+ +      { BBRG_RSP, "sp" },
+ +      { BBRG_RSP, "spl" },
+ +};
+ +
+ +/* Record register contents in terms of the values that were passed to this
+ + * function, IOW track which registers contain an input value.  A register's
+ + * contents can be undefined, it can contain an input register value or it can
+ + * contain an offset from the original stack pointer.
+ + *
+ + * This structure is used to represent the current contents of the integer
+ + * registers, it is held in an array that is indexed by BBRG_xxx.  The element
+ + * for BBRG_xxx indicates what input value is currently in BBRG_xxx.  When
+ + * 'value' is BBRG_OSP then register BBRG_xxx contains a stack pointer,
+ + * pointing at 'offset' from the original stack pointer on entry to the
+ + * function.  When 'value' is not BBRG_OSP then element BBRG_xxx contains the
+ + * original contents of an input register and offset is ignored.
+ + *
+ + * An input register 'value' can be stored in more than one register and/or in
+ + * more than one memory location.
+ + */
+ +
+ +struct bb_reg_contains
+ +{
+ +      enum bb_reg_code value: 8;
+ +      short offset;
+ +};
+ +
+ +/* Note: the offsets in struct bb_mem_contains in this code are _NOT_ offsets
+ + * from OSP, they are offsets from current RSP.  It fits better with the way
+ + * that struct pt_regs is built, some code pushes extra data before pt_regs so
+ + * working with OSP relative offsets gets messy.  struct bb_mem_contains
+ + * entries must be in descending order of RSP offset.
+ + */
+ +
+ +typedef struct { DECLARE_BITMAP(bits, BBRG_R15+1); } bbrgmask_t;
+ +#define BB_SKIP(reg) (1 << (BBRG_ ## reg))
+ +struct bb_mem_contains {
+ +      short offset_address;
+ +      enum bb_reg_code value: 8;
+ +};
+ +
+ +/* Transfer of control to a label outside the current function.  If the
+ + * transfer is to a known common restore path that expects known registers
+ + * and/or a known memory state (e.g. struct pt_regs) then do a sanity check on
+ + * the state at this point.
+ + */
+ +
+ +struct bb_name_state {
+ +      const char *name;                       /* target function */
+ +      bfd_vma address;                        /* Address of target function */
+ +      const char *fname;                      /* optional from function name */
+ +      const struct bb_mem_contains *mem;      /* expected memory state */
+ +      const struct bb_reg_contains *regs;     /* expected register state */
+ +      const unsigned short mem_size;          /* ARRAY_SIZE(mem) */
+ +      const unsigned short regs_size;         /* ARRAY_SIZE(regs) */
+ +      const short osp_offset;                 /* RSP in regs == OSP+osp_offset */
+ +      const bbrgmask_t skip_mem;              /* Some slots in mem may be undefined */
+ +      const bbrgmask_t skip_regs;             /* Some slots in regs may be undefined */
+ +};
+ +
+ +/* NS (NAME_STATE) macros define the register and memory state when we transfer
+ + * control to or start decoding a special case name.  Use NS when the target
+ + * label always has the same state.  Use NS_FROM and specify the source label
+ + * if the target state is slightly different depending on where it is branched
+ + * from.  This gives better state checking, by isolating the special cases.
+ + *
+ + * Note: for the same target label, NS_FROM entries must be followed by a
+ + * single NS entry.
+ + */
+ +
+ +#define       NS_FROM(iname, ifname, imem, iregs, iskip_mem, iskip_regs, iosp_offset) \
+ +      { \
+ +              .name = iname, \
+ +              .fname = ifname, \
+ +              .mem = imem, \
+ +              .regs = iregs, \
+ +              .mem_size = ARRAY_SIZE(imem), \
+ +              .regs_size = ARRAY_SIZE(iregs), \
+ +              .skip_mem.bits[0] = iskip_mem, \
+ +              .skip_regs.bits[0] = iskip_regs, \
+ +              .osp_offset = iosp_offset, \
+ +              .address = 0 \
+ +      }
+ +
+ +/* Shorter forms for the common cases */
+ +#define       NS(iname, imem, iregs, iskip_mem, iskip_regs, iosp_offset) \
+ +        NS_FROM(iname, NULL, imem, iregs, iskip_mem, iskip_regs, iosp_offset)
+ +#define       NS_MEM(iname, imem, iskip_mem) \
+ +        NS_FROM(iname, NULL, imem, no_regs, iskip_mem, 0, 0)
+ +#define       NS_MEM_FROM(iname, ifname, imem, iskip_mem) \
+ +        NS_FROM(iname, ifname, imem, no_regs, iskip_mem, 0, 0)
+ +#define       NS_REG(iname, iregs, iskip_regs) \
+ +        NS_FROM(iname, NULL, no_memory, iregs, 0, iskip_regs, 0)
+ +#define       NS_REG_FROM(iname, ifname, iregs, iskip_regs) \
+ +        NS_FROM(iname, ifname, no_memory, iregs, 0, iskip_regs, 0)
+ +
+ +static void
+ +bb_reg_code_set_value(enum bb_reg_code dst, enum bb_reg_code src);
+ +
+ +static const char *bb_mod_name, *bb_func_name;
+ +
+ +static int
+ +bb_noret(const char *name)
+ +{
+ +      if (strcmp(name, "panic") == 0 ||
+ +          strcmp(name, "do_exit") == 0 ||
+ +          strcmp(name, "do_group_exit") == 0 ||
+ +          strcmp(name, "complete_and_exit") == 0)
+ +              return 1;
+ +      return 0;
+ +}
+ +
+ +/*============================================================================*/
+ +/*                                                                            */
+ +/* Most of the basic block code and data is common to x86_64 and i386.  This  */
+ +/* large ifdef  contains almost all of the differences between the two        */
+ +/* architectures.                                                             */
+ +/*                                                                            */
+ +/* Make sure you update the correct section of this ifdef.                    */
+ +/*                                                                            */
+ +/*============================================================================*/
+ +
+ +#ifdef        CONFIG_X86_64
+ +
+ +/* Registers that can be used to pass parameters, in the order that parameters
+ + * are passed.
+ + */
+ +
+ +const static enum bb_reg_code
+ +bb_param_reg[] = {
+ +      BBRG_RDI,
+ +      BBRG_RSI,
+ +      BBRG_RDX,
+ +      BBRG_RCX,
+ +      BBRG_R8,
+ +      BBRG_R9,
+ +};
+ +
+ +const static enum bb_reg_code
+ +bb_preserved_reg[] = {
+ +      BBRG_RBX,
+ +      BBRG_RBP,
+ +      BBRG_RSP,
+ +      BBRG_R12,
+ +      BBRG_R13,
+ +      BBRG_R14,
+ +      BBRG_R15,
+ +};
+ +
+ +static const struct bb_mem_contains full_pt_regs[] = {
+ +      { 0x70, BBRG_RDI },
+ +      { 0x68, BBRG_RSI },
+ +      { 0x60, BBRG_RDX },
+ +      { 0x58, BBRG_RCX },
+ +      { 0x50, BBRG_RAX },
+ +      { 0x48, BBRG_R8  },
+ +      { 0x40, BBRG_R9  },
+ +      { 0x38, BBRG_R10 },
+ +      { 0x30, BBRG_R11 },
+ +      { 0x28, BBRG_RBX },
+ +      { 0x20, BBRG_RBP },
+ +      { 0x18, BBRG_R12 },
+ +      { 0x10, BBRG_R13 },
+ +      { 0x08, BBRG_R14 },
+ +      { 0x00, BBRG_R15 },
+ +};
+ +static const struct bb_mem_contains full_pt_regs_plus_1[] = {
+ +      { 0x78, BBRG_RDI },
+ +      { 0x70, BBRG_RSI },
+ +      { 0x68, BBRG_RDX },
+ +      { 0x60, BBRG_RCX },
+ +      { 0x58, BBRG_RAX },
+ +      { 0x50, BBRG_R8  },
+ +      { 0x48, BBRG_R9  },
+ +      { 0x40, BBRG_R10 },
+ +      { 0x38, BBRG_R11 },
+ +      { 0x30, BBRG_RBX },
+ +      { 0x28, BBRG_RBP },
+ +      { 0x20, BBRG_R12 },
+ +      { 0x18, BBRG_R13 },
+ +      { 0x10, BBRG_R14 },
+ +      { 0x08, BBRG_R15 },
+ +};
+ +/*
+ + * Going into error_exit we have the hardware pushed error_code on the stack
+ + * plus a full pt_regs
+ + */
+ +static const struct bb_mem_contains error_code_full_pt_regs[] = {
+ +      { 0x78, BBRG_UNDEFINED },
+ +      { 0x70, BBRG_RDI },
+ +      { 0x68, BBRG_RSI },
+ +      { 0x60, BBRG_RDX },
+ +      { 0x58, BBRG_RCX },
+ +      { 0x50, BBRG_RAX },
+ +      { 0x48, BBRG_R8  },
+ +      { 0x40, BBRG_R9  },
+ +      { 0x38, BBRG_R10 },
+ +      { 0x30, BBRG_R11 },
+ +      { 0x28, BBRG_RBX },
+ +      { 0x20, BBRG_RBP },
+ +      { 0x18, BBRG_R12 },
+ +      { 0x10, BBRG_R13 },
+ +      { 0x08, BBRG_R14 },
+ +      { 0x00, BBRG_R15 },
+ +};
+ +static const struct bb_mem_contains partial_pt_regs[] = {
+ +      { 0x40, BBRG_RDI },
+ +      { 0x38, BBRG_RSI },
+ +      { 0x30, BBRG_RDX },
+ +      { 0x28, BBRG_RCX },
+ +      { 0x20, BBRG_RAX },
+ +      { 0x18, BBRG_R8  },
+ +      { 0x10, BBRG_R9  },
+ +      { 0x08, BBRG_R10 },
+ +      { 0x00, BBRG_R11 },
+ +};
+ +static const struct bb_mem_contains partial_pt_regs_plus_1[] = {
+ +      { 0x48, BBRG_RDI },
+ +      { 0x40, BBRG_RSI },
+ +      { 0x38, BBRG_RDX },
+ +      { 0x30, BBRG_RCX },
+ +      { 0x28, BBRG_RAX },
+ +      { 0x20, BBRG_R8  },
+ +      { 0x18, BBRG_R9  },
+ +      { 0x10, BBRG_R10 },
+ +      { 0x08, BBRG_R11 },
+ +};
+ +static const struct bb_mem_contains partial_pt_regs_plus_2[] = {
+ +      { 0x50, BBRG_RDI },
+ +      { 0x48, BBRG_RSI },
+ +      { 0x40, BBRG_RDX },
+ +      { 0x38, BBRG_RCX },
+ +      { 0x30, BBRG_RAX },
+ +      { 0x28, BBRG_R8  },
+ +      { 0x20, BBRG_R9  },
+ +      { 0x18, BBRG_R10 },
+ +      { 0x10, BBRG_R11 },
+ +};
+ +static const struct bb_mem_contains no_memory[] = {
+ +};
+ +/* Hardware has already pushed an error_code on the stack.  Use undefined just
+ + * to set the initial stack offset.
+ + */
+ +static const struct bb_mem_contains error_code[] = {
+ +      { 0x0, BBRG_UNDEFINED },
+ +};
+ +/* error_code plus original rax */
+ +static const struct bb_mem_contains error_code_rax[] = {
+ +      { 0x8, BBRG_UNDEFINED },
+ +      { 0x0, BBRG_RAX },
+ +};
+ +
+ +static const struct bb_reg_contains all_regs[] = {
+ +      [BBRG_RAX] = { BBRG_RAX, 0 },
+ +      [BBRG_RBX] = { BBRG_RBX, 0 },
+ +      [BBRG_RCX] = { BBRG_RCX, 0 },
+ +      [BBRG_RDX] = { BBRG_RDX, 0 },
+ +      [BBRG_RDI] = { BBRG_RDI, 0 },
+ +      [BBRG_RSI] = { BBRG_RSI, 0 },
+ +      [BBRG_RBP] = { BBRG_RBP, 0 },
+ +      [BBRG_RSP] = { BBRG_OSP, 0 },
+ +      [BBRG_R8 ] = { BBRG_R8,  0 },
+ +      [BBRG_R9 ] = { BBRG_R9,  0 },
+ +      [BBRG_R10] = { BBRG_R10, 0 },
+ +      [BBRG_R11] = { BBRG_R11, 0 },
+ +      [BBRG_R12] = { BBRG_R12, 0 },
+ +      [BBRG_R13] = { BBRG_R13, 0 },
+ +      [BBRG_R14] = { BBRG_R14, 0 },
+ +      [BBRG_R15] = { BBRG_R15, 0 },
+ +};
+ +static const struct bb_reg_contains no_regs[] = {
+ +};
+ +
+ +static struct bb_name_state bb_special_cases[] = {
+ +
+ +      /* First the cases that pass data only in memory.  We do not check any
+ +       * register state for these cases.
+ +       */
+ +
+ +      /* Simple cases, no exceptions */
+ +      NS_MEM("ia32_ptregs_common", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("ia32_sysret", partial_pt_regs, 0),
+ +      NS_MEM("int_careful", partial_pt_regs, 0),
+ +      NS_MEM("ia32_badarg", partial_pt_regs, 0),
+ +      NS_MEM("int_restore_rest", full_pt_regs, 0),
+ +      NS_MEM("int_signal", full_pt_regs, 0),
+ +      NS_MEM("int_very_careful", partial_pt_regs, 0),
+ +      NS_MEM("ptregscall_common", full_pt_regs_plus_1, 0),
+ +      NS_MEM("ret_from_intr", partial_pt_regs_plus_2, 0),
+ +      NS_MEM("stub32_clone", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_execve", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_fork", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_iopl", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_rt_sigreturn", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_sigaltstack", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_sigreturn", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub32_vfork", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_clone", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_execve", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_fork", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_iopl", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_rt_sigreturn", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_sigaltstack", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("stub_vfork", partial_pt_regs_plus_1, 0),
+ +      NS_MEM("sysenter_auditsys", partial_pt_regs,
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11)),
+ +
+ +      NS_MEM("paranoid_exit", error_code_full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("ia32_badsys", "ia32_sysenter_target",
+ +              partial_pt_regs,
+ +              /* ia32_sysenter_target uses CLEAR_RREGS to clear R8-R11 on
+ +               * some paths.  It also stomps on RAX.
+ +               */
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX)),
+ +      NS_MEM_FROM("ia32_badsys", "ia32_cstar_target",
+ +              partial_pt_regs,
+ +              /* ia32_cstar_target uses CLEAR_RREGS to clear R8-R11 on some
+ +               * paths.  It also stomps on RAX.  Even more confusing, instead
+ +               * of storing RCX it stores RBP.  WTF?
+ +               */
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +      NS_MEM_FROM("ia32_badsys", "ia32_syscall",
+ +              partial_pt_regs,
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11)),
+ +      NS_MEM("ia32_badsys", partial_pt_regs, 0),
+ +
+ +#ifdef CONFIG_AUDITSYSCALL
+ +      NS_MEM_FROM("int_with_check", "sysexit_audit", partial_pt_regs,
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX)),
+ +      NS_MEM_FROM("int_with_check", "ia32_cstar_target", partial_pt_regs,
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +#endif
+ +      NS_MEM("int_with_check", no_memory, 0),
+ +
+ +      /* Various bits of code branch to int_ret_from_sys_call, with slightly
+ +       * different missing values in pt_regs.
+ +       */
+ +      NS_MEM_FROM("int_ret_from_sys_call", "ret_from_fork",
+ +              partial_pt_regs,
+ +              BB_SKIP(R11)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "stub_execve",
+ +              partial_pt_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "stub_rt_sigreturn",
+ +              partial_pt_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "kernel_execve",
+ +              partial_pt_regs,
+ +              BB_SKIP(RAX)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_syscall",
+ +              partial_pt_regs,
+ +              /* ia32_syscall only saves RDI through RCX. */
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_sysenter_target",
+ +              partial_pt_regs,
+ +              /* ia32_sysenter_target uses CLEAR_RREGS to clear R8-R11 on
+ +              * some paths.  It also stomps on RAX.
+ +              */
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_cstar_target",
+ +              partial_pt_regs,
+ +              /* ia32_cstar_target uses CLEAR_RREGS to clear R8-R11 on some
+ +               * paths.  It also stomps on RAX.  Even more confusing, instead
+ +               * of storing RCX it stores RBP.  WTF?
+ +               */
+ +              BB_SKIP(R8) | BB_SKIP(R9) | BB_SKIP(R10) | BB_SKIP(R11) |
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +      NS_MEM_FROM("int_ret_from_sys_call", "ia32_badsys",
+ +              partial_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM("int_ret_from_sys_call", partial_pt_regs, 0),
+ +
+ +#ifdef        CONFIG_PREEMPT
+ +      NS_MEM("retint_kernel", partial_pt_regs, BB_SKIP(RAX)),
+ +#endif        /* CONFIG_PREEMPT */
+ +
+ +      NS_MEM("retint_careful", partial_pt_regs, BB_SKIP(RAX)),
+ +
+ +      /* Horrible hack: For a brand new x86_64 task, switch_to() branches to
+ +       * ret_from_fork with a totally different stack state from all the
+ +       * other tasks that come out of switch_to().  This non-standard state
+ +       * cannot be represented so just ignore the branch from switch_to() to
+ +       * ret_from_fork.  Due to inlining and linker labels, switch_to() can
+ +       * appear as several different function labels, including schedule,
+ +       * context_switch and __sched_text_start.
+ +       */
+ +      NS_MEM_FROM("ret_from_fork", "schedule", no_memory, 0),
+ +      NS_MEM_FROM("ret_from_fork", "__schedule", no_memory, 0),
+ +      NS_MEM_FROM("ret_from_fork", "__sched_text_start", no_memory, 0),
+ +      NS_MEM_FROM("ret_from_fork", "context_switch", no_memory, 0),
+ +      NS_MEM("ret_from_fork", full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("ret_from_sys_call", "ret_from_fork",
+ +              partial_pt_regs,
+ +              BB_SKIP(R11)),
+ +      NS_MEM("ret_from_sys_call", partial_pt_regs, 0),
+ +
+ +      NS_MEM("retint_restore_args",
+ +              partial_pt_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +
+ +      NS_MEM("retint_swapgs",
+ +              partial_pt_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +
+ +      /* Now the cases that pass data in registers.  We do not check any
+ +       * memory state for these cases.
+ +       */
+ +
+ +      NS_REG("bad_put_user",
+ +              all_regs, BB_SKIP(RBX)),
+ +
+ +      NS_REG("bad_get_user",
+ +              all_regs, BB_SKIP(RAX) | BB_SKIP(RDX)),
+ +
+ +      NS_REG("bad_to_user",
+ +              all_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +
+ +      NS_REG("ia32_ptregs_common",
+ +              all_regs,
+ +              0),
+ +
+ +      NS_REG("copy_user_generic_unrolled",
+ +              all_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +
+ +      NS_REG("copy_user_generic_string",
+ +              all_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RCX)),
+ +
+ +      NS_REG("irq_return",
+ +              all_regs,
+ +              0),
+ +
+ +      /* Finally the cases that pass data in both registers and memory.
+ +       */
+ +
+ +      NS("invalid_TSS", error_code, all_regs, 0, 0, 0),
+ +      NS("segment_not_present", error_code, all_regs, 0, 0, 0),
+ +      NS("alignment_check", error_code, all_regs, 0, 0, 0),
+ +      NS("page_fault", error_code, all_regs, 0, 0, 0),
+ +      NS("general_protection", error_code, all_regs, 0, 0, 0),
+ +      NS("error_entry", error_code_rax, all_regs, 0, BB_SKIP(RAX), -0x10),
+ +      NS("error_exit", error_code_full_pt_regs, no_regs, 0, 0, 0x30),
+ +      NS("common_interrupt", error_code, all_regs, 0, 0, -0x8),
+ +      NS("save_args", error_code, all_regs, 0, 0, -0x50),
+ +      NS("int3", no_memory, all_regs, 0, 0, -0x80),
+ +};
+ +
+ +static const char *bb_spurious[] = {
+ +                              /* schedule */
+ +      "thread_return",
+ +                              /* system_call */
+ +      "system_call_after_swapgs",
+ +      "system_call_fastpath",
+ +      "ret_from_sys_call",
+ +      "sysret_check",
+ +      "sysret_careful",
+ +      "sysret_signal",
+ +      "badsys",
+ +#ifdef CONFIG_AUDITSYSCALL
+ +      "auditsys",
+ +      "sysret_audit",
+ +#endif
+ +      "tracesys",
+ +      "int_ret_from_sys_call",
+ +      "int_with_check",
+ +      "int_careful",
+ +      "int_very_careful",
+ +      "int_signal",
+ +      "int_restore_rest",
+ +                              /* common_interrupt */
+ +      "ret_from_intr",
+ +      "exit_intr",
+ +      "retint_with_reschedule",
+ +      "retint_check",
+ +      "retint_swapgs",
+ +      "retint_restore_args",
+ +      "restore_args",
+ +      "irq_return",
+ +      "bad_iret",
+ +      "retint_careful",
+ +      "retint_signal",
+ +#ifdef        CONFIG_PREEMPT
+ +      "retint_kernel",
+ +#endif        /* CONFIG_PREEMPT */
+ +                              /* paranoid_exit */
+ +      "paranoid_swapgs",
+ +      "paranoid_restore",
+ +      "paranoid_userspace",
+ +      "paranoid_schedule",
+ +                              /* error_entry */
+ +      "error_swapgs",
+ +      "error_sti",
+ +      "error_kernelspace",
+ +                              /* nmi */
+ +#ifdef CONFIG_TRACE_IRQFLAGS
+ +      "nmi_swapgs",
+ +      "nmi_restore",
+ +      "nmi_userspace",
+ +      "nmi_schedule",
+ +#endif
+ +                              /* load_gs_index */
+ +      "gs_change",
+ +      "bad_gs",
+ +                              /* ia32_sysenter_target */
+ +      "sysenter_do_call",
+ +      "sysenter_dispatch",
+ +      "sysexit_from_sys_call",
+ +#ifdef CONFIG_AUDITSYSCALL
+ +      "sysenter_auditsys",
+ +      "sysexit_audit",
+ +#endif
+ +      "sysenter_tracesys",
+ +                              /* ia32_cstar_target */
+ +      "cstar_do_call",
+ +      "cstar_dispatch",
+ +      "sysretl_from_sys_call",
+ +#ifdef CONFIG_AUDITSYSCALL
+ +      "cstar_auditsys",
+ +      "sysretl_audit",
+ +#endif
+ +      "cstar_tracesys",
+ +                              /* ia32_syscall */
+ +      "ia32_do_call",
+ +      "ia32_sysret",
+ +      "ia32_tracesys",
+ +#ifdef        CONFIG_HIBERNATION
+ +                              /* restore_image */
+ +      "loop",
+ +      "done",
+ +#endif        /* CONFIG_HIBERNATION */
+ +#ifdef        CONFIG_KPROBES
+ +                              /* jprobe_return */
+ +      "jprobe_return_end",
+ +                              /* kretprobe_trampoline_holder */
+ +      "kretprobe_trampoline",
+ +#endif        /* CONFIG_KPROBES */
+ +#ifdef        CONFIG_KEXEC
+ +                              /* relocate_kernel */
+ +      "relocate_new_kernel",
+ +#endif        /* CONFIG_KEXEC */
- #ifdef        CONFIG_PARAVIRT_XEN
++#ifdef        CONFIG_XEN
+ +                              /* arch/i386/xen/xen-asm.S */
+ +      "xen_irq_enable_direct_end",
+ +      "xen_irq_disable_direct_end",
+ +      "xen_save_fl_direct_end",
+ +      "xen_restore_fl_direct_end",
+ +      "xen_iret_start_crit",
+ +      "iret_restore_end",
+ +      "xen_iret_end_crit",
+ +      "hyper_iret",
+ +#endif        /* CONFIG_XEN */
+ +};
+ +
+ +static const char *bb_hardware_handlers[] = {
+ +      "system_call",
+ +      "common_interrupt",
+ +      "error_entry",
+ +      "debug",
+ +      "nmi",
+ +      "int3",
+ +      "double_fault",
+ +      "stack_segment",
+ +      "machine_check",
+ +      "kdb_call",
+ +};
+ +
+ +static int
+ +bb_hardware_pushed_arch(kdb_machreg_t rsp,
+ +                      const struct kdb_activation_record *ar)
+ +{
+ +      /* x86_64 interrupt stacks are 16 byte aligned and you must get the
+ +       * next rsp from stack, it cannot be statically calculated.  Do not
+ +       * include the word at rsp, it is pushed by hardware but is treated as
+ +       * a normal software return value.
+ +       *
+ +       * When an IST switch occurs (e.g. NMI) then the saved rsp points to
+ +       * another stack entirely.  Assume that the IST stack is 16 byte
+ +       * aligned and just return the size of the hardware data on this stack.
+ +       * The stack unwind code will take care of the stack switch.
+ +       */
+ +      kdb_machreg_t saved_rsp = *((kdb_machreg_t *)rsp + 3);
+ +      int hardware_pushed = saved_rsp - rsp - KDB_WORD_SIZE;
+ +      if (hardware_pushed < 4 * KDB_WORD_SIZE ||
+ +          saved_rsp < ar->stack.logical_start ||
+ +          saved_rsp >= ar->stack.logical_end)
+ +              return 4 * KDB_WORD_SIZE;
+ +      else
+ +              return hardware_pushed;
+ +}
+ +
+ +static void
+ +bb_start_block0(void)
+ +{
+ +      bb_reg_code_set_value(BBRG_RAX, BBRG_RAX);
+ +      bb_reg_code_set_value(BBRG_RBX, BBRG_RBX);
+ +      bb_reg_code_set_value(BBRG_RCX, BBRG_RCX);
+ +      bb_reg_code_set_value(BBRG_RDX, BBRG_RDX);
+ +      bb_reg_code_set_value(BBRG_RDI, BBRG_RDI);
+ +      bb_reg_code_set_value(BBRG_RSI, BBRG_RSI);
+ +      bb_reg_code_set_value(BBRG_RBP, BBRG_RBP);
+ +      bb_reg_code_set_value(BBRG_RSP, BBRG_OSP);
+ +      bb_reg_code_set_value(BBRG_R8, BBRG_R8);
+ +      bb_reg_code_set_value(BBRG_R9, BBRG_R9);
+ +      bb_reg_code_set_value(BBRG_R10, BBRG_R10);
+ +      bb_reg_code_set_value(BBRG_R11, BBRG_R11);
+ +      bb_reg_code_set_value(BBRG_R12, BBRG_R12);
+ +      bb_reg_code_set_value(BBRG_R13, BBRG_R13);
+ +      bb_reg_code_set_value(BBRG_R14, BBRG_R14);
+ +      bb_reg_code_set_value(BBRG_R15, BBRG_R15);
+ +}
+ +
+ +/* x86_64 does not have a special case for __switch_to */
+ +
+ +static void
+ +bb_fixup_switch_to(char *p)
+ +{
+ +}
+ +
+ +static int
+ +bb_asmlinkage_arch(void)
+ +{
+ +      return strncmp(bb_func_name, "__down", 6) == 0 ||
+ +             strncmp(bb_func_name, "__up", 4) == 0 ||
+ +             strncmp(bb_func_name, "stub_", 5) == 0 ||
+ +             strcmp(bb_func_name, "ret_from_fork") == 0 ||
+ +             strcmp(bb_func_name, "ptregscall_common") == 0;
+ +}
+ +
+ +#else /* !CONFIG_X86_64 */
+ +
+ +/* Registers that can be used to pass parameters, in the order that parameters
+ + * are passed.
+ + */
+ +
+ +const static enum bb_reg_code
+ +bb_param_reg[] = {
+ +      BBRG_RAX,
+ +      BBRG_RDX,
+ +      BBRG_RCX,
+ +};
+ +
+ +const static enum bb_reg_code
+ +bb_preserved_reg[] = {
+ +      BBRG_RBX,
+ +      BBRG_RBP,
+ +      BBRG_RSP,
+ +      BBRG_RSI,
+ +      BBRG_RDI,
+ +};
+ +
+ +static const struct bb_mem_contains full_pt_regs[] = {
+ +      { 0x18, BBRG_RAX },
+ +      { 0x14, BBRG_RBP },
+ +      { 0x10, BBRG_RDI },
+ +      { 0x0c, BBRG_RSI },
+ +      { 0x08, BBRG_RDX },
+ +      { 0x04, BBRG_RCX },
+ +      { 0x00, BBRG_RBX },
+ +};
+ +static const struct bb_mem_contains no_memory[] = {
+ +};
+ +/* Hardware has already pushed an error_code on the stack.  Use undefined just
+ + * to set the initial stack offset.
+ + */
+ +static const struct bb_mem_contains error_code[] = {
+ +      { 0x0, BBRG_UNDEFINED },
+ +};
+ +/* rbx already pushed */
+ +static const struct bb_mem_contains rbx_pushed[] = {
+ +      { 0x0, BBRG_RBX },
+ +};
+ +#ifdef        CONFIG_MATH_EMULATION
+ +static const struct bb_mem_contains mem_fpu_reg_round[] = {
+ +      { 0xc, BBRG_RBP },
+ +      { 0x8, BBRG_RSI },
+ +      { 0x4, BBRG_RDI },
+ +      { 0x0, BBRG_RBX },
+ +};
+ +#endif        /* CONFIG_MATH_EMULATION */
+ +
+ +static const struct bb_reg_contains all_regs[] = {
+ +      [BBRG_RAX] = { BBRG_RAX, 0 },
+ +      [BBRG_RBX] = { BBRG_RBX, 0 },
+ +      [BBRG_RCX] = { BBRG_RCX, 0 },
+ +      [BBRG_RDX] = { BBRG_RDX, 0 },
+ +      [BBRG_RDI] = { BBRG_RDI, 0 },
+ +      [BBRG_RSI] = { BBRG_RSI, 0 },
+ +      [BBRG_RBP] = { BBRG_RBP, 0 },
+ +      [BBRG_RSP] = { BBRG_OSP, 0 },
+ +};
+ +static const struct bb_reg_contains no_regs[] = {
+ +};
+ +#ifdef        CONFIG_MATH_EMULATION
+ +static const struct bb_reg_contains reg_fpu_reg_round[] = {
+ +      [BBRG_RBP] = { BBRG_OSP, -0x4 },
+ +      [BBRG_RSP] = { BBRG_OSP, -0x10 },
+ +};
+ +#endif        /* CONFIG_MATH_EMULATION */
+ +
+ +static struct bb_name_state bb_special_cases[] = {
+ +
+ +      /* First the cases that pass data only in memory.  We do not check any
+ +       * register state for these cases.
+ +       */
+ +
+ +      /* Simple cases, no exceptions */
+ +      NS_MEM("check_userspace", full_pt_regs, 0),
+ +      NS_MEM("device_not_available_emulate", full_pt_regs, 0),
+ +      NS_MEM("ldt_ss", full_pt_regs, 0),
+ +      NS_MEM("no_singlestep", full_pt_regs, 0),
+ +      NS_MEM("restore_all", full_pt_regs, 0),
+ +      NS_MEM("restore_nocheck", full_pt_regs, 0),
+ +      NS_MEM("restore_nocheck_notrace", full_pt_regs, 0),
+ +      NS_MEM("ret_from_exception", full_pt_regs, 0),
+ +      NS_MEM("ret_from_fork", full_pt_regs, 0),
+ +      NS_MEM("ret_from_intr", full_pt_regs, 0),
+ +      NS_MEM("work_notifysig", full_pt_regs, 0),
+ +      NS_MEM("work_pending", full_pt_regs, 0),
+ +
+ +#ifdef        CONFIG_PREEMPT
+ +      NS_MEM("resume_kernel", full_pt_regs, 0),
+ +#endif        /* CONFIG_PREEMPT */
+ +
+ +      NS_MEM("common_interrupt", error_code, 0),
+ +      NS_MEM("error_code", error_code, 0),
+ +
+ +      NS_MEM("bad_put_user", rbx_pushed, 0),
+ +
+ +      NS_MEM_FROM("resume_userspace", "syscall_badsys",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM_FROM("resume_userspace", "syscall_fault",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM_FROM("resume_userspace", "syscall_trace_entry",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      /* Too difficult to trace through the various vm86 functions for now.
+ +       * They are C functions that start off with some memory state, fiddle
+ +       * the registers then jmp directly to resume_userspace.  For the
+ +       * moment, just assume that they are valid and do no checks.
+ +       */
+ +      NS_FROM("resume_userspace", "do_int",
+ +              no_memory, no_regs, 0, 0, 0),
+ +      NS_FROM("resume_userspace", "do_sys_vm86",
+ +              no_memory, no_regs, 0, 0, 0),
+ +      NS_FROM("resume_userspace", "handle_vm86_fault",
+ +              no_memory, no_regs, 0, 0, 0),
+ +      NS_FROM("resume_userspace", "handle_vm86_trap",
+ +              no_memory, no_regs, 0, 0, 0),
+ +      NS_MEM("resume_userspace", full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("syscall_badsys", "ia32_sysenter_target",
+ +              full_pt_regs, BB_SKIP(RBP)),
+ +      NS_MEM("syscall_badsys", full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("syscall_call", "syscall_trace_entry",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM("syscall_call", full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("syscall_exit", "syscall_trace_entry",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM("syscall_exit", full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("syscall_exit_work", "ia32_sysenter_target",
+ +              full_pt_regs, BB_SKIP(RAX) | BB_SKIP(RBP)),
+ +      NS_MEM_FROM("syscall_exit_work", "system_call",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM("syscall_exit_work", full_pt_regs, 0),
+ +
+ +      NS_MEM_FROM("syscall_trace_entry", "ia32_sysenter_target",
+ +              full_pt_regs, BB_SKIP(RBP)),
+ +      NS_MEM_FROM("syscall_trace_entry", "system_call",
+ +              full_pt_regs, BB_SKIP(RAX)),
+ +      NS_MEM("syscall_trace_entry", full_pt_regs, 0),
+ +
+ +      /* Now the cases that pass data in registers.  We do not check any
+ +       * memory state for these cases.
+ +       */
+ +
+ +      NS_REG("syscall_fault", all_regs, 0),
+ +
+ +      NS_REG("bad_get_user", all_regs,
+ +              BB_SKIP(RAX) | BB_SKIP(RDX)),
+ +
+ +      /* Finally the cases that pass data in both registers and memory.
+ +      */
+ +
+ +      /* This entry is redundant now because bb_fixup_switch_to() hides the
+ +       * jmp __switch_to case, however the entry is left here as
+ +       * documentation.
+ +       *
+ +       * NS("__switch_to", no_memory, no_regs, 0, 0, 0),
+ +       */
+ +
+ +      NS("iret_exc", no_memory, all_regs, 0, 0, 0x20),
+ +
+ +#ifdef        CONFIG_MATH_EMULATION
+ +      NS("fpu_reg_round", mem_fpu_reg_round, reg_fpu_reg_round, 0, 0, 0),
+ +#endif        /* CONFIG_MATH_EMULATION */
+ +};
+ +
+ +static const char *bb_spurious[] = {
+ +                              /* ret_from_exception */
+ +      "ret_from_intr",
+ +      "check_userspace",
+ +      "resume_userspace",
+ +                              /* resume_kernel */
+ +#ifdef        CONFIG_PREEMPT
+ +      "need_resched",
+ +#endif        /* CONFIG_PREEMPT */
+ +                              /* ia32_sysenter_target */
+ +      "sysenter_past_esp",
+ +                              /* system_call */
+ +      "no_singlestep",
+ +      "syscall_call",
+ +      "syscall_exit",
+ +      "restore_all",
+ +      "restore_nocheck",
+ +      "restore_nocheck_notrace",
+ +      "ldt_ss",
+ +      /* do not include iret_exc, it is in a .fixup section */
+ +                              /* work_pending */
+ +      "work_resched",
+ +      "work_notifysig",
+ +#ifdef        CONFIG_VM86
+ +      "work_notifysig_v86",
+ +#endif        /* CONFIG_VM86 */
+ +                              /* page_fault */
+ +      "error_code",
+ +                              /* device_not_available */
+ +      "device_not_available_emulate",
+ +                              /* debug */
+ +      "debug_esp_fix_insn",
+ +      "debug_stack_correct",
+ +                              /* nmi */
+ +      "nmi_stack_correct",
+ +      "nmi_stack_fixup",
+ +      "nmi_debug_stack_check",
+ +      "nmi_espfix_stack",
+ +#ifdef        CONFIG_HIBERNATION
+ +                              /* restore_image */
+ +      "copy_loop",
+ +      "done",
+ +#endif        /* CONFIG_HIBERNATION */
+ +#ifdef        CONFIG_KPROBES
+ +                              /* jprobe_return */
+ +      "jprobe_return_end",
+ +#endif        /* CONFIG_KPROBES */
+ +#ifdef        CONFIG_KEXEC
+ +                              /* relocate_kernel */
+ +      "relocate_new_kernel",
+ +#endif        /* CONFIG_KEXEC */
+ +#ifdef        CONFIG_MATH_EMULATION
+ +                              /* assorted *.S files in arch/i386/math_emu */
+ +      "Denorm_done",
+ +      "Denorm_shift_more_than_32",
+ +      "Denorm_shift_more_than_63",
+ +      "Denorm_shift_more_than_64",
+ +      "Do_unmasked_underflow",
+ +      "Exp_not_underflow",
+ +      "fpu_Arith_exit",
+ +      "fpu_reg_round",
+ +      "fpu_reg_round_signed_special_exit",
+ +      "fpu_reg_round_special_exit",
+ +      "L_accum_done",
+ +      "L_accum_loaded",
+ +      "L_accum_loop",
+ +      "L_arg1_larger",
+ +      "L_bugged",
+ +      "L_bugged_1",
+ +      "L_bugged_2",
+ +      "L_bugged_3",
+ +      "L_bugged_4",
+ +      "L_bugged_denorm_486",
+ +      "L_bugged_round24",
+ +      "L_bugged_round53",
+ +      "L_bugged_round64",
+ +      "LCheck_24_round_up",
+ +      "LCheck_53_round_up",
+ +      "LCheck_Round_Overflow",
+ +      "LCheck_truncate_24",
+ +      "LCheck_truncate_53",
+ +      "LCheck_truncate_64",
+ +      "LDenormal_adj_exponent",
+ +      "L_deNormalised",
+ +      "LDo_24_round_up",
+ +      "LDo_2nd_32_bits",
+ +      "LDo_2nd_div",
+ +      "LDo_3rd_32_bits",
+ +      "LDo_3rd_div",
+ +      "LDo_53_round_up",
+ +      "LDo_64_round_up",
+ +      "L_done",
+ +      "LDo_truncate_24",
+ +      "LDown_24",
+ +      "LDown_53",
+ +      "LDown_64",
+ +      "L_entry_bugged",
+ +      "L_error_exit",
+ +      "L_exactly_32",
+ +      "L_exception_exit",
+ +      "L_exit",
+ +      "L_exit_nuo_valid",
+ +      "L_exit_nuo_zero",
+ +      "L_exit_valid",
+ +      "L_extent_zero",
+ +      "LFirst_div_done",
+ +      "LFirst_div_not_1",
+ +      "L_Full_Division",
+ +      "LGreater_Half_24",
+ +      "LGreater_Half_53",
+ +      "LGreater_than_1",
+ +      "LLess_than_1",
+ +      "L_Make_denorm",
+ +      "L_more_31_no_low",
+ +      "L_more_63_no_low",
+ +      "L_more_than_31",
+ +      "L_more_than_63",
+ +      "L_more_than_64",
+ +      "L_more_than_65",
+ +      "L_more_than_95",
+ +      "L_must_be_zero",
+ +      "L_n_exit",
+ +      "L_no_adjust",
+ +      "L_no_bit_lost",
+ +      "L_no_overflow",
+ +      "L_no_precision_loss",
+ +      "L_Normalised",
+ +      "L_norm_bugged",
+ +      "L_n_shift_1",
+ +      "L_nuo_shift_1",
+ +      "L_overflow",
+ +      "L_precision_lost_down",
+ +      "L_precision_lost_up",
+ +      "LPrevent_2nd_overflow",
+ +      "LPrevent_3rd_overflow",
+ +      "LPseudoDenormal",
+ +      "L_Re_normalise",
+ +      "LResult_Normalised",
+ +      "L_round",
+ +      "LRound_large",
+ +      "LRound_nearest_24",
+ +      "LRound_nearest_53",
+ +      "LRound_nearest_64",
+ +      "LRound_not_small",
+ +      "LRound_ovfl",
+ +      "LRound_precision",
+ +      "LRound_prep",
+ +      "L_round_the_result",
+ +      "LRound_To_24",
+ +      "LRound_To_53",
+ +      "LRound_To_64",
+ +      "LSecond_div_done",
+ +      "LSecond_div_not_1",
+ +      "L_shift_1",
+ +      "L_shift_32",
+ +      "L_shift_65_nc",
+ +      "L_shift_done",
+ +      "Ls_less_than_32",
+ +      "Ls_more_than_63",
+ +      "Ls_more_than_95",
+ +      "L_Store_significand",
+ +      "L_subtr",
+ +      "LTest_over",
+ +      "LTruncate_53",
+ +      "LTruncate_64",
+ +      "L_underflow",
+ +      "L_underflow_to_zero",
+ +      "LUp_24",
+ +      "LUp_53",
+ +      "LUp_64",
+ +      "L_zero",
+ +      "Normalise_result",
+ +      "Signal_underflow",
+ +      "sqrt_arg_ge_2",
+ +      "sqrt_get_more_precision",
+ +      "sqrt_more_prec_large",
+ +      "sqrt_more_prec_ok",
+ +      "sqrt_more_prec_small",
+ +      "sqrt_near_exact",
+ +      "sqrt_near_exact_large",
+ +      "sqrt_near_exact_ok",
+ +      "sqrt_near_exact_small",
+ +      "sqrt_near_exact_x",
+ +      "sqrt_prelim_no_adjust",
+ +      "sqrt_round_result",
+ +      "sqrt_stage_2_done",
+ +      "sqrt_stage_2_error",
+ +      "sqrt_stage_2_finish",
+ +      "sqrt_stage_2_positive",
+ +      "sqrt_stage_3_error",
+ +      "sqrt_stage_3_finished",
+ +      "sqrt_stage_3_no_error",
+ +      "sqrt_stage_3_positive",
+ +      "Unmasked_underflow",
+ +      "xExp_not_underflow",
+ +#endif        /* CONFIG_MATH_EMULATION */
+ +};
+ +
+ +static const char *bb_hardware_handlers[] = {
+ +      "ret_from_exception",
+ +      "system_call",
+ +      "work_pending",
+ +      "syscall_fault",
+ +      "page_fault",
+ +      "coprocessor_error",
+ +      "simd_coprocessor_error",
+ +      "device_not_available",
+ +      "debug",
+ +      "nmi",
+ +      "int3",
+ +      "overflow",
+ +      "bounds",
+ +      "invalid_op",
+ +      "coprocessor_segment_overrun",
+ +      "invalid_TSS",
+ +      "segment_not_present",
+ +      "stack_segment",
+ +      "general_protection",
+ +      "alignment_check",
+ +      "kdb_call",
+ +      "divide_error",
+ +      "machine_check",
+ +      "spurious_interrupt_bug",
+ +};
+ +
+ +static int
+ +bb_hardware_pushed_arch(kdb_machreg_t rsp,
+ +                      const struct kdb_activation_record *ar)
+ +{
+ +      return (2 * KDB_WORD_SIZE);
+ +}
+ +
+ +static void
+ +bb_start_block0(void)
+ +{
+ +      bb_reg_code_set_value(BBRG_RAX, BBRG_RAX);
+ +      bb_reg_code_set_value(BBRG_RBX, BBRG_RBX);
+ +      bb_reg_code_set_value(BBRG_RCX, BBRG_RCX);
+ +      bb_reg_code_set_value(BBRG_RDX, BBRG_RDX);
+ +      bb_reg_code_set_value(BBRG_RDI, BBRG_RDI);
+ +      bb_reg_code_set_value(BBRG_RSI, BBRG_RSI);
+ +      bb_reg_code_set_value(BBRG_RBP, BBRG_RBP);
+ +      bb_reg_code_set_value(BBRG_RSP, BBRG_OSP);
+ +}
+ +
+ +/* The i386 code that switches stack in a context switch is an extremely
+ + * special case.  It saves the rip pointing to a label that is not otherwise
+ + * referenced, saves the current rsp then pushes a word.  The magic code that
+ + * resumes the new task picks up the saved rip and rsp, effectively referencing
+ + * a label that otherwise is not used and ignoring the pushed word.
+ + *
+ + * The simplest way to handle this very strange case is to recognise jmp
+ + * address <__switch_to> and treat it as a popfl instruction.  This avoids
+ + * terminating the block on this jmp and removes one word from the stack state,
+ + * which is the end effect of all the magic code.
+ + *
+ + * Called with the instruction line, starting after the first ':'.
+ + */
+ +
+ +static void
+ +bb_fixup_switch_to(char *p)
+ +{
+ +      char *p1 = p;
+ +      p += strspn(p, " \t");          /* start of instruction */
+ +      if (strncmp(p, "jmp", 3))
+ +              return;
+ +      p += strcspn(p, " \t");         /* end of instruction */
+ +      p += strspn(p, " \t");          /* start of address */
+ +      p += strcspn(p, " \t");         /* end of address */
+ +      p += strspn(p, " \t");          /* start of comment */
+ +      if (strcmp(p, "<__switch_to>") == 0)
+ +              strcpy(p1, "popfl");
+ +}
+ +
+ +static int
+ +bb_asmlinkage_arch(void)
+ +{
+ +      return strcmp(bb_func_name, "ret_from_exception") == 0 ||
+ +             strcmp(bb_func_name, "syscall_trace_entry") == 0;
+ +}
+ +
+ +#endif        /* CONFIG_X86_64 */
+ +
+ +
+ +/*============================================================================*/
+ +/*                                                                            */
+ +/* Common code and data.                                                      */
+ +/*                                                                            */
+ +/*============================================================================*/
+ +
+ +
+ +/* Tracking registers by decoding the instructions is quite a bit harder than
+ + * doing the same tracking using compiler generated information.  Register
+ + * contents can remain in the same register, they can be copied to other
+ + * registers, they can be stored on stack or they can be modified/overwritten.
+ + * At any one time, there are 0 or more copies of the original value that was
+ + * supplied in each register on input to the current function.  If a register
+ + * exists in multiple places, one copy of that register is the master version,
+ + * the others are temporary copies which may or may not be destroyed before the
+ + * end of the function.
+ + *
+ + * The compiler knows which copy of a register is the master and which are
+ + * temporary copies, which makes it relatively easy to track register contents
+ + * as they are saved and restored.  Without that compiler based knowledge, this
+ + * code has to track _every_ possible copy of each register, simply because we
+ + * do not know which is the master copy and which are temporary copies which
+ + * may be destroyed later.
+ + *
+ + * It gets worse: registers that contain parameters can be copied to other
+ + * registers which are then saved on stack in a lower level function.  Also the
+ + * stack pointer may be held in multiple registers (typically RSP and RBP)
+ + * which contain different offsets from the base of the stack on entry to this
+ + * function.  All of which means that we have to track _all_ register
+ + * movements, or at least as much as possible.
+ + *
+ + * Start with the basic block that contains the start of the function, by
+ + * definition all registers contain their initial value.  Track each
+ + * instruction's effect on register contents, this includes reading from a
+ + * parameter register before any write to that register, IOW the register
+ + * really does contain a parameter.  The register state is represented by a
+ + * dynamically sized array with each entry containing :-
+ + *
+ + *   Register name
+ + *   Location it is copied to (another register or stack + offset)
+ + *
+ + * Besides the register tracking array, we track which parameter registers are
+ + * read before being written, to determine how many parameters are passed in
+ + * registers.  We also track which registers contain stack pointers, including
+ + * their offset from the original stack pointer on entry to the function.
+ + *
+ + * At each exit from the current basic block (via JMP instruction or drop
+ + * through), the register state is cloned to form the state on input to the
+ + * target basic block and the target is marked for processing using this state.
+ + * When there are multiple ways to enter a basic block (e.g. several JMP
+ + * instructions referencing the same target) then there will be multiple sets
+ + * of register state to form the "input" for that basic block, there is no
+ + * guarantee that all paths to that block will have the same register state.
+ + *
+ + * As each target block is processed, all the known sets of register state are
+ + * merged to form a suitable subset of the state which agrees with all the
+ + * inputs.  The most common case is where one path to this block copies a
+ + * register to another register but another path does not, therefore the copy
+ + * is only a temporary and should not be propogated into this block.
+ + *
+ + * If the target block already has an input state from the current transfer
+ + * point and the new input state is identical to the previous input state then
+ + * we have reached a steady state for the arc from the current location to the
+ + * target block.  Therefore there is no need to process the target block again.
+ + *
+ + * The steps of "process a block, create state for target block(s), pick a new
+ + * target block, merge state for target block, process target block" will
+ + * continue until all the state changes have propogated all the way down the
+ + * basic block tree, including round any cycles in the tree.  The merge step
+ + * only deletes tracking entries from the input state(s), it never adds a
+ + * tracking entry.  Therefore the overall algorithm is guaranteed to converge
+ + * to a steady state, the worst possible case is that every tracking entry into
+ + * a block is deleted, which will result in an empty output state.
+ + *
+ + * As each instruction is decoded, it is checked to see if this is the point at
+ + * which execution left this function.  This can be a call to another function
+ + * (actually the return address to this function) or is the instruction which
+ + * was about to be executed when an interrupt occurred (including an oops).
+ + * Save the register state at this point.
+ + *
+ + * We always know what the registers contain when execution left this function.
+ + * For an interrupt, the registers are in struct pt_regs.  For a call to
+ + * another function, we have already deduced the register state on entry to the
+ + * other function by unwinding to the start of that function.  Given the
+ + * register state on exit from this function plus the known register contents
+ + * on entry to the next function, we can determine the stack pointer value on
+ + * input to this function.  That in turn lets us calculate the address of input
+ + * registers that have been stored on stack, giving us the input parameters.
+ + * Finally the stack pointer gives us the return address which is the exit
+ + * point from the calling function, repeat the unwind process on that function.
+ + *
+ + * The data that tracks which registers contain input parameters is function
+ + * global, not local to any basic block.  To determine which input registers
+ + * contain parameters, we have to decode the entire function.  Otherwise an
+ + * exit early in the function might not have read any parameters yet.
+ + */
+ +
+ +/* Record memory contents in terms of the values that were passed to this
+ + * function, IOW track which memory locations contain an input value.  A memory
+ + * location's contents can be undefined, it can contain an input register value
+ + * or it can contain an offset from the original stack pointer.
+ + *
+ + * This structure is used to record register contents that have been stored in
+ + * memory.  Location (BBRG_OSP + 'offset_address') contains the input value
+ + * from register 'value'.  When 'value' is BBRG_OSP then offset_value contains
+ + * the offset from the original stack pointer that was stored in this memory
+ + * location.  When 'value' is not BBRG_OSP then the memory location contains
+ + * the original contents of an input register and offset_value is ignored.
+ + *
+ + * An input register 'value' can be stored in more than one register and/or in
+ + * more than one memory location.
+ + */
+ +
+ +struct bb_memory_contains
+ +{
+ +      short offset_address;
+ +      enum bb_reg_code value: 8;
+ +      short offset_value;
+ +};
+ +
+ +/* Track the register state in each basic block. */
+ +
+ +struct bb_reg_state
+ +{
+ +      /* Indexed by register value 'reg - BBRG_RAX' */
+ +      struct bb_reg_contains contains[KDB_INT_REGISTERS];
+ +      int ref_count;
+ +      int mem_count;
+ +      /* dynamic size for memory locations, see mem_count */
+ +      struct bb_memory_contains memory[0];
+ +};
+ +
+ +static struct bb_reg_state *bb_reg_state, *bb_exit_state;
+ +static int bb_reg_state_max, bb_reg_params, bb_memory_params;
+ +
+ +struct bb_actual
+ +{
+ +      bfd_vma value;
+ +      int valid;
+ +};
+ +
+ +/* Contains the actual hex value of a register, plus a valid bit.  Indexed by
+ + * register value 'reg - BBRG_RAX'
+ + */
+ +static struct bb_actual bb_actual[KDB_INT_REGISTERS];
+ +
+ +static bfd_vma bb_func_start, bb_func_end;
+ +static bfd_vma bb_common_interrupt, bb_error_entry, bb_ret_from_intr,
+ +             bb_thread_return, bb_sync_regs, bb_save_v86_state,
+ +             bb__sched_text_start, bb__sched_text_end,
+ +             bb_save_args, bb_save_rest, bb_save_paranoid;
+ +
+ +/* Record jmp instructions, both conditional and unconditional.  These form the
+ + * arcs between the basic blocks.  This is also used to record the state when
+ + * one block drops through into the next.
+ + *
+ + * A bb can have multiple associated bb_jmp entries, one for each jcc
+ + * instruction plus at most one bb_jmp for the drop through case.  If a bb
+ + * drops through to the next bb then the drop through bb_jmp entry will be the
+ + * last entry in the set of bb_jmp's that are associated with the bb.  This is
+ + * enforced by the fact that jcc entries are added during the disassembly phase
+ + * of pass 1, the drop through entries are added near the end of pass 1.
+ + *
+ + * At address 'from' in this block, we have a jump to address 'to'.  The
+ + * register state at 'from' is copied to the target block.
+ + */
+ +
+ +struct bb_jmp
+ +{
+ +      bfd_vma from;
+ +      bfd_vma to;
+ +      struct bb_reg_state *state;
+ +      unsigned int drop_through: 1;
+ +};
+ +
+ +struct bb
+ +{
+ +      bfd_vma start;
+ +      /* The end address of a basic block is sloppy.  It can be the first
+ +       * byte of the last instruction in the block or it can be the last byte
+ +       * of the block.
+ +       */
+ +      bfd_vma end;
+ +      unsigned int changed: 1;
+ +      unsigned int drop_through: 1;
+ +};
+ +
+ +static struct bb **bb_list, *bb_curr;
+ +static int bb_max, bb_count;
+ +
+ +static struct bb_jmp *bb_jmp_list;
+ +static int bb_jmp_max, bb_jmp_count;
+ +
+ +/* Add a new bb entry to the list.  This does an insert sort. */
+ +
+ +static struct bb *
+ +bb_new(bfd_vma order)
+ +{
+ +      int i, j;
+ +      struct bb *bb, *p;
+ +      if (bb_giveup)
+ +              return NULL;
+ +      if (bb_count == bb_max) {
+ +              struct bb **bb_list_new;
+ +              bb_max += 10;
+ +              bb_list_new = debug_kmalloc(bb_max*sizeof(*bb_list_new),
+ +                                          GFP_ATOMIC);
+ +              if (!bb_list_new) {
+ +                      kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ +                      bb_giveup = 1;
+ +                      return NULL;
+ +              }
+ +              memcpy(bb_list_new, bb_list, bb_count*sizeof(*bb_list));
+ +              debug_kfree(bb_list);
+ +              bb_list = bb_list_new;
+ +      }
+ +      bb = debug_kmalloc(sizeof(*bb), GFP_ATOMIC);
+ +      if (!bb) {
+ +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return NULL;
+ +      }
+ +      memset(bb, 0, sizeof(*bb));
+ +      for (i = 0; i < bb_count; ++i) {
+ +              p = bb_list[i];
+ +              if ((p->start && p->start > order) ||
+ +                  (p->end && p->end > order))
+ +                      break;
+ +      }
+ +      for (j = bb_count-1; j >= i; --j)
+ +              bb_list[j+1] = bb_list[j];
+ +      bb_list[i] = bb;
+ +      ++bb_count;
+ +      return bb;
+ +}
+ +
+ +/* Add a new bb_jmp entry to the list.  This list is not sorted. */
+ +
+ +static struct bb_jmp *
+ +bb_jmp_new(bfd_vma from, bfd_vma to, unsigned int drop_through)
+ +{
+ +      struct bb_jmp *bb_jmp;
+ +      if (bb_giveup)
+ +              return NULL;
+ +      if (bb_jmp_count == bb_jmp_max) {
+ +              struct bb_jmp *bb_jmp_list_new;
+ +              bb_jmp_max += 10;
+ +              bb_jmp_list_new =
+ +                      debug_kmalloc(bb_jmp_max*sizeof(*bb_jmp_list_new),
+ +                                    GFP_ATOMIC);
+ +              if (!bb_jmp_list_new) {
+ +                      kdb_printf("\n\n%s: out of debug_kmalloc\n",
+ +                                 __FUNCTION__);
+ +                      bb_giveup = 1;
+ +                      return NULL;
+ +              }
+ +              memcpy(bb_jmp_list_new, bb_jmp_list,
+ +                     bb_jmp_count*sizeof(*bb_jmp_list));
+ +              debug_kfree(bb_jmp_list);
+ +              bb_jmp_list = bb_jmp_list_new;
+ +      }
+ +      bb_jmp = bb_jmp_list + bb_jmp_count++;
+ +      bb_jmp->from = from;
+ +      bb_jmp->to = to;
+ +      bb_jmp->drop_through = drop_through;
+ +      bb_jmp->state = NULL;
+ +      return bb_jmp;
+ +}
+ +
+ +static void
+ +bb_delete(int i)
+ +{
+ +      struct bb *bb = bb_list[i];
+ +      memcpy(bb_list+i, bb_list+i+1, (bb_count-i-1)*sizeof(*bb_list));
+ +      bb_list[--bb_count] = NULL;
+ +      debug_kfree(bb);
+ +}
+ +
+ +static struct bb *
+ +bb_add(bfd_vma start, bfd_vma end)
+ +{
+ +      int i;
+ +      struct bb *bb;
+ +      /* Ignore basic blocks whose start address is outside the current
+ +       * function.  These occur for call instructions and for tail recursion.
+ +       */
+ +      if (start &&
+ +          (start < bb_func_start || start >= bb_func_end))
+ +                     return NULL;
+ +      for (i = 0; i < bb_count; ++i) {
+ +              bb = bb_list[i];
+ +              if ((start && bb->start == start) ||
+ +                  (end && bb->end == end))
+ +                      return bb;
+ +      }
+ +      bb = bb_new(start ? start : end);
+ +      if (bb) {
+ +              bb->start = start;
+ +              bb->end = end;
+ +      }
+ +      return bb;
+ +}
+ +
+ +static struct bb_jmp *
+ +bb_jmp_add(bfd_vma from, bfd_vma to, unsigned int drop_through)
+ +{
+ +      int i;
+ +      struct bb_jmp *bb_jmp;
+ +      for (i = 0, bb_jmp = bb_jmp_list; i < bb_jmp_count; ++i, ++bb_jmp) {
+ +              if (bb_jmp->from == from &&
+ +                  bb_jmp->to == to &&
+ +                  bb_jmp->drop_through == drop_through)
+ +                      return bb_jmp;
+ +      }
+ +      bb_jmp = bb_jmp_new(from, to, drop_through);
+ +      return bb_jmp;
+ +}
+ +
+ +static unsigned long bb_curr_addr, bb_exit_addr;
+ +static char bb_buffer[256];   /* A bit too big to go on stack */
+ +
+ +/* Computed jmp uses 'jmp *addr(,%reg,[48])' where 'addr' is the start of a
+ + * table of addresses that point into the current function.  Run the table and
+ + * generate bb starts for each target address plus a bb_jmp from this address
+ + * to the target address.
+ + *
+ + * Only called for 'jmp' instructions, with the pointer starting at 'jmp'.
+ + */
+ +
+ +static void
+ +bb_pass1_computed_jmp(char *p)
+ +{
+ +      unsigned long table, scale;
+ +      kdb_machreg_t addr;
+ +      struct bb* bb;
+ +      p += strcspn(p, " \t");         /* end of instruction */
+ +      p += strspn(p, " \t");          /* start of address */
+ +      if (*p++ != '*')
+ +              return;
+ +      table = simple_strtoul(p, &p, 0);
+ +      if (strncmp(p, "(,%", 3) != 0)
+ +              return;
+ +      p += 3;
+ +      p += strcspn(p, ",");           /* end of reg */
+ +      if (*p++ != ',')
+ +              return;
+ +      scale = simple_strtoul(p, &p, 0);
+ +      if (scale != KDB_WORD_SIZE || strcmp(p, ")"))
+ +              return;
+ +      while (!bb_giveup) {
+ +              if (kdb_getword(&addr, table, sizeof(addr)))
+ +                      return;
+ +              if (addr < bb_func_start || addr >= bb_func_end)
+ +                      return;
+ +              bb = bb_add(addr, 0);
+ +              if (bb)
+ +                      bb_jmp_add(bb_curr_addr, addr, 0);
+ +              table += KDB_WORD_SIZE;
+ +      }
+ +}
+ +
+ +/* Pass 1, identify the start and end of each basic block */
+ +
+ +static int
+ +bb_dis_pass1(PTR file, const char *fmt, ...)
+ +{
+ +      int l = strlen(bb_buffer);
+ +      char *p;
+ +      va_list ap;
+ +      va_start(ap, fmt);
+ +      vsnprintf(bb_buffer + l, sizeof(bb_buffer) - l, fmt, ap);
+ +      va_end(ap);
+ +      if ((p = strchr(bb_buffer, '\n'))) {
+ +              *p = '\0';
+ +              /* ret[q], iret[q], sysexit, sysret, ud2a or jmp[q] end a
+ +               * block.  As does a call to a function marked noret.
+ +               */
+ +              p = bb_buffer;
+ +              p += strcspn(p, ":");
+ +              if (*p++ == ':') {
+ +                      bb_fixup_switch_to(p);
+ +                      p += strspn(p, " \t");  /* start of instruction */
+ +                      if (strncmp(p, "ret", 3) == 0 ||
+ +                          strncmp(p, "iret", 4) == 0 ||
+ +                          strncmp(p, "sysexit", 7) == 0 ||
+ +                          strncmp(p, "sysret", 6) == 0 ||
+ +                          strncmp(p, "ud2a", 4) == 0 ||
+ +                          strncmp(p, "jmp", 3) == 0) {
+ +                              if (strncmp(p, "jmp", 3) == 0)
+ +                                      bb_pass1_computed_jmp(p);
+ +                              bb_add(0, bb_curr_addr);
+ +                      };
+ +                      if (strncmp(p, "call", 4) == 0) {
+ +                              strsep(&p, " \t");      /* end of opcode */
+ +                              if (p)
+ +                                      p += strspn(p, " \t");  /* operand(s) */
+ +                              if (p && strchr(p, '<')) {
+ +                                      p = strchr(p, '<') + 1;
+ +                                      *strchr(p, '>') = '\0';
+ +                                      if (bb_noret(p))
+ +                                              bb_add(0, bb_curr_addr);
+ +                              }
+ +                      };
+ +              }
+ +              bb_buffer[0] = '\0';
+ +      }
+ +      return 0;
+ +}
+ +
+ +static void
+ +bb_printaddr_pass1(bfd_vma addr, disassemble_info *dip)
+ +{
+ +      kdb_symtab_t symtab;
+ +      unsigned int offset;
+ +      struct bb* bb;
+ +      /* disasm only calls the printaddr routine for the target of jmp, loop
+ +       * or call instructions, i.e. the start of a basic block.  call is
+ +       * ignored by bb_add because the target address is outside the current
+ +       * function.
+ +       */
+ +      dip->fprintf_func(dip->stream, "0x%lx", addr);
+ +      kdbnearsym(addr, &symtab);
+ +      if (symtab.sym_name) {
+ +              dip->fprintf_func(dip->stream, " <%s", symtab.sym_name);
+ +              if ((offset = addr - symtab.sym_start))
+ +                      dip->fprintf_func(dip->stream, "+0x%x", offset);
+ +              dip->fprintf_func(dip->stream, ">");
+ +      }
+ +      bb = bb_add(addr, 0);
+ +      if (bb)
+ +              bb_jmp_add(bb_curr_addr, addr, 0);
+ +}
+ +
+ +static void
+ +bb_pass1(void)
+ +{
+ +      int i;
+ +      unsigned long addr;
+ +      struct bb *bb;
+ +      struct bb_jmp *bb_jmp;
+ +
+ +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +              kdb_printf("%s: func_name %s func_start " kdb_bfd_vma_fmt0
+ +                         " func_end " kdb_bfd_vma_fmt0 "\n",
+ +                         __FUNCTION__,
+ +                         bb_func_name,
+ +                         bb_func_start,
+ +                         bb_func_end);
+ +      kdb_di.fprintf_func = bb_dis_pass1;
+ +      kdb_di.print_address_func = bb_printaddr_pass1;
+ +
+ +      bb_add(bb_func_start, 0);
+ +      for (bb_curr_addr = bb_func_start;
+ +           bb_curr_addr < bb_func_end;
+ +           ++bb_curr_addr) {
+ +              unsigned char c;
+ +              if (kdb_getarea(c, bb_curr_addr)) {
+ +                      kdb_printf("%s: unreadable function code at ",
+ +                                 __FUNCTION__);
+ +                      kdb_symbol_print(bb_curr_addr, NULL, KDB_SP_DEFAULT);
+ +                      kdb_printf(", giving up\n");
+ +                      bb_giveup = 1;
+ +                      return;
+ +              }
+ +      }
+ +      for (addr = bb_func_start; addr < bb_func_end; ) {
+ +              bb_curr_addr = addr;
+ +              addr += kdba_id_printinsn(addr, &kdb_di);
+ +              kdb_di.fprintf_func(NULL, "\n");
+ +      }
+ +      if (bb_giveup)
+ +              goto out;
+ +
+ +      /* Special case: a block consisting of a single instruction which is
+ +       * both the target of a jmp and is also an ending instruction, so we
+ +       * add two blocks using the same address, one as a start and one as an
+ +       * end, in no guaranteed order.  The end must be ordered after the
+ +       * start.
+ +       */
+ +      for (i = 0; i < bb_count-1; ++i) {
+ +              struct bb *bb1 = bb_list[i], *bb2 = bb_list[i+1];
+ +              if (bb1->end && bb1->end == bb2->start) {
+ +                      bb = bb_list[i+1];
+ +                      bb_list[i+1] = bb_list[i];
+ +                      bb_list[i] = bb;
+ +              }
+ +      }
+ +
+ +      /* Some bb have a start address, some have an end address.  Collapse
+ +       * them into entries that have both start and end addresses.  The first
+ +       * entry is guaranteed to have a start address.
+ +       */
+ +      for (i = 0; i < bb_count-1; ++i) {
+ +              struct bb *bb1 = bb_list[i], *bb2 = bb_list[i+1];
+ +              if (bb1->end)
+ +                      continue;
+ +              if (bb2->start) {
+ +                      bb1->end = bb2->start - 1;
+ +                      bb1->drop_through = 1;
+ +                      bb_jmp_add(bb1->end, bb2->start, 1);
+ +              } else {
+ +                      bb1->end = bb2->end;
+ +                      bb_delete(i+1);
+ +              }
+ +      }
+ +      bb = bb_list[bb_count-1];
+ +      if (!bb->end)
+ +              bb->end = bb_func_end - 1;
+ +
+ +      /* It would be nice to check that all bb have a valid start and end
+ +       * address but there is just too much garbage code in the kernel to do
+ +       * that check.  Aligned functions in assembler code mean that there is
+ +       * space between the end of one function and the start of the next and
+ +       * that space contains previous code from the assembler's buffers.  It
+ +       * looks like dead code with nothing that branches to it, so no start
+ +       * address.  do_sys_vm86() ends with 'jmp resume_userspace' which the C
+ +       * compiler does not know about so gcc appends the normal exit code,
+ +       * again nothing branches to this dangling code.
+ +       *
+ +       * The best we can do is delete bb entries with no start address.
+ +       */
+ +      for (i = 0; i < bb_count; ++i) {
+ +              struct bb *bb = bb_list[i];
+ +              if (!bb->start)
+ +                      bb_delete(i--);
+ +      }
+ +      for (i = 0; i < bb_count; ++i) {
+ +              struct bb *bb = bb_list[i];
+ +              if (!bb->end) {
+ +                      kdb_printf("%s: incomplete bb state\n", __FUNCTION__);
+ +                      bb_giveup = 1;
+ +                      goto debug;
+ +              }
+ +      }
+ +
+ +out:
+ +      if (!KDB_DEBUG(BB))
+ +              return;
+ +debug:
+ +      kdb_printf("%s: end\n", __FUNCTION__);
+ +      for (i = 0; i < bb_count; ++i) {
+ +              bb = bb_list[i];
+ +              kdb_printf("  bb[%d] start "
+ +                         kdb_bfd_vma_fmt0
+ +                         " end " kdb_bfd_vma_fmt0
+ +                         " drop_through %d",
+ +                         i, bb->start, bb->end, bb->drop_through);
+ +              kdb_printf("\n");
+ +      }
+ +      for (i = 0; i < bb_jmp_count; ++i) {
+ +              bb_jmp = bb_jmp_list + i;
+ +              kdb_printf("  bb_jmp[%d] from "
+ +                         kdb_bfd_vma_fmt0
+ +                         " to " kdb_bfd_vma_fmt0
+ +                         " drop_through %d\n",
+ +                         i, bb_jmp->from, bb_jmp->to, bb_jmp->drop_through);
+ +      }
+ +}
+ +
+ +/* Pass 2, record register changes in each basic block */
+ +
+ +/* For each opcode that we care about, indicate how it uses its operands.  Most
+ + * opcodes can be handled generically because they completely specify their
+ + * operands in the instruction, however many opcodes have side effects such as
+ + * reading or writing rax or updating rsp.  Instructions that change registers
+ + * that are not listed in the operands must be handled as special cases.  In
+ + * addition, instructions that copy registers while preserving their contents
+ + * (push, pop, mov) or change the contents in a well defined way (add with an
+ + * immediate, lea) must be handled as special cases in order to track the
+ + * register contents.
+ + *
+ + * The tables below only list opcodes that are actually used in the Linux
+ + * kernel, so they omit most of the floating point and all of the SSE type
+ + * instructions.  The operand usage entries only cater for accesses to memory
+ + * and to the integer registers, accesses to floating point registers and flags
+ + * are not relevant for kernel backtraces.
+ + */
+ +
+ +enum bb_operand_usage {
+ +      BBOU_UNKNOWN = 0,
+ +              /* generic entries.  because xchg can do any combinations of
+ +               * read src, write src, read dst and  write dst we need to
+ +               * define all 16 possibilities.  These are ordered by rs = 1,
+ +               * rd = 2, ws = 4, wd = 8, bb_usage_x*() functions rely on this
+ +               * order.
+ +               */
+ +      BBOU_RS = 1,    /* read src */          /*  1 */
+ +      BBOU_RD,        /* read dst */          /*  2 */
+ +      BBOU_RSRD,                              /*  3 */
+ +      BBOU_WS,        /* write src */         /*  4 */
+ +      BBOU_RSWS,                              /*  5 */
+ +      BBOU_RDWS,                              /*  6 */
+ +      BBOU_RSRDWS,                            /*  7 */
+ +      BBOU_WD,        /* write dst */         /*  8 */
+ +      BBOU_RSWD,                              /*  9 */
+ +      BBOU_RDWD,                              /* 10 */
+ +      BBOU_RSRDWD,                            /* 11 */
+ +      BBOU_WSWD,                              /* 12 */
+ +      BBOU_RSWSWD,                            /* 13 */
+ +      BBOU_RDWSWD,                            /* 14 */
+ +      BBOU_RSRDWSWD,                          /* 15 */
+ +              /* opcode specific entries */
+ +      BBOU_ADD,
+ +      BBOU_AND,
+ +      BBOU_CALL,
+ +      BBOU_CBW,
+ +      BBOU_CMOV,
+ +      BBOU_CMPXCHG,
+ +      BBOU_CMPXCHGD,
+ +      BBOU_CPUID,
+ +      BBOU_CWD,
+ +      BBOU_DIV,
+ +      BBOU_IDIV,
+ +      BBOU_IMUL,
+ +      BBOU_IRET,
+ +      BBOU_JMP,
+ +      BBOU_LAHF,
+ +      BBOU_LEA,
+ +      BBOU_LEAVE,
+ +      BBOU_LODS,
+ +      BBOU_LOOP,
+ +      BBOU_LSS,
+ +      BBOU_MONITOR,
+ +      BBOU_MOV,
+ +      BBOU_MOVS,
+ +      BBOU_MUL,
+ +      BBOU_MWAIT,
+ +      BBOU_NOP,
+ +      BBOU_OUTS,
+ +      BBOU_POP,
+ +      BBOU_POPF,
+ +      BBOU_PUSH,
+ +      BBOU_PUSHF,
+ +      BBOU_RDMSR,
+ +      BBOU_RDTSC,
+ +      BBOU_RET,
+ +      BBOU_SAHF,
+ +      BBOU_SCAS,
+ +      BBOU_SUB,
+ +      BBOU_SYSEXIT,
+ +      BBOU_SYSRET,
+ +      BBOU_WRMSR,
+ +      BBOU_XADD,
+ +      BBOU_XCHG,
+ +      BBOU_XOR,
+ +};
+ +
+ +struct bb_opcode_usage {
+ +      int length;
+ +      enum bb_operand_usage usage;
+ +      const char *opcode;
+ +};
+ +
+ +/* This table is sorted in alphabetical order of opcode, except that the
+ + * trailing '"' is treated as a high value.  For example, 'in' sorts after
+ + * 'inc', 'bt' after 'btc'.  This modified sort order ensures that shorter
+ + * opcodes come after long ones.  A normal sort would put 'in' first, so 'in'
+ + * would match both 'inc' and 'in'.  When adding any new entries to this table,
+ + * be careful to put shorter entries last in their group.
+ + *
+ + * To automatically sort the table (in vi)
+ + *   Mark the first and last opcode line with 'a and 'b
+ + *   'a
+ + *   !'bsed -e 's/"}/}}/' | LANG=C sort -t '"' -k2 | sed -e 's/}}/"}/'
+ + *
+ + * If a new instruction has to be added, first consider if it affects registers
+ + * other than those listed in the operands.  Also consider if you want to track
+ + * the results of issuing the instruction, IOW can you extract useful
+ + * information by looking in detail at the modified registers or memory.  If
+ + * either test is true then you need a special case to handle the instruction.
+ + *
+ + * The generic entries at the start of enum bb_operand_usage all have one thing
+ + * in common, if a register or memory location is updated then that location
+ + * becomes undefined, i.e. we lose track of anything that was previously saved
+ + * in that location.  So only use a generic BBOU_* value when the result of the
+ + * instruction cannot be calculated exactly _and_ when all the affected
+ + * registers are listed in the operands.
+ + *
+ + * Examples:
+ + *
+ + * 'call' does not generate a known result, but as a side effect of call,
+ + * several scratch registers become undefined, so it needs a special BBOU_CALL
+ + * entry.
+ + *
+ + * 'adc' generates a variable result, it depends on the carry flag, so 'adc'
+ + * gets a generic entry.  'add' can generate an exact result (add with
+ + * immediate on a register that points to the stack) or it can generate an
+ + * unknown result (add a variable, or add immediate to a register that does not
+ + * contain a stack pointer) so 'add' has its own BBOU_ADD entry.
+ + */
+ +
+ +static const struct bb_opcode_usage
+ +bb_opcode_usage_all[] = {
+ +      {3, BBOU_RSRDWD,  "adc"},
+ +      {3, BBOU_ADD,     "add"},
+ +      {3, BBOU_AND,     "and"},
+ +      {3, BBOU_RSWD,    "bsf"},
+ +      {3, BBOU_RSWD,    "bsr"},
+ +      {5, BBOU_RSWS,    "bswap"},
+ +      {3, BBOU_RSRDWD,  "btc"},
+ +      {3, BBOU_RSRDWD,  "btr"},
+ +      {3, BBOU_RSRDWD,  "bts"},
+ +      {2, BBOU_RSRD,    "bt"},
+ +      {4, BBOU_CALL,    "call"},
+ +      {4, BBOU_CBW,     "cbtw"},      /* Intel cbw */
+ +      {3, BBOU_NOP,     "clc"},
+ +      {3, BBOU_NOP,     "cld"},
+ +      {7, BBOU_RS,      "clflush"},
+ +      {4, BBOU_NOP,     "clgi"},
+ +      {3, BBOU_NOP,     "cli"},
+ +      {4, BBOU_CWD,     "cltd"},      /* Intel cdq */
+ +      {4, BBOU_CBW,     "cltq"},      /* Intel cdqe */
+ +      {4, BBOU_NOP,     "clts"},
+ +      {4, BBOU_CMOV,    "cmov"},
+ +      {9, BBOU_CMPXCHGD,"cmpxchg16"},
+ +      {8, BBOU_CMPXCHGD,"cmpxchg8"},
+ +      {7, BBOU_CMPXCHG, "cmpxchg"},
+ +      {3, BBOU_RSRD,    "cmp"},
+ +      {5, BBOU_CPUID,   "cpuid"},
+ +      {4, BBOU_CWD,     "cqto"},      /* Intel cdo */
+ +      {4, BBOU_CWD,     "cwtd"},      /* Intel cwd */
+ +      {4, BBOU_CBW,     "cwtl"},      /* Intel cwde */
+ +      {4, BBOU_NOP,     "data"},      /* alternative ASM_NOP<n> generates data16 on x86_64 */
+ +      {3, BBOU_RSWS,    "dec"},
+ +      {3, BBOU_DIV,     "div"},
+ +      {5, BBOU_RS,      "fdivl"},
+ +      {5, BBOU_NOP,     "finit"},
+ +      {6, BBOU_RS,      "fistpl"},
+ +      {4, BBOU_RS,      "fldl"},
+ +      {4, BBOU_RS,      "fmul"},
+ +      {6, BBOU_NOP,     "fnclex"},
+ +      {6, BBOU_NOP,     "fninit"},
+ +      {6, BBOU_RS,      "fnsave"},
+ +      {7, BBOU_NOP,     "fnsetpm"},
+ +      {6, BBOU_RS,      "frstor"},
+ +      {5, BBOU_WS,      "fstsw"},
+ +      {5, BBOU_RS,      "fsubp"},
+ +      {5, BBOU_NOP,     "fwait"},
+ +      {7, BBOU_RS,      "fxrstor"},
+ +      {6, BBOU_RS,      "fxsave"},
+ +      {3, BBOU_NOP,     "hlt"},
+ +      {4, BBOU_IDIV,    "idiv"},
+ +      {4, BBOU_IMUL,    "imul"},
+ +      {3, BBOU_RSWS,    "inc"},
+ +      {3, BBOU_NOP,     "int"},
+ +      {7, BBOU_RSRD,    "invlpga"},
+ +      {6, BBOU_RS,      "invlpg"},
+ +      {2, BBOU_RSWD,    "in"},
+ +      {4, BBOU_IRET,    "iret"},
+ +      {1, BBOU_JMP,     "j"},
+ +      {4, BBOU_LAHF,    "lahf"},
+ +      {3, BBOU_RSWD,    "lar"},
+ +      {5, BBOU_RS,      "lcall"},
+ +      {5, BBOU_LEAVE,   "leave"},
+ +      {3, BBOU_LEA,     "lea"},
+ +      {6, BBOU_NOP,     "lfence"},
+ +      {4, BBOU_RS,      "lgdt"},
+ +      {4, BBOU_RS,      "lidt"},
+ +      {4, BBOU_RS,      "ljmp"},
+ +      {4, BBOU_RS,      "lldt"},
+ +      {4, BBOU_RS,      "lmsw"},
+ +      {4, BBOU_LODS,    "lods"},
+ +      {4, BBOU_LOOP,    "loop"},
+ +      {4, BBOU_NOP,     "lret"},
+ +      {3, BBOU_RSWD,    "lsl"},
+ +      {3, BBOU_LSS,     "lss"},
+ +      {3, BBOU_RS,      "ltr"},
+ +      {6, BBOU_NOP,     "mfence"},
+ +      {7, BBOU_MONITOR, "monitor"},
+ +      {4, BBOU_MOVS,    "movs"},
+ +      {3, BBOU_MOV,     "mov"},
+ +      {3, BBOU_MUL,     "mul"},
+ +      {5, BBOU_MWAIT,   "mwait"},
+ +      {3, BBOU_RSWS,    "neg"},
+ +      {3, BBOU_NOP,     "nop"},
+ +      {3, BBOU_RSWS,    "not"},
+ +      {2, BBOU_RSRDWD,  "or"},
+ +      {4, BBOU_OUTS,    "outs"},
+ +      {3, BBOU_RSRD,    "out"},
+ +      {5, BBOU_NOP,     "pause"},
+ +      {4, BBOU_POPF,    "popf"},
+ +      {3, BBOU_POP,     "pop"},
+ +      {8, BBOU_RS,      "prefetch"},
+ +      {5, BBOU_PUSHF,   "pushf"},
+ +      {4, BBOU_PUSH,    "push"},
+ +      {3, BBOU_RSRDWD,  "rcl"},
+ +      {3, BBOU_RSRDWD,  "rcr"},
+ +      {5, BBOU_RDMSR,   "rdmsr"},
+ +      {5, BBOU_RDMSR,   "rdpmc"},     /* same side effects as rdmsr */
+ +      {5, BBOU_RDTSC,   "rdtsc"},
+ +      {3, BBOU_RET,     "ret"},
+ +      {3, BBOU_RSRDWD,  "rol"},
+ +      {3, BBOU_RSRDWD,  "ror"},
+ +      {4, BBOU_SAHF,    "sahf"},
+ +      {3, BBOU_RSRDWD,  "sar"},
+ +      {3, BBOU_RSRDWD,  "sbb"},
+ +      {4, BBOU_SCAS,    "scas"},
+ +      {3, BBOU_WS,      "set"},
+ +      {6, BBOU_NOP,     "sfence"},
+ +      {4, BBOU_WS,      "sgdt"},
+ +      {3, BBOU_RSRDWD,  "shl"},
+ +      {3, BBOU_RSRDWD,  "shr"},
+ +      {4, BBOU_WS,      "sidt"},
+ +      {4, BBOU_WS,      "sldt"},
+ +      {3, BBOU_NOP,     "stc"},
+ +      {3, BBOU_NOP,     "std"},
+ +      {4, BBOU_NOP,     "stgi"},
+ +      {3, BBOU_NOP,     "sti"},
+ +      {4, BBOU_SCAS,    "stos"},
+ +      {4, BBOU_WS,      "strl"},
+ +      {3, BBOU_WS,      "str"},
+ +      {3, BBOU_SUB,     "sub"},
+ +      {6, BBOU_NOP,     "swapgs"},
+ +      {7, BBOU_SYSEXIT, "sysexit"},
+ +      {6, BBOU_SYSRET,  "sysret"},
+ +      {4, BBOU_NOP,     "test"},
+ +      {4, BBOU_NOP,     "ud2a"},
+ +      {7, BBOU_RS,      "vmclear"},
+ +      {8, BBOU_NOP,     "vmlaunch"},
+ +      {6, BBOU_RS,      "vmload"},
+ +      {7, BBOU_RS,      "vmptrld"},
+ +      {6, BBOU_WD,      "vmread"},    /* vmread src is an encoding, not a register */
+ +      {8, BBOU_NOP,     "vmresume"},
+ +      {5, BBOU_RS,      "vmrun"},
+ +      {6, BBOU_RS,      "vmsave"},
+ +      {7, BBOU_WD,      "vmwrite"},   /* vmwrite src is an encoding, not a register */
+ +      {3, BBOU_NOP,     "vmxoff"},
+ +      {6, BBOU_NOP,     "wbinvd"},
+ +      {5, BBOU_WRMSR,   "wrmsr"},
+ +      {4, BBOU_XADD,    "xadd"},
+ +      {4, BBOU_XCHG,    "xchg"},
+ +      {3, BBOU_XOR,     "xor"},
+ +      {4, BBOU_NOP,     "xrstor"},
+ +      {4, BBOU_NOP,     "xsave"},
+ +       {10, BBOU_WS,      "xstore-rng"},
+ +};
+ +
+ +/* To speed up searching, index bb_opcode_usage_all by the first letter of each
+ + * opcode.
+ + */
+ +static struct {
+ +      const struct bb_opcode_usage *opcode;
+ +      int size;
+ +} bb_opcode_usage[26];
+ +
+ +struct bb_operand {
+ +      char *base;
+ +      char *index;
+ +      char *segment;
+ +      long disp;
+ +      unsigned int scale;
+ +      enum bb_reg_code base_rc;               /* UNDEFINED or RAX through R15 */
+ +      enum bb_reg_code index_rc;              /* UNDEFINED or RAX through R15 */
+ +      unsigned int present            :1;
+ +      unsigned int disp_present       :1;
+ +      unsigned int indirect           :1;     /* must be combined with reg or memory */
+ +      unsigned int immediate          :1;     /* exactly one of these 3 must be set */
+ +      unsigned int reg                :1;
+ +      unsigned int memory             :1;
+ +};
+ +
+ +struct bb_decode {
+ +      char *prefix;
+ +      char *opcode;
+ +      const struct bb_opcode_usage *match;
+ +      struct bb_operand src;
+ +      struct bb_operand dst;
+ +      struct bb_operand dst2;
+ +};
+ +
+ +static struct bb_decode bb_decode;
+ +
+ +static enum bb_reg_code
+ +bb_reg_map(const char *reg)
+ +{
+ +      int lo, hi, c;
+ +      const struct bb_reg_code_map *p;
+ +      lo = 0;
+ +      hi = ARRAY_SIZE(bb_reg_code_map) - 1;
+ +      while (lo <= hi) {
+ +              int mid = (hi + lo) / 2;
+ +              p = bb_reg_code_map + mid;
+ +              c = strcmp(p->name, reg+1);
+ +              if (c == 0)
+ +                      return p->reg;
+ +              else if (c > 0)
+ +                      hi = mid - 1;
+ +              else
+ +                      lo = mid + 1;
+ +      }
+ +      return BBRG_UNDEFINED;
+ +}
+ +
+ +static void
+ +bb_parse_operand(char *str, struct bb_operand *operand)
+ +{
+ +      char *p = str;
+ +      int sign = 1;
+ +      operand->present = 1;
+ +      /* extract any segment prefix */
+ +      if (p[0] == '%' && p[1] && p[2] == 's' && p[3] == ':') {
+ +              operand->memory = 1;
+ +              operand->segment = p;
+ +              p[3] = '\0';
+ +              p += 4;
+ +      }
+ +      /* extract displacement, base, index, scale */
+ +      if (*p == '*') {
+ +              /* jmp/call *disp(%reg), *%reg or *0xnnn */
+ +              operand->indirect = 1;
+ +              ++p;
+ +      }
+ +      if (*p == '-') {
+ +              sign = -1;
+ +              ++p;
+ +      }
+ +      if (*p == '$') {
+ +              operand->immediate = 1;
+ +              operand->disp_present = 1;
+ +              operand->disp = simple_strtoul(p+1, &p, 0);
+ +      } else if (isdigit(*p)) {
+ +              operand->memory = 1;
+ +              operand->disp_present = 1;
+ +              operand->disp = simple_strtoul(p, &p, 0) * sign;
+ +      }
+ +      if (*p == '%') {
+ +              operand->reg = 1;
+ +              operand->base = p;
+ +      } else if (*p == '(') {
+ +              operand->memory = 1;
+ +              operand->base = ++p;
+ +              p += strcspn(p, ",)");
+ +              if (p == operand->base)
+ +                      operand->base = NULL;
+ +              if (*p == ',') {
+ +                      *p = '\0';
+ +                      operand->index = ++p;
+ +                      p += strcspn(p, ",)");
+ +                      if (p == operand->index)
+ +                              operand->index = NULL;
+ +              }
+ +              if (*p == ',') {
+ +                      *p = '\0';
+ +                      operand->scale = simple_strtoul(p+1, &p, 0);
+ +              }
+ +              *p = '\0';
+ +      } else if (*p) {
+ +              kdb_printf("%s: unexpected token '%c' after disp '%s'\n",
+ +                         __FUNCTION__, *p, str);
+ +              bb_giveup = 1;
+ +      }
+ +      if ((operand->immediate + operand->reg + operand->memory != 1) ||
+ +          (operand->indirect && operand->immediate)) {
+ +              kdb_printf("%s: incorrect decode '%s' N %d I %d R %d M %d\n",
+ +                         __FUNCTION__, str,
+ +                         operand->indirect, operand->immediate, operand->reg,
+ +                         operand->memory);
+ +              bb_giveup = 1;
+ +      }
+ +      if (operand->base)
+ +              operand->base_rc = bb_reg_map(operand->base);
+ +      if (operand->index)
+ +              operand->index_rc = bb_reg_map(operand->index);
+ +}
+ +
+ +static void
+ +bb_print_operand(const char *type, const struct bb_operand *operand)
+ +{
+ +      if (!operand->present)
+ +              return;
+ +      kdb_printf("  %s %c%c: ",
+ +                 type,
+ +                 operand->indirect ? 'N' : ' ',
+ +                 operand->immediate ? 'I' :
+ +                   operand->reg ? 'R' :
+ +                   operand->memory ? 'M' :
+ +                   '?'
+ +                 );
+ +      if (operand->segment)
+ +              kdb_printf("%s:", operand->segment);
+ +      if (operand->immediate) {
+ +              kdb_printf("$0x%lx", operand->disp);
+ +      } else if (operand->reg) {
+ +              if (operand->indirect)
+ +                      kdb_printf("*");
+ +              kdb_printf("%s", operand->base);
+ +      } else if (operand->memory) {
+ +              if (operand->indirect && (operand->base || operand->index))
+ +                      kdb_printf("*");
+ +              if (operand->disp_present) {
+ +                      kdb_printf("0x%lx", operand->disp);
+ +              }
+ +              if (operand->base || operand->index || operand->scale) {
+ +                      kdb_printf("(");
+ +                      if (operand->base)
+ +                              kdb_printf("%s", operand->base);
+ +                      if (operand->index || operand->scale)
+ +                              kdb_printf(",");
+ +                      if (operand->index)
+ +                              kdb_printf("%s", operand->index);
+ +                      if (operand->scale)
+ +                              kdb_printf(",%d", operand->scale);
+ +                      kdb_printf(")");
+ +              }
+ +      }
+ +      if (operand->base_rc)
+ +              kdb_printf(" base_rc %d (%s)",
+ +                         operand->base_rc, bbrg_name[operand->base_rc]);
+ +      if (operand->index_rc)
+ +              kdb_printf(" index_rc %d (%s)",
+ +                         operand->index_rc,
+ +                         bbrg_name[operand->index_rc]);
+ +      kdb_printf("\n");
+ +}
+ +
+ +static void
+ +bb_print_opcode(void)
+ +{
+ +      const struct bb_opcode_usage *o = bb_decode.match;
+ +      kdb_printf("  ");
+ +      if (bb_decode.prefix)
+ +              kdb_printf("%s ", bb_decode.prefix);
+ +      kdb_printf("opcode '%s' matched by '%s', usage %d\n",
+ +                 bb_decode.opcode, o->opcode, o->usage);
+ +}
+ +
+ +static int
+ +bb_parse_opcode(void)
+ +{
+ +      int c, i;
+ +      const struct bb_opcode_usage *o;
+ +      static int bb_parse_opcode_error_limit = 5;
+ +      c = bb_decode.opcode[0] - 'a';
+ +      if (c < 0 || c >= ARRAY_SIZE(bb_opcode_usage))
+ +              goto nomatch;
+ +      o = bb_opcode_usage[c].opcode;
+ +      if (!o)
+ +              goto nomatch;
+ +      for (i = 0; i < bb_opcode_usage[c].size; ++i, ++o) {
+ +              if (strncmp(bb_decode.opcode, o->opcode, o->length) == 0) {
+ +                      bb_decode.match = o;
+ +                      if (KDB_DEBUG(BB))
+ +                              bb_print_opcode();
+ +                      return 0;
+ +              }
+ +      }
+ +nomatch:
+ +      if (!bb_parse_opcode_error_limit)
+ +              return 1;
+ +      --bb_parse_opcode_error_limit;
+ +      kdb_printf("%s: no match at [%s]%s " kdb_bfd_vma_fmt0 " - '%s'\n",
+ +                 __FUNCTION__,
+ +                 bb_mod_name, bb_func_name, bb_curr_addr,
+ +                 bb_decode.opcode);
+ +      return 1;
+ +}
+ +
+ +static bool
+ +bb_is_int_reg(enum bb_reg_code reg)
+ +{
+ +      return reg >= BBRG_RAX && reg < (BBRG_RAX + KDB_INT_REGISTERS);
+ +}
+ +
+ +static bool
+ +bb_is_simple_memory(const struct bb_operand *operand)
+ +{
+ +      return operand->memory &&
+ +             bb_is_int_reg(operand->base_rc) &&
+ +             !operand->index_rc &&
+ +             operand->scale == 0 &&
+ +             !operand->segment;
+ +}
+ +
+ +static bool
+ +bb_is_static_disp(const struct bb_operand *operand)
+ +{
+ +      return operand->memory &&
+ +             !operand->base_rc &&
+ +             !operand->index_rc &&
+ +             operand->scale == 0 &&
+ +             !operand->segment &&
+ +             !operand->indirect;
+ +}
+ +
+ +static enum bb_reg_code
+ +bb_reg_code_value(enum bb_reg_code reg)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ +      return bb_reg_state->contains[reg - BBRG_RAX].value;
+ +}
+ +
+ +static short
+ +bb_reg_code_offset(enum bb_reg_code reg)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ +      return bb_reg_state->contains[reg - BBRG_RAX].offset;
+ +}
+ +
+ +static void
+ +bb_reg_code_set_value(enum bb_reg_code dst, enum bb_reg_code src)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(dst), dst, );
+ +      bb_reg_state->contains[dst - BBRG_RAX].value = src;
+ +}
+ +
+ +static void
+ +bb_reg_code_set_offset(enum bb_reg_code dst, short offset)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(dst), dst, );
+ +      bb_reg_state->contains[dst - BBRG_RAX].offset = offset;
+ +}
+ +
+ +static bool
+ +bb_is_osp_defined(enum bb_reg_code reg)
+ +{
+ +      if (bb_is_int_reg(reg))
+ +              return bb_reg_code_value(reg) == BBRG_OSP;
+ +      else
+ +              return 0;
+ +}
+ +
+ +static bfd_vma
+ +bb_actual_value(enum bb_reg_code reg)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ +      return bb_actual[reg - BBRG_RAX].value;
+ +}
+ +
+ +static int
+ +bb_actual_valid(enum bb_reg_code reg)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(reg), reg, 0);
+ +      return bb_actual[reg - BBRG_RAX].valid;
+ +}
+ +
+ +static void
+ +bb_actual_set_value(enum bb_reg_code reg, bfd_vma value)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(reg), reg, );
+ +      bb_actual[reg - BBRG_RAX].value = value;
+ +}
+ +
+ +static void
+ +bb_actual_set_valid(enum bb_reg_code reg, int valid)
+ +{
+ +      BB_CHECK(!bb_is_int_reg(reg), reg, );
+ +      bb_actual[reg - BBRG_RAX].valid = valid;
+ +}
+ +
+ +/* The scheduler code switches RSP then does PUSH, it is not an error for RSP
+ + * to be undefined in this area of the code.
+ + */
+ +static bool
+ +bb_is_scheduler_address(void)
+ +{
+ +      return bb_curr_addr >= bb__sched_text_start &&
+ +             bb_curr_addr < bb__sched_text_end;
+ +}
+ +
+ +static void
+ +bb_reg_read(enum bb_reg_code reg)
+ +{
+ +      int i, r = 0;
+ +      if (!bb_is_int_reg(reg) ||
+ +          bb_reg_code_value(reg) != reg)
+ +              return;
+ +      for (i = 0;
+ +           i < min_t(unsigned int, REGPARM, ARRAY_SIZE(bb_param_reg));
+ +           ++i) {
+ +              if (reg == bb_param_reg[i]) {
+ +                      r = i + 1;
+ +                      break;
+ +              }
+ +      }
+ +      bb_reg_params = max(bb_reg_params, r);
+ +}
+ +
+ +static void
+ +bb_do_reg_state_print(const struct bb_reg_state *s)
+ +{
+ +      int i, offset_address, offset_value;
+ +      const struct bb_memory_contains *c;
+ +      enum bb_reg_code value;
+ +      kdb_printf("  bb_reg_state %p\n", s);
+ +      for (i = 0; i < ARRAY_SIZE(s->contains); ++i) {
+ +              value = s->contains[i].value;
+ +              offset_value = s->contains[i].offset;
+ +              kdb_printf("    %s = %s",
+ +                         bbrg_name[i + BBRG_RAX], bbrg_name[value]);
+ +              if (value == BBRG_OSP)
+ +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset_value, "", "");
+ +              kdb_printf("\n");
+ +      }
+ +      for (i = 0, c = s->memory; i < s->mem_count; ++i, ++c) {
+ +              offset_address = c->offset_address;
+ +              value = c->value;
+ +              offset_value = c->offset_value;
+ +              kdb_printf("    slot %d offset_address %c0x%x %s",
+ +                         i,
+ +                         offset_address >= 0 ? '+' : '-',
+ +                         offset_address >= 0 ? offset_address : -offset_address,
+ +                         bbrg_name[value]);
+ +              if (value == BBRG_OSP)
+ +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset_value, "", "");
+ +              kdb_printf("\n");
+ +      }
+ +}
+ +
+ +static void
+ +bb_reg_state_print(const struct bb_reg_state *s)
+ +{
+ +      if (KDB_DEBUG(BB))
+ +              bb_do_reg_state_print(s);
+ +}
+ +
+ +/* Set register 'dst' to contain the value from 'src'.  This includes reading
+ + * from 'src' and writing to 'dst'.  The offset value is copied iff 'src'
+ + * contains a stack pointer.
+ + *
+ + * Be very careful about the context here.  'dst' and 'src' reflect integer
+ + * registers by name, _not_ by the value of their contents.  "mov %rax,%rsi"
+ + * will call this function as bb_reg_set_reg(BBRG_RSI, BBRG_RAX), which
+ + * reflects what the assembler code is doing.  However we need to track the
+ + * _values_ in the registers, not their names.  IOW, we really care about "what
+ + * value does rax contain when it is copied into rsi?", so we can record the
+ + * fact that we now have two copies of that value, one in rax and one in rsi.
+ + */
+ +
+ +static void
+ +bb_reg_set_reg(enum bb_reg_code dst, enum bb_reg_code src)
+ +{
+ +      enum bb_reg_code src_value = BBRG_UNDEFINED;
+ +      short offset_value = 0;
+ +      KDB_DEBUG_BB("  %s = %s", bbrg_name[dst], bbrg_name[src]);
+ +      if (bb_is_int_reg(src)) {
+ +              bb_reg_read(src);
+ +              src_value = bb_reg_code_value(src);
+ +              KDB_DEBUG_BB(" (%s", bbrg_name[src_value]);
+ +              if (bb_is_osp_defined(src)) {
+ +                      offset_value = bb_reg_code_offset(src);
+ +                      KDB_DEBUG_BB_OFFSET(offset_value, "", "");
+ +              }
+ +              KDB_DEBUG_BB(")");
+ +      }
+ +      if (bb_is_int_reg(dst)) {
+ +              bb_reg_code_set_value(dst, src_value);
+ +              bb_reg_code_set_offset(dst, offset_value);
+ +      }
+ +      KDB_DEBUG_BB("\n");
+ +}
+ +
+ +static void
+ +bb_reg_set_undef(enum bb_reg_code dst)
+ +{
+ +      bb_reg_set_reg(dst, BBRG_UNDEFINED);
+ +}
+ +
+ +/* Delete any record of a stored register held in osp + 'offset' */
+ +
+ +static void
+ +bb_delete_memory(short offset)
+ +{
+ +      int i;
+ +      struct bb_memory_contains *c;
+ +      for (i = 0, c = bb_reg_state->memory;
+ +           i < bb_reg_state->mem_count;
+ +           ++i, ++c) {
+ +              if (c->offset_address == offset &&
+ +                  c->value != BBRG_UNDEFINED) {
+ +                      KDB_DEBUG_BB("  delete %s from ",
+ +                                   bbrg_name[c->value]);
+ +                      KDB_DEBUG_BB_OFFSET(offset, "osp", "");
+ +                      KDB_DEBUG_BB(" slot %d\n",
+ +                                   (int)(c - bb_reg_state->memory));
+ +                      memset(c, BBRG_UNDEFINED, sizeof(*c));
+ +                      if (i == bb_reg_state->mem_count - 1)
+ +                              --bb_reg_state->mem_count;
+ +              }
+ +      }
+ +}
+ +
+ +/* Set memory location *('dst' + 'offset_address') to contain the supplied
+ + * value and offset.  'dst' is assumed to be a register that contains a stack
+ + * pointer.
+ + */
+ +
+ +static void
+ +bb_memory_set_reg_value(enum bb_reg_code dst, short offset_address,
+ +                      enum bb_reg_code value, short offset_value)
+ +{
+ +      int i;
+ +      struct bb_memory_contains *c, *free = NULL;
+ +      BB_CHECK(!bb_is_osp_defined(dst), dst, );
+ +      KDB_DEBUG_BB("  *(%s", bbrg_name[dst]);
+ +      KDB_DEBUG_BB_OFFSET(offset_address, "", "");
+ +      offset_address += bb_reg_code_offset(dst);
+ +      KDB_DEBUG_BB_OFFSET(offset_address, " osp", ") = ");
+ +      KDB_DEBUG_BB("%s", bbrg_name[value]);
+ +      if (value == BBRG_OSP)
+ +              KDB_DEBUG_BB_OFFSET(offset_value, "", "");
+ +      for (i = 0, c = bb_reg_state->memory;
+ +           i < bb_reg_state_max;
+ +           ++i, ++c) {
+ +              if (c->offset_address == offset_address)
+ +                      free = c;
+ +              else if (c->value == BBRG_UNDEFINED && !free)
+ +                      free = c;
+ +      }
+ +      if (!free) {
+ +              struct bb_reg_state *new, *old = bb_reg_state;
+ +              size_t old_size, new_size;
+ +              int slot;
+ +              old_size = sizeof(*old) + bb_reg_state_max *
+ +                                sizeof(old->memory[0]);
+ +              slot = bb_reg_state_max;
+ +              bb_reg_state_max += 5;
+ +              new_size = sizeof(*new) + bb_reg_state_max *
+ +                                sizeof(new->memory[0]);
+ +              new = debug_kmalloc(new_size, GFP_ATOMIC);
+ +              if (!new) {
+ +                      kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ +                      bb_giveup = 1;
+ +              } else {
+ +                      memcpy(new, old, old_size);
+ +                      memset((char *)new + old_size, BBRG_UNDEFINED,
+ +                             new_size - old_size);
+ +                      bb_reg_state = new;
+ +                      debug_kfree(old);
+ +                      free = bb_reg_state->memory + slot;
+ +              }
+ +      }
+ +      if (free) {
+ +              int slot = free - bb_reg_state->memory;
+ +              free->offset_address = offset_address;
+ +              free->value = value;
+ +              free->offset_value = offset_value;
+ +              KDB_DEBUG_BB(" slot %d", slot);
+ +              bb_reg_state->mem_count = max(bb_reg_state->mem_count, slot+1);
+ +      }
+ +      KDB_DEBUG_BB("\n");
+ +}
+ +
+ +/* Set memory location *('dst' + 'offset') to contain the value from register
+ + * 'src'.  'dst' is assumed to be a register that contains a stack pointer.
+ + * This differs from bb_memory_set_reg_value because it takes a src register
+ + * which contains a value and possibly an offset, bb_memory_set_reg_value is
+ + * passed the value and offset directly.
+ + */
+ +
+ +static void
+ +bb_memory_set_reg(enum bb_reg_code dst, enum bb_reg_code src,
+ +                short offset_address)
+ +{
+ +      int offset_value;
+ +      enum bb_reg_code value;
+ +      BB_CHECK(!bb_is_osp_defined(dst), dst, );
+ +      if (!bb_is_int_reg(src))
+ +              return;
+ +      value = bb_reg_code_value(src);
+ +      if (value == BBRG_UNDEFINED) {
+ +              bb_delete_memory(offset_address + bb_reg_code_offset(dst));
+ +              return;
+ +      }
+ +      offset_value = bb_reg_code_offset(src);
+ +      bb_reg_read(src);
+ +      bb_memory_set_reg_value(dst, offset_address, value, offset_value);
+ +}
+ +
+ +/* Set register 'dst' to contain the value from memory *('src' + offset_address).
+ + * 'src' is assumed to be a register that contains a stack pointer.
+ + */
+ +
+ +static void
+ +bb_reg_set_memory(enum bb_reg_code dst, enum bb_reg_code src, short offset_address)
+ +{
+ +      int i, defined = 0;
+ +      struct bb_memory_contains *s;
+ +      BB_CHECK(!bb_is_osp_defined(src), src, );
+ +      KDB_DEBUG_BB("  %s = *(%s",
+ +                   bbrg_name[dst], bbrg_name[src]);
+ +      KDB_DEBUG_BB_OFFSET(offset_address, "", ")");
+ +      offset_address += bb_reg_code_offset(src);
+ +      KDB_DEBUG_BB_OFFSET(offset_address, " (osp", ")");
+ +      for (i = 0, s = bb_reg_state->memory;
+ +           i < bb_reg_state->mem_count;
+ +           ++i, ++s) {
+ +              if (s->offset_address == offset_address && bb_is_int_reg(dst)) {
+ +                      bb_reg_code_set_value(dst, s->value);
+ +                      KDB_DEBUG_BB(" value %s", bbrg_name[s->value]);
+ +                      if (s->value == BBRG_OSP) {
+ +                              bb_reg_code_set_offset(dst, s->offset_value);
+ +                              KDB_DEBUG_BB_OFFSET(s->offset_value, "", "");
+ +                      } else {
+ +                              bb_reg_code_set_offset(dst, 0);
+ +                      }
+ +                      defined = 1;
+ +              }
+ +      }
+ +      if (!defined)
+ +              bb_reg_set_reg(dst, BBRG_UNDEFINED);
+ +      else
+ +              KDB_DEBUG_BB("\n");
+ +}
+ +
+ +/* A generic read from an operand. */
+ +
+ +static void
+ +bb_read_operand(const struct bb_operand *operand)
+ +{
+ +      int m = 0;
+ +      if (operand->base_rc)
+ +              bb_reg_read(operand->base_rc);
+ +      if (operand->index_rc)
+ +              bb_reg_read(operand->index_rc);
+ +      if (bb_is_simple_memory(operand) &&
+ +          bb_is_osp_defined(operand->base_rc) &&
+ +          bb_decode.match->usage != BBOU_LEA) {
+ +              m = (bb_reg_code_offset(operand->base_rc) + operand->disp +
+ +                   KDB_WORD_SIZE - 1) / KDB_WORD_SIZE;
+ +              bb_memory_params = max(bb_memory_params, m);
+ +      }
+ +}
+ +
+ +/* A generic write to an operand, resulting in an undefined value in that
+ + * location.  All well defined operands are handled separately, this function
+ + * only handles the opcodes where the result is undefined.
+ + */
+ +
+ +static void
+ +bb_write_operand(const struct bb_operand *operand)
+ +{
+ +      enum bb_reg_code base_rc = operand->base_rc;
+ +      if (operand->memory) {
+ +              if (base_rc)
+ +                      bb_reg_read(base_rc);
+ +              if (operand->index_rc)
+ +                      bb_reg_read(operand->index_rc);
+ +      } else if (operand->reg && base_rc) {
+ +              bb_reg_set_undef(base_rc);
+ +      }
+ +      if (bb_is_simple_memory(operand) && bb_is_osp_defined(base_rc)) {
+ +              int offset;
+ +              offset = bb_reg_code_offset(base_rc) + operand->disp;
+ +              offset = ALIGN(offset - KDB_WORD_SIZE + 1, KDB_WORD_SIZE);
+ +              bb_delete_memory(offset);
+ +      }
+ +}
+ +
+ +/* Adjust a register that contains a stack pointer */
+ +
+ +static void
+ +bb_adjust_osp(enum bb_reg_code reg, int adjust)
+ +{
+ +      int offset = bb_reg_code_offset(reg), old_offset = offset;
+ +      KDB_DEBUG_BB("  %s osp offset ", bbrg_name[reg]);
+ +      KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(reg), "", " -> ");
+ +      offset += adjust;
+ +      bb_reg_code_set_offset(reg, offset);
+ +      KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(reg), "", "\n");
+ +      /* When RSP is adjusted upwards, it invalidates any memory
+ +       * stored between the old and current stack offsets.
+ +       */
+ +      if (reg == BBRG_RSP) {
+ +              while (old_offset < bb_reg_code_offset(reg)) {
+ +                      bb_delete_memory(old_offset);
+ +                      old_offset += KDB_WORD_SIZE;
+ +              }
+ +      }
+ +}
+ +
+ +/* The current instruction adjusts a register that contains a stack pointer.
+ + * Direction is 1 or -1, depending on whether the instruction is add/lea or
+ + * sub.
+ + */
+ +
+ +static void
+ +bb_adjust_osp_instruction(int direction)
+ +{
+ +      enum bb_reg_code dst_reg = bb_decode.dst.base_rc;
+ +      if (bb_decode.src.immediate ||
+ +          bb_decode.match->usage == BBOU_LEA /* lea has its own checks */) {
+ +              int adjust = direction * bb_decode.src.disp;
+ +              bb_adjust_osp(dst_reg, adjust);
+ +      } else {
+ +              /* variable stack adjustment, osp offset is not well defined */
+ +              KDB_DEBUG_BB("  %s osp offset ", bbrg_name[dst_reg]);
+ +              KDB_DEBUG_BB_OFFSET(bb_reg_code_offset(dst_reg), "", " -> undefined\n");
+ +              bb_reg_code_set_value(dst_reg, BBRG_UNDEFINED);
+ +              bb_reg_code_set_offset(dst_reg, 0);
+ +      }
+ +}
+ +
+ +/* Some instructions using memory have an explicit length suffix (b, w, l, q).
+ + * The equivalent instructions using a register imply the length from the
+ + * register name.  Deduce the operand length.
+ + */
+ +
+ +static int
+ +bb_operand_length(const struct bb_operand *operand, char opcode_suffix)
+ +{
+ +      int l = 0;
+ +      switch (opcode_suffix) {
+ +      case 'b':
+ +              l = 8;
+ +              break;
+ +      case 'w':
+ +              l = 16;
+ +              break;
+ +      case 'l':
+ +              l = 32;
+ +              break;
+ +      case 'q':
+ +              l = 64;
+ +              break;
+ +      }
+ +      if (l == 0 && operand->reg) {
+ +              switch (strlen(operand->base)) {
+ +              case 3:
+ +                      switch (operand->base[2]) {
+ +                      case 'h':
+ +                      case 'l':
+ +                              l = 8;
+ +                              break;
+ +                      default:
+ +                              l = 16;
+ +                              break;
+ +                      }
+ +              case 4:
+ +                      if (operand->base[1] == 'r')
+ +                              l = 64;
+ +                      else
+ +                              l = 32;
+ +                      break;
+ +              }
+ +      }
+ +      return l;
+ +}
+ +
+ +static int
+ +bb_reg_state_size(const struct bb_reg_state *state)
+ +{
+ +      return sizeof(*state) +
+ +             state->mem_count * sizeof(state->memory[0]);
+ +}
+ +
+ +/* Canonicalize the current bb_reg_state so it can be compared against
+ + * previously created states.  Sort the memory entries in descending order of
+ + * offset_address (stack grows down).  Empty slots are moved to the end of the
+ + * list and trimmed.
+ + */
+ +
+ +static void
+ +bb_reg_state_canonicalize(void)
+ +{
+ +      int i, order, changed;
+ +      struct bb_memory_contains *p1, *p2, temp;
+ +      do {
+ +              changed = 0;
+ +              for (i = 0, p1 = bb_reg_state->memory;
+ +                   i < bb_reg_state->mem_count-1;
+ +                   ++i, ++p1) {
+ +                      p2 = p1 + 1;
+ +                      if (p2->value == BBRG_UNDEFINED) {
+ +                              order = 0;
+ +                      } else if (p1->value == BBRG_UNDEFINED) {
+ +                              order = 1;
+ +                      } else if (p1->offset_address < p2->offset_address) {
+ +                              order = 1;
+ +                      } else if (p1->offset_address > p2->offset_address) {
+ +                              order = -1;
+ +                      } else {
+ +                              order = 0;
+ +                      }
+ +                      if (order > 0) {
+ +                              temp = *p2;
+ +                              *p2 = *p1;
+ +                              *p1 = temp;
+ +                              changed = 1;
+ +                      }
+ +              }
+ +      } while(changed);
+ +      for (i = 0, p1 = bb_reg_state->memory;
+ +           i < bb_reg_state_max;
+ +           ++i, ++p1) {
+ +              if (p1->value != BBRG_UNDEFINED)
+ +                      bb_reg_state->mem_count = i + 1;
+ +      }
+ +      bb_reg_state_print(bb_reg_state);
+ +}
+ +
+ +static int
+ +bb_special_case(bfd_vma to)
+ +{
+ +      int i, j, rsp_offset, expect_offset, offset, errors = 0, max_errors = 40;
+ +      enum bb_reg_code reg, expect_value, value;
+ +      struct bb_name_state *r;
+ +
+ +      for (i = 0, r = bb_special_cases;
+ +           i < ARRAY_SIZE(bb_special_cases);
+ +           ++i, ++r) {
+ +              if (to == r->address &&
+ +                  (r->fname == NULL || strcmp(bb_func_name, r->fname) == 0))
+ +                      goto match;
+ +      }
+ +      /* Some inline assembler code has jumps to .fixup sections which result
+ +       * in out of line transfers with undefined state, ignore them.
+ +       */
+ +      if (strcmp(bb_func_name, "strnlen_user") == 0 ||
+ +          strcmp(bb_func_name, "copy_from_user") == 0)
+ +              return 1;
+ +      return 0;
+ +
+ +match:
+ +      /* Check the running registers match */
+ +      for (reg = BBRG_RAX; reg < r->regs_size; ++reg) {
+ +              expect_value = r->regs[reg].value;
+ +              if (test_bit(expect_value, r->skip_regs.bits)) {
+ +                      /* this regs entry is not defined for this label */
+ +                      continue;
+ +              }
+ +              if (expect_value == BBRG_UNDEFINED)
+ +                      continue;
+ +              expect_offset = r->regs[reg].offset;
+ +              value = bb_reg_code_value(reg);
+ +              offset = bb_reg_code_offset(reg);
+ +              if (expect_value == value &&
+ +                  (value != BBRG_OSP || r->osp_offset == offset))
+ +                      continue;
+ +              kdb_printf("%s: Expected %s to contain %s",
+ +                         __FUNCTION__,
+ +                         bbrg_name[reg],
+ +                         bbrg_name[expect_value]);
+ +              if (r->osp_offset)
+ +                      KDB_DEBUG_BB_OFFSET_PRINTF(r->osp_offset, "", "");
+ +              kdb_printf(".  It actually contains %s", bbrg_name[value]);
+ +              if (offset)
+ +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset, "", "");
+ +              kdb_printf("\n");
+ +              ++errors;
+ +              if (max_errors-- == 0)
+ +                      goto fail;
+ +      }
+ +      /* Check that any memory data on stack matches */
+ +      i = j = 0;
+ +      while (i < bb_reg_state->mem_count &&
+ +             j < r->mem_size) {
+ +              expect_value = r->mem[j].value;
+ +              if (test_bit(expect_value, r->skip_mem.bits) ||
+ +                  expect_value == BBRG_UNDEFINED) {
+ +                      /* this memory slot is not defined for this label */
+ +                      ++j;
+ +                      continue;
+ +              }
+ +              rsp_offset = bb_reg_state->memory[i].offset_address -
+ +                      bb_reg_code_offset(BBRG_RSP);
+ +              if (rsp_offset >
+ +                  r->mem[j].offset_address) {
+ +                      /* extra slots in memory are OK */
+ +                      ++i;
+ +              } else if (rsp_offset <
+ +                         r->mem[j].offset_address) {
+ +                      /* Required memory slot is missing */
+ +                      kdb_printf("%s: Invalid bb_reg_state.memory, "
+ +                                 "missing memory entry[%d] %s\n",
+ +                         __FUNCTION__, j, bbrg_name[expect_value]);
+ +                      ++errors;
+ +                      if (max_errors-- == 0)
+ +                              goto fail;
+ +                      ++j;
+ +              } else {
+ +                      if (bb_reg_state->memory[i].offset_value ||
+ +                          bb_reg_state->memory[i].value != expect_value) {
+ +                              /* memory slot is present but contains wrong
+ +                               * value.
+ +                               */
+ +                              kdb_printf("%s: Invalid bb_reg_state.memory, "
+ +                                          "wrong value in slot %d, "
+ +                                          "should be %s, it is %s\n",
+ +                                 __FUNCTION__, i,
+ +                                 bbrg_name[expect_value],
+ +                                 bbrg_name[bb_reg_state->memory[i].value]);
+ +                              ++errors;
+ +                              if (max_errors-- == 0)
+ +                                      goto fail;
+ +                      }
+ +                      ++i;
+ +                      ++j;
+ +              }
+ +      }
+ +      while (j < r->mem_size) {
+ +              expect_value = r->mem[j].value;
+ +              if (test_bit(expect_value, r->skip_mem.bits) ||
+ +                  expect_value == BBRG_UNDEFINED)
+ +                      ++j;
+ +              else
+ +                      break;
+ +      }
+ +      if (j != r->mem_size) {
+ +              /* Hit end of memory before testing all the pt_reg slots */
+ +              kdb_printf("%s: Invalid bb_reg_state.memory, "
+ +                          "missing trailing entries\n",
+ +                 __FUNCTION__);
+ +              ++errors;
+ +              if (max_errors-- == 0)
+ +                      goto fail;
+ +      }
+ +      if (errors)
+ +              goto fail;
+ +      return 1;
+ +fail:
+ +      kdb_printf("%s: on transfer to %s\n", __FUNCTION__, r->name);
+ +      bb_giveup = 1;
+ +      return 1;
+ +}
+ +
+ +/* Transfer of control to a label outside the current function.  If the
+ + * transfer is to a known common code path then do a sanity check on the state
+ + * at this point.
+ + */
+ +
+ +static void
+ +bb_sanity_check(int type)
+ +{
+ +      enum bb_reg_code expect, actual;
+ +      int i, offset, error = 0;
+ +
+ +      for (i = 0; i < ARRAY_SIZE(bb_preserved_reg); ++i) {
+ +              expect = bb_preserved_reg[i];
+ +              actual = bb_reg_code_value(expect);
+ +              offset = bb_reg_code_offset(expect);
+ +              if (expect == actual)
+ +                      continue;
+ +              /* type == 1 is sysret/sysexit, ignore RSP */
+ +              if (type && expect == BBRG_RSP)
+ +                      continue;
+ +              /* type == 1 is sysret/sysexit, ignore RBP for i386 */
+ +              /* We used to have "#ifndef CONFIG_X86_64" for the type=1 RBP
+ +               * test; however, x86_64 can run ia32 compatible mode and
+ +               * hit this problem. Perform the following test anyway!
+ +               */
+ +              if (type && expect == BBRG_RBP)
+ +                      continue;
+ +              /* RSP should contain OSP+0.  Except for ptregscall_common and
+ +               * ia32_ptregs_common, they get a partial pt_regs, fudge the
+ +               * stack to make it a full pt_regs then reverse the effect on
+ +               * exit, so the offset is -0x50 on exit.
+ +               */
+ +              if (expect == BBRG_RSP &&
+ +                  bb_is_osp_defined(expect) &&
+ +                  (offset == 0 ||
+ +                   (offset == -0x50 &&
+ +                    (strcmp(bb_func_name, "ptregscall_common") == 0 ||
+ +                     strcmp(bb_func_name, "ia32_ptregs_common") == 0))))
+ +                      continue;
+ +              /* The put_user and save_paranoid functions are special.
+ +               * %rbx gets clobbered */
+ +              if (expect == BBRG_RBX &&
+ +                      (strncmp(bb_func_name, "__put_user_", 11) == 0 ||
+ +                       strcmp(bb_func_name, "save_paranoid") == 0))
+ +                      continue;
+ +              /* Ignore rbp and rsp for error_entry */
+ +              if ((strcmp(bb_func_name, "error_entry") == 0) &&
+ +                  (expect == BBRG_RBX ||
+ +                   (expect == BBRG_RSP && bb_is_osp_defined(expect) && offset == -0x10)))
+ +                      continue;
+ +              kdb_printf("%s: Expected %s, got %s",
+ +                         __FUNCTION__,
+ +                         bbrg_name[expect], bbrg_name[actual]);
+ +              if (offset)
+ +                      KDB_DEBUG_BB_OFFSET_PRINTF(offset, "", "");
+ +              kdb_printf("\n");
+ +              error = 1;
+ +      }
+ +      BB_CHECK(error, error, );
+ +}
+ +
+ +/* Transfer of control.  Follow the arc and save the current state as input to
+ + * another basic block.
+ + */
+ +
+ +static void
+ +bb_transfer(bfd_vma from, bfd_vma to, unsigned int drop_through)
+ +{
+ +      int i, found;
+ +      size_t size;
+ +      struct bb* bb = NULL;   /*stupid gcc */
+ +      struct bb_jmp *bb_jmp;
+ +      struct bb_reg_state *state;
+ +      bb_reg_state_canonicalize();
+ +      found = 0;
+ +      for (i = 0; i < bb_jmp_count; ++i) {
+ +              bb_jmp = bb_jmp_list + i;
+ +              if (bb_jmp->from == from &&
+ +                  bb_jmp->to == to &&
+ +                  bb_jmp->drop_through == drop_through) {
+ +                      found = 1;
+ +                      break;
+ +              }
+ +      }
+ +      if (!found) {
+ +              /* Transfer outside the current function.  Check the special
+ +               * cases (mainly in entry.S) first.  If it is not a known
+ +               * special case then check if the target address is the start
+ +               * of a function or not.  If it is the start of a function then
+ +               * assume tail recursion and require that the state be the same
+ +               * as on entry.  Otherwise assume out of line code (e.g.
+ +               * spinlock contention path) and ignore it, the state can be
+ +               * anything.
+ +               */
+ +              kdb_symtab_t symtab;
+ +              if (bb_special_case(to))
+ +                      return;
+ +              kdbnearsym(to, &symtab);
+ +              if (symtab.sym_start != to)
+ +                      return;
+ +              bb_sanity_check(0);
+ +              if (bb_giveup)
+ +                      return;
+ +#ifdef        NO_SIBLINGS
+ +              /* Only print this message when the kernel is compiled with
+ +               * -fno-optimize-sibling-calls.  Otherwise it would print a
+ +               * message for every tail recursion call.  If you see the
+ +               * message below then you probably have an assembler label that
+ +               * is not listed in the special cases.
+ +               */
+ +              kdb_printf("  not matched: from "
+ +                         kdb_bfd_vma_fmt0
+ +                         " to " kdb_bfd_vma_fmt0
+ +                         " drop_through %d bb_jmp[%d]\n",
+ +                         from, to, drop_through, i);
+ +#endif        /* NO_SIBLINGS */
+ +              return;
+ +      }
+ +      KDB_DEBUG_BB("  matched: from " kdb_bfd_vma_fmt0
+ +                   " to " kdb_bfd_vma_fmt0
+ +                   " drop_through %d bb_jmp[%d]\n",
+ +                   from, to, drop_through, i);
+ +      found = 0;
+ +      for (i = 0; i < bb_count; ++i) {
+ +              bb = bb_list[i];
+ +              if (bb->start == to) {
+ +                      found = 1;
+ +                      break;
+ +              }
+ +      }
+ +      BB_CHECK(!found, to, );
+ +      /* If the register state for this arc has already been set (we are
+ +       * rescanning the block that originates the arc) and the state is the
+ +       * same as the previous state for this arc then this input to the
+ +       * target block is the same as last time, so there is no need to rescan
+ +       * the target block.
+ +       */
+ +      state = bb_jmp->state;
+ +      size = bb_reg_state_size(bb_reg_state);
+ +      if (state) {
+ +              bb_reg_state->ref_count = state->ref_count;
+ +              if (memcmp(state, bb_reg_state, size) == 0) {
+ +                      KDB_DEBUG_BB("  no state change\n");
+ +                      return;
+ +              }
+ +              if (--state->ref_count == 0)
+ +                      debug_kfree(state);
+ +              bb_jmp->state = NULL;
+ +      }
+ +      /* New input state is required.  To save space, check if any other arcs
+ +       * have the same state and reuse them where possible.  The overall set
+ +       * of inputs to the target block is now different so the target block
+ +       * must be rescanned.
+ +       */
+ +      bb->changed = 1;
+ +      for (i = 0; i < bb_jmp_count; ++i) {
+ +              state = bb_jmp_list[i].state;
+ +              if (!state)
+ +                      continue;
+ +              bb_reg_state->ref_count = state->ref_count;
+ +              if (memcmp(state, bb_reg_state, size) == 0) {
+ +                      KDB_DEBUG_BB("  reuse bb_jmp[%d]\n", i);
+ +                      bb_jmp->state = state;
+ +                      ++state->ref_count;
+ +                      return;
+ +              }
+ +      }
+ +      state = debug_kmalloc(size, GFP_ATOMIC);
+ +      if (!state) {
+ +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      memcpy(state, bb_reg_state, size);
+ +      state->ref_count = 1;
+ +      bb_jmp->state = state;
+ +      KDB_DEBUG_BB("  new state %p\n", state);
+ +}
+ +
+ +/* Isolate the processing for 'mov' so it can be used for 'xadd'/'xchg' as
+ + * well.
+ + *
+ + * xadd/xchg expect this function to return BBOU_NOP for special cases,
+ + * otherwise it returns BBOU_RSWD.  All special cases must be handled entirely
+ + * within this function, including doing bb_read_operand or bb_write_operand
+ + * where necessary.
+ + */
+ +
+ +static enum bb_operand_usage
+ +bb_usage_mov(const struct bb_operand *src, const struct bb_operand *dst, int l)
+ +{
+ +      int full_register_src, full_register_dst;
+ +      full_register_src = bb_operand_length(src, bb_decode.opcode[l])
+ +                          == KDB_WORD_SIZE * 8;
+ +      full_register_dst = bb_operand_length(dst, bb_decode.opcode[l])
+ +                          == KDB_WORD_SIZE * 8;
+ +      /* If both src and dst are full integer registers then record the
+ +       * register change.
+ +       */
+ +      if (src->reg &&
+ +          bb_is_int_reg(src->base_rc) &&
+ +          dst->reg &&
+ +          bb_is_int_reg(dst->base_rc) &&
+ +          full_register_src &&
+ +          full_register_dst) {
+ +              /* Special case for the code that switches stacks in
+ +               * jprobe_return.  That code must modify RSP but it does it in
+ +               * a well defined manner.  Do not invalidate RSP.
+ +               */
+ +              if (src->base_rc == BBRG_RBX &&
+ +                  dst->base_rc == BBRG_RSP &&
+ +                  strcmp(bb_func_name, "jprobe_return") == 0) {
+ +                      bb_read_operand(src);
+ +                      return BBOU_NOP;
+ +              }
+ +              /* math_abort takes the equivalent of a longjmp structure and
+ +               * resets the stack.  Ignore this, it leaves RSP well defined.
+ +               */
+ +              if (dst->base_rc == BBRG_RSP &&
+ +                  strcmp(bb_func_name, "math_abort") == 0) {
+ +                      bb_read_operand(src);
+ +                      return BBOU_NOP;
+ +              }
+ +              bb_reg_set_reg(dst->base_rc, src->base_rc);
+ +              return BBOU_NOP;
+ +      }
+ +      /* If the move is from a full integer register to stack then record it.
+ +       */
+ +      if (src->reg &&
+ +          bb_is_simple_memory(dst) &&
+ +          bb_is_osp_defined(dst->base_rc) &&
+ +          full_register_src) {
+ +              /* Ugly special case.  Initializing list heads on stack causes
+ +               * false references to stack variables when the list head is
+ +               * used.  Static code analysis cannot detect that the list head
+ +               * has been changed by a previous execution loop and that a
+ +               * basic block is only executed after the list head has been
+ +               * changed.
+ +               *
+ +               * These false references can result in valid stack variables
+ +               * being incorrectly cleared on some logic paths.  Ignore
+ +               * stores to stack variables which point to themselves or to
+ +               * the previous word so the list head initialization is not
+ +               * recorded.
+ +               */
+ +              if (bb_is_osp_defined(src->base_rc)) {
+ +                      int stack1 = bb_reg_code_offset(src->base_rc);
+ +                      int stack2 = bb_reg_code_offset(dst->base_rc) +
+ +                                   dst->disp;
+ +                      if (stack1 == stack2 ||
+ +                          stack1 == stack2 - KDB_WORD_SIZE)
+ +                              return BBOU_NOP;
+ +              }
+ +              bb_memory_set_reg(dst->base_rc, src->base_rc, dst->disp);
+ +              return BBOU_NOP;
+ +      }
+ +      /* If the move is from stack to a full integer register then record it.
+ +       */
+ +      if (bb_is_simple_memory(src) &&
+ +          bb_is_osp_defined(src->base_rc) &&
+ +          dst->reg &&
+ +          bb_is_int_reg(dst->base_rc) &&
+ +          full_register_dst) {
+ +#ifdef        CONFIG_X86_32
- #ifndef TSS_sysenter_sp0
- #define TSS_sysenter_sp0 SYSENTER_stack_sp0
- #endif
+ +              /* mov from TSS_sysenter_sp0+offset to esp to fix up the
+ +               * sysenter stack, it leaves esp well defined.  mov
+ +               * TSS_ysenter_sp0+offset(%esp),%esp is followed by up to 5
+ +               * push instructions to mimic the hardware stack push.  If
+ +               * TSS_sysenter_sp0 is offset then only 3 words will be
+ +               * pushed.
+ +               */
+ +              if (dst->base_rc == BBRG_RSP &&
+ +                  src->disp >= TSS_sysenter_sp0 &&
+ +                  bb_is_osp_defined(BBRG_RSP)) {
+ +                      int pushes;
+ +                      pushes = src->disp == TSS_sysenter_sp0 ? 5 : 3;
+ +                      bb_reg_code_set_offset(BBRG_RSP,
+ +                              bb_reg_code_offset(BBRG_RSP) +
+ +                                      pushes * KDB_WORD_SIZE);
+ +                      KDB_DEBUG_BB_OFFSET(
+ +                              bb_reg_code_offset(BBRG_RSP),
+ +                              "  sysenter fixup, RSP",
+ +                             "\n");
+ +                      return BBOU_NOP;
+ +              }
+ +#endif        /* CONFIG_X86_32 */
+ +              bb_read_operand(src);
+ +              bb_reg_set_memory(dst->base_rc, src->base_rc, src->disp);
+ +              return BBOU_NOP;
+ +      }
+ +      /* move %gs:0x<nn>,%rsp is used to unconditionally switch to another
+ +       * stack.  Ignore this special case, it is handled by the stack
+ +       * unwinding code.
+ +       */
+ +      if (src->segment &&
+ +          strcmp(src->segment, "%gs") == 0 &&
+ +          dst->reg &&
+ +          dst->base_rc == BBRG_RSP)
+ +              return BBOU_NOP;
+ +      /* move %reg,%reg is a nop */
+ +      if (src->reg &&
+ +          dst->reg &&
+ +          !src->segment &&
+ +          !dst->segment &&
+ +          strcmp(src->base, dst->base) == 0)
+ +              return BBOU_NOP;
+ +      /* Special case for the code that switches stacks in the scheduler
+ +       * (switch_to()).  That code must modify RSP but it does it in a well
+ +       * defined manner.  Do not invalidate RSP.
+ +       */
+ +      if (dst->reg &&
+ +          dst->base_rc == BBRG_RSP &&
+ +          full_register_dst &&
+ +          bb_is_scheduler_address()) {
+ +              bb_read_operand(src);
+ +              return BBOU_NOP;
+ +      }
+ +      /* Special case for the code that switches stacks in resume from
+ +       * hibernation code.  That code must modify RSP but it does it in a
+ +       * well defined manner.  Do not invalidate RSP.
+ +       */
+ +      if (src->memory &&
+ +          dst->reg &&
+ +          dst->base_rc == BBRG_RSP &&
+ +          full_register_dst &&
+ +          strcmp(bb_func_name, "restore_image") == 0) {
+ +              bb_read_operand(src);
+ +              return BBOU_NOP;
+ +      }
+ +      return BBOU_RSWD;
+ +}
+ +
+ +static enum bb_operand_usage
+ +bb_usage_xadd(const struct bb_operand *src, const struct bb_operand *dst)
+ +{
+ +      /* Simulate xadd as a series of instructions including mov, that way we
+ +       * get the benefit of all the special cases already handled by
+ +       * BBOU_MOV.
+ +       *
+ +       * tmp = src + dst, src = dst, dst = tmp.
+ +       *
+ +       * For tmp, pick a register that is undefined.  If all registers are
+ +       * defined then pick one that is not being used by xadd.
+ +       */
+ +      enum bb_reg_code reg = BBRG_UNDEFINED;
+ +      struct bb_operand tmp;
+ +      struct bb_reg_contains save_tmp;
+ +      enum bb_operand_usage usage;
+ +      int undefined = 0;
+ +      for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ +              if (bb_reg_code_value(reg) == BBRG_UNDEFINED) {
+ +                      undefined = 1;
+ +                      break;
+ +              }
+ +      }
+ +      if (!undefined) {
+ +              for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ +                      if (reg != src->base_rc &&
+ +                          reg != src->index_rc &&
+ +                          reg != dst->base_rc &&
+ +                          reg != dst->index_rc &&
+ +                          reg != BBRG_RSP)
+ +                              break;
+ +              }
+ +      }
+ +      KDB_DEBUG_BB("  %s saving tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ +      save_tmp = bb_reg_state->contains[reg - BBRG_RAX];
+ +      bb_reg_set_undef(reg);
+ +      memset(&tmp, 0, sizeof(tmp));
+ +      tmp.present = 1;
+ +      tmp.reg = 1;
+ +      tmp.base = debug_kmalloc(strlen(bbrg_name[reg]) + 2, GFP_ATOMIC);
+ +      if (tmp.base) {
+ +              tmp.base[0] = '%';
+ +              strcpy(tmp.base + 1, bbrg_name[reg]);
+ +      }
+ +      tmp.base_rc = reg;
+ +      bb_read_operand(src);
+ +      bb_read_operand(dst);
+ +      if (bb_usage_mov(src, dst, sizeof("xadd")-1) == BBOU_NOP)
+ +              usage = BBOU_RSRD;
+ +      else
+ +              usage = BBOU_RSRDWS;
+ +      bb_usage_mov(&tmp, dst, sizeof("xadd")-1);
+ +      KDB_DEBUG_BB("  %s restoring tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ +      bb_reg_state->contains[reg - BBRG_RAX] = save_tmp;
+ +      debug_kfree(tmp.base);
+ +      return usage;
+ +}
+ +
+ +static enum bb_operand_usage
+ +bb_usage_xchg(const struct bb_operand *src, const struct bb_operand *dst)
+ +{
+ +      /* Simulate xchg as a series of mov instructions, that way we get the
+ +       * benefit of all the special cases already handled by BBOU_MOV.
+ +       *
+ +       * mov dst,tmp; mov src,dst; mov tmp,src;
+ +       *
+ +       * For tmp, pick a register that is undefined.  If all registers are
+ +       * defined then pick one that is not being used by xchg.
+ +       */
+ +      enum bb_reg_code reg = BBRG_UNDEFINED;
+ +      int rs = BBOU_RS, rd = BBOU_RD, ws = BBOU_WS, wd = BBOU_WD;
+ +      struct bb_operand tmp;
+ +      struct bb_reg_contains save_tmp;
+ +      int undefined = 0;
+ +      for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ +              if (bb_reg_code_value(reg) == BBRG_UNDEFINED) {
+ +                      undefined = 1;
+ +                      break;
+ +              }
+ +      }
+ +      if (!undefined) {
+ +              for (reg = BBRG_RAX; reg < BBRG_RAX + KDB_INT_REGISTERS; ++reg) {
+ +                      if (reg != src->base_rc &&
+ +                          reg != src->index_rc &&
+ +                          reg != dst->base_rc &&
+ +                          reg != dst->index_rc &&
+ +                          reg != BBRG_RSP)
+ +                              break;
+ +              }
+ +      }
+ +      KDB_DEBUG_BB("  %s saving tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ +      save_tmp = bb_reg_state->contains[reg - BBRG_RAX];
+ +      memset(&tmp, 0, sizeof(tmp));
+ +      tmp.present = 1;
+ +      tmp.reg = 1;
+ +      tmp.base = debug_kmalloc(strlen(bbrg_name[reg]) + 2, GFP_ATOMIC);
+ +      if (tmp.base) {
+ +              tmp.base[0] = '%';
+ +              strcpy(tmp.base + 1, bbrg_name[reg]);
+ +      }
+ +      tmp.base_rc = reg;
+ +      if (bb_usage_mov(dst, &tmp, sizeof("xchg")-1) == BBOU_NOP)
+ +              rd = 0;
+ +      if (bb_usage_mov(src, dst, sizeof("xchg")-1) == BBOU_NOP) {
+ +              rs = 0;
+ +              wd = 0;
+ +      }
+ +      if (bb_usage_mov(&tmp, src, sizeof("xchg")-1) == BBOU_NOP)
+ +              ws = 0;
+ +      KDB_DEBUG_BB("  %s restoring tmp %s\n", __FUNCTION__, bbrg_name[reg]);
+ +      bb_reg_state->contains[reg - BBRG_RAX] = save_tmp;
+ +      debug_kfree(tmp.base);
+ +      return rs | rd | ws | wd;
+ +}
+ +
+ +/* Invalidate all the scratch registers */
+ +
+ +static void
+ +bb_invalidate_scratch_reg(void)
+ +{
+ +      int i, j;
+ +      for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
+ +              for (j = 0; j < ARRAY_SIZE(bb_preserved_reg); ++j) {
+ +                      if (i == bb_preserved_reg[j])
+ +                              goto preserved;
+ +              }
+ +              bb_reg_set_undef(i);
+ +preserved:
+ +              continue;
+ +      }
+ +}
+ +
+ +static void
+ +bb_pass2_computed_jmp(const struct bb_operand *src)
+ +{
+ +      unsigned long table = src->disp;
+ +      kdb_machreg_t addr;
+ +      while (!bb_giveup) {
+ +              if (kdb_getword(&addr, table, sizeof(addr)))
+ +                      return;
+ +              if (addr < bb_func_start || addr >= bb_func_end)
+ +                      return;
+ +              bb_transfer(bb_curr_addr, addr, 0);
+ +              table += KDB_WORD_SIZE;
+ +      }
+ +}
+ +
+ +/* The current instruction has been decoded and all the information is in
+ + * bb_decode.  Based on the opcode, track any operand usage that we care about.
+ + */
+ +
+ +static void
+ +bb_usage(void)
+ +{
+ +      enum bb_operand_usage usage = bb_decode.match->usage;
+ +      struct bb_operand *src = &bb_decode.src;
+ +      struct bb_operand *dst = &bb_decode.dst;
+ +      struct bb_operand *dst2 = &bb_decode.dst2;
+ +      int opcode_suffix, operand_length;
+ +
+ +      /* First handle all the special usage cases, and map them to a generic
+ +       * case after catering for the side effects.
+ +       */
+ +
+ +      if (usage == BBOU_IMUL &&
+ +          src->present && !dst->present && !dst2->present) {
+ +              /* single operand imul, same effects as mul */
+ +              usage = BBOU_MUL;
+ +      }
+ +
+ +      /* AT&T syntax uses movs<l1><l2> for move with sign extension, instead
+ +       * of the Intel movsx.  The AT&T syntax causes problems for the opcode
+ +       * mapping; movs with sign extension needs to be treated as a generic
+ +       * read src, write dst, but instead it falls under the movs I/O
+ +       * instruction.  Fix it.
+ +       */
+ +      if (usage == BBOU_MOVS && strlen(bb_decode.opcode) > 5)
+ +              usage = BBOU_RSWD;
+ +
+ +      /* This switch statement deliberately does not use 'default' at the top
+ +       * level.  That way the compiler will complain if a new BBOU_ enum is
+ +       * added above and not explicitly handled here.
+ +       */
+ +      switch (usage) {
+ +      case BBOU_UNKNOWN:      /* drop through */
+ +      case BBOU_RS:           /* drop through */
+ +      case BBOU_RD:           /* drop through */
+ +      case BBOU_RSRD:         /* drop through */
+ +      case BBOU_WS:           /* drop through */
+ +      case BBOU_RSWS:         /* drop through */
+ +      case BBOU_RDWS:         /* drop through */
+ +      case BBOU_RSRDWS:       /* drop through */
+ +      case BBOU_WD:           /* drop through */
+ +      case BBOU_RSWD:         /* drop through */
+ +      case BBOU_RDWD:         /* drop through */
+ +      case BBOU_RSRDWD:       /* drop through */
+ +      case BBOU_WSWD:         /* drop through */
+ +      case BBOU_RSWSWD:       /* drop through */
+ +      case BBOU_RDWSWD:       /* drop through */
+ +      case BBOU_RSRDWSWD:
+ +              break;          /* ignore generic usage for now */
+ +      case BBOU_ADD:
+ +              /* Special case for add instructions that adjust registers
+ +               * which are mapping the stack.
+ +               */
+ +              if (dst->reg && bb_is_osp_defined(dst->base_rc)) {
+ +                      bb_adjust_osp_instruction(1);
+ +                      usage = BBOU_RS;
+ +              } else {
+ +                      usage = BBOU_RSRDWD;
+ +              }
+ +              break;
+ +      case BBOU_AND:
+ +              /* Special case when trying to round the stack pointer
+ +               * to achieve byte alignment
+ +               */
+ +              if (dst->reg && dst->base_rc == BBRG_RSP &&
+ +                      src->immediate && strncmp(bb_func_name, "efi_call", 8) == 0) {
+ +                              usage = BBOU_NOP;
+ +              } else {
+ +                      usage = BBOU_RSRDWD;
+ +              }
+ +              break;
+ +      case BBOU_CALL:
+ +              bb_reg_state_print(bb_reg_state);
+ +              usage = BBOU_NOP;
+ +              if (bb_is_static_disp(src)) {
+ +                      /* save_args is special.  It saves
+ +                       * a partial pt_regs onto the stack and switches
+ +                       * to the interrupt stack.
+ +                       */
+ +                      if (src->disp == bb_save_args) {
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDI, 0x48);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RSI, 0x40);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDX, 0x38);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RCX, 0x30);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RAX, 0x28);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R8,  0x20);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R9,  0x18);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R10, 0x10);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R11, 0x08);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0);
+ +                              /* This is actually on the interrupt stack,
+ +                               * but we fudge it so the unwind works.
+ +                               */
+ +                              bb_memory_set_reg_value(BBRG_RSP, -0x8, BBRG_RBP, 0);
+ +                              bb_reg_set_reg(BBRG_RBP, BBRG_RSP);
+ +                              bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
+ +                      }
+ +                      /* save_rest juggles the stack frame to append the
+ +                       * rest of the pt_regs onto a stack where SAVE_ARGS
+ +                       * or save_args has already been done.
+ +                       */
+ +                      else if (src->disp == bb_save_rest) {
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBX, 0x30);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0x28);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R12, 0x20);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R13, 0x18);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R14, 0x10);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R15, 0x08);
+ +                      }
+ +                      /* error_entry and save_paranoid save a full pt_regs.
+ +                       * Break out so the scratch registers aren't invalidated.
+ +                       */
+ +                      else if (src->disp == bb_error_entry || src->disp == bb_save_paranoid) {
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDI, 0x70);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RSI, 0x68);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RDX, 0x60);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RCX, 0x58);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RAX, 0x50);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R8,  0x48);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R9,  0x40);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R10, 0x38);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R11, 0x30);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBX, 0x28);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_RBP, 0x20);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R12, 0x18);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R13, 0x10);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R14, 0x08);
+ +                              bb_memory_set_reg(BBRG_RSP, BBRG_R15, 0);
+ +                              break;
+ +                      }
+ +              }
+ +              /* Invalidate the scratch registers */
+ +              bb_invalidate_scratch_reg();
+ +
+ +              /* These special cases need scratch registers invalidated first */
+ +              if (bb_is_static_disp(src)) {
+ +                      /* Function sync_regs and save_v86_state are special.
+ +                       * Their return value is the new stack pointer
+ +                       */
+ +                      if (src->disp == bb_sync_regs) {
+ +                              bb_reg_set_reg(BBRG_RAX, BBRG_RSP);
+ +                      } else if (src->disp == bb_save_v86_state) {
+ +                              bb_reg_set_reg(BBRG_RAX, BBRG_RSP);
+ +                              bb_adjust_osp(BBRG_RAX, +KDB_WORD_SIZE);
+ +                      }
+ +              }
+ +              break;
+ +      case BBOU_CBW:
+ +              /* Convert word in RAX.  Read RAX, write RAX */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_CMOV:
+ +              /* cmove %gs:0x<nn>,%rsp is used to conditionally switch to
+ +               * another stack.  Ignore this special case, it is handled by
+ +               * the stack unwinding code.
+ +               */
+ +              if (src->segment &&
+ +                  strcmp(src->segment, "%gs") == 0 &&
+ +                  dst->reg &&
+ +                  dst->base_rc == BBRG_RSP)
+ +                      usage = BBOU_NOP;
+ +              else
+ +                      usage = BBOU_RSWD;
+ +              break;
+ +      case BBOU_CMPXCHG:
+ +              /* Read RAX, write RAX plus src read, dst write */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              usage = BBOU_RSWD;
+ +              break;
+ +      case BBOU_CMPXCHGD:
+ +              /* Read RAX, RBX, RCX, RDX, write RAX, RDX plus src read/write */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_read(BBRG_RBX);
+ +              bb_reg_read(BBRG_RCX);
+ +              bb_reg_read(BBRG_RDX);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_RSWS;
+ +              break;
+ +      case BBOU_CPUID:
+ +              /* Read RAX, write RAX, RBX, RCX, RDX */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RBX);
+ +              bb_reg_set_undef(BBRG_RCX);
+ +              bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_CWD:
+ +              /* Convert word in RAX, RDX.  Read RAX, write RDX */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_DIV:  /* drop through */
+ +      case BBOU_IDIV:
+ +              /* The 8 bit variants only affect RAX, the 16, 32 and 64 bit
+ +               * variants affect RDX as well.
+ +               */
+ +              switch (usage) {
+ +              case BBOU_DIV:
+ +                      opcode_suffix = bb_decode.opcode[3];
+ +                      break;
+ +              case BBOU_IDIV:
+ +                      opcode_suffix = bb_decode.opcode[4];
+ +                      break;
+ +              default:
+ +                      opcode_suffix = 'q';
+ +                      break;
+ +              }
+ +              operand_length = bb_operand_length(src, opcode_suffix);
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              if (operand_length != 8) {
+ +                      bb_reg_read(BBRG_RDX);
+ +                      bb_reg_set_undef(BBRG_RDX);
+ +              }
+ +              usage = BBOU_RS;
+ +              break;
+ +      case BBOU_IMUL:
+ +              /* Only the two and three operand forms get here.  The one
+ +               * operand form is treated as mul.
+ +               */
+ +              if (dst2->present) {
+ +                      /* The three operand form is a special case, read the first two
+ +                       * operands, write the third.
+ +                       */
+ +                      bb_read_operand(src);
+ +                      bb_read_operand(dst);
+ +                      bb_write_operand(dst2);
+ +                      usage = BBOU_NOP;
+ +              } else {
+ +                      usage = BBOU_RSRDWD;
+ +              }
+ +              break;
+ +      case BBOU_IRET:
+ +              bb_sanity_check(0);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_JMP:
+ +              if (bb_is_static_disp(src))
+ +                      bb_transfer(bb_curr_addr, src->disp, 0);
+ +              else if (src->indirect &&
+ +                       src->disp &&
+ +                       src->base == NULL &&
+ +                       src->index &&
+ +                       src->scale == KDB_WORD_SIZE)
+ +                      bb_pass2_computed_jmp(src);
+ +              usage = BBOU_RS;
+ +              break;
+ +      case BBOU_LAHF:
+ +              /* Write RAX */
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_LEA:
+ +              /* dst = src + disp.  Often used to calculate offsets into the
+ +               * stack, so check if it uses a stack pointer.
+ +               */
+ +              usage = BBOU_RSWD;
+ +              if (bb_is_simple_memory(src)) {
+ +                     if (bb_is_osp_defined(src->base_rc)) {
+ +                              bb_reg_set_reg(dst->base_rc, src->base_rc);
+ +                              bb_adjust_osp_instruction(1);
+ +                              usage = BBOU_RS;
+ +                      } else if (src->disp == 0 &&
+ +                                 src->base_rc == dst->base_rc) {
+ +                              /* lea 0(%reg),%reg is generated by i386
+ +                               * GENERIC_NOP7.
+ +                               */
+ +                              usage = BBOU_NOP;
+ +                      } else if (src->disp == 4096 &&
+ +                                 (src->base_rc == BBRG_R8 ||
+ +                                  src->base_rc == BBRG_RDI) &&
+ +                                 strcmp(bb_func_name, "relocate_kernel") == 0) {
+ +                              /* relocate_kernel: setup a new stack at the
+ +                               * end of the physical control page, using
+ +                               * (x86_64) lea 4096(%r8),%rsp or (i386) lea
+ +                               * 4096(%edi),%esp
+ +                               */
+ +                              usage = BBOU_NOP;
+ +                      }
+ +              }
+ +              break;
+ +      case BBOU_LEAVE:
+ +              /* RSP = RBP; RBP = *(RSP); RSP += KDB_WORD_SIZE; */
+ +              bb_reg_set_reg(BBRG_RSP, BBRG_RBP);
+ +              if (bb_is_osp_defined(BBRG_RSP))
+ +                      bb_reg_set_memory(BBRG_RBP, BBRG_RSP, 0);
+ +              else
+ +                      bb_reg_set_undef(BBRG_RBP);
+ +              if (bb_is_osp_defined(BBRG_RSP))
+ +                      bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
+ +              /* common_interrupt uses leave in a non-standard manner */
+ +              if (strcmp(bb_func_name, "common_interrupt") != 0)
+ +                      bb_sanity_check(0);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_LODS:
+ +              /* Read RSI, write RAX, RSI */
+ +              bb_reg_read(BBRG_RSI);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RSI);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_LOOP:
+ +              /* Read and write RCX */
+ +              bb_reg_read(BBRG_RCX);
+ +              bb_reg_set_undef(BBRG_RCX);
+ +              if (bb_is_static_disp(src))
+ +                      bb_transfer(bb_curr_addr, src->disp, 0);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_LSS:
+ +              /* lss offset(%esp),%esp leaves esp well defined */
+ +              if (dst->reg &&
+ +                  dst->base_rc == BBRG_RSP &&
+ +                  bb_is_simple_memory(src) &&
+ +                  src->base_rc == BBRG_RSP) {
+ +                      bb_adjust_osp(BBRG_RSP, 2*KDB_WORD_SIZE + src->disp);
+ +                      usage = BBOU_NOP;
+ +              } else {
+ +                      usage = BBOU_RSWD;
+ +              }
+ +              break;
+ +      case BBOU_MONITOR:
+ +              /* Read RAX, RCX, RDX */
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RCX);
+ +              bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_MOV:
+ +              usage = bb_usage_mov(src, dst, sizeof("mov")-1);
+ +              break;
+ +      case BBOU_MOVS:
+ +              /* Read RSI, RDI, write RSI, RDI */
+ +              bb_reg_read(BBRG_RSI);
+ +              bb_reg_read(BBRG_RDI);
+ +              bb_reg_set_undef(BBRG_RSI);
+ +              bb_reg_set_undef(BBRG_RDI);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_MUL:
+ +              /* imul (one operand form only) or mul.  Read RAX.  If the
+ +               * operand length is not 8 then write RDX.
+ +               */
+ +              if (bb_decode.opcode[0] == 'i')
+ +                      opcode_suffix = bb_decode.opcode[4];
+ +              else
+ +                      opcode_suffix = bb_decode.opcode[3];
+ +              operand_length = bb_operand_length(src, opcode_suffix);
+ +              bb_reg_read(BBRG_RAX);
+ +              if (operand_length != 8)
+ +                      bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_MWAIT:
+ +              /* Read RAX, RCX */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_read(BBRG_RCX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_NOP:
+ +              break;
+ +      case BBOU_OUTS:
+ +              /* Read RSI, RDX, write RSI */
+ +              bb_reg_read(BBRG_RSI);
+ +              bb_reg_read(BBRG_RDX);
+ +              bb_reg_set_undef(BBRG_RSI);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_POP:
+ +              /* Complicated by the fact that you can pop from top of stack
+ +               * to a stack location, for this case the destination location
+ +               * is calculated after adjusting RSP.  Analysis of the kernel
+ +               * code shows that gcc only uses this strange format to get the
+ +               * flags into a local variable, e.g. pushf; popl 0x10(%esp); so
+ +               * I am going to ignore this special case.
+ +               */
+ +              usage = BBOU_WS;
+ +              if (!bb_is_osp_defined(BBRG_RSP)) {
+ +                      if (!bb_is_scheduler_address()) {
+ +                              kdb_printf("pop when BBRG_RSP is undefined?\n");
+ +                              bb_giveup = 1;
+ +                      }
+ +              } else {
+ +                      if (src->reg) {
+ +                              bb_reg_set_memory(src->base_rc, BBRG_RSP, 0);
+ +                              usage = BBOU_NOP;
+ +                      }
+ +                      /* pop %rsp does not adjust rsp */
+ +                      if (!src->reg ||
+ +                          src->base_rc != BBRG_RSP)
+ +                              bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
+ +              }
+ +              break;
+ +      case BBOU_POPF:
+ +              /* Do not care about flags, just adjust RSP */
+ +              if (!bb_is_osp_defined(BBRG_RSP)) {
+ +                      if (!bb_is_scheduler_address()) {
+ +                              kdb_printf("popf when BBRG_RSP is undefined?\n");
+ +                              bb_giveup = 1;
+ +                      }
+ +              } else {
+ +                      bb_adjust_osp(BBRG_RSP, KDB_WORD_SIZE);
+ +              }
+ +              usage = BBOU_WS;
+ +              break;
+ +      case BBOU_PUSH:
+ +              /* Complicated by the fact that you can push from a stack
+ +               * location to top of stack, the source location is calculated
+ +               * before adjusting RSP.  Analysis of the kernel code shows
+ +               * that gcc only uses this strange format to restore the flags
+ +               * from a local variable, e.g. pushl 0x10(%esp); popf; so I am
+ +               * going to ignore this special case.
+ +               */
+ +              usage = BBOU_RS;
+ +              if (!bb_is_osp_defined(BBRG_RSP)) {
+ +                      if (!bb_is_scheduler_address()) {
+ +                              kdb_printf("push when BBRG_RSP is undefined?\n");
+ +                              bb_giveup = 1;
+ +                      }
+ +              } else {
+ +                      bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
+ +                      if (src->reg &&
+ +                          bb_reg_code_offset(BBRG_RSP) <= 0)
+ +                              bb_memory_set_reg(BBRG_RSP, src->base_rc, 0);
+ +              }
+ +              break;
+ +      case BBOU_PUSHF:
+ +              /* Do not care about flags, just adjust RSP */
+ +              if (!bb_is_osp_defined(BBRG_RSP)) {
+ +                      if (!bb_is_scheduler_address()) {
+ +                              kdb_printf("pushf when BBRG_RSP is undefined?\n");
+ +                              bb_giveup = 1;
+ +                      }
+ +              } else {
+ +                      bb_adjust_osp(BBRG_RSP, -KDB_WORD_SIZE);
+ +              }
+ +              usage = BBOU_WS;
+ +              break;
+ +      case BBOU_RDMSR:
+ +              /* Read RCX, write RAX, RDX */
+ +              bb_reg_read(BBRG_RCX);
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_RDTSC:
+ +              /* Write RAX, RDX */
+ +              bb_reg_set_undef(BBRG_RAX);
+ +              bb_reg_set_undef(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_RET:
+ +              usage = BBOU_NOP;
+ +              if (src->immediate && bb_is_osp_defined(BBRG_RSP)) {
+ +                      bb_adjust_osp(BBRG_RSP, src->disp);
+ +              }
+ +              /* Functions that restore state which was saved by another
+ +               * function or build new kernel stacks.  We cannot verify what
+ +               * is being restored so skip the sanity check.
+ +               */
+ +              if (strcmp(bb_func_name, "restore_image") == 0 ||
+ +                  strcmp(bb_func_name, "relocate_kernel") == 0 ||
+ +                  strcmp(bb_func_name, "identity_mapped") == 0 ||
+ +                  strcmp(bb_func_name, "xen_iret_crit_fixup") == 0 ||
+ +                  strcmp(bb_func_name, "math_abort") == 0 ||
+ +                  strcmp(bb_func_name, "save_args") == 0 ||
+ +                  strcmp(bb_func_name, "kretprobe_trampoline_holder") == 0)
+ +                      break;
+ +              bb_sanity_check(0);
+ +              break;
+ +      case BBOU_SAHF:
+ +              /* Read RAX */
+ +              bb_reg_read(BBRG_RAX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_SCAS:
+ +              /* Read RAX, RDI, write RDI */
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_read(BBRG_RDI);
+ +              bb_reg_set_undef(BBRG_RDI);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_SUB:
+ +              /* Special case for sub instructions that adjust registers
+ +               * which are mapping the stack.
+ +               */
+ +              if (dst->reg && bb_is_osp_defined(dst->base_rc)) {
+ +                      bb_adjust_osp_instruction(-1);
+ +                      usage = BBOU_RS;
+ +              } else {
+ +                      usage = BBOU_RSRDWD;
+ +              }
+ +              break;
+ +      case BBOU_SYSEXIT:
+ +              bb_sanity_check(1);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_SYSRET:
+ +              bb_sanity_check(1);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_WRMSR:
+ +              /* Read RCX, RAX, RDX */
+ +              bb_reg_read(BBRG_RCX);
+ +              bb_reg_read(BBRG_RAX);
+ +              bb_reg_read(BBRG_RDX);
+ +              usage = BBOU_NOP;
+ +              break;
+ +      case BBOU_XADD:
+ +              usage = bb_usage_xadd(src, dst);
+ +              break;
+ +      case BBOU_XCHG:
+ +              /* i386 do_IRQ with 4K stacks does xchg %ebx,%esp; call
+ +               * irq_handler; mov %ebx,%esp; to switch stacks.  Ignore this
+ +               * stack switch when tracking registers, it is handled by
+ +               * higher level backtrace code.  Convert xchg %ebx,%esp to mov
+ +               * %esp,%ebx so the later mov %ebx,%esp becomes a NOP and the
+ +               * stack remains defined so we can backtrace through do_IRQ's
+ +               * stack switch.
+ +               *
+ +               * Ditto for do_softirq.
+ +               */
+ +              if (src->reg &&
+ +                  dst->reg &&
+ +                  src->base_rc == BBRG_RBX &&
+ +                  dst->base_rc == BBRG_RSP &&
+ +                  (strcmp(bb_func_name, "do_IRQ") == 0 ||
+ +                   strcmp(bb_func_name, "do_softirq") == 0)) {
+ +                      strcpy(bb_decode.opcode, "mov");
+ +                      usage = bb_usage_mov(dst, src, sizeof("mov")-1);
+ +              } else {
+ +                      usage = bb_usage_xchg(src, dst);
+ +              }
+ +              break;
+ +      case BBOU_XOR:
+ +              /* xor %reg,%reg only counts as a register write, the original
+ +               * contents of reg are irrelevant.
+ +               */
+ +              if (src->reg && dst->reg && src->base_rc == dst->base_rc)
+ +                      usage = BBOU_WS;
+ +              else
+ +                      usage = BBOU_RSRDWD;
+ +              break;
+ +      }
+ +
+ +      /* The switch statement above handled all the special cases.  Every
+ +       * opcode should now have a usage of NOP or one of the generic cases.
+ +       */
+ +      if (usage == BBOU_UNKNOWN || usage == BBOU_NOP) {
+ +              /* nothing to do */
+ +      } else if (usage >= BBOU_RS && usage <= BBOU_RSRDWSWD) {
+ +              if (usage & BBOU_RS)
+ +                      bb_read_operand(src);
+ +              if (usage & BBOU_RD)
+ +                      bb_read_operand(dst);
+ +              if (usage & BBOU_WS)
+ +                      bb_write_operand(src);
+ +              if (usage & BBOU_WD)
+ +                      bb_write_operand(dst);
+ +      } else {
+ +              kdb_printf("%s: opcode not fully handled\n", __FUNCTION__);
+ +              if (!KDB_DEBUG(BB)) {
+ +                      bb_print_opcode();
+ +                      if (bb_decode.src.present)
+ +                              bb_print_operand("src", &bb_decode.src);
+ +                      if (bb_decode.dst.present)
+ +                              bb_print_operand("dst", &bb_decode.dst);
+ +                      if (bb_decode.dst2.present)
+ +                              bb_print_operand("dst2", &bb_decode.dst2);
+ +              }
+ +              bb_giveup = 1;
+ +      }
+ +}
+ +
+ +static void
+ +bb_parse_buffer(void)
+ +{
+ +      char *p, *src, *dst = NULL, *dst2 = NULL;
+ +      int paren = 0;
+ +      p = bb_buffer;
+ +      memset(&bb_decode, 0, sizeof(bb_decode));
+ +      KDB_DEBUG_BB(" '%s'\n", p);
+ +      p += strcspn(p, ":");   /* skip address and function name+offset: */
+ +      if (*p++ != ':') {
+ +              kdb_printf("%s: cannot find ':' in buffer '%s'\n",
+ +                         __FUNCTION__, bb_buffer);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      p += strspn(p, " \t");  /* step to opcode */
+ +      if (strncmp(p, "(bad)", 5) == 0)
+ +              strcpy(p, "nop");
+ +      /* separate any opcode prefix */
+ +      if (strncmp(p, "lock", 4) == 0 ||
+ +          strncmp(p, "rep", 3) == 0 ||
+ +          strncmp(p, "rex", 3) == 0 ||
+ +          strncmp(p, "addr", 4) == 0) {
+ +              bb_decode.prefix = p;
+ +              p += strcspn(p, " \t");
+ +              *p++ = '\0';
+ +              p += strspn(p, " \t");
+ +      }
+ +      bb_decode.opcode = p;
+ +      strsep(&p, " \t");      /* step to end of opcode */
+ +      if (bb_parse_opcode())
+ +              return;
+ +      if (!p)
+ +              goto no_operands;
+ +      p += strspn(p, " \t");  /* step to operand(s) */
+ +      if (!*p)
+ +              goto no_operands;
+ +      src = p;
+ +      p = strsep(&p, " \t");  /* strip comments after operands */
+ +      /* split 'src','dst' but ignore ',' inside '(' ')' */
+ +      while (*p) {
+ +              if (*p == '(') {
+ +                      ++paren;
+ +              } else if (*p == ')') {
+ +                      --paren;
+ +              } else if (*p == ',' && paren == 0) {
+ +                      *p = '\0';
+ +                      if (dst)
+ +                              dst2 = p+1;
+ +                      else
+ +                              dst = p+1;
+ +              }
+ +              ++p;
+ +      }
+ +      bb_parse_operand(src, &bb_decode.src);
+ +      if (KDB_DEBUG(BB))
+ +              bb_print_operand("src", &bb_decode.src);
+ +      if (dst && !bb_giveup) {
+ +              bb_parse_operand(dst, &bb_decode.dst);
+ +              if (KDB_DEBUG(BB))
+ +                      bb_print_operand("dst", &bb_decode.dst);
+ +      }
+ +      if (dst2 && !bb_giveup) {
+ +              bb_parse_operand(dst2, &bb_decode.dst2);
+ +              if (KDB_DEBUG(BB))
+ +                      bb_print_operand("dst2", &bb_decode.dst2);
+ +      }
+ +no_operands:
+ +      if (!bb_giveup)
+ +              bb_usage();
+ +}
+ +
+ +static int
+ +bb_dis_pass2(PTR file, const char *fmt, ...)
+ +{
+ +      char *p;
+ +      int l = strlen(bb_buffer);
+ +      va_list ap;
+ +      va_start(ap, fmt);
+ +      vsnprintf(bb_buffer + l, sizeof(bb_buffer) - l, fmt, ap);
+ +      va_end(ap);
+ +      if ((p = strchr(bb_buffer, '\n'))) {
+ +              *p = '\0';
+ +              p = bb_buffer;
+ +              p += strcspn(p, ":");
+ +              if (*p++ == ':')
+ +                      bb_fixup_switch_to(p);
+ +              bb_parse_buffer();
+ +              bb_buffer[0] = '\0';
+ +      }
+ +      return 0;
+ +}
+ +
+ +static void
+ +bb_printaddr_pass2(bfd_vma addr, disassemble_info *dip)
+ +{
+ +      kdb_symtab_t symtab;
+ +      unsigned int offset;
+ +      dip->fprintf_func(dip->stream, "0x%lx", addr);
+ +      kdbnearsym(addr, &symtab);
+ +      if (symtab.sym_name) {
+ +              dip->fprintf_func(dip->stream, " <%s", symtab.sym_name);
+ +              if ((offset = addr - symtab.sym_start))
+ +                      dip->fprintf_func(dip->stream, "+0x%x", offset);
+ +              dip->fprintf_func(dip->stream, ">");
+ +      }
+ +}
+ +
+ +/* Set the starting register and memory state for the current bb */
+ +
+ +static void
+ +bb_start_block0_special(void)
+ +{
+ +      int i;
+ +      short offset_address;
+ +      enum bb_reg_code reg, value;
+ +      struct bb_name_state *r;
+ +      for (i = 0, r = bb_special_cases;
+ +           i < ARRAY_SIZE(bb_special_cases);
+ +           ++i, ++r) {
+ +              if (bb_func_start == r->address && r->fname == NULL)
+ +                      goto match;
+ +      }
+ +      return;
+ +match:
+ +      /* Set the running registers */
+ +      for (reg = BBRG_RAX; reg < r->regs_size; ++reg) {
+ +              value = r->regs[reg].value;
+ +              if (test_bit(value, r->skip_regs.bits)) {
+ +                      /* this regs entry is not defined for this label */
+ +                      continue;
+ +              }
+ +              bb_reg_code_set_value(reg, value);
+ +              bb_reg_code_set_offset(reg, r->regs[reg].offset);
+ +      }
+ +      /* Set any memory contents, e.g. pt_regs.  Adjust RSP as required. */
+ +      offset_address = 0;
+ +      for (i = 0; i < r->mem_size; ++i) {
+ +              offset_address = max_t(int,
+ +                              r->mem[i].offset_address + KDB_WORD_SIZE,
+ +                              offset_address);
+ +      }
+ +      if (bb_reg_code_offset(BBRG_RSP) > -offset_address)
+ +              bb_adjust_osp(BBRG_RSP, -offset_address - bb_reg_code_offset(BBRG_RSP));
+ +      for (i = 0; i < r->mem_size; ++i) {
+ +              value = r->mem[i].value;
+ +              if (test_bit(value, r->skip_mem.bits)) {
+ +                      /* this memory entry is not defined for this label */
+ +                      continue;
+ +              }
+ +              bb_memory_set_reg_value(BBRG_RSP, r->mem[i].offset_address,
+ +                                      value, 0);
+ +              bb_reg_set_undef(value);
+ +      }
+ +      return;
+ +}
+ +
+ +static void
+ +bb_pass2_start_block(int number)
+ +{
+ +      int i, j, k, first, changed;
+ +      size_t size;
+ +      struct bb_jmp *bb_jmp;
+ +      struct bb_reg_state *state;
+ +      struct bb_memory_contains *c1, *c2;
+ +      bb_reg_state->mem_count = bb_reg_state_max;
+ +      size = bb_reg_state_size(bb_reg_state);
+ +      memset(bb_reg_state, 0, size);
+ +
+ +      if (number == 0) {
+ +              /* The first block is assumed to have well defined inputs */
+ +              bb_start_block0();
+ +              /* Some assembler labels have non-standard entry
+ +               * states.
+ +               */
+ +              bb_start_block0_special();
+ +              bb_reg_state_print(bb_reg_state);
+ +              return;
+ +      }
+ +
+ +      /* Merge all the input states for the current bb together */
+ +      first = 1;
+ +      changed = 0;
+ +      for (i = 0; i < bb_jmp_count; ++i) {
+ +              bb_jmp = bb_jmp_list + i;
+ +              if (bb_jmp->to != bb_curr->start)
+ +                      continue;
+ +              state = bb_jmp->state;
+ +              if (!state)
+ +                      continue;
+ +              if (first) {
+ +                      size = bb_reg_state_size(state);
+ +                      memcpy(bb_reg_state, state, size);
+ +                      KDB_DEBUG_BB("  first state %p\n", state);
+ +                      bb_reg_state_print(bb_reg_state);
+ +                      first = 0;
+ +                      continue;
+ +              }
+ +
+ +              KDB_DEBUG_BB("  merging state %p\n", state);
+ +              /* Merge the register states */
+ +              for (j = 0; j < ARRAY_SIZE(state->contains); ++j) {
+ +                      if (memcmp(bb_reg_state->contains + j,
+ +                                 state->contains + j,
+ +                                 sizeof(bb_reg_state->contains[0]))) {
+ +                              /* Different states for this register from two
+ +                               * or more inputs, make it undefined.
+ +                               */
+ +                              if (bb_reg_state->contains[j].value ==
+ +                                  BBRG_UNDEFINED) {
+ +                                      KDB_DEBUG_BB("  ignoring %s\n",
+ +                                                  bbrg_name[j + BBRG_RAX]);
+ +                              } else {
+ +                                      bb_reg_set_undef(BBRG_RAX + j);
+ +                                      changed = 1;
+ +                              }
+ +                      }
+ +              }
+ +
+ +              /* Merge the memory states.  This relies on both
+ +               * bb_reg_state->memory and state->memory being sorted in
+ +               * descending order, with undefined entries at the end.
+ +               */
+ +              c1 = bb_reg_state->memory;
+ +              c2 = state->memory;
+ +              j = k = 0;
+ +              while (j < bb_reg_state->mem_count &&
+ +                     k < state->mem_count) {
+ +                      if (c1->offset_address < c2->offset_address) {
+ +                              KDB_DEBUG_BB_OFFSET(c2->offset_address,
+ +                                                  "  ignoring c2->offset_address ",
+ +                                                  "\n");
+ +                              ++c2;
+ +                              ++k;
+ +                              continue;
+ +                      }
+ +                      if (c1->offset_address > c2->offset_address) {
+ +                              /* Memory location is not in all input states,
+ +                               * delete the memory location.
+ +                               */
+ +                              bb_delete_memory(c1->offset_address);
+ +                              changed = 1;
+ +                              ++c1;
+ +                              ++j;
+ +                              continue;
+ +                      }
+ +                      if (memcmp(c1, c2, sizeof(*c1))) {
+ +                              /* Same location, different contents, delete
+ +                               * the memory location.
+ +                               */
+ +                              bb_delete_memory(c1->offset_address);
+ +                              KDB_DEBUG_BB_OFFSET(c2->offset_address,
+ +                                                  "  ignoring c2->offset_address ",
+ +                                                  "\n");
+ +                              changed = 1;
+ +                      }
+ +                      ++c1;
+ +                      ++c2;
+ +                      ++j;
+ +                      ++k;
+ +              }
+ +              while (j < bb_reg_state->mem_count) {
+ +                      bb_delete_memory(c1->offset_address);
+ +                      changed = 1;
+ +                      ++c1;
+ +                      ++j;
+ +              }
+ +      }
+ +      if (changed) {
+ +              KDB_DEBUG_BB("  final state\n");
+ +              bb_reg_state_print(bb_reg_state);
+ +      }
+ +}
+ +
+ +/* We have reached the exit point from the current function, either a call to
+ + * the next function or the instruction that was about to executed when an
+ + * interrupt occurred.  Save the current register state in bb_exit_state.
+ + */
+ +
+ +static void
+ +bb_save_exit_state(void)
+ +{
+ +      size_t size;
+ +      debug_kfree(bb_exit_state);
+ +      bb_exit_state = NULL;
+ +      bb_reg_state_canonicalize();
+ +      size = bb_reg_state_size(bb_reg_state);
+ +      bb_exit_state = debug_kmalloc(size, GFP_ATOMIC);
+ +      if (!bb_exit_state) {
+ +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      memcpy(bb_exit_state, bb_reg_state, size);
+ +}
+ +
+ +static int
+ +bb_pass2_do_changed_blocks(int allow_missing)
+ +{
+ +      int i, j, missing, changed, maxloops;
+ +      unsigned long addr;
+ +      struct bb_jmp *bb_jmp;
+ +      KDB_DEBUG_BB("\n  %s: allow_missing %d\n", __FUNCTION__, allow_missing);
+ +      /* Absolute worst case is we have to iterate over all the basic blocks
+ +       * in an "out of order" state, each iteration losing one register or
+ +       * memory state.  Any more loops than that is a bug.  "out of order"
+ +       * means that the layout of blocks in memory does not match the logic
+ +       * flow through those blocks so (for example) block 27 comes before
+ +       * block 2.  To allow for out of order blocks, multiply maxloops by the
+ +       * number of blocks.
+ +       */
+ +      maxloops = (KDB_INT_REGISTERS + bb_reg_state_max) * bb_count;
+ +      changed = 1;
+ +      do {
+ +              changed = 0;
+ +              for (i = 0; i < bb_count; ++i) {
+ +                      bb_curr = bb_list[i];
+ +                      if (!bb_curr->changed)
+ +                              continue;
+ +                      missing = 0;
+ +                      for (j = 0, bb_jmp = bb_jmp_list;
+ +                           j < bb_jmp_count;
+ +                           ++j, ++bb_jmp) {
+ +                              if (bb_jmp->to == bb_curr->start &&
+ +                                  !bb_jmp->state)
+ +                                      ++missing;
+ +                      }
+ +                      if (missing > allow_missing)
+ +                              continue;
+ +                      bb_curr->changed = 0;
+ +                      changed = 1;
+ +                      KDB_DEBUG_BB("\n  bb[%d]\n", i);
+ +                      bb_pass2_start_block(i);
+ +                      for (addr = bb_curr->start;
+ +                           addr <= bb_curr->end; ) {
+ +                              bb_curr_addr = addr;
+ +                              if (addr == bb_exit_addr)
+ +                                      bb_save_exit_state();
+ +                              addr += kdba_id_printinsn(addr, &kdb_di);
+ +                              kdb_di.fprintf_func(NULL, "\n");
+ +                              if (bb_giveup)
+ +                                      goto done;
+ +                      }
+ +                      if (!bb_exit_state) {
+ +                              /* ATTRIB_NORET functions are a problem with
+ +                               * the current gcc.  Allow the trailing address
+ +                               * a bit of leaway.
+ +                               */
+ +                              if (addr == bb_exit_addr ||
+ +                                  addr == bb_exit_addr + 1)
+ +                                      bb_save_exit_state();
+ +                      }
+ +                      if (bb_curr->drop_through)
+ +                              bb_transfer(bb_curr->end,
+ +                                          bb_list[i+1]->start, 1);
+ +              }
+ +              if (maxloops-- == 0) {
+ +                      kdb_printf("\n\n%s maxloops reached\n",
+ +                                 __FUNCTION__);
+ +                      bb_giveup = 1;
+ +                      goto done;
+ +              }
+ +      } while(changed);
+ +done:
+ +      for (i = 0; i < bb_count; ++i) {
+ +              bb_curr = bb_list[i];
+ +              if (bb_curr->changed)
+ +                      return 1;       /* more to do, increase allow_missing */
+ +      }
+ +      return 0;       /* all blocks done */
+ +}
+ +
+ +/* Assume that the current function is a pass through function that does not
+ + * refer to its register parameters.  Exclude known asmlinkage functions and
+ + * assume the other functions actually use their registers.
+ + */
+ +
+ +static void
+ +bb_assume_pass_through(void)
+ +{
+ +      static int first_time = 1;
+ +      if (strncmp(bb_func_name, "sys_", 4) == 0 ||
+ +          strncmp(bb_func_name, "compat_sys_", 11) == 0 ||
+ +          strcmp(bb_func_name, "schedule") == 0 ||
+ +          strcmp(bb_func_name, "do_softirq") == 0 ||
+ +          strcmp(bb_func_name, "printk") == 0 ||
+ +          strcmp(bb_func_name, "vprintk") == 0 ||
+ +          strcmp(bb_func_name, "preempt_schedule") == 0 ||
+ +          strcmp(bb_func_name, "start_kernel") == 0 ||
+ +          strcmp(bb_func_name, "csum_partial") == 0 ||
+ +          strcmp(bb_func_name, "csum_partial_copy_generic") == 0 ||
+ +          strcmp(bb_func_name, "math_state_restore") == 0 ||
+ +          strcmp(bb_func_name, "panic") == 0 ||
+ +          strcmp(bb_func_name, "kdb_printf") == 0 ||
+ +          strcmp(bb_func_name, "kdb_interrupt") == 0)
+ +              return;
+ +      if (bb_asmlinkage_arch())
+ +              return;
+ +      bb_reg_params = REGPARM;
+ +      if (first_time) {
+ +              kdb_printf("  %s has memory parameters but no register "
+ +                         "parameters.\n  Assuming it is a 'pass "
+ +                         "through' function that does not refer to "
+ +                         "its register\n  parameters and setting %d "
+ +                         "register parameters\n",
+ +                         bb_func_name, REGPARM);
+ +              first_time = 0;
+ +              return;
+ +      }
+ +      kdb_printf("  Assuming %s is 'pass through' with %d register "
+ +                 "parameters\n",
+ +                 bb_func_name, REGPARM);
+ +}
+ +
+ +static void
+ +bb_pass2(void)
+ +{
+ +      int allow_missing;
+ +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +              kdb_printf("%s: start\n", __FUNCTION__);
+ +
+ +      kdb_di.fprintf_func = bb_dis_pass2;
+ +      kdb_di.print_address_func = bb_printaddr_pass2;
+ +
+ +      bb_reg_state = debug_kmalloc(sizeof(*bb_reg_state), GFP_ATOMIC);
+ +      if (!bb_reg_state) {
+ +              kdb_printf("\n\n%s: out of debug_kmalloc\n", __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      bb_list[0]->changed = 1;
+ +
+ +      /* If a block does not have all its input states available then it is
+ +       * possible for a register to initially appear to hold a known value,
+ +       * but when other inputs are available then it becomes a variable
+ +       * value.  The initial false state of "known" can generate false values
+ +       * for other registers and can even make it look like stack locations
+ +       * are being changed.
+ +       *
+ +       * To avoid these false positives, only process blocks which have all
+ +       * their inputs defined.  That gives a clean depth first traversal of
+ +       * the tree, except for loops.  If there are any loops, then start
+ +       * processing blocks with one missing input, then two missing inputs
+ +       * etc.
+ +       *
+ +       * Absolute worst case is we have to iterate over all the jmp entries,
+ +       * each iteration allowing one more missing input.  Any more loops than
+ +       * that is a bug.  Watch out for the corner case of 0 jmp entries.
+ +       */
+ +      for (allow_missing = 0; allow_missing <= bb_jmp_count; ++allow_missing) {
+ +              if (!bb_pass2_do_changed_blocks(allow_missing))
+ +                      break;
+ +              if (bb_giveup)
+ +                      break;
+ +      }
+ +      if (allow_missing > bb_jmp_count) {
+ +              kdb_printf("\n\n%s maxloops reached\n",
+ +                         __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +
+ +      if (bb_memory_params && bb_reg_params)
+ +              bb_reg_params = REGPARM;
+ +      if (REGPARM &&
+ +          bb_memory_params &&
+ +          !bb_reg_params)
+ +              bb_assume_pass_through();
+ +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
+ +              kdb_printf("%s: end bb_reg_params %d bb_memory_params %d\n",
+ +                         __FUNCTION__, bb_reg_params, bb_memory_params);
+ +              if (bb_exit_state) {
+ +                      kdb_printf("%s: bb_exit_state at " kdb_bfd_vma_fmt0 "\n",
+ +                                 __FUNCTION__, bb_exit_addr);
+ +                      bb_do_reg_state_print(bb_exit_state);
+ +              }
+ +      }
+ +}
+ +
+ +static void
+ +bb_cleanup(void)
+ +{
+ +      int i;
+ +      struct bb* bb;
+ +      struct bb_reg_state *state;
+ +      while (bb_count) {
+ +              bb = bb_list[0];
+ +              bb_delete(0);
+ +      }
+ +      debug_kfree(bb_list);
+ +      bb_list = NULL;
+ +      bb_count = bb_max = 0;
+ +      for (i = 0; i < bb_jmp_count; ++i) {
+ +              state = bb_jmp_list[i].state;
+ +              if (state && --state->ref_count == 0)
+ +                      debug_kfree(state);
+ +      }
+ +      debug_kfree(bb_jmp_list);
+ +      bb_jmp_list = NULL;
+ +      bb_jmp_count = bb_jmp_max = 0;
+ +      debug_kfree(bb_reg_state);
+ +      bb_reg_state = NULL;
+ +      bb_reg_state_max = 0;
+ +      debug_kfree(bb_exit_state);
+ +      bb_exit_state = NULL;
+ +      bb_reg_params = bb_memory_params = 0;
+ +      bb_giveup = 0;
+ +}
+ +
+ +static int
+ +bb_spurious_global_label(const char *func_name)
+ +{
+ +      int i;
+ +      for (i = 0; i < ARRAY_SIZE(bb_spurious); ++i) {
+ +              if (strcmp(bb_spurious[i], func_name) == 0)
+ +                      return 1;
+ +      }
+ +      return 0;
+ +}
+ +
+ +/* Given the current actual register contents plus the exit state deduced from
+ + * a basic block analysis of the current function, rollback the actual register
+ + * contents to the values they had on entry to this function.
+ + */
+ +
+ +static void
+ +bb_actual_rollback(const struct kdb_activation_record *ar)
+ +{
+ +      int i, offset_address;
+ +      struct bb_memory_contains *c;
+ +      enum bb_reg_code reg;
+ +      unsigned long address, osp = 0;
+ +      struct bb_actual new[ARRAY_SIZE(bb_actual)];
+ +
+ +
+ +      if (!bb_exit_state) {
+ +              kdb_printf("%s: no bb_exit_state, cannot rollback\n",
+ +                         __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      memcpy(bb_reg_state, bb_exit_state, bb_reg_state_size(bb_exit_state));
+ +      memset(new, 0, sizeof(new));
+ +
+ +      /* The most important register for obtaining saved state is rsp so get
+ +       * its new value first.  Prefer rsp if it is valid, then other
+ +       * registers.  Saved values of rsp in memory are unusable without a
+ +       * register that points to memory.
+ +       */
+ +      if (!bb_actual_valid(BBRG_RSP)) {
+ +              kdb_printf("%s: no starting value for RSP, cannot rollback\n",
+ +                         __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +              kdb_printf("%s: rsp " kdb_bfd_vma_fmt0,
+ +                         __FUNCTION__, bb_actual_value(BBRG_RSP));
+ +      i = BBRG_RSP;
+ +      if (!bb_is_osp_defined(i)) {
+ +              for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
+ +                      if (bb_is_osp_defined(i) && bb_actual_valid(i))
+ +                              break;
+ +              }
+ +      }
+ +      if (bb_is_osp_defined(i) && bb_actual_valid(i)) {
+ +              osp = new[BBRG_RSP - BBRG_RAX].value =
+ +                    bb_actual_value(i) - bb_reg_code_offset(i);
+ +              new[BBRG_RSP - BBRG_RAX].valid = 1;
+ +              if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +                      kdb_printf(" -> osp " kdb_bfd_vma_fmt0 "\n", osp);
+ +      } else {
+ +              bb_actual_set_valid(BBRG_RSP, 0);
+ +              if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +                      kdb_printf(" -> undefined\n");
+ +              kdb_printf("%s: no ending value for RSP, cannot rollback\n",
+ +                         __FUNCTION__);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +
+ +      /* Now the other registers.  First look at register values that have
+ +       * been copied to other registers.
+ +       */
+ +      for (i = BBRG_RAX; i < BBRG_RAX + KDB_INT_REGISTERS; ++i) {
+ +              reg = bb_reg_code_value(i);
+ +              if (bb_is_int_reg(reg)) {
+ +                      new[reg - BBRG_RAX] = bb_actual[i - BBRG_RAX];
+ +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
+ +                              kdb_printf("%s: %s is in %s ",
+ +                                          __FUNCTION__,
+ +                                          bbrg_name[reg],
+ +                                          bbrg_name[i]);
+ +                              if (bb_actual_valid(i))
+ +                                      kdb_printf(" -> " kdb_bfd_vma_fmt0 "\n",
+ +                                                  bb_actual_value(i));
+ +                              else
+ +                                      kdb_printf("(invalid)\n");
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* Finally register values that have been saved on stack */
+ +      for (i = 0, c = bb_reg_state->memory;
+ +           i < bb_reg_state->mem_count;
+ +           ++i, ++c) {
+ +              offset_address = c->offset_address;
+ +              reg = c->value;
+ +              if (!bb_is_int_reg(reg))
+ +                      continue;
+ +              address = osp + offset_address;
+ +              if (address < ar->stack.logical_start ||
+ +                  address >= ar->stack.logical_end) {
+ +                      new[reg - BBRG_RAX].value = 0;
+ +                      new[reg - BBRG_RAX].valid = 0;
+ +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +                              kdb_printf("%s: %s -> undefined\n",
+ +                                         __FUNCTION__,
+ +                                         bbrg_name[reg]);
+ +              } else {
+ +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM)) {
+ +                              kdb_printf("%s: %s -> *(osp",
+ +                                         __FUNCTION__,
+ +                                         bbrg_name[reg]);
+ +                              KDB_DEBUG_BB_OFFSET_PRINTF(offset_address, "", " ");
+ +                              kdb_printf(kdb_bfd_vma_fmt0, address);
+ +                      }
+ +                      new[reg - BBRG_RAX].value = *(bfd_vma *)address;
+ +                      new[reg - BBRG_RAX].valid = 1;
+ +                      if (KDB_DEBUG(BB) | KDB_DEBUG(BB_SUMM))
+ +                              kdb_printf(") = " kdb_bfd_vma_fmt0 "\n",
+ +                                         new[reg - BBRG_RAX].value);
+ +              }
+ +      }
+ +
+ +      memcpy(bb_actual, new, sizeof(bb_actual));
+ +}
+ +
+ +/* Return true if the current function is an interrupt handler */
+ +
+ +static bool
+ +bb_interrupt_handler(kdb_machreg_t rip)
+ +{
+ +      unsigned long disp8, disp32, target, addr = (unsigned long)rip;
+ +      unsigned char code[5];
+ +      int i;
+ +
+ +      for (i = 0; i < ARRAY_SIZE(bb_hardware_handlers); ++i)
+ +              if (strcmp(bb_func_name, bb_hardware_handlers[i]) == 0)
+ +                      return 1;
+ +
+ +      /* Given the large number of interrupt handlers, it is easiest to look
+ +       * at the next instruction and see if it is a jmp to the common exit
+ +       * routines.
+ +       */
+ +      if (kdb_getarea(code, addr) ||
+ +          kdb_getword(&disp32, addr+1, 4) ||
+ +          kdb_getword(&disp8, addr+1, 1))
+ +              return 0;       /* not a valid code address */
+ +      if (code[0] == 0xe9) {
+ +              target = addr + (s32) disp32 + 5;       /* jmp disp32 */
+ +              if (target == bb_ret_from_intr ||
+ +                  target == bb_common_interrupt ||
+ +                  target == bb_error_entry)
+ +                      return 1;
+ +      }
+ +      if (code[0] == 0xeb) {
+ +              target = addr + (s8) disp8 + 2;         /* jmp disp8 */
+ +              if (target == bb_ret_from_intr ||
+ +                  target == bb_common_interrupt ||
+ +                  target == bb_error_entry)
+ +                      return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Copy argument information that was deduced by the basic block analysis and
+ + * rollback into the kdb stack activation record.
+ + */
+ +
+ +static void
+ +bb_arguments(struct kdb_activation_record *ar)
+ +{
+ +      int i;
+ +      enum bb_reg_code reg;
+ +      kdb_machreg_t rsp;
+ +      ar->args = bb_reg_params + bb_memory_params;
+ +      bitmap_zero(ar->valid.bits, KDBA_MAXARGS);
+ +      for (i = 0; i < bb_reg_params; ++i) {
+ +              reg = bb_param_reg[i];
+ +              if (bb_actual_valid(reg)) {
+ +                      ar->arg[i] = bb_actual_value(reg);
+ +                      set_bit(i, ar->valid.bits);
+ +              }
+ +      }
+ +      if (!bb_actual_valid(BBRG_RSP))
+ +              return;
+ +      rsp = bb_actual_value(BBRG_RSP);
+ +      for (i = bb_reg_params; i < ar->args; ++i) {
+ +              rsp += KDB_WORD_SIZE;
+ +              if (kdb_getarea(ar->arg[i], rsp) == 0)
+ +                      set_bit(i, ar->valid.bits);
+ +      }
+ +}
+ +
+ +/* Given an exit address from a function, decompose the entire function into
+ + * basic blocks and determine the register state at the exit point.
+ + */
+ +
+ +static void
+ +kdb_bb(unsigned long exit)
+ +{
+ +      kdb_symtab_t symtab;
+ +      if (!kdbnearsym(exit, &symtab)) {
+ +              kdb_printf("%s: address " kdb_bfd_vma_fmt0 " not recognised\n",
+ +                         __FUNCTION__, exit);
+ +              bb_giveup = 1;
+ +              return;
+ +      }
+ +      bb_exit_addr = exit;
+ +      bb_mod_name = symtab.mod_name;
+ +      bb_func_name = symtab.sym_name;
+ +      bb_func_start = symtab.sym_start;
+ +      bb_func_end = symtab.sym_end;
+ +      /* Various global labels exist in the middle of assembler code and have
+ +       * a non-standard state.  Ignore these labels and use the start of the
+ +       * previous label instead.
+ +       */
+ +      while (bb_spurious_global_label(symtab.sym_name)) {
+ +              if (!kdbnearsym(symtab.sym_start - 1, &symtab))
+ +                      break;
+ +              bb_func_start = symtab.sym_start;
+ +      }
+ +      bb_mod_name = symtab.mod_name;
+ +      bb_func_name = symtab.sym_name;
+ +      bb_func_start = symtab.sym_start;
+ +      /* Ignore spurious labels past this point and use the next non-spurious
+ +       * label as the end point.
+ +       */
+ +      if (kdbnearsym(bb_func_end, &symtab)) {
+ +              while (bb_spurious_global_label(symtab.sym_name)) {
+ +                      bb_func_end = symtab.sym_end;
+ +                      if (!kdbnearsym(symtab.sym_end + 1, &symtab))
+ +                              break;
+ +              }
+ +      }
+ +      bb_pass1();
+ +      if (!bb_giveup)
+ +              bb_pass2();
+ +      if (bb_giveup)
+ +              kdb_printf("%s: " kdb_bfd_vma_fmt0
+ +                         " [%s]%s failed at " kdb_bfd_vma_fmt0 "\n\n",
+ +                         __FUNCTION__, exit,
+ +                         bb_mod_name, bb_func_name, bb_curr_addr);
+ +}
+ +
+ +static int
+ +kdb_bb1(int argc, const char **argv)
+ +{
+ +      int diag, nextarg = 1;
+ +      kdb_machreg_t addr;
+ +      unsigned long offset;
+ +
+ +      bb_cleanup();   /* in case previous command was interrupted */
+ +      kdba_id_init(&kdb_di);
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +      diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ +      if (diag)
+ +              return diag;
+ +      if (!addr)
+ +              return KDB_BADADDR;
+ +      kdb_save_flags();
+ +      kdb_flags |= KDB_DEBUG_FLAG_BB << KDB_DEBUG_FLAG_SHIFT;
+ +      kdb_bb(addr);
+ +      bb_cleanup();
+ +      kdb_restore_flags();
+ +      kdbnearsym_cleanup();
+ +      return 0;
+ +}
+ +
+ +/* Run a basic block analysis on every function in the base kernel.  Used as a
+ + * global sanity check to find errors in the basic block code.
+ + */
+ +
+ +static int
+ +kdb_bb_all(int argc, const char **argv)
+ +{
+ +      loff_t pos = 0;
+ +      const char *symname;
+ +      unsigned long addr;
+ +      int i, max_errors = 20;
+ +      struct bb_name_state *r;
+ +      kdb_printf("%s: build variables:"
+ +                 " CCVERSION \"" __stringify(CCVERSION) "\""
+ +#ifdef        CONFIG_X86_64
+ +                 " CONFIG_X86_64"
+ +#endif
+ +#ifdef        CONFIG_4KSTACKS
+ +                 " CONFIG_4KSTACKS"
+ +#endif
+ +#ifdef        CONFIG_PREEMPT
+ +                 " CONFIG_PREEMPT"
+ +#endif
+ +#ifdef        CONFIG_VM86
+ +                 " CONFIG_VM86"
+ +#endif
+ +#ifdef        CONFIG_FRAME_POINTER
+ +                 " CONFIG_FRAME_POINTER"
+ +#endif
+ +#ifdef        CONFIG_TRACE_IRQFLAGS
+ +                 " CONFIG_TRACE_IRQFLAGS"
+ +#endif
+ +#ifdef        CONFIG_HIBERNATION
+ +                 " CONFIG_HIBERNATION"
+ +#endif
+ +#ifdef        CONFIG_KPROBES
+ +                 " CONFIG_KPROBES"
+ +#endif
+ +#ifdef        CONFIG_KEXEC
+ +                 " CONFIG_KEXEC"
+ +#endif
+ +#ifdef        CONFIG_MATH_EMULATION
+ +                 " CONFIG_MATH_EMULATION"
+ +#endif
- #ifdef        CONFIG_PARAVIRT_XEN
++#ifdef        CONFIG_XEN
+ +                 " CONFIG_XEN"
+ +#endif
+ +#ifdef        CONFIG_DEBUG_INFO
+ +                 " CONFIG_DEBUG_INFO"
+ +#endif
+ +#ifdef        NO_SIBLINGS
+ +                 " NO_SIBLINGS"
+ +#endif
+ +                 " REGPARM=" __stringify(REGPARM)
+ +                 "\n\n", __FUNCTION__);
+ +      for (i = 0, r = bb_special_cases;
+ +           i < ARRAY_SIZE(bb_special_cases);
+ +           ++i, ++r) {
+ +              if (!r->address)
+ +                      kdb_printf("%s: cannot find special_case name %s\n",
+ +                                 __FUNCTION__, r->name);
+ +      }
+ +      for (i = 0; i < ARRAY_SIZE(bb_spurious); ++i) {
+ +              if (!kallsyms_lookup_name(bb_spurious[i]))
+ +                      kdb_printf("%s: cannot find spurious label %s\n",
+ +                                 __FUNCTION__, bb_spurious[i]);
+ +      }
+ +      while ((symname = kdb_walk_kallsyms(&pos))) {
+ +              if (strcmp(symname, "_stext") == 0 ||
+ +                  strcmp(symname, "stext") == 0)
+ +                      break;
+ +      }
+ +      if (!symname) {
+ +              kdb_printf("%s: cannot find _stext\n", __FUNCTION__);
+ +              return 0;
+ +      }
+ +      kdba_id_init(&kdb_di);
+ +      i = 0;
+ +      while ((symname = kdb_walk_kallsyms(&pos))) {
+ +              if (strcmp(symname, "_etext") == 0)
+ +                      break;
+ +              if (i++ % 100 == 0)
+ +                      kdb_printf(".");
+ +              /* x86_64 has some 16 bit functions that appear between stext
+ +               * and _etext.  Skip them.
+ +               */
+ +              if (strcmp(symname, "verify_cpu") == 0 ||
+ +                  strcmp(symname, "verify_cpu_noamd") == 0 ||
+ +                  strcmp(symname, "verify_cpu_sse_test") == 0 ||
+ +                  strcmp(symname, "verify_cpu_no_longmode") == 0 ||
+ +                  strcmp(symname, "verify_cpu_sse_ok") == 0 ||
+ +                  strcmp(symname, "mode_seta") == 0 ||
+ +                  strcmp(symname, "bad_address") == 0 ||
+ +                  strcmp(symname, "wakeup_code") == 0 ||
+ +                  strcmp(symname, "wakeup_code_start") == 0 ||
+ +                  strcmp(symname, "wakeup_start") == 0 ||
+ +                  strcmp(symname, "wakeup_32_vector") == 0 ||
+ +                  strcmp(symname, "wakeup_32") == 0 ||
+ +                  strcmp(symname, "wakeup_long64_vector") == 0 ||
+ +                  strcmp(symname, "wakeup_long64") == 0 ||
+ +                  strcmp(symname, "gdta") == 0 ||
+ +                  strcmp(symname, "idt_48a") == 0 ||
+ +                  strcmp(symname, "gdt_48a") == 0 ||
+ +                  strcmp(symname, "bogus_real_magic") == 0 ||
+ +                  strcmp(symname, "bogus_64_magic") == 0 ||
+ +                  strcmp(symname, "no_longmode") == 0 ||
+ +                  strcmp(symname, "mode_set") == 0 ||
+ +                  strcmp(symname, "mode_seta") == 0 ||
+ +                  strcmp(symname, "setbada") == 0 ||
+ +                  strcmp(symname, "check_vesa") == 0 ||
+ +                  strcmp(symname, "check_vesaa") == 0 ||
+ +                  strcmp(symname, "_setbada") == 0 ||
+ +                  strcmp(symname, "wakeup_stack_begin") == 0 ||
+ +                  strcmp(symname, "wakeup_stack") == 0 ||
+ +                  strcmp(symname, "wakeup_level4_pgt") == 0 ||
+ +                  strcmp(symname, "acpi_copy_wakeup_routine") == 0 ||
+ +                  strcmp(symname, "wakeup_end") == 0 ||
+ +                  strcmp(symname, "do_suspend_lowlevel_s4bios") == 0 ||
+ +                  strcmp(symname, "do_suspend_lowlevel") == 0 ||
+ +                  strcmp(symname, "wakeup_pmode_return") == 0 ||
+ +                  strcmp(symname, "restore_registers") == 0)
+ +                      continue;
+ +              /* __kprobes_text_end contains branches to the middle of code,
+ +               * with undefined states.
+ +               */
+ +              if (strcmp(symname, "__kprobes_text_end") == 0)
+ +                      continue;
+ +              /* Data in the middle of the text segment :( */
+ +              if (strcmp(symname, "level2_kernel_pgt") == 0 ||
+ +                  strcmp(symname, "level3_kernel_pgt") == 0)
+ +                      continue;
+ +              if (bb_spurious_global_label(symname))
+ +                      continue;
+ +              if ((addr = kallsyms_lookup_name(symname)) == 0)
+ +                      continue;
+ +              // kdb_printf("BB " kdb_bfd_vma_fmt0 " %s\n", addr, symname);
+ +              bb_cleanup();   /* in case previous command was interrupted */
+ +              kdbnearsym_cleanup();
+ +              kdb_bb(addr);
+ +              touch_nmi_watchdog();
+ +              if (bb_giveup) {
+ +                      if (max_errors-- == 0) {
+ +                              kdb_printf("%s: max_errors reached, giving up\n",
+ +                                         __FUNCTION__);
+ +                              break;
+ +                      } else {
+ +                              bb_giveup = 0;
+ +                      }
+ +              }
+ +      }
+ +      kdb_printf("\n");
+ +      bb_cleanup();
+ +      kdbnearsym_cleanup();
+ +      return 0;
+ +}
+ +
+ +/*
+ + *=============================================================================
+ + *
+ + * Everything above this line is doing basic block analysis, function by
+ + * function.  Everything below this line uses the basic block data to do a
+ + * complete backtrace over all functions that are used by a process.
+ + *
+ + *=============================================================================
+ + */
+ +
+ +
+ +/*============================================================================*/
+ +/*                                                                            */
+ +/* Most of the backtrace code and data is common to x86_64 and i386.  This    */
+ +/* large ifdef contains all of the differences between the two architectures. */
+ +/*                                                                            */
+ +/* Make sure you update the correct section of this ifdef.                    */
+ +/*                                                                            */
+ +/*============================================================================*/
+ +#define XCS "cs"
+ +#define RSP "sp"
+ +#define RIP "ip"
+ +#define ARCH_RSP sp
+ +#define ARCH_RIP ip
+ +
+ +#ifdef        CONFIG_X86_64
+ +
+ +#define ARCH_NORMAL_PADDING (16 * 8)
+ +
+ +/* x86_64 has multiple alternate stacks, with different sizes and different
+ + * offsets to get the link from one stack to the next.  All of the stacks are
+ + * in the per_cpu area: either in the orig_ist or irq_stack_ptr. Debug events
+ + * can even have multiple nested stacks within the single physical stack,
+ + * each nested stack has its own link and some of those links are wrong.
+ + *
+ + * Consistent it's not!
+ + *
+ + * Do not assume that these stacks are aligned on their size.
+ + */
+ +#define INTERRUPT_STACK (N_EXCEPTION_STACKS + 1)
+ +void
+ +kdba_get_stack_info_alternate(kdb_machreg_t addr, int cpu,
+ +                            struct kdb_activation_record *ar)
+ +{
+ +      static struct {
+ +              const char *id;
+ +              unsigned int total_size;
+ +              unsigned int nested_size;
+ +              unsigned int next;
+ +      } *sdp, stack_data[] = {
+ +              [STACKFAULT_STACK - 1] =  { "stackfault",    EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ +              [DOUBLEFAULT_STACK - 1] = { "doublefault",   EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ +              [NMI_STACK - 1] =         { "nmi",           EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ +              [DEBUG_STACK - 1] =       { "debug",         DEBUG_STKSZ,     EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ +              [MCE_STACK - 1] =         { "machine check", EXCEPTION_STKSZ, EXCEPTION_STKSZ, EXCEPTION_STKSZ - 2*sizeof(void *) },
+ +              [INTERRUPT_STACK - 1] =   { "interrupt",     IRQ_STACK_SIZE,  IRQ_STACK_SIZE,  IRQ_STACK_SIZE  -   sizeof(void *) },
+ +      };
+ +      unsigned long total_start = 0, total_size, total_end;
+ +      int sd, found = 0;
+ +      extern unsigned long kdba_orig_ist(int, int);
+ +
+ +      for (sd = 0, sdp = stack_data;
+ +           sd < ARRAY_SIZE(stack_data);
+ +           ++sd, ++sdp) {
+ +              total_size = sdp->total_size;
+ +              if (!total_size)
+ +                      continue;       /* in case stack_data[] has any holes */
+ +              if (cpu < 0) {
+ +                      /* Arbitrary address which can be on any cpu, see if it
+ +                       * falls within any of the alternate stacks
+ +                       */
+ +                      int c;
+ +                      for_each_online_cpu(c) {
+ +                              if (sd == INTERRUPT_STACK - 1)
+ +                                      total_end = (unsigned long)per_cpu(irq_stack_ptr, c);
+ +                              else
+ +                                      total_end = per_cpu(orig_ist, c).ist[sd];
+ +                              total_start = total_end - total_size;
+ +                              if (addr >= total_start && addr < total_end) {
+ +                                      found = 1;
+ +                                      cpu = c;
+ +                                      break;
+ +                              }
+ +                      }
+ +                      if (!found)
+ +                              continue;
+ +              }
+ +              /* Only check the supplied or found cpu */
+ +              if (sd == INTERRUPT_STACK - 1)
+ +                      total_end = (unsigned long)per_cpu(irq_stack_ptr, cpu);
+ +              else
+ +                      total_end = per_cpu(orig_ist, cpu).ist[sd];
+ +              total_start = total_end - total_size;
+ +              if (addr >= total_start && addr < total_end) {
+ +                      found = 1;
+ +                      break;
+ +              }
+ +      }
+ +      if (!found)
+ +              return;
+ +      /* find which nested stack the address is in */
+ +      while (addr > total_start + sdp->nested_size)
+ +              total_start += sdp->nested_size;
+ +      ar->stack.physical_start = total_start;
+ +      ar->stack.physical_end = total_start + sdp->nested_size;
+ +      ar->stack.logical_start = total_start;
+ +      ar->stack.logical_end = total_start + sdp->next;
+ +      ar->stack.next = *(unsigned long *)ar->stack.logical_end;
+ +      ar->stack.id = sdp->id;
+ +
+ +      /* Nasty: when switching to the interrupt stack, the stack state of the
+ +       * caller is split over two stacks, the original stack and the
+ +       * interrupt stack.  One word (the previous frame pointer) is stored on
+ +       * the interrupt stack, the rest of the interrupt data is in the old
+ +       * frame.  To make the interrupted stack state look as though it is
+ +       * contiguous, copy the missing word from the interrupt stack to the
+ +       * original stack and adjust the new stack pointer accordingly.
+ +       */
+ +
+ +      if (sd == INTERRUPT_STACK - 1) {
+ +              *(unsigned long *)(ar->stack.next - KDB_WORD_SIZE) =
+ +                      ar->stack.next;
+ +              ar->stack.next -= KDB_WORD_SIZE;
+ +      }
+ +}
+ +
+ +/* rip is not in the thread struct for x86_64.  We know that the stack value
+ + * was saved in schedule near the label thread_return.  Setting rip to
+ + * thread_return lets the stack trace find that we are in schedule and
+ + * correctly decode its prologue.
+ + */
+ +
+ +static kdb_machreg_t
+ +kdba_bt_stack_rip(const struct task_struct *p)
+ +{
+ +      return bb_thread_return;
+ +}
+ +
+ +#else /* !CONFIG_X86_64 */
+ +
+ +#define ARCH_NORMAL_PADDING (19 * 4)
+ +
+ +#ifdef        CONFIG_4KSTACKS
+ +static struct thread_info **kdba_hardirq_ctx, **kdba_softirq_ctx;
+ +#endif        /* CONFIG_4KSTACKS */
+ +
+ +/* On a 4K stack kernel, hardirq_ctx and softirq_ctx are [NR_CPUS] arrays.  The
+ + * first element of each per-cpu stack is a struct thread_info.
+ + */
+ +void
+ +kdba_get_stack_info_alternate(kdb_machreg_t addr, int cpu,
+ +                            struct kdb_activation_record *ar)
+ +{
+ +#ifdef        CONFIG_4KSTACKS
+ +      struct thread_info *tinfo;
+ +      tinfo = (struct thread_info *)(addr & -THREAD_SIZE);
+ +      if (cpu < 0) {
+ +              /* Arbitrary address, see if it falls within any of the irq
+ +               * stacks
+ +               */
+ +              int found = 0;
+ +              for_each_online_cpu(cpu) {
+ +                      if (tinfo == kdba_hardirq_ctx[cpu] ||
+ +                          tinfo == kdba_softirq_ctx[cpu]) {
+ +                              found = 1;
+ +                              break;
+ +                      }
+ +              }
+ +              if (!found)
+ +                      return;
+ +      }
+ +      if (tinfo == kdba_hardirq_ctx[cpu] ||
+ +          tinfo == kdba_softirq_ctx[cpu]) {
+ +              ar->stack.physical_start = (kdb_machreg_t)tinfo;
+ +              ar->stack.physical_end = ar->stack.physical_start + THREAD_SIZE;
+ +              ar->stack.logical_start = ar->stack.physical_start +
+ +                                        sizeof(struct thread_info);
+ +              ar->stack.logical_end = ar->stack.physical_end;
+ +              ar->stack.next = tinfo->previous_esp;
+ +              if (tinfo == kdba_hardirq_ctx[cpu])
+ +                      ar->stack.id = "hardirq_ctx";
+ +              else
+ +                      ar->stack.id = "softirq_ctx";
+ +      }
+ +#endif        /* CONFIG_4KSTACKS */
+ +}
+ +
+ +/* rip is in the thread struct for i386 */
+ +
+ +static kdb_machreg_t
+ +kdba_bt_stack_rip(const struct task_struct *p)
+ +{
+ +      return p->thread.ip;
+ +}
+ +
+ +#endif        /* CONFIG_X86_64 */
+ +
+ +/* Given an address which claims to be on a stack, an optional cpu number and
+ + * an optional task address, get information about the stack.
+ + *
+ + * t == NULL, cpu < 0 indicates an arbitrary stack address with no associated
+ + * struct task, the address can be in an alternate stack or any task's normal
+ + * stack.
+ + *
+ + * t != NULL, cpu >= 0 indicates a running task, the address can be in an
+ + * alternate stack or that task's normal stack.
+ + *
+ + * t != NULL, cpu < 0 indicates a blocked task, the address can only be in that
+ + * task's normal stack.
+ + *
+ + * t == NULL, cpu >= 0 is not a valid combination.
+ + */
+ +
+ +static void
+ +kdba_get_stack_info(kdb_machreg_t rsp, int cpu,
+ +                  struct kdb_activation_record *ar,
+ +                  const struct task_struct *t)
+ +{
+ +      struct thread_info *tinfo;
+ +      struct task_struct *g, *p;
+ +      memset(&ar->stack, 0, sizeof(ar->stack));
+ +      if (KDB_DEBUG(ARA))
+ +              kdb_printf("%s: " RSP "=0x%lx cpu=%d task=%p\n",
+ +                         __FUNCTION__, rsp, cpu, t);
+ +      if (t == NULL || cpu >= 0) {
+ +              kdba_get_stack_info_alternate(rsp, cpu, ar);
+ +              if (ar->stack.logical_start)
+ +                      goto out;
+ +      }
+ +      rsp &= -THREAD_SIZE;
+ +      tinfo = (struct thread_info *)rsp;
+ +      if (t == NULL) {
+ +              /* Arbitrary stack address without an associated task, see if
+ +               * it falls within any normal process stack, including the idle
+ +               * tasks.
+ +               */
+ +              kdb_do_each_thread(g, p) {
+ +                      if (tinfo == task_thread_info(p)) {
+ +                              t = p;
+ +                              goto found;
+ +                      }
+ +              } kdb_while_each_thread(g, p);
+ +              for_each_online_cpu(cpu) {
+ +                      p = idle_task(cpu);
+ +                      if (tinfo == task_thread_info(p)) {
+ +                              t = p;
+ +                              goto found;
+ +                      }
+ +              }
+ +      found:
+ +              if (KDB_DEBUG(ARA))
+ +                      kdb_printf("%s: found task %p\n", __FUNCTION__, t);
+ +      } else if (cpu >= 0) {
+ +              /* running task */
+ +              struct kdb_running_process *krp = kdb_running_process + cpu;
+ +              if (krp->p != t || tinfo != task_thread_info(t))
+ +                      t = NULL;
+ +              if (KDB_DEBUG(ARA))
+ +                      kdb_printf("%s: running task %p\n", __FUNCTION__, t);
+ +      } else {
+ +              /* blocked task */
+ +              if (tinfo != task_thread_info(t))
+ +                      t = NULL;
+ +              if (KDB_DEBUG(ARA))
+ +                      kdb_printf("%s: blocked task %p\n", __FUNCTION__, t);
+ +      }
+ +      if (t) {
+ +              ar->stack.physical_start = rsp;
+ +              ar->stack.physical_end = rsp + THREAD_SIZE;
+ +              ar->stack.logical_start = rsp + sizeof(struct thread_info);
+ +              ar->stack.logical_end = ar->stack.physical_end - ARCH_NORMAL_PADDING;
+ +              ar->stack.next = 0;
+ +              ar->stack.id = "normal";
+ +      }
+ +out:
+ +      if (ar->stack.physical_start && KDB_DEBUG(ARA)) {
+ +              kdb_printf("%s: ar->stack\n", __FUNCTION__);
+ +              kdb_printf("    physical_start=0x%lx\n", ar->stack.physical_start);
+ +              kdb_printf("    physical_end=0x%lx\n", ar->stack.physical_end);
+ +              kdb_printf("    logical_start=0x%lx\n", ar->stack.logical_start);
+ +              kdb_printf("    logical_end=0x%lx\n", ar->stack.logical_end);
+ +              kdb_printf("    next=0x%lx\n", ar->stack.next);
+ +              kdb_printf("    id=%s\n", ar->stack.id);
+ +              kdb_printf("    set MDCOUNT %ld\n",
+ +                         (ar->stack.physical_end - ar->stack.physical_start) /
+ +                         KDB_WORD_SIZE);
+ +              kdb_printf("    mds " kdb_machreg_fmt0 "\n",
+ +                         ar->stack.physical_start);
+ +      }
+ +}
+ +
+ +static void
+ +bt_print_one(kdb_machreg_t rip, kdb_machreg_t rsp,
+ +            const struct kdb_activation_record *ar,
+ +            const kdb_symtab_t *symtab, int argcount)
+ +{
+ +      int btsymarg = 0;
+ +      int nosect = 0;
+ +
+ +      kdbgetintenv("BTSYMARG", &btsymarg);
+ +      kdbgetintenv("NOSECT", &nosect);
+ +
+ +      kdb_printf(kdb_machreg_fmt0, rsp);
+ +      kdb_symbol_print(rip, symtab,
+ +                       KDB_SP_SPACEB|KDB_SP_VALUE);
+ +      if (argcount && ar->args) {
+ +              int i, argc = ar->args;
+ +              kdb_printf(" (");
+ +              if (argc > argcount)
+ +                      argc = argcount;
+ +              for (i = 0; i < argc; i++) {
+ +                      if (i)
+ +                              kdb_printf(", ");
+ +                      if (test_bit(i, ar->valid.bits))
+ +                              kdb_printf("0x%lx", ar->arg[i]);
+ +                      else
+ +                              kdb_printf("invalid");
+ +              }
+ +              kdb_printf(")");
+ +      }
+ +      kdb_printf("\n");
+ +      if (symtab->sym_name) {
+ +              if (!nosect) {
+ +                      kdb_printf("                               %s",
+ +                                 symtab->mod_name);
+ +                      if (symtab->sec_name && symtab->sec_start)
+ +                              kdb_printf(" 0x%lx 0x%lx",
+ +                                         symtab->sec_start, symtab->sec_end);
+ +                      kdb_printf(" 0x%lx 0x%lx\n",
+ +                                 symtab->sym_start, symtab->sym_end);
+ +              }
+ +      }
+ +      if (argcount && ar->args && btsymarg) {
+ +              int i, argc = ar->args;
+ +              kdb_symtab_t arg_symtab;
+ +              for (i = 0; i < argc; i++) {
+ +                      kdb_machreg_t arg = ar->arg[i];
+ +                      if (test_bit(i, ar->valid.bits) &&
+ +                          kdbnearsym(arg, &arg_symtab)) {
+ +                              kdb_printf("                       ARG %2d ", i);
+ +                              kdb_symbol_print(arg, &arg_symtab,
+ +                                               KDB_SP_DEFAULT|KDB_SP_NEWLINE);
+ +                      }
+ +              }
+ +      }
+ +}
+ +
+ +static void
+ +kdba_bt_new_stack(struct kdb_activation_record *ar, kdb_machreg_t *rsp,
+ +                 int *count, int *suppress)
+ +{
+ +      /* Nasty: save_args builds a partial pt_regs, with r15 through
+ +       * rbx not being filled in.  It passes struct pt_regs* to do_IRQ (in
+ +       * rdi) but the stack pointer is not adjusted to account for r15
+ +       * through rbx.  This has two effects :-
+ +       *
+ +       * (1) struct pt_regs on an external interrupt actually overlaps with
+ +       *     the local stack area used by do_IRQ.  Not only are r15-rbx
+ +       *     undefined, the area that claims to hold their values can even
+ +       *     change as the irq is processed.
+ +       *
+ +       * (2) The back stack pointer saved for the new frame is not pointing
+ +       *     at pt_regs, it is pointing at rbx within the pt_regs passed to
+ +       *     do_IRQ.
+ +       *
+ +       * There is nothing that I can do about (1) but I have to fix (2)
+ +       * because kdb backtrace looks for the "start" address of pt_regs as it
+ +       * walks back through the stacks.  When switching from the interrupt
+ +       * stack to another stack, we have to assume that pt_regs has been
+ +       * seen and turn off backtrace supression.
+ +       */
+ +      int probable_pt_regs = strcmp(ar->stack.id, "interrupt") == 0;
+ +      *rsp = ar->stack.next;
+ +      if (KDB_DEBUG(ARA))
+ +              kdb_printf("new " RSP "=" kdb_machreg_fmt0 "\n", *rsp);
+ +      bb_actual_set_value(BBRG_RSP, *rsp);
+ +      kdba_get_stack_info(*rsp, -1, ar, NULL);
+ +      if (!ar->stack.physical_start) {
+ +              kdb_printf("+++ Cannot resolve next stack\n");
+ +      } else if (!*suppress) {
+ +              kdb_printf(" ======================= <%s>\n",
+ +                         ar->stack.id);
+ +              ++*count;
+ +      }
+ +      if (probable_pt_regs)
+ +              *suppress = 0;
+ +}
+ +
+ +/*
+ + * kdba_bt_stack
+ + *
+ + * Inputs:
+ + *    addr    Address provided to 'bt' command, if any.
+ + *    argcount
+ + *    p       Pointer to task for 'btp' command.
+ + * Outputs:
+ + *    None.
+ + * Returns:
+ + *    zero for success, a kdb diagnostic if error
+ + * Locking:
+ + *    none.
+ + * Remarks:
+ + *    Ultimately all the bt* commands come through this routine.  If
+ + *    old_style is 0 then it uses the basic block analysis to get an accurate
+ + *    backtrace with arguments, otherwise it falls back to the old method of
+ + *    printing anything on stack that looks like a kernel address.
+ + *
+ + *    Allowing for the stack data pushed by the hardware is tricky.  We
+ + *    deduce the presence of hardware pushed data by looking for interrupt
+ + *    handlers, either by name or by the code that they contain.  This
+ + *    information must be applied to the next function up the stack, because
+ + *    the hardware data is above the saved rip for the interrupted (next)
+ + *    function.
+ + *
+ + *    To make things worse, the amount of data pushed is arch specific and
+ + *    may depend on the rsp for the next function, not the current function.
+ + *    The number of bytes pushed by hardware cannot be calculated until we
+ + *    are actually processing the stack for the interrupted function and have
+ + *    its rsp.
+ + *
+ + *    It is also possible for an interrupt to occur in user space and for the
+ + *    interrupt handler to also be interrupted.  Check the code selector
+ + *    whenever the previous function is an interrupt handler and stop
+ + *    backtracing if the interrupt was not in kernel space.
+ + */
+ +
+ +static int
+ +kdba_bt_stack(kdb_machreg_t addr, int argcount, const struct task_struct *p,
+ +             int old_style)
+ +{
+ +      struct kdb_activation_record ar;
+ +      kdb_machreg_t rip = 0, rsp = 0, prev_rsp, cs;
+ +      kdb_symtab_t symtab;
+ +      int rip_at_rsp = 0, count = 0, btsp = 0, suppress,
+ +          interrupt_handler = 0, prev_interrupt_handler = 0, hardware_pushed,
+ +          prev_noret = 0;
+ +      struct pt_regs *regs = NULL;
+ +
+ +      kdbgetintenv("BTSP", &btsp);
+ +      suppress = !btsp;
+ +      memset(&ar, 0, sizeof(ar));
+ +      if (old_style)
+ +              kdb_printf("Using old style backtrace, unreliable with no arguments\n");
+ +
+ +      /*
+ +       * The caller may have supplied an address at which the stack traceback
+ +       * operation should begin.  This address is assumed by this code to
+ +       * point to a return address on the stack to be traced back.
+ +       *
+ +       * Warning: type in the wrong address and you will get garbage in the
+ +       * backtrace.
+ +       */
+ +      if (addr) {
+ +              rsp = addr;
+ +              kdb_getword(&rip, rsp, sizeof(rip));
+ +              rip_at_rsp = 1;
+ +              suppress = 0;
+ +              kdba_get_stack_info(rsp, -1, &ar, NULL);
+ +      } else {
+ +              if (task_curr(p)) {
+ +                      struct kdb_running_process *krp =
+ +                          kdb_running_process + task_cpu(p);
+ +                      kdb_machreg_t cs;
+ +                      regs = krp->regs;
+ +                      if (krp->seqno &&
+ +                          krp->p == p &&
+ +                          krp->seqno >= kdb_seqno - 1 &&
+ +                          !KDB_NULL_REGS(regs)) {
+ +                              /* valid saved state, continue processing */
+ +                      } else {
+ +                              kdb_printf
+ +                                  ("Process did not save state, cannot backtrace\n");
+ +                              kdb_ps1(p);
+ +                              return 0;
+ +                      }
+ +                      kdba_getregcontents(XCS, regs, &cs);
+ +                      if ((cs & 0xffff) != __KERNEL_CS) {
+ +                              kdb_printf("Stack is not in kernel space, backtrace not available\n");
+ +                              return 0;
+ +                      }
+ +                      rip = krp->arch.ARCH_RIP;
+ +                      rsp = krp->arch.ARCH_RSP;
+ +                      kdba_get_stack_info(rsp, kdb_process_cpu(p), &ar, p);
+ +              } else {
+ +                      /* Not on cpu, assume blocked.  Blocked tasks do not
+ +                       * have pt_regs.  p->thread contains some data, alas
+ +                       * what it contains differs between i386 and x86_64.
+ +                       */
+ +                      rip = kdba_bt_stack_rip(p);
+ +                      rsp = p->thread.sp;
+ +                      suppress = 0;
+ +                      kdba_get_stack_info(rsp, -1, &ar, p);
+ +              }
+ +      }
+ +      if (!ar.stack.physical_start) {
+ +              kdb_printf(RSP "=0x%lx is not in a valid kernel stack, backtrace not available\n",
+ +                         rsp);
+ +              return 0;
+ +      }
+ +      memset(&bb_actual, 0, sizeof(bb_actual));
+ +      bb_actual_set_value(BBRG_RSP, rsp);
+ +      bb_actual_set_valid(BBRG_RSP, 1);
+ +
+ +      kdb_printf(RSP "%*s" RIP "%*sFunction (args)\n",
+ +                 2*KDB_WORD_SIZE, " ",
+ +                 2*KDB_WORD_SIZE, " ");
+ +      if (ar.stack.next && !suppress)
+ +              kdb_printf(" ======================= <%s>\n",
+ +                         ar.stack.id);
+ +
+ +      bb_cleanup();
+ +      /* Run through all the stacks */
+ +      while (ar.stack.physical_start) {
+ +              if (rip_at_rsp) {
+ +                      rip = *(kdb_machreg_t *)rsp;
+ +                      /* I wish that gcc was fixed to include a nop
+ +                       * instruction after ATTRIB_NORET functions.  The lack
+ +                       * of a nop means that the return address points to the
+ +                       * start of next function, so fudge it to point to one
+ +                       * byte previous.
+ +                       *
+ +                       * No, we cannot just decrement all rip values.
+ +                       * Sometimes an rip legally points to the start of a
+ +                       * function, e.g. interrupted code or hand crafted
+ +                       * assembler.
+ +                       */
+ +                      if (prev_noret) {
+ +                              kdbnearsym(rip, &symtab);
+ +                              if (rip == symtab.sym_start) {
+ +                                      --rip;
+ +                                      if (KDB_DEBUG(ARA))
+ +                                              kdb_printf("\tprev_noret, " RIP
+ +                                                         "=0x%lx\n", rip);
+ +                              }
+ +                      }
+ +              }
+ +              kdbnearsym(rip, &symtab);
+ +              if (old_style) {
+ +                      if (__kernel_text_address(rip) && !suppress) {
+ +                              bt_print_one(rip, rsp, &ar, &symtab, 0);
+ +                              ++count;
+ +                      }
+ +                      if (rsp == (unsigned long)regs) {
+ +                              if (ar.stack.next && suppress)
+ +                                      kdb_printf(" ======================= <%s>\n",
+ +                                                 ar.stack.id);
+ +                              ++count;
+ +                              suppress = 0;
+ +                      }
+ +                      rsp += sizeof(rip);
+ +                      rip_at_rsp = 1;
+ +                      if (rsp >= ar.stack.logical_end) {
+ +                              if (!ar.stack.next)
+ +                                      break;
+ +                              kdba_bt_new_stack(&ar, &rsp, &count, &suppress);
+ +                              rip_at_rsp = 0;
+ +                              continue;
+ +                      }
+ +              } else {
+ +                      /* Start each analysis with no dynamic data from the
+ +                       * previous kdb_bb() run.
+ +                       */
+ +                      bb_cleanup();
+ +                      kdb_bb(rip);
+ +                      if (bb_giveup)
+ +                              break;
+ +                      prev_interrupt_handler = interrupt_handler;
+ +                      interrupt_handler = bb_interrupt_handler(rip);
+ +                      prev_rsp = rsp;
+ +                      if (rip_at_rsp) {
+ +                              if (prev_interrupt_handler) {
+ +                                      cs = *((kdb_machreg_t *)rsp + 1) & 0xffff;
+ +                                      hardware_pushed =
+ +                                              bb_hardware_pushed_arch(rsp, &ar);
+ +                              } else {
+ +                                      cs = __KERNEL_CS;
+ +                                      hardware_pushed = 0;
+ +                              }
+ +                              rsp += sizeof(rip) + hardware_pushed;
+ +                              if (KDB_DEBUG(ARA))
+ +                                      kdb_printf("%s: " RSP " "
+ +                                                 kdb_machreg_fmt0
+ +                                                 " -> " kdb_machreg_fmt0
+ +                                                 " hardware_pushed %d"
+ +                                                 " prev_interrupt_handler %d"
+ +                                                 " cs 0x%lx\n",
+ +                                                 __FUNCTION__,
+ +                                                 prev_rsp,
+ +                                                 rsp,
+ +                                                 hardware_pushed,
+ +                                                 prev_interrupt_handler,
+ +                                                 cs);
+ +                              if (rsp >= ar.stack.logical_end &&
+ +                                  ar.stack.next) {
+ +                                      kdba_bt_new_stack(&ar, &rsp, &count,
+ +                                                         &suppress);
+ +                                      rip_at_rsp = 0;
+ +                                      continue;
+ +                              }
+ +                              bb_actual_set_value(BBRG_RSP, rsp);
+ +                      } else {
+ +                              cs = __KERNEL_CS;
+ +                      }
+ +                      rip_at_rsp = 1;
+ +                      bb_actual_rollback(&ar);
+ +                      if (bb_giveup)
+ +                              break;
+ +                      if (bb_actual_value(BBRG_RSP) < rsp) {
+ +                              kdb_printf("%s: " RSP " is going backwards, "
+ +                                         kdb_machreg_fmt0 " -> "
+ +                                         kdb_machreg_fmt0 "\n",
+ +                                         __FUNCTION__,
+ +                                         rsp,
+ +                                         bb_actual_value(BBRG_RSP));
+ +                              bb_giveup = 1;
+ +                              break;
+ +                      }
+ +                      bb_arguments(&ar);
+ +                      if (!suppress) {
+ +                              bt_print_one(rip, prev_rsp, &ar, &symtab, argcount);
+ +                              ++count;
+ +                      }
+ +                      /* Functions that terminate the backtrace */
+ +                      if (strcmp(bb_func_name, "cpu_idle") == 0 ||
+ +                          strcmp(bb_func_name, "child_rip") == 0)
+ +                              break;
+ +                      if (rsp >= ar.stack.logical_end &&
+ +                          !ar.stack.next)
+ +                              break;
+ +                      if (rsp <= (unsigned long)regs &&
+ +                          bb_actual_value(BBRG_RSP) > (unsigned long)regs) {
+ +                              if (ar.stack.next && suppress)
+ +                                      kdb_printf(" ======================= <%s>\n",
+ +                                                 ar.stack.id);
+ +                              ++count;
+ +                              suppress = 0;
+ +                      }
+ +                      if (cs != __KERNEL_CS) {
+ +                              kdb_printf("Reached user space\n");
+ +                              break;
+ +                      }
+ +                      rsp = bb_actual_value(BBRG_RSP);
+ +              }
+ +              prev_noret = bb_noret(bb_func_name);
+ +              if (count > 200)
+ +                      break;
+ +      }
+ +      if (bb_giveup)
+ +              return 1;
+ +      bb_cleanup();
+ +      kdbnearsym_cleanup();
+ +
+ +      if (count > 200) {
+ +              kdb_printf("bt truncated, count limit reached\n");
+ +              return 1;
+ +      } else if (suppress) {
+ +              kdb_printf
+ +                  ("bt did not find pt_regs - no trace produced.  Suggest 'set BTSP 1'\n");
+ +              return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * kdba_bt_address
+ + *
+ + *    Do a backtrace starting at a specified stack address.  Use this if the
+ + *    heuristics get the stack decode wrong.
+ + *
+ + * Inputs:
+ + *    addr    Address provided to 'bt' command.
+ + *    argcount
+ + * Outputs:
+ + *    None.
+ + * Returns:
+ + *    zero for success, a kdb diagnostic if error
+ + * Locking:
+ + *    none.
+ + * Remarks:
+ + *    mds %rsp comes in handy when examining the stack to do a manual
+ + *    traceback.
+ + */
+ +
+ +int kdba_bt_address(kdb_machreg_t addr, int argcount)
+ +{
+ +      int ret;
+ +      kdba_id_init(&kdb_di);                  /* kdb_bb needs this done once */
+ +      ret = kdba_bt_stack(addr, argcount, NULL, 0);
+ +      if (ret == 1)
+ +              ret = kdba_bt_stack(addr, argcount, NULL, 1);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * kdba_bt_process
+ + *
+ + *    Do a backtrace for a specified process.
+ + *
+ + * Inputs:
+ + *    p       Struct task pointer extracted by 'bt' command.
+ + *    argcount
+ + * Outputs:
+ + *    None.
+ + * Returns:
+ + *    zero for success, a kdb diagnostic if error
+ + * Locking:
+ + *    none.
+ + */
+ +
+ +int kdba_bt_process(const struct task_struct *p, int argcount)
+ +{
+ +      int ret;
+ +      kdba_id_init(&kdb_di);                  /* kdb_bb needs this done once */
+ +      ret = kdba_bt_stack(0, argcount, p, 0);
+ +      if (ret == 1)
+ +              ret = kdba_bt_stack(0, argcount, p, 1);
+ +      return ret;
+ +}
+ +
+ +static int __init kdba_bt_x86_init(void)
+ +{
+ +      int i, c, cp = -1;
+ +      struct bb_name_state *r;
+ +
+ +      kdb_register_repeat("bb1", kdb_bb1, "<vaddr>",  "Analyse one basic block", 0, KDB_REPEAT_NONE);
+ +      kdb_register_repeat("bb_all", kdb_bb_all, "",   "Backtrace check on all built in functions", 0, KDB_REPEAT_NONE);
+ +
+ +      /* Split the opcode usage table by the first letter of each set of
+ +       * opcodes, for faster mapping of opcode to its operand usage.
+ +       */
+ +      for (i = 0; i < ARRAY_SIZE(bb_opcode_usage_all); ++i) {
+ +              c = bb_opcode_usage_all[i].opcode[0] - 'a';
+ +              if (c != cp) {
+ +                      cp = c;
+ +                      bb_opcode_usage[c].opcode = bb_opcode_usage_all + i;
+ +              }
+ +              ++bb_opcode_usage[c].size;
+ +      }
+ +
+ +      bb_common_interrupt = kallsyms_lookup_name("common_interrupt");
+ +      bb_error_entry = kallsyms_lookup_name("error_entry");
+ +      bb_ret_from_intr = kallsyms_lookup_name("ret_from_intr");
+ +      bb_thread_return = kallsyms_lookup_name("thread_return");
+ +      bb_sync_regs = kallsyms_lookup_name("sync_regs");
+ +      bb_save_v86_state = kallsyms_lookup_name("save_v86_state");
+ +      bb__sched_text_start = kallsyms_lookup_name("__sched_text_start");
+ +      bb__sched_text_end = kallsyms_lookup_name("__sched_text_end");
+ +      bb_save_args = kallsyms_lookup_name("save_args");
+ +      bb_save_rest = kallsyms_lookup_name("save_rest");
+ +      bb_save_paranoid = kallsyms_lookup_name("save_paranoid");
+ +      for (i = 0, r = bb_special_cases;
+ +           i < ARRAY_SIZE(bb_special_cases);
+ +           ++i, ++r) {
+ +              r->address = kallsyms_lookup_name(r->name);
+ +      }
+ +
+ +#ifdef        CONFIG_4KSTACKS
+ +      kdba_hardirq_ctx = (struct thread_info **)kallsyms_lookup_name("hardirq_ctx");
+ +      kdba_softirq_ctx = (struct thread_info **)kallsyms_lookup_name("softirq_ctx");
+ +#endif        /* CONFIG_4KSTACKS */
+ +
+ +      return 0;
+ +}
+ +
+ +static void __exit kdba_bt_x86_exit(void)
+ +{
+ +      kdb_unregister("bb1");
+ +      kdb_unregister("bb_all");
+ +}
+ +
+ +module_init(kdba_bt_x86_init)
+ +module_exit(kdba_bt_x86_exit)
diff --cc arch/x86/kdb/kdba_support.c

index 417b7c6,0000000..4d1c88d

mode 100644,000000..100644
--- 1/arch/x86/kdb/kdba_support.c
--- /dev/null
+++ b/arch/x86/kdb/kdba_support.c
@@@ -1,1536 -1,0 +1,1536 @@@
+ +/*
+ + * Kernel Debugger Architecture Independent Support Functions
+ + *
+ + * This file is subject to the terms and conditions of the GNU General Public
+ + * License.  See the file "COPYING" in the main directory of this archive
+ + * for more details.
+ + *
+ + * Copyright (c) 1999-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ + */
+ +
+ +#include <linux/string.h>
+ +#include <linux/stddef.h>
+ +#include <linux/kernel.h>
+ +#include <linux/module.h>
+ +#include <linux/init.h>
+ +#include <linux/irq.h>
+ +#include <linux/ptrace.h>
+ +#include <linux/mm.h>
+ +#include <linux/sched.h>
+ +#include <linux/hardirq.h>
+ +#include <linux/kdb.h>
+ +#include <linux/kdbprivate.h>
+ +#include <linux/interrupt.h>
+ +#include <linux/kdebug.h>
+ +#include <linux/cpumask.h>
+ +
+ +#include <asm/processor.h>
+ +#include <asm/msr.h>
+ +#include <asm/uaccess.h>
+ +#include <asm/desc.h>
+ +
+ +static kdb_machreg_t
+ +kdba_getcr(int regnum)
+ +{
+ +      kdb_machreg_t contents = 0;
+ +      switch(regnum) {
+ +      case 0:
+ +              __asm__ (_ASM_MOV " %%cr0,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 1:
+ +              break;
+ +      case 2:
+ +              __asm__ (_ASM_MOV " %%cr2,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 3:
+ +              __asm__ (_ASM_MOV " %%cr3,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 4:
+ +              __asm__ (_ASM_MOV " %%cr4,%0\n\t":"=r"(contents));
+ +              break;
+ +      default:
+ +              break;
+ +      }
+ +
+ +      return contents;
+ +}
+ +
+ +void
+ +kdba_putdr(int regnum, kdb_machreg_t contents)
+ +{
+ +      switch(regnum) {
+ +      case 0:
+ +              __asm__ (_ASM_MOV " %0,%%db0\n\t"::"r"(contents));
+ +              break;
+ +      case 1:
+ +              __asm__ (_ASM_MOV " %0,%%db1\n\t"::"r"(contents));
+ +              break;
+ +      case 2:
+ +              __asm__ (_ASM_MOV " %0,%%db2\n\t"::"r"(contents));
+ +              break;
+ +      case 3:
+ +              __asm__ (_ASM_MOV " %0,%%db3\n\t"::"r"(contents));
+ +              break;
+ +      case 4:
+ +      case 5:
+ +              break;
+ +      case 6:
+ +              __asm__ (_ASM_MOV " %0,%%db6\n\t"::"r"(contents));
+ +              break;
+ +      case 7:
+ +              __asm__ (_ASM_MOV " %0,%%db7\n\t"::"r"(contents));
+ +              break;
+ +      default:
+ +              break;
+ +      }
+ +}
+ +
+ +kdb_machreg_t
+ +kdba_getdr(int regnum)
+ +{
+ +      kdb_machreg_t contents = 0;
+ +      switch(regnum) {
+ +      case 0:
+ +              __asm__ (_ASM_MOV " %%db0,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 1:
+ +              __asm__ (_ASM_MOV " %%db1,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 2:
+ +              __asm__ (_ASM_MOV " %%db2,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 3:
+ +              __asm__ (_ASM_MOV " %%db3,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 4:
+ +      case 5:
+ +              break;
+ +      case 6:
+ +              __asm__ (_ASM_MOV " %%db6,%0\n\t":"=r"(contents));
+ +              break;
+ +      case 7:
+ +              __asm__ (_ASM_MOV " %%db7,%0\n\t":"=r"(contents));
+ +              break;
+ +      default:
+ +              break;
+ +      }
+ +
+ +      return contents;
+ +}
+ +
+ +kdb_machreg_t
+ +kdba_getdr6(void)
+ +{
+ +      return kdba_getdr(6);
+ +}
+ +
+ +kdb_machreg_t
+ +kdba_getdr7(void)
+ +{
+ +      return kdba_getdr(7);
+ +}
+ +
+ +void
+ +kdba_putdr6(kdb_machreg_t contents)
+ +{
+ +      kdba_putdr(6, contents);
+ +}
+ +
+ +static void
+ +kdba_putdr7(kdb_machreg_t contents)
+ +{
+ +      kdba_putdr(7, contents);
+ +}
+ +
+ +void
+ +kdba_installdbreg(kdb_bp_t *bp)
+ +{
+ +      int cpu = smp_processor_id();
+ +
+ +      kdb_machreg_t dr7;
+ +
+ +      dr7 = kdba_getdr7();
+ +
+ +      kdba_putdr(bp->bp_hard[cpu]->bph_reg, bp->bp_addr);
+ +
+ +      dr7 |= DR7_GE;
+ +      if (cpu_has_de)
+ +              set_in_cr4(X86_CR4_DE);
+ +
+ +      switch (bp->bp_hard[cpu]->bph_reg){
+ +      case 0:
+ +              DR7_RW0SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ +              DR7_LEN0SET(dr7,bp->bp_hard[cpu]->bph_length);
+ +              DR7_G0SET(dr7);
+ +              break;
+ +      case 1:
+ +              DR7_RW1SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ +              DR7_LEN1SET(dr7,bp->bp_hard[cpu]->bph_length);
+ +              DR7_G1SET(dr7);
+ +              break;
+ +      case 2:
+ +              DR7_RW2SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ +              DR7_LEN2SET(dr7,bp->bp_hard[cpu]->bph_length);
+ +              DR7_G2SET(dr7);
+ +              break;
+ +      case 3:
+ +              DR7_RW3SET(dr7,bp->bp_hard[cpu]->bph_mode);
+ +              DR7_LEN3SET(dr7,bp->bp_hard[cpu]->bph_length);
+ +              DR7_G3SET(dr7);
+ +              break;
+ +      default:
+ +              kdb_printf("kdb: Bad debug register!! %ld\n",
+ +                         bp->bp_hard[cpu]->bph_reg);
+ +              break;
+ +      }
+ +
+ +      kdba_putdr7(dr7);
+ +      return;
+ +}
+ +
+ +void
+ +kdba_removedbreg(kdb_bp_t *bp)
+ +{
+ +      int regnum;
+ +      kdb_machreg_t dr7;
+ +      int cpu = smp_processor_id();
+ +
+ +      if (!bp->bp_hard[cpu])
+ +              return;
+ +
+ +      regnum = bp->bp_hard[cpu]->bph_reg;
+ +
+ +      dr7 = kdba_getdr7();
+ +
+ +      kdba_putdr(regnum, 0);
+ +
+ +      switch (regnum) {
+ +      case 0:
+ +              DR7_G0CLR(dr7);
+ +              DR7_L0CLR(dr7);
+ +              break;
+ +      case 1:
+ +              DR7_G1CLR(dr7);
+ +              DR7_L1CLR(dr7);
+ +              break;
+ +      case 2:
+ +              DR7_G2CLR(dr7);
+ +              DR7_L2CLR(dr7);
+ +              break;
+ +      case 3:
+ +              DR7_G3CLR(dr7);
+ +              DR7_L3CLR(dr7);
+ +              break;
+ +      default:
+ +              kdb_printf("kdb: Bad debug register!! %d\n", regnum);
+ +              break;
+ +      }
+ +
+ +      kdba_putdr7(dr7);
+ +}
+ +
+ +struct kdbregs {
+ +      char   *reg_name;
+ +      size_t  reg_offset;
+ +};
+ +
+ +static struct kdbregs dbreglist[] = {
+ +      { "dr0",        0 },
+ +      { "dr1",        1 },
+ +      { "dr2",        2 },
+ +      { "dr3",        3 },
+ +      { "dr6",        6 },
+ +      { "dr7",        7 },
+ +};
+ +
+ +static const int ndbreglist = sizeof(dbreglist) / sizeof(struct kdbregs);
+ +
+ +#ifdef CONFIG_X86_32
+ +static struct kdbregs kdbreglist[] = {
+ +      { "ax",         offsetof(struct pt_regs, ax) },
+ +      { "bx",         offsetof(struct pt_regs, bx) },
+ +      { "cx",         offsetof(struct pt_regs, cx) },
+ +      { "dx",         offsetof(struct pt_regs, dx) },
+ +
+ +      { "si",         offsetof(struct pt_regs, si) },
+ +      { "di",         offsetof(struct pt_regs, di) },
+ +      { "sp",         offsetof(struct pt_regs, sp) },
+ +      { "ip",         offsetof(struct pt_regs, ip) },
+ +
+ +      { "bp",         offsetof(struct pt_regs, bp) },
+ +      { "ss",         offsetof(struct pt_regs, ss) },
+ +      { "cs",         offsetof(struct pt_regs, cs) },
+ +      { "flags",      offsetof(struct pt_regs, flags) },
+ +
+ +      { "ds",         offsetof(struct pt_regs, ds) },
+ +      { "es",         offsetof(struct pt_regs, es) },
+ +      { "origax",     offsetof(struct pt_regs, orig_ax) },
+ +
+ +};
+ +
+ +static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs);
+ +
+ +
+ +/*
+ + * kdba_getregcontents
+ + *
+ + *    Return the contents of the register specified by the
+ + *    input string argument.   Return an error if the string
+ + *    does not match a machine register.
+ + *
+ + *    The following pseudo register names are supported:
+ + *       &regs         - Prints address of exception frame
+ + *       kesp          - Prints kernel stack pointer at time of fault
+ + *       cesp          - Prints current kernel stack pointer, inside kdb
+ + *       ceflags       - Prints current flags, inside kdb
+ + *       %<regname>    - Uses the value of the registers at the
+ + *                       last time the user process entered kernel
+ + *                       mode, instead of the registers at the time
+ + *                       kdb was entered.
+ + *
+ + * Parameters:
+ + *    regname         Pointer to string naming register
+ + *    regs            Pointer to structure containing registers.
+ + * Outputs:
+ + *    *contents       Pointer to unsigned long to recieve register contents
+ + * Returns:
+ + *    0               Success
+ + *    KDB_BADREG      Invalid register name
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + *    If kdb was entered via an interrupt from the kernel itself then
+ + *    ss and sp are *not* on the stack.
+ + */
+ +
+ +int
+ +kdba_getregcontents(const char *regname,
+ +                  struct pt_regs *regs,
+ +                  kdb_machreg_t *contents)
+ +{
+ +      int i;
+ +
+ +      if (strcmp(regname, "cesp") == 0) {
+ +              asm volatile("movl %%esp,%0":"=m" (*contents));
+ +              return 0;
+ +      }
+ +
+ +      if (strcmp(regname, "ceflags") == 0) {
+ +              unsigned long flags;
+ +              local_save_flags(flags);
+ +              *contents = flags;
+ +              return 0;
+ +      }
+ +
+ +      if (regname[0] == '%') {
+ +              /* User registers:  %%e[a-c]x, etc */
+ +              regname++;
+ +              regs = (struct pt_regs *)
+ +                      (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
+ +      }
+ +
+ +      for (i=0; i<ndbreglist; i++) {
+ +              if (strnicmp(dbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < ndbreglist)
+ +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ +              *contents = kdba_getdr(dbreglist[i].reg_offset);
+ +              return 0;
+ +      }
+ +
+ +      if (!regs) {
+ +              kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
+ +              return KDB_BADREG;
+ +      }
+ +
+ +      if (strcmp(regname, "&regs") == 0) {
+ +              *contents = (unsigned long)regs;
+ +              return 0;
+ +      }
+ +
+ +      if (strcmp(regname, "kesp") == 0) {
+ +              *contents = (unsigned long)regs + sizeof(struct pt_regs);
+ +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ +                      /* sp and ss are not on stack */
+ +                      *contents -= 2*4;
+ +              }
+ +              return 0;
+ +      }
+ +
+ +      for (i=0; i<nkdbreglist; i++) {
+ +              if (strnicmp(kdbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < nkdbreglist)
+ +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ +                      /* No cpl switch, sp and ss are not on stack */
+ +                      if (strcmp(kdbreglist[i].reg_name, "sp") == 0) {
+ +                              *contents = (kdb_machreg_t)regs +
+ +                                      sizeof(struct pt_regs) - 2*4;
+ +                              return(0);
+ +                      }
+ +                      if (strcmp(kdbreglist[i].reg_name, "xss") == 0) {
+ +                              asm volatile(
+ +                                      "pushl %%ss\n"
+ +                                      "popl %0\n"
+ +                                      :"=m" (*contents));
+ +                              return(0);
+ +                      }
+ +              }
+ +              *contents = *(unsigned long *)((unsigned long)regs +
+ +                              kdbreglist[i].reg_offset);
+ +              return(0);
+ +      }
+ +
+ +      return KDB_BADREG;
+ +}
+ +
+ +/*
+ + * kdba_setregcontents
+ + *
+ + *    Set the contents of the register specified by the
+ + *    input string argument.   Return an error if the string
+ + *    does not match a machine register.
+ + *
+ + *    Supports modification of user-mode registers via
+ + *    %<register-name>
+ + *
+ + * Parameters:
+ + *    regname         Pointer to string naming register
+ + *    regs            Pointer to structure containing registers.
+ + *    contents        Unsigned long containing new register contents
+ + * Outputs:
+ + * Returns:
+ + *    0               Success
+ + *    KDB_BADREG      Invalid register name
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + */
+ +
+ +int
+ +kdba_setregcontents(const char *regname,
+ +                struct pt_regs *regs,
+ +                unsigned long contents)
+ +{
+ +      int i;
+ +
+ +      if (regname[0] == '%') {
+ +              regname++;
+ +              regs = (struct pt_regs *)
+ +                      (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
+ +      }
+ +
+ +      for (i=0; i<ndbreglist; i++) {
+ +              if (strnicmp(dbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < ndbreglist)
+ +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ +              kdba_putdr(dbreglist[i].reg_offset, contents);
+ +              return 0;
+ +      }
+ +
+ +      if (!regs) {
+ +              kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
+ +              return KDB_BADREG;
+ +      }
+ +
+ +      for (i=0; i<nkdbreglist; i++) {
+ +              if (strnicmp(kdbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < nkdbreglist)
+ +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ +              *(unsigned long *)((unsigned long)regs
+ +                                 + kdbreglist[i].reg_offset) = contents;
+ +              return 0;
+ +      }
+ +
+ +      return KDB_BADREG;
+ +}
+ +
+ +/*
+ + * kdba_pt_regs
+ + *
+ + *    Format a struct pt_regs
+ + *
+ + * Inputs:
+ + *    argc    argument count
+ + *    argv    argument vector
+ + * Outputs:
+ + *    None.
+ + * Returns:
+ + *    zero for success, a kdb diagnostic if error
+ + * Locking:
+ + *    none.
+ + * Remarks:
+ + *    If no address is supplied, it uses the last irq pt_regs.
+ + */
+ +
+ +static int
+ +kdba_pt_regs(int argc, const char **argv)
+ +{
+ +      int diag;
+ +      kdb_machreg_t addr;
+ +      long offset = 0;
+ +      int nextarg;
+ +      struct pt_regs *p;
+ +      static const char *fmt = "  %-11.11s 0x%lx\n";
+ +
+ +      if (argc == 0) {
+ +              addr = (kdb_machreg_t) get_irq_regs();
+ +      } else if (argc == 1) {
+ +              nextarg = 1;
+ +              diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ +              if (diag)
+ +                      return diag;
+ +      } else {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +      p = (struct pt_regs *) addr;
+ +      kdb_printf("struct pt_regs 0x%p-0x%p\n", p, (unsigned char *)p + sizeof(*p) - 1);
+ +      kdb_print_nameval("bx", p->bx);
+ +      kdb_print_nameval("cx", p->cx);
+ +      kdb_print_nameval("dx", p->dx);
+ +      kdb_print_nameval("si", p->si);
+ +      kdb_print_nameval("di", p->di);
+ +      kdb_print_nameval("bp", p->bp);
+ +      kdb_print_nameval("ax", p->ax);
+ +      kdb_printf(fmt, "ds", p->ds);
+ +      kdb_printf(fmt, "es", p->es);
+ +      kdb_print_nameval("orig_ax", p->orig_ax);
+ +      kdb_print_nameval("ip", p->ip);
+ +      kdb_printf(fmt, "cs", p->cs);
+ +      kdb_printf(fmt, "flags", p->flags);
+ +      kdb_printf(fmt, "sp", p->sp);
+ +      kdb_printf(fmt, "ss", p->ss);
+ +      return 0;
+ +}
+ +
+ +#else /* CONFIG_X86_32 */
+ +
+ +static struct kdbregs kdbreglist[] = {
+ +      { "r15",        offsetof(struct pt_regs, r15) },
+ +      { "r14",        offsetof(struct pt_regs, r14) },
+ +      { "r13",        offsetof(struct pt_regs, r13) },
+ +      { "r12",        offsetof(struct pt_regs, r12) },
+ +      { "bp",         offsetof(struct pt_regs, bp) },
+ +      { "bx",         offsetof(struct pt_regs, bx) },
+ +      { "r11",        offsetof(struct pt_regs, r11) },
+ +      { "r10",        offsetof(struct pt_regs, r10) },
+ +      { "r9",         offsetof(struct pt_regs, r9) },
+ +      { "r8",         offsetof(struct pt_regs, r8) },
+ +      { "ax",         offsetof(struct pt_regs, ax) },
+ +      { "cx",         offsetof(struct pt_regs, cx) },
+ +      { "dx",         offsetof(struct pt_regs, dx) },
+ +      { "si",         offsetof(struct pt_regs, si) },
+ +      { "di",         offsetof(struct pt_regs, di) },
+ +      { "orig_ax",    offsetof(struct pt_regs, orig_ax) },
+ +      { "ip",         offsetof(struct pt_regs, ip) },
+ +      { "cs",         offsetof(struct pt_regs, cs) },
+ +      { "flags",      offsetof(struct pt_regs, flags) },
+ +      { "sp",         offsetof(struct pt_regs, sp) },
+ +      { "ss",         offsetof(struct pt_regs, ss) },
+ +};
+ +
+ +static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs);
+ +
+ +
+ +/*
+ + * kdba_getregcontents
+ + *
+ + *    Return the contents of the register specified by the
+ + *    input string argument.   Return an error if the string
+ + *    does not match a machine register.
+ + *
+ + *    The following pseudo register names are supported:
+ + *       &regs         - Prints address of exception frame
+ + *       krsp          - Prints kernel stack pointer at time of fault
+ + *       crsp          - Prints current kernel stack pointer, inside kdb
+ + *       ceflags       - Prints current flags, inside kdb
+ + *       %<regname>    - Uses the value of the registers at the
+ + *                       last time the user process entered kernel
+ + *                       mode, instead of the registers at the time
+ + *                       kdb was entered.
+ + *
+ + * Parameters:
+ + *    regname         Pointer to string naming register
+ + *    regs            Pointer to structure containing registers.
+ + * Outputs:
+ + *    *contents       Pointer to unsigned long to recieve register contents
+ + * Returns:
+ + *    0               Success
+ + *    KDB_BADREG      Invalid register name
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + *    If kdb was entered via an interrupt from the kernel itself then
+ + *    ss and sp are *not* on the stack.
+ + */
+ +int
+ +kdba_getregcontents(const char *regname,
+ +                  struct pt_regs *regs,
+ +                  kdb_machreg_t *contents)
+ +{
+ +      int i;
+ +
+ +      if (strcmp(regname, "&regs") == 0) {
+ +              *contents = (unsigned long)regs;
+ +              return 0;
+ +      }
+ +
+ +      if (strcmp(regname, "krsp") == 0) {
+ +              *contents = (unsigned long)regs + sizeof(struct pt_regs);
+ +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ +                      /* sp and ss are not on stack */
+ +                      *contents -= 2*4;
+ +              }
+ +              return 0;
+ +      }
+ +
+ +      if (strcmp(regname, "crsp") == 0) {
+ +              asm volatile("movq %%rsp,%0":"=m" (*contents));
+ +              return 0;
+ +      }
+ +
+ +      if (strcmp(regname, "ceflags") == 0) {
+ +              unsigned long flags;
+ +              local_save_flags(flags);
+ +              *contents = flags;
+ +              return 0;
+ +      }
+ +
+ +      if (regname[0] == '%') {
+ +              /* User registers:  %%r[a-c]x, etc */
+ +              regname++;
+ +              regs = (struct pt_regs *)
+ +                      (current->thread.sp0 - sizeof(struct pt_regs));
+ +      }
+ +
+ +      for (i=0; i<nkdbreglist; i++) {
+ +              if (strnicmp(kdbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < nkdbreglist)
+ +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ +              if ((regs->cs & 0xffff) == __KERNEL_CS) {
+ +                      /* No cpl switch, sp is not on stack */
+ +                      if (strcmp(kdbreglist[i].reg_name, "sp") == 0) {
+ +                              *contents = (kdb_machreg_t)regs +
+ +                                      sizeof(struct pt_regs) - 2*8;
+ +                              return(0);
+ +                      }
+ +#if 0 /* FIXME */
+ +                      if (strcmp(kdbreglist[i].reg_name, "ss") == 0) {
+ +                              kdb_machreg_t r;
+ +
+ +                              r = (kdb_machreg_t)regs +
+ +                                      sizeof(struct pt_regs) - 2*8;
+ +                              *contents = (kdb_machreg_t)SS(r);       /* XXX */
+ +                              return(0);
+ +                      }
+ +#endif
+ +              }
+ +              *contents = *(unsigned long *)((unsigned long)regs +
+ +                              kdbreglist[i].reg_offset);
+ +              return(0);
+ +      }
+ +
+ +      for (i=0; i<ndbreglist; i++) {
+ +              if (strnicmp(dbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < ndbreglist)
+ +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ +              *contents = kdba_getdr(dbreglist[i].reg_offset);
+ +              return 0;
+ +      }
+ +      return KDB_BADREG;
+ +}
+ +
+ +/*
+ + * kdba_setregcontents
+ + *
+ + *    Set the contents of the register specified by the
+ + *    input string argument.   Return an error if the string
+ + *    does not match a machine register.
+ + *
+ + *    Supports modification of user-mode registers via
+ + *    %<register-name>
+ + *
+ + * Parameters:
+ + *    regname         Pointer to string naming register
+ + *    regs            Pointer to structure containing registers.
+ + *    contents        Unsigned long containing new register contents
+ + * Outputs:
+ + * Returns:
+ + *    0               Success
+ + *    KDB_BADREG      Invalid register name
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + */
+ +
+ +int
+ +kdba_setregcontents(const char *regname,
+ +                struct pt_regs *regs,
+ +                unsigned long contents)
+ +{
+ +      int i;
+ +
+ +      if (regname[0] == '%') {
+ +              regname++;
+ +              regs = (struct pt_regs *)
+ +                      (current->thread.sp0 - sizeof(struct pt_regs));
+ +      }
+ +
+ +      for (i=0; i<nkdbreglist; i++) {
+ +              if (strnicmp(kdbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < nkdbreglist)
+ +       && (strlen(kdbreglist[i].reg_name) == strlen(regname))) {
+ +              *(unsigned long *)((unsigned long)regs
+ +                                 + kdbreglist[i].reg_offset) = contents;
+ +              return 0;
+ +      }
+ +
+ +      for (i=0; i<ndbreglist; i++) {
+ +              if (strnicmp(dbreglist[i].reg_name,
+ +                           regname,
+ +                           strlen(regname)) == 0)
+ +                      break;
+ +      }
+ +
+ +      if ((i < ndbreglist)
+ +       && (strlen(dbreglist[i].reg_name) == strlen(regname))) {
+ +              kdba_putdr(dbreglist[i].reg_offset, contents);
+ +              return 0;
+ +      }
+ +
+ +      return KDB_BADREG;
+ +}
+ +
+ +/*
+ + * kdba_pt_regs
+ + *
+ + *    Format a struct pt_regs
+ + *
+ + * Inputs:
+ + *    argc    argument count
+ + *    argv    argument vector
+ + * Outputs:
+ + *    None.
+ + * Returns:
+ + *    zero for success, a kdb diagnostic if error
+ + * Locking:
+ + *    none.
+ + * Remarks:
+ + *    If no address is supplied, it uses the last irq pt_regs.
+ + */
+ +
+ +static int
+ +kdba_pt_regs(int argc, const char **argv)
+ +{
+ +      int diag;
+ +      kdb_machreg_t addr;
+ +      long offset = 0;
+ +      int nextarg;
+ +      struct pt_regs *p;
+ +      static const char *fmt = "  %-11.11s 0x%lx\n";
+ +      static int first_time = 1;
+ +
+ +      if (argc == 0) {
+ +              addr = (kdb_machreg_t) get_irq_regs();
+ +      } else if (argc == 1) {
+ +              nextarg = 1;
+ +              diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ +              if (diag)
+ +                      return diag;
+ +      } else {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +      p = (struct pt_regs *) addr;
+ +      if (first_time) {
+ +              first_time = 0;
+ +              kdb_printf("\n+++ Warning: x86_64 pt_regs are not always "
+ +                         "completely defined, r15-bx may be invalid\n\n");
+ +      }
+ +      kdb_printf("struct pt_regs 0x%p-0x%p\n", p, (unsigned char *)p + sizeof(*p) - 1);
+ +      kdb_print_nameval("r15", p->r15);
+ +      kdb_print_nameval("r14", p->r14);
+ +      kdb_print_nameval("r13", p->r13);
+ +      kdb_print_nameval("r12", p->r12);
+ +      kdb_print_nameval("bp", p->bp);
+ +      kdb_print_nameval("bx", p->bx);
+ +      kdb_print_nameval("r11", p->r11);
+ +      kdb_print_nameval("r10", p->r10);
+ +      kdb_print_nameval("r9", p->r9);
+ +      kdb_print_nameval("r8", p->r8);
+ +      kdb_print_nameval("ax", p->ax);
+ +      kdb_print_nameval("cx", p->cx);
+ +      kdb_print_nameval("dx", p->dx);
+ +      kdb_print_nameval("si", p->si);
+ +      kdb_print_nameval("di", p->di);
+ +      kdb_print_nameval("orig_ax", p->orig_ax);
+ +      kdb_print_nameval("ip", p->ip);
+ +      kdb_printf(fmt, "cs", p->cs);
+ +      kdb_printf(fmt, "flags", p->flags);
+ +      kdb_printf(fmt, "sp", p->sp);
+ +      kdb_printf(fmt, "ss", p->ss);
+ +      return 0;
+ +}
+ +#endif /* CONFIG_X86_32 */
+ +
+ +/*
+ + * kdba_dumpregs
+ + *
+ + *    Dump the specified register set to the display.
+ + *
+ + * Parameters:
+ + *    regs            Pointer to structure containing registers.
+ + *    type            Character string identifying register set to dump
+ + *    extra           string further identifying register (optional)
+ + * Outputs:
+ + * Returns:
+ + *    0               Success
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + *    This function will dump the general register set if the type
+ + *    argument is NULL (struct pt_regs).   The alternate register
+ + *    set types supported by this function:
+ + *
+ + *    d               Debug registers
+ + *    c               Control registers
+ + *    u               User registers at most recent entry to kernel
+ + *                    for the process currently selected with "pid" command.
+ + * Following not yet implemented:
+ + *    r               Memory Type Range Registers (extra defines register)
+ + *
+ + * MSR on i386/x86_64 are handled by rdmsr/wrmsr commands.
+ + */
+ +
+ +int
+ +kdba_dumpregs(struct pt_regs *regs,
+ +          const char *type,
+ +          const char *extra)
+ +{
+ +      int i;
+ +      int count = 0;
+ +
+ +      if (type
+ +       && (type[0] == 'u')) {
+ +              type = NULL;
+ +              regs = (struct pt_regs *)
+ +                      (kdb_current_task->thread.sp0 - sizeof(struct pt_regs));
+ +      }
+ +
+ +      if (type == NULL) {
+ +              struct kdbregs *rlp;
+ +              kdb_machreg_t contents;
+ +
+ +              if (!regs) {
+ +                      kdb_printf("%s: pt_regs not available, use bt* or pid to select a different task\n", __FUNCTION__);
+ +                      return KDB_BADREG;
+ +              }
+ +
+ +#ifdef CONFIG_X86_32
+ +              for (i=0, rlp=kdbreglist; i<nkdbreglist; i++,rlp++) {
+ +                      kdb_printf("%s = ", rlp->reg_name);
+ +                      kdba_getregcontents(rlp->reg_name, regs, &contents);
+ +                      kdb_printf("0x%08lx ", contents);
+ +                      if ((++count % 4) == 0)
+ +                              kdb_printf("\n");
+ +              }
+ +#else
+ +              for (i=0, rlp=kdbreglist; i<nkdbreglist; i++,rlp++) {
+ +                      kdb_printf("%8s = ", rlp->reg_name);
+ +                      kdba_getregcontents(rlp->reg_name, regs, &contents);
+ +                      kdb_printf("0x%016lx ", contents);
+ +                      if ((++count % 2) == 0)
+ +                              kdb_printf("\n");
+ +              }
+ +#endif
+ +
+ +              kdb_printf("&regs = 0x%p\n", regs);
+ +
+ +              return 0;
+ +      }
+ +
+ +      switch (type[0]) {
+ +      case 'd':
+ +      {
+ +              unsigned long dr[8];
+ +
+ +              for(i=0; i<8; i++) {
+ +                      if ((i == 4) || (i == 5)) continue;
+ +                      dr[i] = kdba_getdr(i);
+ +              }
+ +              kdb_printf("dr0 = 0x%08lx  dr1 = 0x%08lx  dr2 = 0x%08lx  dr3 = 0x%08lx\n",
+ +                         dr[0], dr[1], dr[2], dr[3]);
+ +              kdb_printf("dr6 = 0x%08lx  dr7 = 0x%08lx\n",
+ +                         dr[6], dr[7]);
+ +              return 0;
+ +      }
+ +      case 'c':
+ +      {
+ +              unsigned long cr[5];
+ +
+ +              for (i=0; i<5; i++) {
+ +                      cr[i] = kdba_getcr(i);
+ +              }
+ +              kdb_printf("cr0 = 0x%08lx  cr1 = 0x%08lx  cr2 = 0x%08lx  cr3 = 0x%08lx\ncr4 = 0x%08lx\n",
+ +                         cr[0], cr[1], cr[2], cr[3], cr[4]);
+ +              return 0;
+ +      }
+ +      case 'r':
+ +              break;
+ +      default:
+ +              return KDB_BADREG;
+ +      }
+ +
+ +      /* NOTREACHED */
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL(kdba_dumpregs);
+ +
+ +kdb_machreg_t
+ +kdba_getpc(struct pt_regs *regs)
+ +{
+ +      return regs ? regs->ip : 0;
+ +}
+ +
+ +int
+ +kdba_setpc(struct pt_regs *regs, kdb_machreg_t newpc)
+ +{
+ +      if (KDB_NULL_REGS(regs))
+ +              return KDB_BADREG;
+ +      regs->ip = newpc;
+ +      KDB_STATE_SET(IP_ADJUSTED);
+ +      return 0;
+ +}
+ +
+ +/*
+ + * kdba_main_loop
+ + *
+ + *    Do any architecture specific set up before entering the main kdb loop.
+ + *    The primary function of this routine is to make all processes look the
+ + *    same to kdb, kdb must be able to list a process without worrying if the
+ + *    process is running or blocked, so make all process look as though they
+ + *    are blocked.
+ + *
+ + * Inputs:
+ + *    reason          The reason KDB was invoked
+ + *    error           The hardware-defined error code
+ + *    error2          kdb's current reason code.  Initially error but can change
+ + *                    acording to kdb state.
+ + *    db_result       Result from break or debug point.
+ + *    regs            The exception frame at time of fault/breakpoint.  If reason
+ + *                    is SILENT or CPU_UP then regs is NULL, otherwise it should
+ + *                    always be valid.
+ + * Returns:
+ + *    0       KDB was invoked for an event which it wasn't responsible
+ + *    1       KDB handled the event for which it was invoked.
+ + * Outputs:
+ + *    Sets ip and sp in current->thread.
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + *    none.
+ + */
+ +
+ +int
+ +kdba_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+ +             kdb_dbtrap_t db_result, struct pt_regs *regs)
+ +{
+ +      int ret;
+ +
+ +#ifdef CONFIG_X86_64
+ +      if (regs)
+ +              kdba_getregcontents("sp", regs, &(current->thread.sp));
+ +#endif
+ +      ret = kdb_save_running(regs, reason, reason2, error, db_result);
+ +      kdb_unsave_running(regs);
+ +      return ret;
+ +}
+ +
+ +void
+ +kdba_disableint(kdb_intstate_t *state)
+ +{
+ +      unsigned long *fp = (unsigned long *)state;
+ +      unsigned long flags;
+ +
+ +      local_irq_save(flags);
+ +      *fp = flags;
+ +}
+ +
+ +void
+ +kdba_restoreint(kdb_intstate_t *state)
+ +{
+ +      unsigned long flags = *(unsigned long *)state;
+ +      local_irq_restore(flags);
+ +}
+ +
+ +void
+ +kdba_setsinglestep(struct pt_regs *regs)
+ +{
+ +      if (KDB_NULL_REGS(regs))
+ +              return;
+ +      if (regs->flags & X86_EFLAGS_IF)
+ +              KDB_STATE_SET(A_IF);
+ +      else
+ +              KDB_STATE_CLEAR(A_IF);
+ +      regs->flags = (regs->flags | X86_EFLAGS_TF) & ~X86_EFLAGS_IF;
+ +}
+ +
+ +void
+ +kdba_clearsinglestep(struct pt_regs *regs)
+ +{
+ +      if (KDB_NULL_REGS(regs))
+ +              return;
+ +      if (KDB_STATE(A_IF))
+ +              regs->flags |= X86_EFLAGS_IF;
+ +      else
+ +              regs->flags &= ~X86_EFLAGS_IF;
+ +}
+ +
+ +#ifdef CONFIG_X86_32
+ +int asmlinkage
+ +kdba_setjmp(kdb_jmp_buf *jb)
+ +{
+ +#ifdef CONFIG_FRAME_POINTER
+ +      __asm__ ("movl 8(%esp), %eax\n\t"
+ +               "movl %ebx, 0(%eax)\n\t"
+ +               "movl %esi, 4(%eax)\n\t"
+ +               "movl %edi, 8(%eax)\n\t"
+ +               "movl (%esp), %ecx\n\t"
+ +               "movl %ecx, 12(%eax)\n\t"
+ +               "leal 8(%esp), %ecx\n\t"
+ +               "movl %ecx, 16(%eax)\n\t"
+ +               "movl 4(%esp), %ecx\n\t"
+ +               "movl %ecx, 20(%eax)\n\t");
+ +#else  /* CONFIG_FRAME_POINTER */
+ +      __asm__ ("movl 4(%esp), %eax\n\t"
+ +               "movl %ebx, 0(%eax)\n\t"
+ +               "movl %esi, 4(%eax)\n\t"
+ +               "movl %edi, 8(%eax)\n\t"
+ +               "movl %ebp, 12(%eax)\n\t"
+ +               "leal 4(%esp), %ecx\n\t"
+ +               "movl %ecx, 16(%eax)\n\t"
+ +               "movl 0(%esp), %ecx\n\t"
+ +               "movl %ecx, 20(%eax)\n\t");
+ +#endif   /* CONFIG_FRAME_POINTER */
+ +      return 0;
+ +}
+ +
+ +void asmlinkage
+ +kdba_longjmp(kdb_jmp_buf *jb, int reason)
+ +{
+ +#ifdef CONFIG_FRAME_POINTER
+ +      __asm__("movl 8(%esp), %ecx\n\t"
+ +              "movl 12(%esp), %eax\n\t"
+ +              "movl 20(%ecx), %edx\n\t"
+ +              "movl 0(%ecx), %ebx\n\t"
+ +              "movl 4(%ecx), %esi\n\t"
+ +              "movl 8(%ecx), %edi\n\t"
+ +              "movl 12(%ecx), %ebp\n\t"
+ +              "movl 16(%ecx), %esp\n\t"
+ +              "jmp *%edx\n");
+ +#else    /* CONFIG_FRAME_POINTER */
+ +      __asm__("movl 4(%esp), %ecx\n\t"
+ +              "movl 8(%esp), %eax\n\t"
+ +              "movl 20(%ecx), %edx\n\t"
+ +              "movl 0(%ecx), %ebx\n\t"
+ +              "movl 4(%ecx), %esi\n\t"
+ +              "movl 8(%ecx), %edi\n\t"
+ +              "movl 12(%ecx), %ebp\n\t"
+ +              "movl 16(%ecx), %esp\n\t"
+ +              "jmp *%edx\n");
+ +#endif         /* CONFIG_FRAME_POINTER */
+ +}
+ +
+ +#else /* CONFIG_X86_32 */
+ +
+ +int asmlinkage
+ +kdba_setjmp(kdb_jmp_buf *jb)
+ +{
+ +#ifdef        CONFIG_FRAME_POINTER
+ +      __asm__ __volatile__
+ +              ("movq %%rbx, (0*8)(%%rdi);"
+ +              "movq %%rcx, (1*8)(%%rdi);"
+ +              "movq %%r12, (2*8)(%%rdi);"
+ +              "movq %%r13, (3*8)(%%rdi);"
+ +              "movq %%r14, (4*8)(%%rdi);"
+ +              "movq %%r15, (5*8)(%%rdi);"
+ +              "leaq 16(%%rsp), %%rdx;"
+ +              "movq %%rdx, (6*8)(%%rdi);"
+ +              "movq %%rax, (7*8)(%%rdi)"
+ +              :
+ +              : "a" (__builtin_return_address(0)),
+ +                "c" (__builtin_frame_address(1))
+ +              );
+ +#else  /* !CONFIG_FRAME_POINTER */
+ +      __asm__ __volatile__
+ +              ("movq %%rbx, (0*8)(%%rdi);"
+ +              "movq %%rbp, (1*8)(%%rdi);"
+ +              "movq %%r12, (2*8)(%%rdi);"
+ +              "movq %%r13, (3*8)(%%rdi);"
+ +              "movq %%r14, (4*8)(%%rdi);"
+ +              "movq %%r15, (5*8)(%%rdi);"
+ +              "leaq 8(%%rsp), %%rdx;"
+ +              "movq %%rdx, (6*8)(%%rdi);"
+ +              "movq %%rax, (7*8)(%%rdi)"
+ +              :
+ +              : "a" (__builtin_return_address(0))
+ +              );
+ +#endif   /* CONFIG_FRAME_POINTER */
+ +      return 0;
+ +}
+ +
+ +void asmlinkage
+ +kdba_longjmp(kdb_jmp_buf *jb, int reason)
+ +{
+ +      __asm__("movq (0*8)(%rdi),%rbx;"
+ +              "movq (1*8)(%rdi),%rbp;"
+ +              "movq (2*8)(%rdi),%r12;"
+ +              "movq (3*8)(%rdi),%r13;"
+ +              "movq (4*8)(%rdi),%r14;"
+ +              "movq (5*8)(%rdi),%r15;"
+ +              "movq (7*8)(%rdi),%rdx;"
+ +              "movq (6*8)(%rdi),%rsp;"
+ +              "mov %rsi, %rax;"
+ +              "jmpq *%rdx");
+ +}
+ +#endif /* CONFIG_X86_32 */
+ +
+ +#ifdef CONFIG_X86_32
+ +/*
+ + * kdba_stackdepth
+ + *
+ + *    Print processes that are using more than a specific percentage of their
+ + *    stack.
+ + *
+ + * Inputs:
+ + *    argc    argument count
+ + *    argv    argument vector
+ + * Outputs:
+ + *    None.
+ + * Returns:
+ + *    zero for success, a kdb diagnostic if error
+ + * Locking:
+ + *    none.
+ + * Remarks:
+ + *    If no percentage is supplied, it uses 60.
+ + */
+ +
+ +static void
+ +kdba_stackdepth1(struct task_struct *p, unsigned long sp)
+ +{
+ +      struct thread_info *tinfo;
+ +      int used;
+ +      const char *type;
+ +      kdb_ps1(p);
+ +      do {
+ +              tinfo = (struct thread_info *)(sp & -THREAD_SIZE);
+ +              used = sizeof(*tinfo) + THREAD_SIZE - (sp & (THREAD_SIZE-1));
+ +              type = NULL;
+ +              if (kdb_task_has_cpu(p)) {
+ +                      struct kdb_activation_record ar;
+ +                      memset(&ar, 0, sizeof(ar));
+ +                      kdba_get_stack_info_alternate(sp, -1, &ar);
+ +                      type = ar.stack.id;
+ +              }
+ +              if (!type)
+ +                      type = "process";
+ +              kdb_printf("  %s stack %p sp %lx used %d\n", type, tinfo, sp, used);
+ +              sp = tinfo->previous_esp;
+ +      } while (sp);
+ +}
+ +
+ +static int
+ +kdba_stackdepth(int argc, const char **argv)
+ +{
+ +      int diag, cpu, threshold, used, over;
+ +      unsigned long percentage;
+ +      unsigned long esp;
+ +      long offset = 0;
+ +      int nextarg;
+ +      struct task_struct *p, *g;
+ +      struct kdb_running_process *krp;
+ +      struct thread_info *tinfo;
+ +
+ +      if (argc == 0) {
+ +              percentage = 60;
+ +      } else if (argc == 1) {
+ +              nextarg = 1;
+ +              diag = kdbgetaddrarg(argc, argv, &nextarg, &percentage, &offset, NULL);
+ +              if (diag)
+ +                      return diag;
+ +      } else {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +      percentage = max_t(int, percentage, 1);
+ +      percentage = min_t(int, percentage, 100);
+ +      threshold = ((2 * THREAD_SIZE * percentage) / 100 + 1) >> 1;
+ +      kdb_printf("stackdepth: processes using more than %ld%% (%d bytes) of stack\n",
+ +              percentage, threshold);
+ +
+ +      /* Run the active tasks first, they can have multiple stacks */
+ +      for (cpu = 0, krp = kdb_running_process; cpu < NR_CPUS; ++cpu, ++krp) {
+ +              if (!cpu_online(cpu))
+ +                      continue;
+ +              p = krp->p;
+ +              esp = krp->arch.sp;
+ +              over = 0;
+ +              do {
+ +                      tinfo = (struct thread_info *)(esp & -THREAD_SIZE);
+ +                      used = sizeof(*tinfo) + THREAD_SIZE - (esp & (THREAD_SIZE-1));
+ +                      if (used >= threshold)
+ +                              over = 1;
+ +                      esp = tinfo->previous_esp;
+ +              } while (esp);
+ +              if (over)
+ +                      kdba_stackdepth1(p, krp->arch.sp);
+ +      }
+ +      /* Now the tasks that are not on cpus */
+ +      kdb_do_each_thread(g, p) {
+ +              if (kdb_task_has_cpu(p))
+ +                      continue;
+ +              esp = p->thread.sp;
+ +              used = sizeof(*tinfo) + THREAD_SIZE - (esp & (THREAD_SIZE-1));
+ +              over = used >= threshold;
+ +              if (over)
+ +                      kdba_stackdepth1(p, esp);
+ +      } kdb_while_each_thread(g, p);
+ +
+ +      return 0;
+ +}
+ +#else /* CONFIG_X86_32 */
+ +
+ +
+ +/*
+ + * kdba_entry
+ + *
+ + *    This is the interface routine between
+ + *    the notifier die_chain and kdb
+ + */
+ +static int kdba_entry( struct notifier_block *b, unsigned long val, void *v)
+ +{
+ +      struct die_args *args = v;
+ +      int err, trap, ret = 0;
+ +      struct pt_regs *regs;
+ +
+ +      regs = args->regs;
+ +      err  = args->err;
+ +      trap  = args->trapnr;
+ +      switch (val){
+ +#ifdef        CONFIG_SMP
+ +              case DIE_NMI_IPI:
+ +                      ret = kdb_ipi(regs, NULL);
+ +                      break;
+ +#endif        /* CONFIG_SMP */
+ +              case DIE_OOPS:
+ +                      ret = kdb(KDB_REASON_OOPS, err, regs);
+ +                      break;
+ +              case DIE_CALL:
+ +                      ret = kdb(KDB_REASON_ENTER, err, regs);
+ +                      break;
+ +              case DIE_DEBUG:
+ +                      ret = kdb(KDB_REASON_DEBUG, err, regs);
+ +                      break;
+ +              case DIE_NMIWATCHDOG:
+ +                      ret = kdb(KDB_REASON_NMI, err, regs);
+ +                      break;
+ +              case DIE_INT3:
+ +                       ret = kdb(KDB_REASON_BREAK, err, regs);
+ +                      // falls thru
+ +              default:
+ +                      break;
+ +      }
+ +      return (ret ? NOTIFY_STOP : NOTIFY_DONE);
+ +}
+ +
+ +/*
+ + * notifier block for kdb entry
+ + */
+ +static struct notifier_block kdba_notifier = {
+ +      .notifier_call = kdba_entry
+ +};
+ +#endif /* CONFIG_X86_32 */
+ +
+ +asmlinkage int kdb_call(void);
+ +
+ +/* Executed once on each cpu at startup. */
+ +void
+ +kdba_cpu_up(void)
+ +{
+ +}
+ +
+ +static int __init
+ +kdba_arch_init(void)
+ +{
+ +      set_intr_gate(KDBENTER_VECTOR, kdb_call);
+ +      return 0;
+ +}
+ +
+ +arch_initcall(kdba_arch_init);
+ +
+ +/*
+ + * kdba_init
+ + *
+ + *    Architecture specific initialization.
+ + *
+ + * Parameters:
+ + *    None.
+ + * Returns:
+ + *    None.
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + *    None.
+ + */
+ +
+ +void __init
+ +kdba_init(void)
+ +{
+ +      kdba_arch_init();       /* Need to register KDBENTER_VECTOR early */
+ +      kdb_register("pt_regs", kdba_pt_regs, "address", "Format struct pt_regs", 0);
+ +#ifdef CONFIG_X86_32
+ +      kdb_register("stackdepth", kdba_stackdepth, "[percentage]", "Print processes using >= stack percentage", 0);
+ +#else
+ +      register_die_notifier(&kdba_notifier);
+ +#endif
+ +      return;
+ +}
+ +
+ +/*
+ + * kdba_adjust_ip
+ + *
+ + *    Architecture specific adjustment of instruction pointer before leaving
+ + *    kdb.
+ + *
+ + * Parameters:
+ + *    reason          The reason KDB was invoked
+ + *    error           The hardware-defined error code
+ + *    regs            The exception frame at time of fault/breakpoint.  If reason
+ + *                    is SILENT or CPU_UP then regs is NULL, otherwise it should
+ + *                    always be valid.
+ + * Returns:
+ + *    None.
+ + * Locking:
+ + *    None.
+ + * Remarks:
+ + *    noop on ix86.
+ + */
+ +
+ +void
+ +kdba_adjust_ip(kdb_reason_t reason, int error, struct pt_regs *regs)
+ +{
+ +      return;
+ +}
+ +
+ +void
+ +kdba_set_current_task(const struct task_struct *p)
+ +{
+ +      kdb_current_task = p;
+ +      if (kdb_task_has_cpu(p)) {
+ +              struct kdb_running_process *krp = kdb_running_process + kdb_process_cpu(p);
+ +              kdb_current_regs = krp->regs;
+ +              return;
+ +      }
+ +      kdb_current_regs = NULL;
+ +}
+ +
+ +#ifdef CONFIG_X86_32
+ +/*
+ + * asm-i386 uaccess.h supplies __copy_to_user which relies on MMU to
+ + * trap invalid addresses in the _xxx fields.  Verify the other address
+ + * of the pair is valid by accessing the first and last byte ourselves,
+ + * then any access violations should only be caused by the _xxx
+ + * addresses,
+ + */
+ +
+ +int
+ +kdba_putarea_size(unsigned long to_xxx, void *from, size_t size)
+ +{
+ +      mm_segment_t oldfs = get_fs();
+ +      int r;
+ +      char c;
+ +      c = *((volatile char *)from);
+ +      c = *((volatile char *)from + size - 1);
+ +
+ +      if (to_xxx < PAGE_OFFSET) {
+ +              return kdb_putuserarea_size(to_xxx, from, size);
+ +      }
+ +
+ +      set_fs(KERNEL_DS);
+ +      r = __copy_to_user_inatomic((void __user *)to_xxx, from, size);
+ +      set_fs(oldfs);
+ +      return r;
+ +}
+ +
+ +int
+ +kdba_getarea_size(void *to, unsigned long from_xxx, size_t size)
+ +{
+ +      mm_segment_t oldfs = get_fs();
+ +      int r;
+ +      *((volatile char *)to) = '\0';
+ +      *((volatile char *)to + size - 1) = '\0';
+ +
+ +      if (from_xxx < PAGE_OFFSET) {
+ +              return kdb_getuserarea_size(to, from_xxx, size);
+ +      }
+ +
+ +      set_fs(KERNEL_DS);
+ +      switch (size) {
+ +      case 1:
+ +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 1);
+ +              break;
+ +      case 2:
+ +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 2);
+ +              break;
+ +      case 4:
+ +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 4);
+ +              break;
+ +      case 8:
+ +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, 8);
+ +              break;
+ +      default:
+ +              r = __copy_to_user_inatomic((void __user *)to, (void *)from_xxx, size);
+ +              break;
+ +      }
+ +      set_fs(oldfs);
+ +      return r;
+ +}
+ +
+ +int
+ +kdba_verify_rw(unsigned long addr, size_t size)
+ +{
+ +      unsigned char data[size];
+ +      return(kdba_getarea_size(data, addr, size) || kdba_putarea_size(addr, data, size));
+ +}
+ +#endif /* CONFIG_X86_32 */
+ +
+ +#ifdef        CONFIG_SMP
+ +
+ +#include <asm/ipi.h>
+ +
+ +gate_desc save_idt[NR_VECTORS];
+ +
+ +void kdba_takeover_vector(int vector)
+ +{
+ +      memcpy(&save_idt[vector], &idt_table[vector], sizeof(gate_desc));
+ +      set_intr_gate(KDB_VECTOR, kdb_interrupt);
+ +      return;
+ +}
+ +
+ +void kdba_giveback_vector(int vector)
+ +{
+ +      native_write_idt_entry(idt_table, vector, &save_idt[vector]);
+ +      return;
+ +}
+ +
+ +/* When first entering KDB, try a normal IPI.  That reduces backtrace problems
+ + * on the other cpus.
+ + */
+ +void
+ +smp_kdb_stop(void)
+ +{
+ +      if (!KDB_FLAG(NOIPI)) {
+ +              kdba_takeover_vector(KDB_VECTOR);
+ +              apic->send_IPI_allbutself(KDB_VECTOR);
+ +      }
+ +}
+ +
+ +/* The normal KDB IPI handler */
+ +#ifdef CONFIG_X86_64
+ +asmlinkage
+ +#endif
+ +void
+ +smp_kdb_interrupt(struct pt_regs *regs)
+ +{
+ +      struct pt_regs *old_regs = set_irq_regs(regs);
+ +      ack_APIC_irq();
+ +      irq_enter();
+ +      kdb_ipi(regs, NULL);
+ +      irq_exit();
+ +      set_irq_regs(old_regs);
+ +}
+ +
+ +/* Invoked once from kdb_wait_for_cpus when waiting for cpus.  For those cpus
+ + * that have not responded to the normal KDB interrupt yet, hit them with an
+ + * NMI event.
+ + */
+ +void
+ +kdba_wait_for_cpus(void)
+ +{
+ +      int c;
+ +      if (KDB_FLAG(CATASTROPHIC))
+ +              return;
+ +      kdb_printf("  Sending NMI to non-responding cpus: ");
+ +      for_each_online_cpu(c) {
+ +              if (kdb_running_process[c].seqno < kdb_seqno - 1) {
+ +                      kdb_printf(" %d", c);
+ +                      apic->send_IPI_mask(cpumask_of(c), NMI_VECTOR);
+ +              }
+ +      }
+ +      kdb_printf(".\n");
+ +}
+ +
+ +#endif        /* CONFIG_SMP */
+ +
+ +#ifdef CONFIG_KDB_KDUMP
+ +void kdba_kdump_prepare(struct pt_regs *regs)
+ +{
+ +      int i;
+ +      struct pt_regs r;
+ +      if (regs == NULL)
+ +              regs = &r;
+ +
+ +      for (i = 1; i < NR_CPUS; ++i) {
+ +              if (!cpu_online(i))
+ +                      continue;
+ +
+ +              KDB_STATE_SET_CPU(KEXEC, i);
+ +      }
+ +
+ +      machine_crash_shutdown(regs);
+ +}
+ +
+ +extern void halt_current_cpu(struct pt_regs *);
+ +
+ +void kdba_kdump_shutdown_slave(struct pt_regs *regs)
+ +{
- #ifndef CONFIG_PARAVIRT_XEN
++#ifndef CONFIG_XEN
+ +      halt_current_cpu(regs);
+ +#endif /* CONFIG_XEN */
+ +}
+ +
+ +#endif /* CONFIG_KDB_KDUMP */
diff --cc arch/x86/kernel/acpi/boot.c

index 2fce69c,a54d714..5b3f2f5
--- 1/arch/x86/kernel/acpi/boot.c
--- 2/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@@ -1318,23 -1326,7 +1326,22 @@@ static int __init dmi_ignore_irq0_timer
         }
         return 0;
   }
- #endif
   
+ +static int __init force_acpi_rsdt(const struct dmi_system_id *d)
+ +{
+ +      if (!acpi_force) {
+ +              printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
+ +                     d->ident);
+ +              acpi_rsdt_forced = 1;
+ +      } else {
+ +              printk(KERN_NOTICE
+ +                     "Warning: acpi=force overrules DMI blacklist: "
+ +                     "acpi=rsdt\n");
+ +      }
+ +      return 0;
+ +
+ +}
+ +
   /*
    * If your system is blacklisted here, but you find that acpi=force
    * works for you, please contact linux-acpi@vger.kernel.org
diff --cc arch/x86/kernel/apic/io_apic.c
Simple merge
diff --cc arch/x86/kernel/apm_32.c
Simple merge
diff --cc arch/x86/kernel/cpu/mcheck/Makefile

index 5838c77,4ac6d48..16606f4
--- 1/arch/x86/kernel/cpu/mcheck/Makefile
--- 2/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@@ -2,9 -2,7 +2,8 @@@ obj-y                            =  mce.o mce-severity.
   
   obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
   obj-$(CONFIG_X86_MCE_INTEL)   += mce_intel.o
+ +obj-$(CONFIG_X86_MCE_XEON75XX)        += mce-xeon75xx.o
   obj-$(CONFIG_X86_MCE_AMD)     += mce_amd.o
- obj-$(CONFIG_X86_XEN_MCE)     += mce_dom0.o
   obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
   obj-$(CONFIG_X86_MCE_INJECT)  += mce-inject.o
   
diff --cc arch/x86/kernel/cpu/mcheck/mce.c
Simple merge
diff --cc arch/x86/kernel/dumpstack_32.c
Simple merge
diff --cc arch/x86/kernel/dumpstack_64.c
Simple merge
diff --cc arch/x86/kernel/e820.c
Simple merge
diff --cc arch/x86/kernel/entry_32.S
Simple merge
diff --cc arch/x86/kernel/entry_64.S

index 353fc7b,0697ff1..9fdb7b1
--- 1/arch/x86/kernel/entry_64.S
--- 2/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@@ -1243,41 -1232,7 +1243,41 @@@ ENTRY(call_softirq
         CFI_ENDPROC
   END(call_softirq)
   
+ +#ifdef CONFIG_STACK_UNWIND
+ +ENTRY(arch_unwind_init_running)
+ +      CFI_STARTPROC
+ +      movq    %r15, R15(%rdi)
+ +      movq    %r14, R14(%rdi)
+ +      xchgq   %rsi, %rdx
+ +      movq    %r13, R13(%rdi)
+ +      movq    %r12, R12(%rdi)
+ +      xorl    %eax, %eax
+ +      movq    %rbp, RBP(%rdi)
+ +      movq    %rbx, RBX(%rdi)
+ +      movq    (%rsp), %r9
+ +      xchgq   %rdx, %rcx
+ +      movq    %rax, R11(%rdi)
+ +      movq    %rax, R10(%rdi)
+ +      movq    %rax, R9(%rdi)
+ +      movq    %rax, R8(%rdi)
+ +      movq    %rax, RAX(%rdi)
+ +      movq    %rax, RCX(%rdi)
+ +      movq    %rax, RDX(%rdi)
+ +      movq    %rax, RSI(%rdi)
+ +      movq    %rax, RDI(%rdi)
+ +      movq    %rax, ORIG_RAX(%rdi)
+ +      movq    %r9, RIP(%rdi)
+ +      leaq    8(%rsp), %r9
+ +      movq    $__KERNEL_CS, CS(%rdi)
+ +      movq    %rax, EFLAGS(%rdi)
+ +      movq    %r9, RSP(%rdi)
+ +      movq    $__KERNEL_DS, SS(%rdi)
+ +      jmpq    *%rcx
+ +      CFI_ENDPROC
+ +END(arch_unwind_init_running)
+ +#endif
+ +
- #ifdef CONFIG_PARAVIRT_XEN
+ #ifdef CONFIG_XEN
   zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
   
   /*
@@@ -1374,35 -1329,8 +1374,35 @@@ ENTRY(xen_failsafe_callback
         CFI_ENDPROC
   END(xen_failsafe_callback)
   
- #endif /* CONFIG_PARAVIRT_XEN */
+ #endif /* CONFIG_XEN */
   
+ +#ifdef        CONFIG_KDB
+ +
+ +#ifdef CONFIG_SMP
+ +apicinterrupt KDB_VECTOR \
+ +      kdb_interrupt, smp_kdb_interrupt
+ +#endif        /* CONFIG_SMP */
+ +
+ +ENTRY(kdb_call)
+ +      INTR_FRAME
+ +      cld
+ +      pushq $-1                       # orig_eax
+ +      CFI_ADJUST_CFA_OFFSET 8
+ +      SAVE_ALL
+ +      movq $1,%rdi                    # KDB_REASON_ENTER
+ +      movq $0,%rsi                    # error_code
+ +      movq %rsp,%rdx                  # struct pt_regs
+ +      call kdb
+ +      RESTORE_ALL
+ +      addq $8,%rsp                    # forget orig_eax
+ +      CFI_ADJUST_CFA_OFFSET -8
+ +      iretq
+ +      CFI_ENDPROC
+ +END(kdb_call)
+ +
+ +#endif        /* CONFIG_KDB */
+ +
+ +
   /*
    * Some functions should be protected against kprobes
    */
diff --cc arch/x86/kernel/hpet.c
Simple merge
diff --cc arch/x86/kernel/machine_kexec_32.c

index 53da615,a3fa43b..ecac7ab
--- 1/arch/x86/kernel/machine_kexec_32.c
--- 2/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@@ -27,10 -27,48 +27,6 @@@
   #include <asm/cacheflush.h>
   #include <asm/debugreg.h>
   
- #ifdef CONFIG_XEN
- #include <xen/interface/kexec.h>
- #endif
- 
- -static void set_idt(void *newidt, __u16 limit)
- -{
- -      struct desc_ptr curidt;
- -
- -      /* ia32 supports unaliged loads & stores */
- -      curidt.size    = limit;
- -      curidt.address = (unsigned long)newidt;
- -
- -      load_idt(&curidt);
- -}
- -
- -
- -static void set_gdt(void *newgdt, __u16 limit)
- -{
- -      struct desc_ptr curgdt;
- -
- -      /* ia32 supports unaligned loads & stores */
- -      curgdt.size    = limit;
- -      curgdt.address = (unsigned long)newgdt;
- -
- -      load_gdt(&curgdt);
- -}
- -
- -static void load_segments(void)
- -{
- -#define __STR(X) #X
- -#define STR(X) __STR(X)
- -
- -      __asm__ __volatile__ (
- -              "\tljmp $"STR(__KERNEL_CS)",$1f\n"
- -              "\t1:\n"
- -              "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
- -              "\tmovl %%eax,%%ds\n"
- -              "\tmovl %%eax,%%es\n"
- -              "\tmovl %%eax,%%fs\n"
- -              "\tmovl %%eax,%%gs\n"
- -              "\tmovl %%eax,%%ss\n"
- -              : : : "eax", "memory");
- -#undef STR
- -#undef __STR
- -}
- -
   static void machine_kexec_free_page_tables(struct kimage *image)
   {
         free_page((unsigned long)image->arch.pgd);
diff --cc arch/x86/kernel/machine_kexec_64.c
Simple merge
diff --cc arch/x86/kernel/paravirt.c
Simple merge
diff --cc arch/x86/kernel/pci-dma.c
Simple merge
diff --cc arch/x86/kernel/process.c
Simple merge
diff --cc arch/x86/kernel/reboot.c
Simple merge
diff --cc arch/x86/kernel/traps.c
Simple merge
diff --cc arch/x86/kernel/tsc.c
Simple merge
diff --cc arch/x86/kernel/x8664_ksyms_64.c

index b29da78,693920b..e395103
--- 1/arch/x86/kernel/x8664_ksyms_64.c
--- 2/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@@ -54,6 -55,6 +55,6 @@@ EXPORT_SYMBOL(__memcpy)
   
   EXPORT_SYMBOL(empty_zero_page);
   EXPORT_SYMBOL(init_level4_pgt);
- #if !defined(CONFIG_PARAVIRT_CPU) && !defined(CONFIG_XEN)
- -#ifndef CONFIG_PARAVIRT
++#ifndef CONFIG_PARAVIRT_CPU
   EXPORT_SYMBOL(native_load_gs_index);
   #endif
diff --cc arch/x86/kvm/Kconfig
Simple merge
diff --cc arch/x86/kvm/svm.c
Simple merge
diff --cc arch/x86/kvm/x86.c
Simple merge
diff --cc arch/x86/mm/init_32.c
Simple merge
diff --cc arch/x86/mm/init_64.c
Simple merge
diff --cc arch/x86/oprofile/nmi_int.c
Simple merge
diff --cc arch/x86/xen/Kconfig

index 7c251db,b83e119..eef41bd
--- 1/arch/x86/xen/Kconfig
--- 2/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@@ -2,9 -2,9 +2,9 @@@
   # This Kconfig describes xen options
   #
   
- config PARAVIRT_XEN
+ config XEN
         bool "Xen guest support"
- -      select PARAVIRT
+ +      select PARAVIRT_ALL
         select PARAVIRT_CLOCK
         depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
         depends on X86_CMPXCHG && X86_TSC
diff --cc drivers/Makefile

index 9375baf,81e3659..0bcaf45
--- 1/drivers/Makefile
--- 2/drivers/Makefile
+++ b/drivers/Makefile
@@@ -33,7 -40,7 +33,6 @@@ obj-$(CONFIG_PARPORT)         += parport
   obj-y                         += base/ block/ misc/ mfd/
   obj-$(CONFIG_NUBUS)           += nubus/
   obj-y                         += macintosh/
- obj-$(CONFIG_XEN)             += xen/
- -obj-$(CONFIG_IDE)             += ide/
   obj-$(CONFIG_SCSI)            += scsi/
   obj-$(CONFIG_ATA)             += ata/
   obj-$(CONFIG_MTD)             += mtd/
diff --cc drivers/acpi/numa.c
Simple merge
diff --cc drivers/acpi/osl.c
Simple merge
diff --cc drivers/ata/ahci.c
Simple merge
diff --cc drivers/ata/ata_piix.c
Simple merge
diff --cc drivers/ata/libata-core.c
Simple merge
diff --cc drivers/atm/fore200e.c
Simple merge
diff --cc drivers/block/Kconfig
Simple merge
diff --cc drivers/block/Makefile

index ece0951,aff5ac9..e65e1db
--- 1/drivers/block/Makefile
--- 2/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@@ -35,9 -35,7 +35,9 @@@ obj-$(CONFIG_BLK_DEV_SX8)     += sx8.
   obj-$(CONFIG_BLK_DEV_UB)      += ub.o
   obj-$(CONFIG_BLK_DEV_HD)      += hd.o
   
- obj-$(CONFIG_XEN_BLKFRONT)    += xen-blkfront.o
+ obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     += xen-blkfront.o
   obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
   
+ +obj-$(CONFIG_CIPHER_TWOFISH)  += loop_fish2.o
+ +
   swim_mod-objs := swim.o swim_asm.o
diff --cc drivers/char/Kconfig
Simple merge
diff --cc drivers/char/agp/intel-agp.c
Simple merge
diff --cc drivers/char/keyboard.c
Simple merge
diff --cc drivers/char/mem.c
Simple merge
diff --cc drivers/crypto/amcc/crypto4xx_core.c
Simple merge
diff --cc drivers/crypto/talitos.c
Simple merge
diff --cc drivers/dma/fsldma.c

index d22659f,bbb4be5..db0cacb
--- 1/drivers/dma/fsldma.c
--- 2/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@@ -1280,15 -1407,18 +1407,19 @@@ static const struct of_device_id fsldma
         {}
   };
   
- static struct of_platform_driver of_fsl_dma_driver = {
-       .owner = THIS_MODULE,
-       .name = "fsl-elo-dma",
-       .match_table = of_fsl_dma_ids,
-       .probe = of_fsl_dma_probe,
-       .remove = of_fsl_dma_remove,
+ static struct of_platform_driver fsldma_of_driver = {
++      .owner          = THIS_MODULE,
+       .name           = "fsl-elo-dma",
+       .match_table    = fsldma_of_ids,
+       .probe          = fsldma_of_probe,
+       .remove         = fsldma_of_remove,
   };
   
- static __init int of_fsl_dma_init(void)
+ /*----------------------------------------------------------------------------*/
+ /* Module Init / Exit                                                         */
+ /*----------------------------------------------------------------------------*/
+ 
+ static __init int fsldma_init(void)
   {
         int ret;
   
diff --cc drivers/hid/hid-apple.c
Simple merge
diff --cc drivers/hid/hid-core.c
Simple merge
diff --cc drivers/hid/hid-ids.h
Simple merge
diff --cc drivers/hid/usbhid/hid-core.c
Simple merge
diff --cc drivers/ieee1394/sbp2.c
Simple merge
diff --cc drivers/input/serio/xilinx_ps2.c
Simple merge
diff --cc drivers/input/touchscreen/Kconfig
Simple merge
diff --cc drivers/input/touchscreen/Makefile
Simple merge
diff --cc drivers/macintosh/Kconfig
Simple merge
diff --cc drivers/macintosh/adb.c
Simple merge
diff --cc drivers/macintosh/therm_pm72.c
Simple merge
diff --cc drivers/macintosh/therm_windtunnel.c
Simple merge
diff --cc drivers/md/dm-mpath.c

index 901c32b,826bce7..93665eb
--- 1/drivers/md/dm-mpath.c
--- 2/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@@ -610,57 -607,30 +617,57 @@@ static struct pgpath *parse_path(struc
         if (!p)
                 return ERR_PTR(-ENOMEM);
   
- -      r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
+ +      path = shift(as);
-       r = dm_get_device(ti, path, ti->begin, ti->len,
-                         dm_table_get_mode(ti->table), &p->path.dev);
++      r = dm_get_device(ti, path, dm_table_get_mode(ti->table),
+                         &p->path.dev);
         if (r) {
- -              ti->error = "error getting device";
- -              goto bad;
+ +              unsigned major, minor;
+ +
+ +              /* Try to add a failed device */
+ +              if (r == -ENXIO && sscanf(path, "%u:%u", &major, &minor) == 2) {
+ +                      dev_t dev;
+ +
+ +                      /* Extract the major/minor numbers */
+ +                      dev = MKDEV(major, minor);
+ +                      if (MAJOR(dev) != major || MINOR(dev) != minor) {
+ +                              /* Nice try, didn't work */
+ +                              DMWARN("Invalid device path %s", path);
+ +                              ti->error = "error converting devnum";
+ +                              goto bad;
+ +                      }
+ +                      DMWARN("adding disabled device %d:%d", major, minor);
+ +                      p->path.dev = NULL;
+ +                      format_dev_t(p->path.pdev, dev);
+ +                      p->is_active = 0;
+ +              } else {
+ +                      ti->error = "error getting device";
+ +                      goto bad;
+ +              }
+ +      } else {
+ +              memcpy(p->path.pdev, p->path.dev->name, 16);
         }
   
- -      if (m->hw_handler_name) {
+ +      if (p->path.dev) {
                 struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
   
- -              r = scsi_dh_attach(q, m->hw_handler_name);
- -              if (r == -EBUSY) {
- -                      /*
- -                       * Already attached to different hw_handler,
- -                       * try to reattach with correct one.
- -                       */
- -                      scsi_dh_detach(q);
+ +              if (m->hw_handler_name) {
                         r = scsi_dh_attach(q, m->hw_handler_name);
- -              }
- -
- -              if (r < 0) {
- -                      ti->error = "error attaching hardware handler";
- -                      dm_put_device(ti, p->path.dev);
- -                      goto bad;
+ +                      if (r == -EBUSY) {
+ +                              /*
+ +                               * Already attached to different hw_handler,
+ +                               * try to reattach with correct one.
+ +                               */
+ +                              scsi_dh_detach(q);
+ +                              r = scsi_dh_attach(q, m->hw_handler_name);
+ +                      }
+ +                      if (r < 0) {
+ +                              ti->error = "error attaching hardware handler";
+ +                              dm_put_device(ti, p->path.dev);
+ +                              goto bad;
+ +                      }
+ +              } else {
+ +                      /* Play safe and detach hardware handler */
+ +                      scsi_dh_detach(q);
                 }
   
                 if (m->hw_handler_params) {
@@@ -1204,8 -1177,8 +1235,9 @@@ static void pg_init_done(void *data, in
                         errors = 0;
                         break;
                 }
-               DMERR("Cannot failover device %s because scsi_dh_%s was not "
-                     "loaded.", pgpath->path.pdev, m->hw_handler_name);
- -              DMERR("Could not failover the device: Handler scsi_dh_%s "
- -                    "Error %d.", m->hw_handler_name, errors);
++              DMERR("Count not failover device %s: Handler scsi_dh_%s "
++                    "was not loaded.", pgpath->path.pdev,
++                    m->hw_handler_name);
                 /*
                  * Fail path for now, so we do not ping pong
                  */
@@@ -1263,47 -1241,8 +1305,47 @@@ static void activate_path(struct work_s
         struct pgpath *pgpath =
                 container_of(work, struct pgpath, activate_path);
   
- -      scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
- -                              pg_init_done, pgpath);
+ +      if (pgpath->path.dev)
+ +              scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-                                pg_init_done, &pgpath->path);
++                               pg_init_done, pgpath);
+ +}
+ +
+ +/*
+ + * Evaluate scsi return code
+ + */
+ +static int eval_scsi_error(int result, char *sense, int sense_len)
+ +{
+ +      struct scsi_sense_hdr sshdr;
+ +      int r = DM_ENDIO_REQUEUE;
+ +
+ +      if (host_byte(result) != DID_OK)
+ +              return r;
+ +
+ +      if (msg_byte(result) != COMMAND_COMPLETE)
+ +              return r;
+ +
+ +      if (status_byte(result) == RESERVATION_CONFLICT)
+ +              /* Do not retry here, possible data corruption */
+ +              return -EIO;
+ +
+ +#if defined(CONFIG_SCSI) || defined(CONFIG_SCSI_MODULE)
+ +      if (status_byte(result) == CHECK_CONDITION &&
+ +          !scsi_normalize_sense(sense, sense_len, &sshdr)) {
+ +
+ +              switch (sshdr.sense_key) {
+ +              case MEDIUM_ERROR:
+ +              case DATA_PROTECT:
+ +              case BLANK_CHECK:
+ +              case COPY_ABORTED:
+ +              case VOLUME_OVERFLOW:
+ +              case MISCOMPARE:
+ +                      r = -EIO;
+ +                      break;
+ +              }
+ +      }
+ +#endif
+ +
+ +      return r;
   }
   
   /*
diff --cc drivers/md/dm-raid45.c

index a780346,0000000..eb5ae0a

mode 100644,000000..100644
--- 1/drivers/md/dm-raid45.c
--- /dev/null
+++ b/drivers/md/dm-raid45.c
@@@ -1,4523 -1,0 +1,4522 @@@
+ +/*
+ + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+ + *
+ + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
+ + *
+ + * This file is released under the GPL.
+ + *
+ + *
+ + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
+ + *
+ + * Supports:
+ + *    o RAID4 with dedicated and selectable parity device
+ + *    o RAID5 with rotating parity (left+right, symmetric+asymmetric)
+ + *    o run time optimization of xor algorithm used to calculate parity
+ + *
+ + *
+ + * Thanks to MD for:
+ + *    o the raid address calculation algorithm
+ + *    o the base of the biovec <-> page list copier.
+ + *
+ + *
+ + * Uses region hash to keep track of how many writes are in flight to
+ + * regions in order to use dirty log to keep state of regions to recover:
+ + *
+ + *    o clean regions (those which are synchronized
+ + *    and don't have write io in flight)
+ + *    o dirty regions (those with write io in flight)
+ + *
+ + *
+ + * On startup, any dirty regions are migrated to the 'nosync' state
+ + * and are subject to recovery by the daemon.
+ + *
+ + * See raid_ctr() for table definition.
+ + *
+ + *
+ + * FIXME:
+ + * o add virtual interface for locking
+ + * o remove instrumentation (REMOVEME:)
+ + *
+ + */
+ +
+ +static const char *version = "v0.2431";
+ +
+ +#include "dm.h"
+ +#include "dm-memcache.h"
+ +#include "dm-message.h"
+ +#include "dm-raid45.h"
+ +
+ +#include <linux/kernel.h>
+ +#include <linux/vmalloc.h>
+ +
+ +#include <linux/dm-io.h>
+ +#include <linux/dm-dirty-log.h>
+ +#include <linux/dm-region-hash.h>
+ +
+ +/* # of parallel recovered regions */
+ +/* FIXME: cope with multiple recovery stripes in raid_set struct. */
+ +#define MAX_RECOVER   1 /* needs to be 1! */
+ +
+ +/*
+ + * Configurable parameters
+ + */
+ +#define       INLINE
+ +
+ +/* Default # of stripes if not set in constructor. */
+ +#define       STRIPES                 64
+ +
+ +/* Minimum/maximum # of selectable stripes. */
+ +#define       STRIPES_MIN             8
+ +#define       STRIPES_MAX             16384
+ +
+ +/* Default chunk size in sectors if not set in constructor. */
+ +#define       CHUNK_SIZE              64
+ +
+ +/* Default io size in sectors if not set in constructor. */
+ +#define       IO_SIZE_MIN             SECTORS_PER_PAGE
+ +#define       IO_SIZE                 IO_SIZE_MIN
+ +
+ +/* Maximum setable chunk size in sectors. */
+ +#define       CHUNK_SIZE_MAX          16384
+ +
+ +/* Recover io size default in sectors. */
+ +#define       RECOVER_IO_SIZE_MIN     64
+ +#define       RECOVER_IO_SIZE         256
+ +
+ +/* Default percentage recover io bandwidth. */
+ +#define       BANDWIDTH               10
+ +#define       BANDWIDTH_MIN           1
+ +#define       BANDWIDTH_MAX           100
+ +/*
+ + * END Configurable parameters
+ + */
+ +
+ +#define       TARGET  "dm-raid45"
+ +#define       DAEMON  "kraid45d"
+ +#define       DM_MSG_PREFIX   TARGET
+ +
+ +#define       SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
+ +
+ +/* Amount/size for __xor(). */
+ +#define       SECTORS_PER_XOR SECTORS_PER_PAGE
+ +#define       XOR_SIZE        PAGE_SIZE
+ +
+ +/* Derive raid_set from stripe_cache pointer. */
+ +#define       RS(x)   container_of(x, struct raid_set, sc)
+ +
+ +/* Check value in range. */
+ +#define       range_ok(i, min, max)   (i >= min && i <= max)
+ +
+ +/* Page reference. */
+ +#define PAGE(stripe, p)       ((stripe)->obj[p].pl->page)
+ +
+ +/* Bio list reference. */
+ +#define       BL(stripe, p, rw)       (stripe->ss[p].bl + rw)
+ +
+ +/* Page list reference. */
+ +#define       PL(stripe, p)           (stripe->obj[p].pl)
+ +
+ +/* Check argument is power of 2. */
+ +#define POWER_OF_2(a) (!(a & (a - 1)))
+ +
+ +/* Factor out to dm-bio-list.h */
+ +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
+ +{
+ +      bio->bi_next = bl->head;
+ +      bl->head = bio;
+ +
+ +      if (!bl->tail)
+ +              bl->tail = bio;
+ +}
+ +
+ +/* Factor out to dm.h */
+ +#define TI_ERR_RET(str, ret) \
+ +      do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
+ +#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
+ +
+ +/*-----------------------------------------------------------------
+ + * Stripe cache
+ + *
+ + * Cache for all reads and writes to raid sets (operational or degraded)
+ + *
+ + * We need to run all data to and from a RAID set through this cache,
+ + * because parity chunks need to get calculated from data chunks
+ + * or, in the degraded/resynchronization case, missing chunks need
+ + * to be reconstructed using the other chunks of the stripe.
+ + *---------------------------------------------------------------*/
+ +/* Protect kmem cache # counter. */
+ +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
+ +
+ +/* A stripe set (holds bios hanging off). */
+ +struct stripe_set {
+ +      struct stripe *stripe;  /* Backpointer to stripe for endio(). */
+ +      struct bio_list bl[3]; /* Reads, writes, and writes merged. */
+ +#define       WRITE_MERGED    2
+ +};
+ +
+ +#if READ != 0 || WRITE != 1
+ +#error dm-raid45: READ/WRITE != 0/1 used as index!!!
+ +#endif
+ +
+ +/*
+ + * Stripe linked list indexes. Keep order, because the stripe
+ + * and the stripe cache rely on the first 3!
+ + */
+ +enum list_types {
+ +      LIST_IO = 0,    /* Stripes with io pending. */
+ +      LIST_ENDIO,     /* Stripes to endio. */
+ +      LIST_LRU,       /* Least recently used stripes. */
+ +      LIST_HASH,      /* Hashed stripes. */
+ +      LIST_RECOVER = LIST_HASH,       /* For recovery type stripes only. */
+ +      NR_LISTS,       /* To size array in struct stripe. */
+ +};
+ +
+ +enum lock_types {
+ +      LOCK_ENDIO = 0, /* Protect endio list. */
+ +      LOCK_LRU,       /* Protect lru list. */
+ +      NR_LOCKS,       /* To size array in struct stripe_cache. */
+ +};
+ +
+ +/* A stripe: the io object to handle all reads and writes to a RAID set. */
+ +struct stripe {
+ +      struct stripe_cache *sc;        /* Backpointer to stripe cache. */
+ +
+ +      sector_t key;           /* Hash key. */
+ +      region_t region;        /* Region stripe is mapped to. */
+ +
+ +      /* Reference count. */
+ +      atomic_t cnt;
+ +
+ +      struct {
+ +              unsigned long flags;    /* flags (see below). */
+ +
+ +              /*
+ +               * Pending ios in flight:
+ +               *
+ +               * used as a 'lock' to control move of stripe to endio list
+ +               */
+ +              atomic_t pending;       /* Pending ios in flight. */
+ +
+ +              /* Sectors to read and write for multi page stripe sets. */
+ +              unsigned size;
+ +      } io;
+ +
+ +      /* Lock on stripe (for clustering). */
+ +      void *lock;
+ +
+ +      /*
+ +       * 4 linked lists:
+ +       *   o io list to flush io
+ +       *   o endio list
+ +       *   o LRU list to put stripes w/o reference count on
+ +       *   o stripe cache hash
+ +       */
+ +      struct list_head lists[NR_LISTS];
+ +
+ +      struct {
+ +              unsigned short parity;  /* Parity chunk index. */
+ +              short recover;          /* Recovery chunk index. */
+ +      } idx;
+ +
+ +      /* This sets memory cache object (dm-mem-cache). */
+ +      struct dm_mem_cache_object *obj;
+ +
+ +      /* Array of stripe sets (dynamically allocated). */
+ +      struct stripe_set ss[0];
+ +};
+ +
+ +/* States stripes can be in (flags field). */
+ +enum stripe_states {
+ +      STRIPE_ACTIVE,          /* Active io on stripe. */
+ +      STRIPE_ERROR,           /* io error on stripe. */
+ +      STRIPE_MERGED,          /* Writes got merged. */
+ +      STRIPE_READ,            /* Read. */
+ +      STRIPE_RBW,             /* Read-before-write. */
+ +      STRIPE_RECONSTRUCT,     /* reconstruct of a missing chunk required. */
+ +      STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
+ +};
+ +
+ +/* ... and macros to access them. */
+ +#define       BITOPS(name, what, var, flag) \
+ +static inline int TestClear ## name ## what(struct var *v) \
+ +{ return test_and_clear_bit(flag, &v->io.flags); } \
+ +static inline int TestSet ## name ## what(struct var *v) \
+ +{ return test_and_set_bit(flag, &v->io.flags); } \
+ +static inline void Clear ## name ## what(struct var *v) \
+ +{ clear_bit(flag, &v->io.flags); } \
+ +static inline void Set ## name ## what(struct var *v) \
+ +{ set_bit(flag, &v->io.flags); } \
+ +static inline int name ## what(struct var *v) \
+ +{ return test_bit(flag, &v->io.flags); }
+ +
+ +
+ +BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
+ +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
+ +BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
+ +BITOPS(Stripe, Read, stripe, STRIPE_READ)
+ +BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
+ +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
+ +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
+ +
+ +/* A stripe hash. */
+ +struct stripe_hash {
+ +      struct list_head *hash;
+ +      unsigned buckets;
+ +      unsigned mask;
+ +      unsigned prime;
+ +      unsigned shift;
+ +};
+ +
+ +/* A stripe cache. */
+ +struct stripe_cache {
+ +      /* Stripe hash. */
+ +      struct stripe_hash hash;
+ +
+ +      /* Stripes with io to flush, stripes to endio and LRU lists. */
+ +      struct list_head lists[3];
+ +
+ +      /* Locks to protect endio and lru lists. */
+ +      spinlock_t locks[NR_LOCKS];
+ +
+ +      /* Slab cache to allocate stripes from. */
+ +      struct {
+ +              struct kmem_cache *cache;       /* Cache itself. */
+ +              char name[32];  /* Unique name. */
+ +      } kc;
+ +
+ +      struct dm_io_client *dm_io_client; /* dm-io client resource context. */
+ +
+ +      /* dm-mem-cache client resource context. */
+ +      struct dm_mem_cache_client *mem_cache_client;
+ +
+ +      int stripes_parm;           /* # stripes parameter from constructor. */
+ +      atomic_t stripes;           /* actual # of stripes in cache. */
+ +      atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
+ +      atomic_t stripes_last;      /* last # of stripes in cache. */
+ +      atomic_t active_stripes;    /* actual # of active stripes in cache. */
+ +
+ +      /* REMOVEME: */
+ +      atomic_t max_active_stripes; /* actual # of active stripes in cache. */
+ +};
+ +
+ +/* Flag specs for raid_dev */ ;
+ +enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
+ +
+ +/* The raid device in a set. */
+ +struct raid_dev {
+ +      struct dm_dev *dev;
+ +      unsigned long flags;    /* raid_dev_flags. */
+ +      sector_t start;         /* offset to map to. */
+ +};
+ +
+ +/* Flags spec for raid_set. */
+ +enum raid_set_flags {
+ +      RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
+ +      RS_DEAD,                /* RAID set inoperational. */
+ +      RS_DEVEL_STATS,         /* REMOVEME: display status information. */
+ +      RS_IO_ERROR,            /* io error on set. */
+ +      RS_RECOVER,             /* Do recovery. */
+ +      RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
+ +      RS_REGION_GET,          /* get a region to recover. */
+ +      RS_SC_BUSY,             /* stripe cache busy -> send an event. */
+ +      RS_SUSPENDED,           /* RAID set suspendedn. */
+ +};
+ +
+ +/* REMOVEME: devel stats counters. */
+ +enum stats_types {
+ +      S_BIOS_READ,
+ +      S_BIOS_ADDED_READ,
+ +      S_BIOS_ENDIO_READ,
+ +      S_BIOS_WRITE,
+ +      S_BIOS_ADDED_WRITE,
+ +      S_BIOS_ENDIO_WRITE,
+ +      S_CAN_MERGE,
+ +      S_CANT_MERGE,
+ +      S_CONGESTED,
+ +      S_DM_IO_READ,
+ +      S_DM_IO_WRITE,
+ +      S_ACTIVE_READS,
+ +      S_BANDWIDTH,
+ +      S_BARRIER,
+ +      S_BIO_COPY_PL_NEXT,
+ +      S_DEGRADED,
+ +      S_DELAYED_BIOS,
+ +      S_EVICT,
+ +      S_FLUSHS,
+ +      S_HITS_1ST,
+ +      S_IOS_POST,
+ +      S_INSCACHE,
+ +      S_MAX_LOOKUP,
+ +      S_MERGE_PAGE_LOCKED,
+ +      S_NO_BANDWIDTH,
+ +      S_NOT_CONGESTED,
+ +      S_NO_RW,
+ +      S_NOSYNC,
+ +      S_PROHIBITPAGEIO,
+ +      S_RECONSTRUCT_EI,
+ +      S_RECONSTRUCT_DEV,
+ +      S_REDO,
+ +      S_REQUEUE,
+ +      S_STRIPE_ERROR,
+ +      S_SUM_DELAYED_BIOS,
+ +      S_XORS,
+ +      S_NR_STATS,     /* # of stats counters. */
+ +};
+ +
+ +/* Status type -> string mappings. */
+ +struct stats_map {
+ +      const enum stats_types type;
+ +      const char *str;
+ +};
+ +
+ +static struct stats_map stats_map[] = {
+ +      { S_BIOS_READ, "r=" },
+ +      { S_BIOS_ADDED_READ, "/" },
+ +      { S_BIOS_ENDIO_READ, "/" },
+ +      { S_BIOS_WRITE, " w=" },
+ +      { S_BIOS_ADDED_WRITE, "/" },
+ +      { S_BIOS_ENDIO_WRITE, "/" },
+ +      { S_DM_IO_READ, " rc=" },
+ +      { S_DM_IO_WRITE, " wc=" },
+ +      { S_ACTIVE_READS, " active_reads=" },
+ +      { S_BANDWIDTH, " bandwidth=" },
+ +      { S_NO_BANDWIDTH, " no_bandwidth=" },
+ +      { S_BARRIER, " barrier=" },
+ +      { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
+ +      { S_CAN_MERGE, " can_merge=" },
+ +      { S_MERGE_PAGE_LOCKED, "/page_locked=" },
+ +      { S_CANT_MERGE, "/cant_merge=" },
+ +      { S_CONGESTED, " congested=" },
+ +      { S_NOT_CONGESTED, "/not_congested=" },
+ +      { S_DEGRADED, " degraded=" },
+ +      { S_DELAYED_BIOS, " delayed_bios=" },
+ +      { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
+ +      { S_EVICT, " evict=" },
+ +      { S_FLUSHS, " flushs=" },
+ +      { S_HITS_1ST, " hits_1st=" },
+ +      { S_IOS_POST, " ios_post=" },
+ +      { S_INSCACHE, " inscache=" },
+ +      { S_MAX_LOOKUP, " max_lookup=" },
+ +      { S_NO_RW, " no_rw=" },
+ +      { S_NOSYNC, " nosync=" },
+ +      { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
+ +      { S_RECONSTRUCT_EI, " reconstruct_ei=" },
+ +      { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
+ +      { S_REDO, " redo=" },
+ +      { S_REQUEUE, " requeue=" },
+ +      { S_STRIPE_ERROR, " stripe_error=" },
+ +      { S_XORS, " xors=" },
+ +};
+ +
+ +/*
+ + * A RAID set.
+ + */
+ +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
+ +struct raid_set {
+ +      struct dm_target *ti;   /* Target pointer. */
+ +
+ +      struct {
+ +              unsigned long flags;    /* State flags. */
+ +              spinlock_t in_lock;     /* Protects central input list below. */
+ +              struct bio_list in;     /* Pending ios (central input list). */
+ +              struct bio_list work;   /* ios work set. */
+ +              wait_queue_head_t suspendq;     /* suspend synchronization. */
+ +              atomic_t in_process;    /* counter of queued bios (suspendq). */
+ +              atomic_t in_process_max;/* counter of queued bios max. */
+ +
+ +              /* io work. */
+ +              struct workqueue_struct *wq;
+ +              struct delayed_work dws;
+ +      } io;
+ +
+ +      /* External locking. */
+ +      struct dm_raid45_locking_type *locking;
+ +
+ +      struct stripe_cache sc; /* Stripe cache for this set. */
+ +
+ +      /* Xor optimization. */
+ +      struct {
+ +              struct xor_func *f;
+ +              unsigned chunks;
+ +              unsigned speed;
+ +      } xor;
+ +
+ +      /* Recovery parameters. */
+ +      struct recover {
+ +              struct dm_dirty_log *dl;        /* Dirty log. */
+ +              struct dm_region_hash *rh;      /* Region hash. */
+ +
+ +              /* dm-mem-cache client resource context for recovery stripes. */
+ +              struct dm_mem_cache_client *mem_cache_client;
+ +
+ +              struct list_head stripes;       /* List of recovery stripes. */
+ +
+ +              region_t nr_regions;
+ +              region_t nr_regions_to_recover;
+ +              region_t nr_regions_recovered;
+ +              unsigned long start_jiffies;
+ +              unsigned long end_jiffies;
+ +
+ +              unsigned bandwidth;          /* Recovery bandwidth [%]. */
+ +              unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
+ +              unsigned bandwidth_parm; /*  " constructor parm. */
+ +              unsigned io_size;        /* io size <= chunk size. */
+ +              unsigned io_size_parm;   /* io size ctr parameter. */
+ +
+ +              /* recovery io throttling. */
+ +              atomic_t io_count[2];   /* counter recover/regular io. */
+ +              unsigned long last_jiffies;
+ +
+ +              struct dm_region *reg;  /* Actual region to recover. */
+ +              sector_t pos;   /* Position within region to recover. */
+ +              sector_t end;   /* End of region to recover. */
+ +      } recover;
+ +
+ +      /* RAID set parameters. */
+ +      struct {
+ +              struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
+ +              unsigned raid_parms;    /* # variable raid parameters. */
+ +
+ +              unsigned chunk_size;    /* Sectors per chunk. */
+ +              unsigned chunk_size_parm;
+ +              unsigned chunk_mask;    /* Mask for amount. */
+ +              unsigned chunk_shift;   /* rsector chunk size shift. */
+ +
+ +              unsigned io_size;       /* Sectors per io. */
+ +              unsigned io_size_parm;
+ +              unsigned io_mask;       /* Mask for amount. */
+ +              unsigned io_shift_mask; /* Mask for raid_address(). */
+ +              unsigned io_shift;      /* rsector io size shift. */
+ +              unsigned pages_per_io;  /* Pages per io. */
+ +
+ +              sector_t sectors_per_dev;       /* Sectors per device. */
+ +
+ +              atomic_t failed_devs;           /* Amount of devices failed. */
+ +
+ +              /* Index of device to initialize. */
+ +              int dev_to_init;
+ +              int dev_to_init_parm;
+ +
+ +              /* Raid devices dynamically allocated. */
+ +              unsigned raid_devs;     /* # of RAID devices below. */
+ +              unsigned data_devs;     /* # of RAID data devices. */
+ +
+ +              int ei;         /* index of failed RAID device. */
+ +
+ +              /* index of dedicated parity device (i.e. RAID4). */
+ +              int pi;
+ +              int pi_parm;    /* constructor parm for status output. */
+ +      } set;
+ +
+ +      /* REMOVEME: devel stats counters. */
+ +      atomic_t stats[S_NR_STATS];
+ +
+ +      /* Dynamically allocated temporary pointers for xor(). */
+ +      unsigned long **data;
+ +
+ +      /* Dynamically allocated RAID devices. Alignment? */
+ +      struct raid_dev dev[0];
+ +};
+ +
+ +
+ +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
+ +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
+ +BITOPS(RS, Dead, raid_set, RS_DEAD)
+ +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+ +BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
+ +BITOPS(RS, Recover, raid_set, RS_RECOVER)
+ +BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
+ +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
+ +BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
+ +#undef BITOPS
+ +
+ +#define       PageIO(page)            PageChecked(page)
+ +#define       AllowPageIO(page)       SetPageChecked(page)
+ +#define       ProhibitPageIO(page)    ClearPageChecked(page)
+ +
+ +/*-----------------------------------------------------------------
+ + * Raid-4/5 set structures.
+ + *---------------------------------------------------------------*/
+ +/* RAID level definitions. */
+ +enum raid_level {
+ +      raid4,
+ +      raid5,
+ +};
+ +
+ +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
+ +enum raid_algorithm {
+ +      none,
+ +      left_asym,
+ +      right_asym,
+ +      left_sym,
+ +      right_sym,
+ +};
+ +
+ +struct raid_type {
+ +      const char *name;               /* RAID algorithm. */
+ +      const char *descr;              /* Descriptor text for logging. */
+ +      const unsigned parity_devs;     /* # of parity devices. */
+ +      const unsigned minimal_devs;    /* minimal # of devices in set. */
+ +      const enum raid_level level;            /* RAID level. */
+ +      const enum raid_algorithm algorithm;    /* RAID algorithm. */
+ +};
+ +
+ +/* Supported raid types and properties. */
+ +static struct raid_type raid_types[] = {
+ +      {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
+ +      {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
+ +      {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
+ +      {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
+ +      {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
+ +};
+ +
+ +/* Address as calculated by raid_address(). */
+ +struct address {
+ +      sector_t key;           /* Hash key (start address of stripe). */
+ +      unsigned di, pi;        /* Data and parity disks index. */
+ +};
+ +
+ +/* REMOVEME: reset statistics counters. */
+ +static void stats_reset(struct raid_set *rs)
+ +{
+ +      unsigned s = S_NR_STATS;
+ +
+ +      while (s--)
+ +              atomic_set(rs->stats + s, 0);
+ +}
+ +
+ +/*----------------------------------------------------------------
+ + * RAID set management routines.
+ + *--------------------------------------------------------------*/
+ +/*
+ + * Begin small helper functions.
+ + */
+ +/* Queue (optionally delayed) io work. */
+ +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
+ +{
+ +      struct delayed_work *dws = &rs->io.dws;
+ +
+ +      cancel_delayed_work(dws);
+ +      queue_delayed_work(rs->io.wq, dws, delay);
+ +}
+ +
+ +/* Queue io work immediately (called from region hash too). */
+ +static INLINE void wake_do_raid(void *context)
+ +{
+ +      wake_do_raid_delayed(context, 0);
+ +}
+ +
+ +/* Wait until all io has been processed. */
+ +static INLINE void wait_ios(struct raid_set *rs)
+ +{
+ +      wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
+ +}
+ +
+ +/* Declare io queued to device. */
+ +static INLINE void io_dev_queued(struct raid_dev *dev)
+ +{
+ +      set_bit(IO_QUEUED, &dev->flags);
+ +}
+ +
+ +/* Io on device and reset ? */
+ +static inline int io_dev_clear(struct raid_dev *dev)
+ +{
+ +      return test_and_clear_bit(IO_QUEUED, &dev->flags);
+ +}
+ +
+ +/* Get an io reference. */
+ +static INLINE void io_get(struct raid_set *rs)
+ +{
+ +      int p = atomic_inc_return(&rs->io.in_process);
+ +
+ +      if (p > atomic_read(&rs->io.in_process_max))
+ +              atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
+ +}
+ +
+ +/* Put the io reference and conditionally wake io waiters. */
+ +static INLINE void io_put(struct raid_set *rs)
+ +{
+ +      /* Intel: rebuild data corrupter? */
+ +      if (!atomic_read(&rs->io.in_process)) {
+ +              DMERR("%s would go negative!!!", __func__);
+ +              return;
+ +      }
+ +
+ +      if (atomic_dec_and_test(&rs->io.in_process))
+ +              wake_up(&rs->io.suspendq);
+ +}
+ +
+ +/* Calculate device sector offset. */
+ +static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
+ +{
+ +      sector_t sector = bio->bi_sector;
+ +
+ +      sector_div(sector, rs->set.data_devs);
+ +      return sector;
+ +}
+ +
+ +/* Test device operational. */
+ +static INLINE int dev_operational(struct raid_set *rs, unsigned p)
+ +{
+ +      return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
+ +}
+ +
+ +/* Return # of active stripes in stripe cache. */
+ +static INLINE int sc_active(struct stripe_cache *sc)
+ +{
+ +      return atomic_read(&sc->active_stripes);
+ +}
+ +
+ +/* Test io pending on stripe. */
+ +static INLINE int stripe_io(struct stripe *stripe)
+ +{
+ +      return atomic_read(&stripe->io.pending);
+ +}
+ +
+ +static INLINE void stripe_io_inc(struct stripe *stripe)
+ +{
+ +      atomic_inc(&stripe->io.pending);
+ +}
+ +
+ +static INLINE void stripe_io_dec(struct stripe *stripe)
+ +{
+ +      atomic_dec(&stripe->io.pending);
+ +}
+ +
+ +/* Wrapper needed by for_each_io_dev(). */
+ +static void _stripe_io_inc(struct stripe *stripe, unsigned p)
+ +{
+ +      stripe_io_inc(stripe);
+ +}
+ +
+ +/* Error a stripe. */
+ +static INLINE void stripe_error(struct stripe *stripe, struct page *page)
+ +{
+ +      SetStripeError(stripe);
+ +      SetPageError(page);
+ +      atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
+ +}
+ +
+ +/* Page IOed ok. */
+ +enum dirty_type { CLEAN, DIRTY };
+ +static INLINE void page_set(struct page *page, enum dirty_type type)
+ +{
+ +      switch (type) {
+ +      case DIRTY:
+ +              SetPageDirty(page);
+ +              AllowPageIO(page);
+ +              break;
+ +
+ +      case CLEAN:
+ +              ClearPageDirty(page);
+ +              break;
+ +
+ +      default:
+ +              BUG();
+ +      }
+ +
+ +      SetPageUptodate(page);
+ +      ClearPageError(page);
+ +}
+ +
+ +/* Return region state for a sector. */
+ +static INLINE int
+ +region_state(struct raid_set *rs, sector_t sector, unsigned long state)
+ +{
+ +      struct dm_region_hash *rh = rs->recover.rh;
+ +
+ +      return RSRecover(rs) ?
+ +             (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
+ +              state) : 0;
+ +}
+ +
+ +/* Check maximum devices which may fail in a raid set. */
+ +static inline int raid_set_degraded(struct raid_set *rs)
+ +{
+ +      return RSIoError(rs);
+ +}
+ +
+ +/* Check # of devices which may fail in a raid set. */
+ +static INLINE int raid_set_operational(struct raid_set *rs)
+ +{
+ +      /* Too many failed devices -> BAD. */
+ +      return atomic_read(&rs->set.failed_devs) <=
+ +             rs->set.raid_type->parity_devs;
+ +}
+ +
+ +/*
+ + * Return true in case a page_list should be read/written
+ + *
+ + * Conditions to read/write:
+ + *    o 1st page in list not uptodate
+ + *    o 1st page in list dirty
+ + *    o if we optimized io away, we flag it using the pages checked bit.
+ + */
+ +static INLINE unsigned page_io(struct page *page)
+ +{
+ +      /* Optimization: page was flagged to need io during first run. */
+ +      if (PagePrivate(page)) {
+ +              ClearPagePrivate(page);
+ +              return 1;
+ +      }
+ +
+ +      /* Avoid io if prohibited or a locked page. */
+ +      if (!PageIO(page) || PageLocked(page))
+ +              return 0;
+ +
+ +      if (!PageUptodate(page) || PageDirty(page)) {
+ +              /* Flag page needs io for second run optimization. */
+ +              SetPagePrivate(page);
+ +              return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Call a function on each page list needing io. */
+ +static INLINE unsigned
+ +for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
+ +              void (*f_io)(struct stripe *stripe, unsigned p))
+ +{
+ +      unsigned p = rs->set.raid_devs, r = 0;
+ +
+ +      while (p--) {
+ +              if (page_io(PAGE(stripe, p))) {
+ +                      f_io(stripe, p);
+ +                      r++;
+ +              }
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/* Reconstruct a particular device ?. */
+ +static INLINE int dev_to_init(struct raid_set *rs)
+ +{
+ +      return rs->set.dev_to_init > -1;
+ +}
+ +
+ +/*
+ + * Index of device to calculate parity on.
+ + * Either the parity device index *or* the selected device to init
+ + * after a spare replacement.
+ + */
+ +static INLINE unsigned dev_for_parity(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +
+ +      return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
+ +}
+ +
+ +/* Return the index of the device to be recovered. */
+ +static int idx_get(struct raid_set *rs)
+ +{
+ +      /* Avoid to read in the pages to be reconstructed anyway. */
+ +      if (dev_to_init(rs))
+ +              return rs->set.dev_to_init;
+ +      else if (rs->set.raid_type->level == raid4)
+ +              return rs->set.pi;
+ +
+ +      return -1;
+ +}
+ +
+ +/* RAID set congested function. */
+ +static int raid_set_congested(void *congested_data, int bdi_bits)
+ +{
+ +      struct raid_set *rs = congested_data;
+ +      int r = 0; /* Assume uncongested. */
+ +      unsigned p = rs->set.raid_devs;
+ +
+ +      /* If any of our component devices are overloaded. */
+ +      while (p--) {
+ +              struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+ +
+ +              r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ +      }
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
+ +      return r;
+ +}
+ +
+ +/* Display RAID set dead message once. */
+ +static void raid_set_dead(struct raid_set *rs)
+ +{
+ +      if (!TestSetRSDead(rs)) {
+ +              unsigned p;
+ +              char buf[BDEVNAME_SIZE];
+ +
+ +              DMERR("FATAL: too many devices failed -> RAID set dead");
+ +
+ +              for (p = 0; p < rs->set.raid_devs; p++) {
+ +                      if (!dev_operational(rs, p))
+ +                              DMERR("device /dev/%s failed",
+ +                                    bdevname(rs->dev[p].dev->bdev, buf));
+ +              }
+ +      }
+ +}
+ +
+ +/* RAID set degrade check. */
+ +static INLINE int
+ +raid_set_check_and_degrade(struct raid_set *rs,
+ +                         struct stripe *stripe, unsigned p)
+ +{
+ +      if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
+ +              return -EPERM;
+ +
+ +      /* Through an event in case of member device errors. */
+ +      dm_table_event(rs->ti->table);
+ +      atomic_inc(&rs->set.failed_devs);
+ +
+ +      /* Only log the first member error. */
+ +      if (!TestSetRSIoError(rs)) {
+ +              char buf[BDEVNAME_SIZE];
+ +
+ +              /* Store index for recovery. */
+ +              mb();
+ +              rs->set.ei = p;
+ +              mb();
+ +
+ +              DMERR("CRITICAL: %sio error on device /dev/%s "
+ +                    "in region=%llu; DEGRADING RAID set",
+ +                    stripe ? "" : "FAKED ",
+ +                    bdevname(rs->dev[p].dev->bdev, buf),
+ +                    (unsigned long long) (stripe ? stripe->key : 0));
+ +              DMERR("further device error messages suppressed");
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static void
+ +raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
+ +{
+ +      unsigned p = rs->set.raid_devs;
+ +
+ +      while (p--) {
+ +              struct page *page = PAGE(stripe, p);
+ +
+ +              if (PageError(page)) {
+ +                      ClearPageError(page);
+ +                      raid_set_check_and_degrade(rs, stripe, p);
+ +              }
+ +      }
+ +}
+ +
+ +/* RAID set upgrade check. */
+ +static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
+ +{
+ +      if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
+ +              return -EPERM;
+ +
+ +      if (atomic_dec_and_test(&rs->set.failed_devs)) {
+ +              ClearRSIoError(rs);
+ +              rs->set.ei = -1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Lookup a RAID device by name or by major:minor number. */
+ +union dev_lookup {
+ +      const char *dev_name;
+ +      struct raid_dev *dev;
+ +};
+ +enum lookup_type { byname, bymajmin, bynumber };
+ +static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
+ +                         union dev_lookup *dl)
+ +{
+ +      unsigned p;
+ +
+ +      /*
+ +       * Must be an incremental loop, because the device array
+ +       * can have empty slots still on calls from raid_ctr()
+ +       */
+ +      for (p = 0; p < rs->set.raid_devs; p++) {
+ +              char buf[BDEVNAME_SIZE];
+ +              struct raid_dev *dev = rs->dev + p;
+ +
+ +              if (!dev->dev)
+ +                      break;
+ +
+ +              /* Format dev string appropriately if necessary. */
+ +              if (by == byname)
+ +                      bdevname(dev->dev->bdev, buf);
+ +              else if (by == bymajmin)
+ +                      format_dev_t(buf, dev->dev->bdev->bd_dev);
+ +
+ +              /* Do the actual check. */
+ +              if (by == bynumber) {
+ +                      if (dl->dev->dev->bdev->bd_dev ==
+ +                          dev->dev->bdev->bd_dev)
+ +                              return p;
+ +              } else if (!strcmp(dl->dev_name, buf))
+ +                      return p;
+ +      }
+ +
+ +      return -ENODEV;
+ +}
+ +
+ +/* End io wrapper. */
+ +static INLINE void
+ +_bio_endio(struct raid_set *rs, struct bio *bio, int error)
+ +{
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
+ +                 S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
+ +      bio_endio(bio, error);
+ +      io_put(rs);             /* Wake any suspend waiters. */
+ +}
+ +
+ +/*
+ + * End small helper functions.
+ + */
+ +
+ +
+ +/*
+ + * Stripe hash functions
+ + */
+ +/* Initialize/destroy stripe hash. */
+ +static int hash_init(struct stripe_hash *hash, unsigned stripes)
+ +{
+ +      unsigned buckets = 2, max_buckets = stripes / 4;
+ +      unsigned hash_primes[] = {
+ +              /* Table of primes for hash_fn/table size optimization. */
+ +              3, 7, 13, 27, 53, 97, 193, 389, 769,
+ +              1543, 3079, 6151, 12289, 24593,
+ +      };
+ +
+ +      /* Calculate number of buckets (2^^n <= stripes / 4). */
+ +      while (buckets < max_buckets)
+ +              buckets <<= 1;
+ +
+ +      /* Allocate stripe hash. */
+ +      hash->hash = vmalloc(buckets * sizeof(*hash->hash));
+ +      if (!hash->hash)
+ +              return -ENOMEM;
+ +
+ +      hash->buckets = buckets;
+ +      hash->mask = buckets - 1;
+ +      hash->shift = ffs(buckets);
+ +      if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
+ +              hash->shift = ARRAY_SIZE(hash_primes) + 1;
+ +
+ +      BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
+ +      hash->prime = hash_primes[hash->shift - 2];
+ +
+ +      /* Initialize buckets. */
+ +      while (buckets--)
+ +              INIT_LIST_HEAD(hash->hash + buckets);
+ +
+ +      return 0;
+ +}
+ +
+ +static INLINE void hash_exit(struct stripe_hash *hash)
+ +{
+ +      if (hash->hash) {
+ +              vfree(hash->hash);
+ +              hash->hash = NULL;
+ +      }
+ +}
+ +
+ +/* List add (head/tail/locked/unlocked) inlines. */
+ +enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
+ +#define       LIST_DEL(name, list) \
+ +static void stripe_ ## name ## _del(struct stripe *stripe, \
+ +                                  enum list_lock_type lock) { \
+ +      struct list_head *lh = stripe->lists + (list); \
+ +      spinlock_t *l = NULL; \
+ +\
+ +      if (lock == LIST_LOCKED) { \
+ +              l = stripe->sc->locks + LOCK_LRU; \
+ +              spin_lock_irq(l); \
+ +      } \
+ +\
+ +\
+ +      if (!list_empty(lh)) \
+ +              list_del_init(lh); \
+ +\
+ +      if (lock == LIST_LOCKED) \
+ +              spin_unlock_irq(l); \
+ +}
+ +
+ +LIST_DEL(hash, LIST_HASH)
+ +LIST_DEL(lru, LIST_LRU)
+ +#undef LIST_DEL
+ +
+ +enum list_pos_type { POS_HEAD, POS_TAIL };
+ +#define       LIST_ADD(name, list) \
+ +static void stripe_ ## name ## _add(struct stripe *stripe, \
+ +                                  enum list_pos_type pos, \
+ +                                  enum list_lock_type lock) { \
+ +      struct list_head *lh = stripe->lists + (list); \
+ +      struct stripe_cache *sc = stripe->sc; \
+ +      spinlock_t *l = NULL; \
+ +\
+ +      if (lock == LIST_LOCKED) { \
+ +              l = sc->locks + LOCK_LRU; \
+ +              spin_lock_irq(l); \
+ +      } \
+ +\
+ +      if (list_empty(lh)) { \
+ +              if (pos == POS_HEAD) \
+ +                      list_add(lh, sc->lists + (list)); \
+ +              else \
+ +                      list_add_tail(lh, sc->lists + (list)); \
+ +      } \
+ +\
+ +      if (lock == LIST_LOCKED) \
+ +              spin_unlock_irq(l); \
+ +}
+ +
+ +LIST_ADD(endio, LIST_ENDIO)
+ +LIST_ADD(io, LIST_IO)
+ +LIST_ADD(lru, LIST_LRU)
+ +#undef LIST_ADD
+ +
+ +#define POP(list) \
+ +      do { \
+ +              if (list_empty(sc->lists + list)) \
+ +                      stripe = NULL; \
+ +              else { \
+ +                      stripe = list_first_entry(&sc->lists[list], \
+ +                                                struct stripe, \
+ +                                                lists[list]); \
+ +                      list_del_init(&stripe->lists[list]); \
+ +              } \
+ +      } while (0);
+ +
+ +/* Pop an available stripe off the lru list. */
+ +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
+ +{
+ +      struct stripe *stripe;
+ +      spinlock_t *lock = sc->locks + LOCK_LRU;
+ +
+ +      spin_lock_irq(lock);
+ +      POP(LIST_LRU);
+ +      spin_unlock_irq(lock);
+ +
+ +      if (stripe)
+ +              /* Remove from hash before reuse. */
+ +              stripe_hash_del(stripe, LIST_UNLOCKED);
+ +
+ +      return stripe;
+ +}
+ +
+ +static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
+ +{
+ +      return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
+ +}
+ +
+ +static inline struct list_head *
+ +hash_bucket(struct stripe_hash *hash, sector_t key)
+ +{
+ +      return hash->hash + hash_fn(hash, key);
+ +}
+ +
+ +/* Insert an entry into a hash. */
+ +static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
+ +{
+ +      list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
+ +}
+ +
+ +/* Insert an entry into the stripe hash. */
+ +static inline void
+ +sc_insert(struct stripe_cache *sc, struct stripe *stripe)
+ +{
+ +      hash_insert(&sc->hash, stripe);
+ +}
+ +
+ +/* Lookup an entry in the stripe hash. */
+ +static inline struct stripe *
+ +stripe_lookup(struct stripe_cache *sc, sector_t key)
+ +{
+ +      unsigned c = 0;
+ +      struct stripe *stripe;
+ +      struct list_head *bucket = hash_bucket(&sc->hash, key);
+ +
+ +      list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
+ +              /* REMOVEME: statisics. */
+ +              if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
+ +                      atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
+ +
+ +              if (stripe->key == key)
+ +                      return stripe;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +/* Resize the stripe cache hash on size changes. */
+ +static int hash_resize(struct stripe_cache *sc)
+ +{
+ +      /* Resize threshold reached? */
+ +      if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
+ +          || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
+ +              int r;
+ +              struct stripe_hash hash, hash_tmp;
+ +              spinlock_t *lock;
+ +
+ +              r = hash_init(&hash, atomic_read(&sc->stripes));
+ +              if (r)
+ +                      return r;
+ +
+ +              lock = sc->locks + LOCK_LRU;
+ +              spin_lock_irq(lock);
+ +              if (sc->hash.hash) {
+ +                      unsigned b = sc->hash.buckets;
+ +                      struct list_head *pos, *tmp;
+ +
+ +                      /* Walk old buckets and insert into new. */
+ +                      while (b--) {
+ +                              list_for_each_safe(pos, tmp, sc->hash.hash + b)
+ +                                  hash_insert(&hash,
+ +                                              list_entry(pos, struct stripe,
+ +                                                         lists[LIST_HASH]));
+ +                      }
+ +
+ +              }
+ +
+ +              memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
+ +              memcpy(&sc->hash, &hash, sizeof(sc->hash));
+ +              atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
+ +              spin_unlock_irq(lock);
+ +
+ +              hash_exit(&hash_tmp);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Stripe cache locking functions
+ + */
+ +/* Dummy lock function for local RAID4+5. */
+ +static void *no_lock(sector_t key, enum dm_lock_type type)
+ +{
+ +      return &no_lock;
+ +}
+ +
+ +/* Dummy unlock function for local RAID4+5. */
+ +static void no_unlock(void *lock_handle)
+ +{
+ +}
+ +
+ +/* No locking (for local RAID 4+5). */
+ +static struct dm_raid45_locking_type locking_none = {
+ +      .lock = no_lock,
+ +      .unlock = no_unlock,
+ +};
+ +
+ +/* Clustered RAID 4+5. */
+ +/* FIXME: code this. */
+ +static struct dm_raid45_locking_type locking_cluster = {
+ +      .lock = no_lock,
+ +      .unlock = no_unlock,
+ +};
+ +
+ +/* Lock a stripe (for clustering). */
+ +static int
+ +stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
+ +{
+ +      stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
+ +                                                         DM_RAID45_EX);
+ +      return stripe->lock ? 0 : -EPERM;
+ +}
+ +
+ +/* Unlock a stripe (for clustering). */
+ +static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
+ +{
+ +      rs->locking->unlock(stripe->lock);
+ +      stripe->lock = NULL;
+ +}
+ +
+ +/*
+ + * Stripe cache functions.
+ + */
+ +/*
+ + * Invalidate all page lists pages of a stripe.
+ + *
+ + * I only keep state for the whole list in the first page.
+ + */
+ +static INLINE void
+ +stripe_pages_invalidate(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--) {
+ +              struct page *page = PAGE(stripe, p);
+ +
+ +              ProhibitPageIO(page);
+ +              ClearPageChecked(page);
+ +              ClearPageDirty(page);
+ +              ClearPageError(page);
+ +              __clear_page_locked(page);
+ +              ClearPagePrivate(page);
+ +              ClearPageUptodate(page);
+ +      }
+ +}
+ +
+ +/* Prepare stripe for (re)use. */
+ +static INLINE void stripe_invalidate(struct stripe *stripe)
+ +{
+ +      stripe->io.flags = 0;
+ +      stripe_pages_invalidate(stripe);
+ +}
+ +
+ +/* Allow io on all chunks of a stripe. */
+ +static INLINE void stripe_allow_io(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--)
+ +              AllowPageIO(PAGE(stripe, p));
+ +}
+ +
+ +/* Initialize a stripe. */
+ +static void
+ +stripe_init(struct stripe_cache *sc, struct stripe *stripe)
+ +{
+ +      unsigned p = RS(sc)->set.raid_devs;
+ +      unsigned i;
+ +
+ +      /* Work all io chunks. */
+ +      while (p--) {
+ +              struct stripe_set *ss = stripe->ss + p;
+ +
+ +              stripe->obj[p].private = ss;
+ +              ss->stripe = stripe;
+ +
+ +              i = ARRAY_SIZE(ss->bl);
+ +              while (i--)
+ +                      bio_list_init(ss->bl + i);
+ +      }
+ +
+ +      stripe->sc = sc;
+ +
+ +      i = ARRAY_SIZE(stripe->lists);
+ +      while (i--)
+ +              INIT_LIST_HEAD(stripe->lists + i);
+ +
+ +      atomic_set(&stripe->cnt, 0);
+ +      atomic_set(&stripe->io.pending, 0);
+ +
+ +      stripe_invalidate(stripe);
+ +}
+ +
+ +/* Number of pages per chunk. */
+ +static inline unsigned chunk_pages(unsigned io_size)
+ +{
+ +      return dm_div_up(io_size, SECTORS_PER_PAGE);
+ +}
+ +
+ +/* Number of pages per stripe. */
+ +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
+ +{
+ +      return chunk_pages(io_size) * rs->set.raid_devs;
+ +}
+ +
+ +/* Initialize part of page_list (recovery). */
+ +static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
+ +                                     unsigned start, unsigned count)
+ +{
+ +      unsigned pages = chunk_pages(count);
+ +      /* Get offset into the page_list. */
+ +      struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
+ +
+ +      BUG_ON(!pl);
+ +      while (pl && pages--) {
+ +              BUG_ON(!pl->page);
+ +              memset(page_address(pl->page), 0, PAGE_SIZE);
+ +              pl = pl->next;
+ +      }
+ +}
+ +
+ +/* Initialize parity chunk of stripe. */
+ +static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
+ +{
+ +      stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
+ +}
+ +
+ +/* Return dynamic stripe structure size. */
+ +static INLINE size_t stripe_size(struct raid_set *rs)
+ +{
+ +      return sizeof(struct stripe) +
+ +                    rs->set.raid_devs * sizeof(struct stripe_set);
+ +}
+ +
+ +/* Allocate a stripe and its memory object. */
+ +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
+ +enum grow { SC_GROW, SC_KEEP };
+ +static struct stripe *stripe_alloc(struct stripe_cache *sc,
+ +                                 struct dm_mem_cache_client *mc,
+ +                                 enum grow grow)
+ +{
+ +      int r;
+ +      struct stripe *stripe;
+ +
+ +      stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
+ +      if (stripe) {
+ +              /* Grow the dm-mem-cache by one object. */
+ +              if (grow == SC_GROW) {
+ +                      r = dm_mem_cache_grow(mc, 1);
+ +                      if (r)
+ +                              goto err_free;
+ +              }
+ +
+ +              stripe->obj = dm_mem_cache_alloc(mc);
+ +              if (!stripe->obj)
+ +                      goto err_shrink;
+ +
+ +              stripe_init(sc, stripe);
+ +      }
+ +
+ +      return stripe;
+ +
+ +err_shrink:
+ +      if (grow == SC_GROW)
+ +              dm_mem_cache_shrink(mc, 1);
+ +err_free:
+ +      kmem_cache_free(sc->kc.cache, stripe);
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * Free a stripes memory object, shrink the
+ + * memory cache and free the stripe itself
+ + */
+ +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
+ +{
+ +      dm_mem_cache_free(mc, stripe->obj);
+ +      dm_mem_cache_shrink(mc, 1);
+ +      kmem_cache_free(stripe->sc->kc.cache, stripe);
+ +}
+ +
+ +/* Free the recovery stripe. */
+ +static void stripe_recover_free(struct raid_set *rs)
+ +{
+ +      struct recover *rec = &rs->recover;
+ +      struct list_head *stripes = &rec->stripes;
+ +
+ +      while (!list_empty(stripes)) {
+ +              struct stripe *stripe = list_first_entry(stripes, struct stripe,
+ +                                                       lists[LIST_RECOVER]);
+ +              list_del(stripe->lists + LIST_RECOVER);
+ +              stripe_free(stripe, rec->mem_cache_client);
+ +      }
+ +}
+ +
+ +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
+ +static INLINE void stripe_endio_push(struct stripe *stripe)
+ +{
+ +      int wake;
+ +      unsigned long flags;
+ +      struct stripe_cache *sc = stripe->sc;
+ +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
+ +
+ +      spin_lock_irqsave(lock, flags);
+ +      wake = list_empty(sc->lists + LIST_ENDIO);
+ +      stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
+ +      spin_unlock_irqrestore(lock, flags);
+ +
+ +      if (wake)
+ +              wake_do_raid(RS(sc));
+ +}
+ +
+ +/* Protected check for stripe cache endio list empty. */
+ +static INLINE int stripe_endio_empty(struct stripe_cache *sc)
+ +{
+ +      int r;
+ +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
+ +
+ +      spin_lock_irq(lock);
+ +      r = list_empty(sc->lists + LIST_ENDIO);
+ +      spin_unlock_irq(lock);
+ +
+ +      return r;
+ +}
+ +
+ +/* Pop a stripe off safely off the endio list. */
+ +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
+ +{
+ +      struct stripe *stripe;
+ +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
+ +
+ +      /* This runs in parallel with endio(). */
+ +      spin_lock_irq(lock);
+ +      POP(LIST_ENDIO)
+ +      spin_unlock_irq(lock);
+ +      return stripe;
+ +}
+ +
+ +#undef POP
+ +
+ +/* Evict stripe from cache. */
+ +static void stripe_evict(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
+ +
+ +      if (list_empty(stripe->lists + LIST_LRU)) {
+ +              stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ +              atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
+ +      }
+ +}
+ +
+ +/* Grow stripe cache. */
+ +static int
+ +sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
+ +{
+ +      int r = 0;
+ +      struct raid_set *rs = RS(sc);
+ +
+ +      /* Try to allocate this many (additional) stripes. */
+ +      while (stripes--) {
+ +              struct stripe *stripe =
+ +                      stripe_alloc(sc, sc->mem_cache_client, grow);
+ +
+ +              if (likely(stripe)) {
+ +                      stripe->io.size = rs->set.io_size;
+ +                      stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ +                      atomic_inc(&sc->stripes);
+ +              } else {
+ +                      r = -ENOMEM;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      ClearRSScBusy(rs);
+ +      return r ? r : hash_resize(sc);
+ +}
+ +
+ +/* Shrink stripe cache. */
+ +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
+ +{
+ +      int r = 0;
+ +
+ +      /* Try to get unused stripe from LRU list. */
+ +      while (stripes--) {
+ +              struct stripe *stripe;
+ +
+ +              stripe = stripe_lru_pop(sc);
+ +              if (stripe) {
+ +                      /* An lru stripe may never have ios pending! */
+ +                      BUG_ON(stripe_io(stripe));
+ +                      stripe_free(stripe, sc->mem_cache_client);
+ +                      atomic_dec(&sc->stripes);
+ +              } else {
+ +                      r = -ENOENT;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      /* Check if stats are still sane. */
+ +      if (atomic_read(&sc->max_active_stripes) >
+ +          atomic_read(&sc->stripes))
+ +              atomic_set(&sc->max_active_stripes, 0);
+ +
+ +      if (r)
+ +              return r;
+ +
+ +      ClearRSScBusy(RS(sc));
+ +      return hash_resize(sc);
+ +}
+ +
+ +/* Create stripe cache. */
+ +static int sc_init(struct raid_set *rs, unsigned stripes)
+ +{
+ +      unsigned i, nr;
+ +      struct stripe_cache *sc = &rs->sc;
+ +      struct stripe *stripe;
+ +      struct recover *rec = &rs->recover;
+ +
+ +      /* Initialize lists and locks. */
+ +      i = ARRAY_SIZE(sc->lists);
+ +      while (i--)
+ +              INIT_LIST_HEAD(sc->lists + i);
+ +
+ +      i = NR_LOCKS;
+ +      while (i--)
+ +              spin_lock_init(sc->locks + i);
+ +
+ +      /* Initialize atomic variables. */
+ +      atomic_set(&sc->stripes, 0);
+ +      atomic_set(&sc->stripes_last, 0);
+ +      atomic_set(&sc->stripes_to_shrink, 0);
+ +      atomic_set(&sc->active_stripes, 0);
+ +      atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
+ +
+ +      /*
+ +       * We need a runtime unique # to suffix the kmem cache name
+ +       * because we'll have one for each active RAID set.
+ +       */
+ +      nr = atomic_inc_return(&_stripe_sc_nr);
+ +      sprintf(sc->kc.name, "%s_%d", TARGET, nr);
+ +      sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
+ +                                       0, 0, NULL);
+ +      if (!sc->kc.cache)
+ +              return -ENOMEM;
+ +
+ +      /* Create memory cache client context for RAID stripe cache. */
+ +      sc->mem_cache_client =
+ +              dm_mem_cache_client_create(stripes, rs->set.raid_devs,
+ +                                         chunk_pages(rs->set.io_size));
+ +      if (IS_ERR(sc->mem_cache_client))
+ +              return PTR_ERR(sc->mem_cache_client);
+ +
+ +      /* Create memory cache client context for RAID recovery stripe(s). */
+ +      rec->mem_cache_client =
+ +              dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
+ +                                         chunk_pages(rec->io_size));
+ +      if (IS_ERR(rec->mem_cache_client))
+ +              return PTR_ERR(rec->mem_cache_client);
+ +
+ +      /* Allocate stripe for set recovery. */
+ +      /* XXX: cope with MAX_RECOVERY. */
+ +      INIT_LIST_HEAD(&rec->stripes);
+ +      for (i = 0; i < MAX_RECOVER; i++) {
+ +              stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
+ +              if (!stripe)
+ +                      return -ENOMEM;
+ +
+ +              SetStripeRecover(stripe);
+ +              stripe->io.size = rec->io_size;
+ +              list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
+ +      }
+ +
+ +      /*
+ +       * Allocate the stripe objetcs from the
+ +       * cache and add them to the LRU list.
+ +       */
+ +      return sc_grow(sc, stripes, SC_KEEP);
+ +}
+ +
+ +/* Destroy the stripe cache. */
+ +static void sc_exit(struct stripe_cache *sc)
+ +{
+ +      if (sc->kc.cache) {
+ +              BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
+ +              kmem_cache_destroy(sc->kc.cache);
+ +      }
+ +
+ +      if (sc->mem_cache_client)
+ +              dm_mem_cache_client_destroy(sc->mem_cache_client);
+ +
+ +      ClearRSRecover(RS(sc));
+ +      stripe_recover_free(RS(sc));
+ +      if (RS(sc)->recover.mem_cache_client)
+ +              dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
+ +
+ +      hash_exit(&sc->hash);
+ +}
+ +
+ +/*
+ + * Calculate RAID address
+ + *
+ + * Delivers tuple with the index of the data disk holding the chunk
+ + * in the set, the parity disks index and the start of the stripe
+ + * within the address space of the set (used as the stripe cache hash key).
+ + */
+ +/* thx MD. */
+ +static struct address *
+ +raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
+ +{
+ +      unsigned data_devs = rs->set.data_devs, di, pi,
+ +               raid_devs = rs->set.raid_devs;
+ +      sector_t stripe, tmp;
+ +
+ +      /*
+ +       * chunk_number = sector / chunk_size
+ +       * stripe = chunk_number / data_devs
+ +       * di = stripe % data_devs;
+ +       */
+ +      stripe = sector >> rs->set.chunk_shift;
+ +      di = sector_div(stripe, data_devs);
+ +
+ +      switch (rs->set.raid_type->level) {
+ +      case raid5:
+ +              tmp = stripe;
+ +              pi = sector_div(tmp, raid_devs);
+ +
+ +              switch (rs->set.raid_type->algorithm) {
+ +              case left_asym:         /* Left asymmetric. */
+ +                      pi = data_devs - pi;
+ +              case right_asym:        /* Right asymmetric. */
+ +                      if (di >= pi)
+ +                              di++;
+ +                      break;
+ +
+ +              case left_sym:          /* Left symmetric. */
+ +                      pi = data_devs - pi;
+ +              case right_sym:         /* Right symmetric. */
+ +                      di = (pi + di + 1) % raid_devs;
+ +                      break;
+ +
+ +              default:
+ +                      DMERR("Unknown RAID algorithm %d",
+ +                            rs->set.raid_type->algorithm);
+ +                      goto out;
+ +              }
+ +
+ +              break;
+ +
+ +      case raid4:
+ +              pi = rs->set.pi;
+ +              if (di >= pi)
+ +                      di++;
+ +              break;
+ +
+ +      default:
+ +              DMERR("Unknown RAID level %d", rs->set.raid_type->level);
+ +              goto out;
+ +      }
+ +
+ +      /*
+ +       * Hash key = start offset on any single device of the RAID set;
+ +       * adjusted in case io size differs from chunk size.
+ +       */
+ +      addr->key = (stripe << rs->set.chunk_shift) +
+ +                  (sector & rs->set.io_shift_mask);
+ +      addr->di = di;
+ +      addr->pi = pi;
+ +
+ +out:
+ +      return addr;
+ +}
+ +
+ +/*
+ + * Copy data across between stripe pages and bio vectors.
+ + *
+ + * Pay attention to data alignment in stripe and bio pages.
+ + */
+ +static void
+ +bio_copy_page_list(int rw, struct stripe *stripe,
+ +                 struct page_list *pl, struct bio *bio)
+ +{
+ +      unsigned i, page_offset;
+ +      void *page_addr;
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct bio_vec *bv;
+ +
+ +      /* Get start page in page list for this sector. */
+ +      i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
+ +      pl = pl_elem(pl, i);
+ +
+ +      page_addr = page_address(pl->page);
+ +      page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
+ +
+ +      /* Walk all segments and copy data across between bio_vecs and pages. */
+ +      bio_for_each_segment(bv, bio, i) {
+ +              int len = bv->bv_len, size;
+ +              unsigned bio_offset = 0;
+ +              void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
+ +redo:
+ +              size = (page_offset + len > PAGE_SIZE) ?
+ +                     PAGE_SIZE - page_offset : len;
+ +
+ +              if (rw == READ)
+ +                      memcpy(bio_addr + bio_offset,
+ +                             page_addr + page_offset, size);
+ +              else
+ +                      memcpy(page_addr + page_offset,
+ +                             bio_addr + bio_offset, size);
+ +
+ +              page_offset += size;
+ +              if (page_offset == PAGE_SIZE) {
+ +                      /*
+ +                       * We reached the end of the chunk page ->
+ +                       * need refer to the next one to copy more data.
+ +                       */
+ +                      len -= size;
+ +                      if (len) {
+ +                              /* Get next page. */
+ +                              pl = pl->next;
+ +                              BUG_ON(!pl);
+ +                              page_addr = page_address(pl->page);
+ +                              page_offset = 0;
+ +                              bio_offset += size;
+ +                              /* REMOVEME: statistics. */
+ +                              atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
+ +                              goto redo;
+ +                      }
+ +              }
+ +
+ +              __bio_kunmap_atomic(bio_addr, KM_USER0);
+ +      }
+ +}
+ +
+ +/*
+ + * Xor optimization macros.
+ + */
+ +/* Xor data pointer declaration and initialization macros. */
+ +#define DECLARE_2     unsigned long *d0 = data[0], *d1 = data[1]
+ +#define DECLARE_3     DECLARE_2, *d2 = data[2]
+ +#define DECLARE_4     DECLARE_3, *d3 = data[3]
+ +#define DECLARE_5     DECLARE_4, *d4 = data[4]
+ +#define DECLARE_6     DECLARE_5, *d5 = data[5]
+ +#define DECLARE_7     DECLARE_6, *d6 = data[6]
+ +#define DECLARE_8     DECLARE_7, *d7 = data[7]
+ +
+ +/* Xor unrole macros. */
+ +#define D2(n) d0[n] = d0[n] ^ d1[n]
+ +#define D3(n) D2(n) ^ d2[n]
+ +#define D4(n) D3(n) ^ d3[n]
+ +#define D5(n) D4(n) ^ d4[n]
+ +#define D6(n) D5(n) ^ d5[n]
+ +#define D7(n) D6(n) ^ d6[n]
+ +#define D8(n) D7(n) ^ d7[n]
+ +
+ +#define       X_2(macro, offset)      macro(offset); macro(offset + 1);
+ +#define       X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
+ +#define       X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
+ +#define       X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
+ +#define       X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
+ +#define       X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
+ +
+ +/* Define a _xor_#chunks_#xors_per_run() function. */
+ +#define       _XOR(chunks, xors_per_run) \
+ +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
+ +{ \
+ +      unsigned end = XOR_SIZE / sizeof(data[0]), i; \
+ +      DECLARE_ ## chunks; \
+ +\
+ +      for (i = 0; i < end; i += xors_per_run) { \
+ +              X_ ## xors_per_run(D ## chunks, i); \
+ +      } \
+ +}
+ +
+ +/* Define xor functions for 2 - 8 chunks. */
+ +#define       MAKE_XOR_PER_RUN(xors_per_run) \
+ +      _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
+ +      _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
+ +      _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
+ +      _XOR(8, xors_per_run);
+ +
+ +MAKE_XOR_PER_RUN(8)   /* Define _xor_*_8() functions. */
+ +MAKE_XOR_PER_RUN(16)  /* Define _xor_*_16() functions. */
+ +MAKE_XOR_PER_RUN(32)  /* Define _xor_*_32() functions. */
+ +MAKE_XOR_PER_RUN(64)  /* Define _xor_*_64() functions. */
+ +
+ +#define MAKE_XOR(xors_per_run) \
+ +struct { \
+ +      void (*f)(unsigned long **); \
+ +} static xor_funcs ## xors_per_run[] = { \
+ +      { NULL }, \
+ +      { NULL }, \
+ +      { _xor2_ ## xors_per_run }, \
+ +      { _xor3_ ## xors_per_run }, \
+ +      { _xor4_ ## xors_per_run }, \
+ +      { _xor5_ ## xors_per_run }, \
+ +      { _xor6_ ## xors_per_run }, \
+ +      { _xor7_ ## xors_per_run }, \
+ +      { _xor8_ ## xors_per_run }, \
+ +}; \
+ +\
+ +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
+ +{ \
+ +      /* Call respective function for amount of chunks. */ \
+ +      xor_funcs ## xors_per_run[n].f(data); \
+ +}
+ +
+ +/* Define xor_8() - xor_64 functions. */
+ +MAKE_XOR(8)
+ +MAKE_XOR(16)
+ +MAKE_XOR(32)
+ +MAKE_XOR(64)
+ +
+ +/* Maximum number of chunks, which can be xor'ed in one go. */
+ +#define       XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
+ +
+ +struct xor_func {
+ +      xor_function_t f;
+ +      const char *name;
+ +} static xor_funcs[] = {
+ +      {xor_8,   "xor_8"},
+ +      {xor_16,  "xor_16"},
+ +      {xor_32,  "xor_32"},
+ +      {xor_64,  "xor_64"},
+ +};
+ +
+ +/*
+ + * Calculate crc.
+ + *
+ + * This indexes into the page list of the stripe.
+ + *
+ + * All chunks will be xored into the parity chunk
+ + * in maximum groups of xor.chunks.
+ + *
+ + * FIXME: try mapping the pages on discontiguous memory.
+ + */
+ +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned max_chunks = rs->xor.chunks, n, p;
+ +      unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
+ +      unsigned long **d = rs->data;
+ +      xor_function_t xor_f = rs->xor.f->f;
+ +
+ +      /* Address of parity page to xor into. */
+ +      d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
+ +
+ +      /* Preset pointers to data pages. */
+ +      for (n = 1, p = rs->set.raid_devs; p--; ) {
+ +              if (p != pi && PageIO(PAGE(stripe, p)))
+ +                      d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
+ +
+ +              /* If max chunks -> xor .*/
+ +              if (n == max_chunks) {
+ +                      xor_f(n, d);
+ +                      n = 1;
+ +              }
+ +      }
+ +
+ +      /* If chunks -> xor. */
+ +      if (n > 1)
+ +              xor_f(n, d);
+ +
+ +      /* Set parity page uptodate and clean. */
+ +      page_set(PAGE(stripe, pi), CLEAN);
+ +}
+ +
+ +/* Common xor loop through all stripe page lists. */
+ +static void common_xor(struct stripe *stripe, sector_t count,
+ +                     unsigned off, unsigned p)
+ +{
+ +      unsigned sector;
+ +
+ +      for (sector = off; sector < count; sector += SECTORS_PER_XOR)
+ +              xor(stripe, p, sector);
+ +
+ +      atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
+ +}
+ +
+ +/*
+ + * Calculate parity sectors on intact stripes.
+ + *
+ + * Need to calculate raid address for recover stripe, because its
+ + * chunk sizes differs and is typically larger than io chunk size.
+ + */
+ +static void parity_xor(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned chunk_size = rs->set.chunk_size,
+ +               io_size = stripe->io.size,
+ +               xor_size = chunk_size > io_size ? io_size : chunk_size;
+ +      sector_t off;
+ +
+ +      /* This can be the recover stripe with a larger io size. */
+ +      for (off = 0; off < io_size; off += xor_size) {
+ +              unsigned pi;
+ +
+ +              /*
+ +               * Recover stripe likely is bigger than regular io
+ +               * ones and has no precalculated parity disk index ->
+ +               * need to calculate RAID address.
+ +               */
+ +              if (unlikely(StripeRecover(stripe))) {
+ +                      struct address addr;
+ +
+ +                      raid_address(rs,
+ +                                   (stripe->key + off) * rs->set.data_devs,
+ +                                   &addr);
+ +                      pi = addr.pi;
+ +                      stripe_zero_pl_part(stripe, pi, off,
+ +                                          rs->set.chunk_size);
+ +              } else
+ +                      pi = stripe->idx.parity;
+ +
+ +              common_xor(stripe, xor_size, off, pi);
+ +              page_set(PAGE(stripe, pi), DIRTY);
+ +      }
+ +}
+ +
+ +/* Reconstruct missing chunk. */
+ +static void reconstruct_xor(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      int p = stripe->idx.recover;
+ +
+ +      BUG_ON(p < 0);
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats + (raid_set_degraded(rs) ?
+ +                  S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
+ +
+ +      /* Zero chunk to be reconstructed. */
+ +      stripe_zero_chunk(stripe, p);
+ +      common_xor(stripe, stripe->io.size, 0, p);
+ +}
+ +
+ +/*
+ + * Try getting a stripe either from the hash or from the lru list
+ + */
+ +static inline void _stripe_get(struct stripe *stripe)
+ +{
+ +      atomic_inc(&stripe->cnt);
+ +}
+ +
+ +static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
+ +{
+ +      struct stripe_cache *sc = &rs->sc;
+ +      struct stripe *stripe;
+ +
+ +      stripe = stripe_lookup(sc, addr->key);
+ +      if (stripe) {
+ +              _stripe_get(stripe);
+ +              /* Remove from the lru list if on. */
+ +              stripe_lru_del(stripe, LIST_LOCKED);
+ +              atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
+ +      } else {
+ +              /* Second try to get an LRU stripe. */
+ +              stripe = stripe_lru_pop(sc);
+ +              if (stripe) {
+ +                      _stripe_get(stripe);
+ +                      /* Invalidate before reinserting with changed key. */
+ +                      stripe_invalidate(stripe);
+ +                      stripe->key = addr->key;
+ +                      stripe->region = dm_rh_sector_to_region(rs->recover.rh,
+ +                                                              addr->key);
+ +                      stripe->idx.parity = addr->pi;
+ +                      sc_insert(sc, stripe);
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_INSCACHE);
+ +              }
+ +      }
+ +
+ +      return stripe;
+ +}
+ +
+ +/*
+ + * Decrement reference count on a stripe.
+ + *
+ + * Move it to list of LRU stripes if zero.
+ + */
+ +static void stripe_put(struct stripe *stripe)
+ +{
+ +      if (atomic_dec_and_test(&stripe->cnt)) {
+ +              if (TestClearStripeActive(stripe))
+ +                      atomic_dec(&stripe->sc->active_stripes);
+ +
+ +              /* Put stripe onto the LRU list. */
+ +              stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ +      }
+ +
+ +      BUG_ON(atomic_read(&stripe->cnt) < 0);
+ +}
+ +
+ +/*
+ + * Process end io
+ + *
+ + * I need to do it here because I can't in interrupt
+ + *
+ + * Read and write functions are split in order to avoid
+ + * conditionals in the main loop for performamce reasons.
+ + */
+ +
+ +/* Helper read bios on a page list. */
+ +static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
+ +                              struct bio *bio)
+ +{
+ +      bio_copy_page_list(READ, stripe, pl, bio);
+ +}
+ +
+ +/* Helper write bios on a page list. */
+ +static void _rh_dec(struct stripe *stripe, struct page_list *pl,
+ +                  struct bio *bio)
+ +{
+ +      dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
+ +}
+ +
+ +/* End io all bios on a page list. */
+ +static inline int
+ +page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
+ +{
+ +      int r = 0;
+ +      struct bio_list *bl = BL(stripe, p, rw);
+ +
+ +      if (!bio_list_empty(bl)) {
+ +              struct page_list *pl = PL(stripe, p);
+ +              struct page *page = pl->page;
+ +
+ +              if (PageLocked(page))
+ +                      r = -EBUSY;
+ +              /*
+ +               * FIXME: PageUptodate() not cleared
+ +               *        properly for missing chunks ?
+ +               */
+ +              else if (PageUptodate(page)) {
+ +                      struct bio *bio;
+ +                      struct raid_set *rs = RS(stripe->sc);
+ +                      void (*h_f)(struct stripe *, struct page_list *,
+ +                                  struct bio *) =
+ +                              (rw == READ) ? _bio_copy_page_list : _rh_dec;
+ +
+ +                      while ((bio = bio_list_pop(bl))) {
+ +                              h_f(stripe, pl, bio);
+ +                              _bio_endio(rs, bio, 0);
+ +                              stripe_put(stripe);
+ +                              if (count)
+ +                                      (*count)++;
+ +                      }
+ +              } else
+ +                      r = -EAGAIN;
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/*
+ + * End io all reads/writes on a stripe copying
+ + * read date accross from stripe to bios.
+ + */
+ +static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
+ +{
+ +      int r = 0;
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--) {
+ +              int rr = page_list_endio(rw, stripe, p, count);
+ +
+ +              if (rr && r != -EIO)
+ +                      r = rr;
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/* Fail all ios on a bio list and return # of bios. */
+ +static unsigned
+ +bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
+ +{
+ +      unsigned r;
+ +      struct bio *bio;
+ +
+ +      raid_set_dead(rs);
+ +
+ +      /* Update region counters. */
+ +      if (stripe) {
+ +              struct dm_region_hash *rh = rs->recover.rh;
+ +
+ +              bio_list_for_each(bio, bl) {
+ +                      if (bio_data_dir(bio) == WRITE)
+ +                              dm_rh_dec(rh, stripe->region);
+ +              }
+ +      }
+ +
+ +      /* Error end io all bios. */
+ +      for (r = 0; (bio = bio_list_pop(bl)); r++)
+ +              _bio_endio(rs, bio, -EIO);
+ +
+ +      return r;
+ +}
+ +
+ +/* Fail all ios of a bio list of a stripe and drop io pending count. */
+ +static void
+ +stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
+ +                   struct bio_list *bl)
+ +{
+ +      unsigned put = bio_list_fail(rs, stripe, bl);
+ +
+ +      while (put--)
+ +              stripe_put(stripe);
+ +}
+ +
+ +/* Fail all ios hanging off all bio lists of a stripe. */
+ +static void stripe_fail_io(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned p = rs->set.raid_devs;
+ +
+ +      stripe_evict(stripe);
+ +
+ +      while (p--) {
+ +              struct stripe_set *ss = stripe->ss + p;
+ +              int i = ARRAY_SIZE(ss->bl);
+ +
+ +              while (i--)
+ +                      stripe_bio_list_fail(rs, stripe, ss->bl + i);
+ +      }
+ +}
+ +
+ +/*
+ + * Handle all stripes by handing them to the daemon, because we can't
+ + * map their pages to copy the data in interrupt context.
+ + *
+ + * We don't want to handle them here either, while interrupts are disabled.
+ + */
+ +
+ +/* Read/write endio function for dm-io (interrupt context). */
+ +static void endio(unsigned long error, void *context)
+ +{
+ +      struct dm_mem_cache_object *obj = context;
+ +      struct stripe_set *ss = obj->private;
+ +      struct stripe *stripe = ss->stripe;
+ +      struct page *page = obj->pl->page;
+ +
+ +      if (unlikely(error))
+ +              stripe_error(stripe, page);
+ +      else
+ +              page_set(page, CLEAN);
+ +
+ +      __clear_page_locked(page);
+ +      stripe_io_dec(stripe);
+ +
+ +      /* Add stripe to endio list and wake daemon. */
+ +      stripe_endio_push(stripe);
+ +}
+ +
+ +/*
+ + * Recovery io throttling
+ + */
+ +/* Conditionally reset io counters. */
+ +enum count_type { IO_WORK = 0, IO_RECOVER };
+ +static int recover_io_reset(struct raid_set *rs)
+ +{
+ +      unsigned long j = jiffies;
+ +
+ +      /* Pay attention to jiffies overflows. */
+ +      if (j > rs->recover.last_jiffies + HZ
+ +          || j < rs->recover.last_jiffies) {
+ +              rs->recover.last_jiffies = j;
+ +              atomic_set(rs->recover.io_count + IO_WORK, 0);
+ +              atomic_set(rs->recover.io_count + IO_RECOVER, 0);
+ +              return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Count ios. */
+ +static INLINE void
+ +recover_io_count(struct raid_set *rs, struct stripe *stripe)
+ +{
+ +      if (RSRecover(rs)) {
+ +              recover_io_reset(rs);
+ +              atomic_inc(rs->recover.io_count +
+ +                         (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
+ +      }
+ +}
+ +
+ +/* Read/Write a page_list asynchronously. */
+ +static void page_list_rw(struct stripe *stripe, unsigned p)
+ +{
+ +      struct stripe_cache *sc = stripe->sc;
+ +      struct raid_set *rs = RS(sc);
+ +      struct dm_mem_cache_object *obj = stripe->obj + p;
+ +      struct page_list *pl = obj->pl;
+ +      struct page *page = pl->page;
+ +      struct raid_dev *dev = rs->dev + p;
+ +      struct dm_io_region io = {
+ +              .bdev = dev->dev->bdev,
+ +              .sector = stripe->key,
+ +              .count = stripe->io.size,
+ +      };
+ +      struct dm_io_request control = {
+ +              .bi_rw = PageDirty(page) ? WRITE : READ,
+ +              .mem.type = DM_IO_PAGE_LIST,
+ +              .mem.ptr.pl = pl,
+ +              .mem.offset = 0,
+ +              .notify.fn = endio,
+ +              .notify.context = obj,
+ +              .client = sc->dm_io_client,
+ +      };
+ +
+ +      BUG_ON(PageLocked(page));
+ +
+ +      /*
+ +       * Don't rw past end of device, which can happen, because
+ +       * typically sectors_per_dev isn't divisable by io_size.
+ +       */
+ +      if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
+ +              io.count = rs->set.sectors_per_dev - io.sector;
+ +
+ +      io.sector += dev->start;        /* Add <offset>. */
+ +      recover_io_count(rs, stripe);   /* Recovery io accounting. */
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats +
+ +                  (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
+ +
+ +      ClearPageError(page);
+ +      __set_page_locked(page);
+ +      io_dev_queued(dev);
+ +      BUG_ON(dm_io(&control, 1, &io, NULL));
+ +}
+ +
+ +/*
+ + * Write dirty / read not uptodate page lists of a stripe.
+ + */
+ +static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
+ +{
+ +      unsigned r;
+ +
+ +      /*
+ +       * Increment the pending count on the stripe
+ +       * first, so that we don't race in endio().
+ +       *
+ +       * An inc (IO) is needed for any page:
+ +       *
+ +       * o not uptodate
+ +       * o dirtied by writes merged
+ +       * o dirtied by parity calculations
+ +       */
+ +      r = for_each_io_dev(rs, stripe, _stripe_io_inc);
+ +      if (r) {
+ +              /* io needed: chunks are not uptodate/dirty. */
+ +              int max;        /* REMOVEME: */
+ +              struct stripe_cache *sc = &rs->sc;
+ +
+ +              if (!TestSetStripeActive(stripe))
+ +                      atomic_inc(&sc->active_stripes);
+ +
+ +              /* Take off the lru list in case it got added there. */
+ +              stripe_lru_del(stripe, LIST_LOCKED);
+ +
+ +              /* Submit actual io. */
+ +              for_each_io_dev(rs, stripe, page_list_rw);
+ +
+ +              /* REMOVEME: statistics */
+ +              max = sc_active(sc);
+ +              if (atomic_read(&sc->max_active_stripes) < max)
+ +                      atomic_set(&sc->max_active_stripes, max);
+ +
+ +              atomic_inc(rs->stats + S_FLUSHS);
+ +              /* END REMOVEME: statistics */
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/* Work in all pending writes. */
+ +static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
+ +{
+ +      struct bio_list *write = BL(stripe, p, WRITE);
+ +
+ +      if (!bio_list_empty(write)) {
+ +              struct page_list *pl = stripe->obj[p].pl;
+ +              struct bio *bio;
+ +              struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
+ +
+ +              /*
+ +               * We can play with the lists without holding a lock,
+ +               * because it is just us accessing them anyway.
+ +               */
+ +              bio_list_for_each(bio, write)
+ +                      bio_copy_page_list(WRITE, stripe, pl, bio);
+ +
+ +              bio_list_merge(write_merged, write);
+ +              bio_list_init(write);
+ +              page_set(pl->page, DIRTY);
+ +      }
+ +}
+ +
+ +/* Merge in all writes hence dirtying respective pages. */
+ +static INLINE void writes_merge(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--)
+ +              _writes_merge(stripe, p);
+ +}
+ +
+ +/* Check, if a chunk gets completely overwritten. */
+ +static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
+ +{
+ +      unsigned sectors = 0;
+ +      struct bio *bio;
+ +      struct bio_list *bl = BL(stripe, p, WRITE);
+ +
+ +      bio_list_for_each(bio, bl)
+ +              sectors += bio_sectors(bio);
+ +
+ +      return sectors == RS(stripe->sc)->set.io_size;
+ +}
+ +
+ +/*
+ + * Prepare stripe to avoid io on broken/reconstructed
+ + * drive in order to reconstruct date on endio.
+ + */
+ +enum prepare_type { IO_ALLOW, IO_PROHIBIT };
+ +static void stripe_prepare(struct stripe *stripe, unsigned p,
+ +                         enum prepare_type type)
+ +{
+ +      struct page *page = PAGE(stripe, p);
+ +
+ +      switch (type) {
+ +      case IO_PROHIBIT:
+ +              /*
+ +               * In case we prohibit, we gotta make sure, that
+ +               * io on all other chunks than the one which failed
+ +               * or is being reconstructed is allowed and that it
+ +               * doesn't have state uptodate.
+ +               */
+ +              stripe_allow_io(stripe);
+ +              ClearPageUptodate(page);
+ +              ProhibitPageIO(page);
+ +
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
+ +              stripe->idx.recover = p;
+ +              SetStripeReconstruct(stripe);
+ +              break;
+ +
+ +      case IO_ALLOW:
+ +              AllowPageIO(page);
+ +              stripe->idx.recover = -1;
+ +              ClearStripeReconstruct(stripe);
+ +              break;
+ +
+ +      default:
+ +              BUG();
+ +      }
+ +}
+ +
+ +/*
+ + * Degraded/reconstruction mode.
+ + *
+ + * Check stripe state to figure which chunks don't need IO.
+ + */
+ +static INLINE void stripe_check_reconstruct(struct stripe *stripe,
+ +                                          int prohibited)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +
+ +      /*
+ +       * Degraded mode (device(s) failed) ->
+ +       * avoid io on the failed device.
+ +       */
+ +      if (unlikely(raid_set_degraded(rs))) {
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + S_DEGRADED);
+ +              stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
+ +              return;
+ +      } else {
+ +              /*
+ +               * Reconstruction mode (ie. a particular device or
+ +               * some (rotating) parity chunk is being resynchronized) ->
+ +               *   o make sure all needed pages are read in
+ +               *   o writes are allowed to go through
+ +               */
+ +              int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
+ +
+ +              if (r) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_NOSYNC);
+ +                      stripe_prepare(stripe, dev_for_parity(stripe),
+ +                                     IO_PROHIBIT);
+ +                      return;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * All disks good. Avoid reading parity chunk and reconstruct it
+ +       * unless we have prohibited io to chunk(s).
+ +       */
+ +      if (!prohibited) {
+ +              if (StripeMerged(stripe))
+ +                      stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
+ +              else {
+ +                      stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
+ +
+ +                      /*
+ +                       * Overrule stripe_prepare to reconstruct the
+ +                       * parity chunk, because it'll be created new anyway.
+ +                       */
+ +                      ClearStripeReconstruct(stripe);
+ +              }
+ +      }
+ +}
+ +
+ +/* Check, if stripe is ready to merge writes. */
+ +static INLINE int stripe_check_merge(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      int prohibited = 0;
+ +      unsigned chunks = 0, p = rs->set.raid_devs;
+ +
+ +      /* Walk all chunks. */
+ +      while (p--) {
+ +              struct page *page = PAGE(stripe, p);
+ +
+ +              /* Can't merge active chunks. */
+ +              if (PageLocked(page)) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
+ +                      break;
+ +              }
+ +
+ +              /* Can merge uptodate chunks and have to count parity chunk. */
+ +              if (PageUptodate(page) || p == stripe->idx.parity) {
+ +                      chunks++;
+ +                      continue;
+ +              }
+ +
+ +              /* Read before write ordering. */
+ +              if (RSCheckOverwrite(rs) &&
+ +                  bio_list_empty(BL(stripe, p, READ))) {
+ +                      int r = stripe_check_overwrite(stripe, p);
+ +
+ +                      if (r) {
+ +                              chunks++;
+ +                              /* REMOVEME: statistics. */
+ +                              atomic_inc(RS(stripe->sc)->stats +
+ +                                         S_PROHIBITPAGEIO);
+ +                              ProhibitPageIO(page);
+ +                              prohibited = 1;
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (chunks == rs->set.raid_devs) {
+ +              /* All pages are uptodate or get written over or mixture. */
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + S_CAN_MERGE);
+ +              return 0;
+ +      } else
+ +              /* REMOVEME: statistics.*/
+ +              atomic_inc(rs->stats + S_CANT_MERGE);
+ +
+ +      return prohibited ? 1 : -EPERM;
+ +}
+ +
+ +/* Check, if stripe is ready to merge writes. */
+ +static INLINE int stripe_check_read(struct stripe *stripe)
+ +{
+ +      int r = 0;
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      /* Walk all chunks. */
+ +      while (p--) {
+ +              struct page *page = PAGE(stripe, p);
+ +
+ +              if (!PageLocked(page) &&
+ +                  bio_list_empty(BL(stripe, p, READ))) {
+ +                      ProhibitPageIO(page);
+ +                      r = 1;
+ +              }
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/*
+ + * Read/write a stripe.
+ + *
+ + * All stripe read/write activity goes through this function.
+ + *
+ + * States to cover:
+ + *   o stripe to read and/or write
+ + *   o stripe with error to reconstruct
+ + */
+ +static int stripe_rw(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      int prohibited = 0, r;
+ +
+ +      /*
+ +       * Check the state of the RAID set and if degraded (or
+ +       * resynchronizing for reads), read in all other chunks but
+ +       * the one on the dead/resynchronizing device in order to be
+ +       * able to reconstruct the missing one.
+ +       *
+ +       * Merge all writes hanging off uptodate pages of the stripe.
+ +       */
+ +
+ +      /* Initially allow io on all chunks and prohibit below, if necessary. */
+ +      stripe_allow_io(stripe);
+ +
+ +      if (StripeRBW(stripe)) {
+ +              r = stripe_check_merge(stripe);
+ +              if (!r) {
+ +                      /*
+ +                       * If I could rely on valid parity (which would only
+ +                       * be sure in case of a full synchronization),
+ +                       * I could xor a fraction of chunks out of
+ +                       * parity and back in.
+ +                       *
+ +                       * For the time being, I got to redo parity...
+ +                       */
+ +                      /* parity_xor(stripe); */       /* Xor chunks out. */
+ +                      stripe_zero_chunk(stripe, stripe->idx.parity);
+ +                      writes_merge(stripe);           /* Merge writes in. */
+ +                      parity_xor(stripe);             /* Update parity. */
+ +                      ClearStripeRBW(stripe);         /* Disable RBW. */
+ +                      SetStripeMerged(stripe);        /* Writes merged. */
+ +              }
+ +
+ +              if (r > 0)
+ +                      prohibited = 1;
+ +      } else if (!raid_set_degraded(rs))
+ +              /* Only allow for read avoidance if not degraded. */
+ +              prohibited = stripe_check_read(stripe);
+ +
+ +      /*
+ +       * Check, if io needs to be allowed/prohibeted on certain chunks
+ +       * because of a degraded set or reconstruction on a region.
+ +       */
+ +      stripe_check_reconstruct(stripe, prohibited);
+ +
+ +      /* Now submit any reads/writes. */
+ +      r = stripe_page_lists_rw(rs, stripe);
+ +      if (!r) {
+ +              /*
+ +               * No io submitted because of chunk io prohibited or
+ +               * locked pages -> push to end io list for processing.
+ +               */
+ +              atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
+ +              stripe_endio_push(stripe);
+ +              wake_do_raid(rs);       /* Wake myself. */
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Flush stripe either via flush list or imeediately. */
+ +enum flush_type { FLUSH_DELAY, FLUSH_NOW };
+ +static int stripe_flush(struct stripe *stripe, enum flush_type type)
+ +{
+ +      int r = 0;
+ +
+ +      stripe_lru_del(stripe, LIST_LOCKED);
+ +
+ +      /* Immediately flush. */
+ +      if (type == FLUSH_NOW) {
+ +              if (likely(raid_set_operational(RS(stripe->sc))))
+ +                      r = stripe_rw(stripe); /* Read/write stripe. */
+ +              else
+ +                      /* Optimization: Fail early on failed sets. */
+ +                      stripe_fail_io(stripe);
+ +      /* Delay flush by putting it on io list for later processing. */
+ +      } else if (type == FLUSH_DELAY)
+ +              stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
+ +      else
+ +              BUG();
+ +
+ +      return r;
+ +}
+ +
+ +/*
+ + * Queue reads and writes to a stripe by hanging
+ + * their bios off the stripsets read/write lists.
+ + *
+ + * Endio reads on uptodate chunks.
+ + */
+ +static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
+ +                                 struct bio_list *reject)
+ +{
+ +      int r = 0;
+ +      struct address addr;
+ +      struct stripe *stripe =
+ +              stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
+ +
+ +      if (stripe) {
+ +              int rr, rw = bio_data_dir(bio);
+ +
+ +              rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
+ +              if (rr) {
+ +                      stripe_put(stripe);
+ +                      goto out;
+ +              }
+ +
+ +              /* Distinguish read and write cases. */
+ +              bio_list_add(BL(stripe, addr.di, rw), bio);
+ +
+ +              /* REMOVEME: statistics */
+ +              atomic_inc(rs->stats + (rw == WRITE ?
+ +                         S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
+ +
+ +              if (rw == READ)
+ +                      SetStripeRead(stripe);
+ +              else {
+ +                      SetStripeRBW(stripe);
+ +
+ +                      /* Inrement pending write count on region. */
+ +                      dm_rh_inc(rs->recover.rh, stripe->region);
+ +                      r = 1;  /* Region hash needs a flush. */
+ +              }
+ +
+ +              /*
+ +               * Optimize stripe flushing:
+ +               *
+ +               * o directly start io for read stripes.
+ +               *
+ +               * o put stripe onto stripe caches io_list for RBW,
+ +               *   so that do_flush() can belabour it after we put
+ +               *   more bios to the stripe for overwrite optimization.
+ +               */
+ +              stripe_flush(stripe,
+ +                           StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
+ +
+ +      /* Got no stripe from cache -> reject bio. */
+ +      } else {
+ +out:
+ +              bio_list_add(reject, bio);
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + S_IOS_POST);
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/*
+ + * Recovery functions
+ + */
+ +/* Read a stripe off a raid set for recovery. */
+ +static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
+ +{
+ +      /* Invalidate all pages so that they get read in. */
+ +      stripe_pages_invalidate(stripe);
+ +
+ +      /* Allow io on all recovery chunks. */
+ +      stripe_allow_io(stripe);
+ +
+ +      if (idx > -1)
+ +              ProhibitPageIO(PAGE(stripe, idx));
+ +
+ +      stripe->key = rs->recover.pos;
+ +      return stripe_page_lists_rw(rs, stripe);
+ +}
+ +
+ +/* Write a stripe to a raid set for recovery. */
+ +static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
+ +{
+ +      /*
+ +       * If this is a reconstruct of a particular device, then
+ +       * reconstruct the respective page(s), else create parity page(s).
+ +       */
+ +      if (idx > -1) {
+ +              struct page *page = PAGE(stripe, idx);
+ +
+ +              AllowPageIO(page);
+ +              stripe_zero_chunk(stripe, idx);
+ +              common_xor(stripe, stripe->io.size, 0, idx);
+ +              page_set(page, DIRTY);
+ +      } else
+ +              parity_xor(stripe);
+ +
+ +      return stripe_page_lists_rw(rs, stripe);
+ +}
+ +
+ +/* Recover bandwidth available ?. */
+ +static int recover_bandwidth(struct raid_set *rs)
+ +{
+ +      int r, work;
+ +
+ +      /* On reset -> allow recovery. */
+ +      r = recover_io_reset(rs);
+ +      if (r || RSBandwidth(rs))
+ +              goto out;
+ +
+ +      work = atomic_read(rs->recover.io_count + IO_WORK);
+ +      if (work) {
+ +              /* Pay attention to larger recover stripe size. */
+ +              int recover =
+ +                  atomic_read(rs->recover.io_count + IO_RECOVER) *
+ +                              rs->recover.io_size /
+ +                              rs->set.io_size;
+ +
+ +              /*
+ +               * Don't use more than given bandwidth of
+ +               * the work io for recovery.
+ +               */
+ +              if (recover > work / rs->recover.bandwidth_work) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_NO_BANDWIDTH);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +out:
+ +      atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
+ +      return 1;
+ +}
+ +
+ +/* Try to get a region to recover. */
+ +static int recover_get_region(struct raid_set *rs)
+ +{
+ +      struct recover *rec = &rs->recover;
+ +      struct dm_region_hash *rh = rec->rh;
+ +
+ +      /* Start quiescing some regions. */
+ +      if (!RSRegionGet(rs)) {
+ +              int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
+ +
+ +              if (r) {
+ +                      r = dm_rh_recovery_prepare(rh);
+ +                      if (r < 0) {
+ +                              DMINFO("No %sregions to recover",
+ +                                     rec->nr_regions_to_recover ?
+ +                                     "more " : "");
+ +                              return -ENOENT;
+ +                      }
+ +              } else
+ +                      return -EAGAIN;
+ +
+ +              SetRSRegionGet(rs);
+ +      }
+ +
+ +      if (!rec->reg) {
+ +              rec->reg = dm_rh_recovery_start(rh);
+ +              if (rec->reg) {
+ +                      /*
+ +                       * A reference for the the region I'll
+ +                       * keep till I've completely synced it.
+ +                       */
+ +                      io_get(rs);
+ +                      rec->pos = dm_rh_region_to_sector(rh,
+ +                              dm_rh_get_region_key(rec->reg));
+ +                      rec->end = rec->pos + dm_rh_get_region_size(rh);
+ +                      return 1;
+ +              } else
+ +                      return -EAGAIN;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Read/write a recovery stripe. */
+ +static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
+ +{
+ +      /* Read/write flip-flop. */
+ +      if (TestClearStripeRBW(stripe)) {
+ +              SetStripeRead(stripe);
+ +              return recover_read(rs, stripe, idx_get(rs));
+ +      } else if (TestClearStripeRead(stripe))
+ +              return recover_write(rs, stripe, idx_get(rs));
+ +
+ +      return 0;
+ +}
+ +
+ +/* Reset recovery variables. */
+ +static void recovery_region_reset(struct raid_set *rs)
+ +{
+ +      rs->recover.reg = NULL;
+ +      ClearRSRegionGet(rs);
+ +}
+ +
+ +/* Update region hash state. */
+ +static void recover_rh_update(struct raid_set *rs, int error)
+ +{
+ +      struct recover *rec = &rs->recover;
+ +      struct dm_region *reg = rec->reg;
+ +
+ +      if (reg) {
+ +              dm_rh_recovery_end(reg, error);
+ +              if (!error)
+ +                      rec->nr_regions_recovered++;
+ +
+ +              recovery_region_reset(rs);
+ +      }
+ +
+ +      dm_rh_update_states(reg->rh, 1);
+ +      dm_rh_flush(reg->rh);
+ +      io_put(rs);     /* Release the io reference for the region. */
+ +}
+ +
+ +/* Called by main io daemon to recover regions. */
+ +/* FIXME: cope with MAX_RECOVER > 1. */
+ +static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
+ +{
+ +      int r;
+ +      struct recover *rec = &rs->recover;
+ +
+ +      /* If recovery is active -> return. */
+ +      if (StripeActive(stripe))
+ +              return;
+ +
+ +      /* io error is fatal for recovery -> stop it. */
+ +      if (unlikely(StripeError(stripe)))
+ +              goto err;
+ +
+ +      /* Get a region to recover. */
+ +      r = recover_get_region(rs);
+ +      switch (r) {
+ +      case 1: /* Got a new region. */
+ +              /* Flag read before write. */
+ +              ClearStripeRead(stripe);
+ +              SetStripeRBW(stripe);
+ +              break;
+ +
+ +      case 0:
+ +              /* Got a region in the works. */
+ +              r = recover_bandwidth(rs);
+ +              if (r) /* Got enough bandwidth. */
+ +                      break;
+ +
+ +      case -EAGAIN:
+ +              /* No bandwidth/quiesced region yet, try later. */
+ +              wake_do_raid_delayed(rs, HZ / 10);
+ +              return;
+ +
+ +      case -ENOENT:   /* No more regions. */
+ +              dm_table_event(rs->ti->table);
+ +              goto free;
+ +      }
+ +
+ +      /* Read/write a recover stripe. */
+ +      r = recover_stripe_rw(rs, stripe);
+ +      if (r) {
+ +              /* IO initiated, get another reference for the IO. */
+ +              io_get(rs);
+ +              return;
+ +      }
+ +
+ +      /* Update recovery position within region. */
+ +      rec->pos += stripe->io.size;
+ +
+ +      /* If we're at end of region, update region hash. */
+ +      if (rec->pos >= rec->end ||
+ +          rec->pos >= rs->set.sectors_per_dev)
+ +              recover_rh_update(rs, 0);
+ +      else
+ +              SetStripeRBW(stripe);
+ +
+ +      /* Schedule myself for another round... */
+ +      wake_do_raid(rs);
+ +      return;
+ +
+ +err:
+ +      raid_set_check_degrade(rs, stripe);
+ +
+ +      {
+ +              char buf[BDEVNAME_SIZE];
+ +
+ +              DMERR("stopping recovery due to "
+ +                    "ERROR on /dev/%s, stripe at offset %llu",
+ +                    bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
+ +                    (unsigned long long) stripe->key);
+ +
+ +      }
+ +
+ +      /* Make sure, that all quiesced regions get released. */
+ +      do {
+ +              if (rec->reg)
+ +                      dm_rh_recovery_end(rec->reg, -EIO);
+ +
+ +              rec->reg = dm_rh_recovery_start(rec->rh);
+ +      } while (rec->reg);
+ +
+ +      recover_rh_update(rs, -EIO);
+ +free:
+ +      rs->set.dev_to_init = -1;
+ +
+ +      /* Check for jiffies overrun. */
+ +      rs->recover.end_jiffies = jiffies;
+ +      if (rs->recover.end_jiffies < rs->recover.start_jiffies)
+ +              rs->recover.end_jiffies = ~0;
+ +
+ +      ClearRSRecover(rs);
+ +}
+ +
+ +static INLINE void do_recovery(struct raid_set *rs)
+ +{
+ +      struct stripe *stripe;
+ +
+ +      list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
+ +              _do_recovery(rs, stripe);
+ +
+ +      if (!RSRecover(rs))
+ +              stripe_recover_free(rs);
+ +}
+ +
+ +/*
+ + * END recovery functions
+ + */
+ +
+ +/* End io process all stripes handed in by endio() callback. */
+ +static void do_endios(struct raid_set *rs)
+ +{
+ +      struct stripe_cache *sc = &rs->sc;
+ +      struct stripe *stripe;
+ +
+ +      while ((stripe = stripe_endio_pop(sc))) {
+ +              unsigned count;
+ +
+ +              /* Recovery stripe special case. */
+ +              if (unlikely(StripeRecover(stripe))) {
+ +                      if (stripe_io(stripe))
+ +                              continue;
+ +
+ +                      io_put(rs); /* Release region io reference. */
+ +                      ClearStripeActive(stripe);
+ +
+ +                      /* REMOVEME: statistics*/
+ +                      atomic_dec(&sc->active_stripes);
+ +                      continue;
+ +              }
+ +
+ +              /* Early end io all reads on any uptodate chunks. */
+ +              stripe_endio(READ, stripe, (count = 0, &count));
+ +              if (stripe_io(stripe)) {
+ +                      if (count) /* REMOVEME: statistics. */
+ +                              atomic_inc(rs->stats + S_ACTIVE_READS);
+ +
+ +                      continue;
+ +              }
+ +
+ +              /* Set stripe inactive after all io got processed. */
+ +              if (TestClearStripeActive(stripe))
+ +                      atomic_dec(&sc->active_stripes);
+ +
+ +              /* Unlock stripe (for clustering). */
+ +              stripe_unlock(rs, stripe);
+ +
+ +              /*
+ +               * If an io error on a stripe occured and the RAID set
+ +               * is still operational, requeue the stripe for io.
+ +               */
+ +              if (TestClearStripeError(stripe)) {
+ +                      raid_set_check_degrade(rs, stripe);
+ +                      ClearStripeReconstruct(stripe);
+ +
+ +                      if (!StripeMerged(stripe) &&
+ +                          raid_set_operational(rs)) {
+ +                              stripe_pages_invalidate(stripe);
+ +                              stripe_flush(stripe, FLUSH_DELAY);
+ +                              /* REMOVEME: statistics. */
+ +                              atomic_inc(rs->stats + S_REQUEUE);
+ +                              continue;
+ +                      }
+ +              }
+ +
+ +              /* Check if the RAID set is inoperational to error ios. */
+ +              if (!raid_set_operational(rs)) {
+ +                      ClearStripeReconstruct(stripe);
+ +                      stripe_fail_io(stripe);
+ +                      BUG_ON(atomic_read(&stripe->cnt));
+ +                      continue;
+ +              }
+ +
+ +              /* Got to reconstruct a missing chunk. */
+ +              if (TestClearStripeReconstruct(stripe))
+ +                      reconstruct_xor(stripe);
+ +
+ +              /*
+ +               * Now that we've got a complete stripe, we can
+ +               * process the rest of the end ios on reads.
+ +               */
+ +              BUG_ON(stripe_endio(READ, stripe, NULL));
+ +              ClearStripeRead(stripe);
+ +
+ +              /*
+ +               * Read-before-write stripes need to be flushed again in
+ +               * order to work the write data into the pages *after*
+ +               * they were read in.
+ +               */
+ +              if (TestClearStripeMerged(stripe))
+ +                      /* End io all bios which got merged already. */
+ +                      BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
+ +
+ +              /* Got to put on flush list because of new writes. */
+ +              if (StripeRBW(stripe))
+ +                      stripe_flush(stripe, FLUSH_DELAY);
+ +      }
+ +}
+ +
+ +/*
+ + * Stripe cache shrinking.
+ + */
+ +static INLINE void do_sc_shrink(struct raid_set *rs)
+ +{
+ +      unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
+ +
+ +      if (shrink) {
+ +              unsigned cur = atomic_read(&rs->sc.stripes);
+ +
+ +              sc_shrink(&rs->sc, shrink);
+ +              shrink -= cur - atomic_read(&rs->sc.stripes);
+ +              atomic_set(&rs->sc.stripes_to_shrink, shrink);
+ +
+ +              /*
+ +               * Wake myself up in case we failed to shrink the
+ +               * requested amount in order to try again later.
+ +               */
+ +              if (shrink)
+ +                      wake_do_raid(rs);
+ +      }
+ +}
+ +
+ +
+ +/*
+ + * Process all ios
+ + *
+ + * We do different things with the io depending on the
+ + * state of the region that it's in:
+ + *
+ + * o reads: hang off stripe cache or postpone if full
+ + *
+ + * o writes:
+ + *
+ + *  CLEAN/DIRTY/NOSYNC:       increment pending and hang io off stripe's stripe set.
+ + *                    In case stripe cache is full or busy, postpone the io.
+ + *
+ + *  RECOVERING:               delay the io until recovery of the region completes.
+ + *
+ + */
+ +static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
+ +{
+ +      int r;
+ +      unsigned flush = 0;
+ +      struct dm_region_hash *rh = rs->recover.rh;
+ +      struct bio *bio;
+ +      struct bio_list delay, reject;
+ +
+ +      bio_list_init(&delay);
+ +      bio_list_init(&reject);
+ +
+ +      /*
+ +       * Classify each io:
+ +       *    o delay to recovering regions
+ +       *    o queue to all other regions
+ +       */
+ +      while ((bio = bio_list_pop(ios))) {
+ +              /*
+ +               * In case we get a barrier bio, push it back onto
+ +               * the input queue unless all work queues are empty
+ +               * and the stripe cache is inactive.
+ +               */
+ +              if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_BARRIER);
+ +                      if (!list_empty(rs->sc.lists + LIST_IO) ||
+ +                          !bio_list_empty(&delay) ||
+ +                          !bio_list_empty(&reject) ||
+ +                          sc_active(&rs->sc)) {
+ +                              bio_list_push(ios, bio);
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
+ +              if (unlikely(r)) {
+ +                      /* Got to wait for recovering regions. */
+ +                      bio_list_add(&delay, bio);
+ +                      SetRSBandwidth(rs);
+ +              } else {
+ +                      /*
+ +                       * Process ios to non-recovering regions by queueing
+ +                       * them to stripes (does rh_inc()) for writes).
+ +                       */
+ +                      flush += stripe_queue_bio(rs, bio, &reject);
+ +              }
+ +      }
+ +
+ +      if (flush) {
+ +              r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
+ +              if (r)
+ +                      DMERR("dirty log flush");
+ +      }
+ +
+ +      /* Delay ios to regions which are recovering. */
+ +      while ((bio = bio_list_pop(&delay))) {
+ +              /* REMOVEME: statistics.*/
+ +              atomic_inc(rs->stats + S_DELAYED_BIOS);
+ +              atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
+ +              dm_rh_delay(rh, bio);
+ +
+ +      }
+ +
+ +      /* Merge any rejected bios back to the head of the input list. */
+ +      bio_list_merge_head(ios, &reject);
+ +}
+ +
+ +/* Flush any stripes on the io list. */
+ +static INLINE void do_flush(struct raid_set *rs)
+ +{
+ +      struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
+ +
+ +      list_for_each_safe(pos, tmp, list) {
+ +              int r = stripe_flush(list_entry(pos, struct stripe,
+ +                                              lists[LIST_IO]), FLUSH_NOW);
+ +
+ +              /* Remove from the list only if the stripe got processed. */
+ +              if (!r)
+ +                      list_del_init(pos);
+ +      }
+ +}
+ +
+ +/* Send an event in case we're getting too busy. */
+ +static INLINE void do_busy_event(struct raid_set *rs)
+ +{
+ +      if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
+ +              if (!TestSetRSScBusy(rs))
+ +                      dm_table_event(rs->ti->table);
+ +      } else
+ +              ClearRSScBusy(rs);
+ +}
+ +
+ +/* Unplug: let the io role on the sets devices. */
+ +static INLINE void do_unplug(struct raid_set *rs)
+ +{
+ +      struct raid_dev *dev = rs->dev + rs->set.raid_devs;
+ +
+ +      while (dev-- > rs->dev) {
+ +              /* Only call any device unplug function, if io got queued. */
+ +              if (io_dev_clear(dev))
+ +                      blk_unplug(bdev_get_queue(dev->dev->bdev));
+ +      }
+ +}
+ +
+ +/*-----------------------------------------------------------------
+ + * RAID daemon
+ + *---------------------------------------------------------------*/
+ +/*
+ + * o belabour all end ios
+ + * o optionally shrink the stripe cache
+ + * o update the region hash states
+ + * o optionally do recovery
+ + * o grab the input queue
+ + * o work an all requeued or new ios and perform stripe cache flushs
+ + *   unless the RAID set is inoperational (when we error ios)
+ + * o check, if the stripe cache gets too busy and throw an event if so
+ + * o unplug any component raid devices with queued bios
+ + */
+ +static void do_raid(struct work_struct *ws)
+ +{
+ +      struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
+ +      struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
+ +      spinlock_t *lock = &rs->io.in_lock;
+ +
+ +      /*
+ +       * We always need to end io, so that ios
+ +       * can get errored in case the set failed
+ +       * and the region counters get decremented
+ +       * before we update the region hash states.
+ +       */
+ +redo:
+ +      do_endios(rs);
+ +
+ +      /*
+ +       * Now that we've end io'd, which may have put stripes on
+ +       * the LRU list, we shrink the stripe cache if requested.
+ +       */
+ +      do_sc_shrink(rs);
+ +
+ +      /* Update region hash states before we go any further. */
+ +      dm_rh_update_states(rs->recover.rh, 1);
+ +
+ +      /* Try to recover regions. */
+ +      if (RSRecover(rs))
+ +              do_recovery(rs);
+ +
+ +      /* More endios -> process. */
+ +      if (!stripe_endio_empty(&rs->sc)) {
+ +              atomic_inc(rs->stats + S_REDO);
+ +              goto redo;
+ +      }
+ +
+ +      /* Quickly grab all new ios queued and add them to the work list. */
+ +      spin_lock_irq(lock);
+ +      bio_list_merge(ios, ios_in);
+ +      bio_list_init(ios_in);
+ +      spin_unlock_irq(lock);
+ +
+ +      /* Let's assume we're operational most of the time ;-). */
+ +      if (likely(raid_set_operational(rs))) {
+ +              /* If we got ios, work them into the cache. */
+ +              if (!bio_list_empty(ios)) {
+ +                      do_ios(rs, ios);
+ +                      do_unplug(rs);  /* Unplug the sets device queues. */
+ +              }
+ +
+ +              do_flush(rs);           /* Flush any stripes on io list. */
+ +              do_unplug(rs);          /* Unplug the sets device queues. */
+ +              do_busy_event(rs);      /* Check if we got too busy. */
+ +
+ +              /* More endios -> process. */
+ +              if (!stripe_endio_empty(&rs->sc)) {
+ +                      atomic_inc(rs->stats + S_REDO);
+ +                      goto redo;
+ +              }
+ +      } else
+ +              /* No way to reconstruct data with too many devices failed. */
+ +              bio_list_fail(rs, NULL, ios);
+ +}
+ +
+ +/*
+ + * Callback for region hash to dispatch
+ + * delayed bios queued to recovered regions
+ + * (Gets called via rh_update_states()).
+ + */
+ +static void dispatch_delayed_bios(void *context, struct bio_list *bl)
+ +{
+ +      struct raid_set *rs = context;
+ +      struct bio *bio;
+ +
+ +      /* REMOVEME: decrement pending delayed bios counter. */
+ +      bio_list_for_each(bio, bl)
+ +              atomic_dec(rs->stats + S_DELAYED_BIOS);
+ +
+ +      /* Merge region hash private list to work list. */
+ +      bio_list_merge_head(&rs->io.work, bl);
+ +      bio_list_init(bl);
+ +      ClearRSBandwidth(rs);
+ +}
+ +
+ +/*************************************************************
+ + * Constructor helpers
+ + *************************************************************/
+ +/* Calculate MB/sec. */
+ +static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
+ +{
+ +      return to_bytes(speed * rs->set.data_devs *
+ +                      rs->recover.io_size * HZ >> 10) >> 10;
+ +}
+ +
+ +/*
+ + * Discover fastest xor algorithm and # of chunks combination.
+ + */
+ +/* Calculate speed for algorithm and # of chunks. */
+ +static INLINE unsigned xor_speed(struct stripe *stripe)
+ +{
+ +      unsigned r = 0;
+ +      unsigned long j;
+ +
+ +      /* Wait for next tick. */
+ +      for (j = jiffies; j == jiffies;)
+ +              ;
+ +
+ +      /* Do xors for a full tick. */
+ +      for (j = jiffies; j == jiffies;) {
+ +              mb();
+ +              common_xor(stripe, stripe->io.size, 0, 0);
+ +              mb();
+ +              r++;
+ +              mb();
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/* Optimize xor algorithm for this RAID set. */
+ +static unsigned xor_optimize(struct raid_set *rs)
+ +{
+ +      unsigned chunks_max = 2, speed_max = 0;
+ +      struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
+ +      struct stripe *stripe;
+ +
+ +      BUG_ON(list_empty(&rs->recover.stripes));
+ +      stripe = list_first_entry(&rs->recover.stripes, struct stripe,
+ +                          lists[LIST_RECOVER]);
+ +
+ +      /*
+ +       * Got to allow io on all chunks, so that
+ +       * xor() will actually work on them.
+ +       */
+ +      stripe_allow_io(stripe);
+ +
+ +      /* Try all xor functions. */
+ +      while (f-- > xor_funcs) {
+ +              unsigned speed;
+ +
+ +              /* Set actual xor function for common_xor(). */
+ +              rs->xor.f = f;
+ +              rs->xor.chunks = XOR_CHUNKS_MAX + 1;
+ +
+ +              while (rs->xor.chunks-- > 2) {
+ +                      speed = xor_speed(stripe);
+ +                      if (speed > speed_max) {
+ +                              speed_max = speed;
+ +                              chunks_max = rs->xor.chunks;
+ +                              f_max = f;
+ +                      }
+ +              }
+ +      }
+ +
+ +      /* Memorize optimum parameters. */
+ +      rs->xor.f = f_max;
+ +      rs->xor.chunks = chunks_max;
+ +      return speed_max;
+ +}
+ +
+ +static inline int array_too_big(unsigned long fixed, unsigned long obj,
+ +                                unsigned long num)
+ +{
+ +      return (num > (ULONG_MAX - fixed) / obj);
+ +}
+ +
+ +static void wakeup_all_recovery_waiters(void *context)
+ +{
+ +}
+ +
+ +/*
+ + * Allocate a RAID context (a RAID set)
+ + */
+ +static int
+ +context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
+ +            unsigned stripes, unsigned chunk_size, unsigned io_size,
+ +            unsigned recover_io_size, unsigned raid_devs,
+ +            sector_t sectors_per_dev,
+ +            struct dm_target *ti, unsigned dl_parms, char **argv)
+ +{
+ +      int r;
+ +      unsigned p;
+ +      size_t len;
+ +      sector_t region_size, ti_len;
+ +      struct raid_set *rs = NULL;
+ +      struct dm_dirty_log *dl;
+ +      struct recover *rec;
+ +
+ +      /*
+ +       * Create the dirty log
+ +       *
+ +       * We need to change length for the dirty log constructor,
+ +       * because we want an amount of regions for all stripes derived
+ +       * from the single device size, so that we can keep region
+ +       * size = 2^^n independant of the number of devices
+ +       */
+ +      ti_len = ti->len;
+ +      ti->len = sectors_per_dev;
+ +      dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
+ +      ti->len = ti_len;
+ +      if (!dl)
+ +              goto bad_dirty_log;
+ +
+ +      /* Chunk size *must* be smaller than region size. */
+ +      region_size = dl->type->get_region_size(dl);
+ +      if (chunk_size > region_size)
+ +              goto bad_chunk_size;
+ +
+ +      /* Recover io size *must* be smaller than region size as well. */
+ +      if (recover_io_size > region_size)
+ +              goto bad_recover_io_size;
+ +
+ +      /* Size and allocate the RAID set structure. */
+ +      len = sizeof(*rs->data) + sizeof(*rs->dev);
+ +      if (array_too_big(sizeof(*rs), len, raid_devs))
+ +              goto bad_array;
+ +
+ +      len = sizeof(*rs) + raid_devs * len;
+ +      rs = kzalloc(len, GFP_KERNEL);
+ +      if (!rs)
+ +              goto bad_alloc;
+ +
+ +      rec = &rs->recover;
+ +      atomic_set(&rs->io.in_process, 0);
+ +      atomic_set(&rs->io.in_process_max, 0);
+ +      rec->io_size = recover_io_size;
+ +
+ +      /* Pointer to data array. */
+ +      rs->data = (unsigned long **)
+ +                 ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
+ +      rec->dl = dl;
+ +      rs->set.raid_devs = p = raid_devs;
+ +      rs->set.data_devs = raid_devs - raid_type->parity_devs;
+ +      rs->set.raid_type = raid_type;
+ +
+ +      /*
+ +       * Set chunk and io size and respective shifts
+ +       * (used to avoid divisions)
+ +       */
+ +      rs->set.chunk_size = chunk_size;
+ +      rs->set.chunk_mask = chunk_size - 1;
+ +      rs->set.chunk_shift = ffs(chunk_size) - 1;
+ +
+ +      rs->set.io_size = io_size;
+ +      rs->set.io_mask = io_size - 1;
+ +      rs->set.io_shift = ffs(io_size) - 1;
+ +      rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
+ +
+ +      rs->set.pages_per_io = chunk_pages(io_size);
+ +      rs->set.sectors_per_dev = sectors_per_dev;
+ +
+ +      rs->set.ei = -1;        /* Indicate no failed device. */
+ +      atomic_set(&rs->set.failed_devs, 0);
+ +
+ +      rs->ti = ti;
+ +
+ +      atomic_set(rec->io_count + IO_WORK, 0);
+ +      atomic_set(rec->io_count + IO_RECOVER, 0);
+ +
+ +      /* Initialize io lock and queues. */
+ +      spin_lock_init(&rs->io.in_lock);
+ +      bio_list_init(&rs->io.in);
+ +      bio_list_init(&rs->io.work);
+ +
+ +      init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
+ +
+ +      rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
+ +
+ +      rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios, wake_do_raid,
+ +                                      wakeup_all_recovery_waiters,
+ +                                      rs->ti->begin, MAX_RECOVER, dl,
+ +                                      region_size, rs->recover.nr_regions);
+ +      if (IS_ERR(rec->rh))
+ +              goto bad_rh;
+ +
+ +      /* Initialize stripe cache. */
+ +      r = sc_init(rs, stripes);
+ +      if (r)
+ +              goto bad_sc;
+ +
+ +      /* Create dm-io client context. */
+ +      rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
+ +                                                rs->set.pages_per_io);
+ +      if (IS_ERR(rs->sc.dm_io_client))
+ +              goto bad_dm_io_client;
+ +
+ +      /* REMOVEME: statistics. */
+ +      stats_reset(rs);
+ +      ClearRSDevelStats(rs);  /* Disnable development status. */
+ +
+ +      *raid_set = rs;
+ +      return 0;
+ +
+ +bad_dirty_log:
+ +      TI_ERR_RET("Error creating dirty log", -ENOMEM);
+ +
+ +
+ +bad_chunk_size:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR("Chunk size larger than region size");
+ +
+ +bad_recover_io_size:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR("Recover stripe io size larger than region size");
+ +
+ +bad_array:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR("Arry too big");
+ +
+ +bad_alloc:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
+ +
+ +bad_rh:
+ +      dm_dirty_log_destroy(dl);
+ +      ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
+ +      goto free_rs;
+ +
+ +bad_sc:
+ +      ti->error = DM_MSG_PREFIX "Error creating stripe cache";
+ +      goto free;
+ +
+ +bad_dm_io_client:
+ +      ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
+ +free:
+ +      dm_region_hash_destroy(rec->rh);
+ +      sc_exit(&rs->sc);
+ +      dm_region_hash_destroy(rec->rh); /* Destroys dirty log as well. */
+ +free_rs:
+ +      kfree(rs);
+ +      return -ENOMEM;
+ +}
+ +
+ +/* Free a RAID context (a RAID set). */
+ +static void
+ +context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
+ +{
+ +      while (r--)
+ +              dm_put_device(ti, rs->dev[r].dev);
+ +
+ +      dm_io_client_destroy(rs->sc.dm_io_client);
+ +      sc_exit(&rs->sc);
+ +      dm_region_hash_destroy(rs->recover.rh);
+ +      dm_dirty_log_destroy(rs->recover.dl);
+ +      kfree(rs);
+ +}
+ +
+ +/* Create work queue and initialize work. */
+ +static int rs_workqueue_init(struct raid_set *rs)
+ +{
+ +      struct dm_target *ti = rs->ti;
+ +
+ +      rs->io.wq = create_singlethread_workqueue(DAEMON);
+ +      if (!rs->io.wq)
+ +              TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
+ +
+ +      INIT_DELAYED_WORK(&rs->io.dws, do_raid);
+ +      return 0;
+ +}
+ +
+ +/* Return pointer to raid_type structure for raid name. */
+ +static struct raid_type *get_raid_type(char *name)
+ +{
+ +      struct raid_type *r = ARRAY_END(raid_types);
+ +
+ +      while (r-- > raid_types) {
+ +              if (!strnicmp(STR_LEN(r->name, name)))
+ +                      return r;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +/* FIXME: factor out to dm core. */
+ +static int multiple(sector_t a, sector_t b, sector_t *n)
+ +{
+ +      sector_t r = a;
+ +
+ +      sector_div(r, b);
+ +      *n = r;
+ +      return a == r * b;
+ +}
+ +
+ +/* Log RAID set information to kernel log. */
+ +static void raid_set_log(struct raid_set *rs, unsigned speed)
+ +{
+ +      unsigned p;
+ +      char buf[BDEVNAME_SIZE];
+ +
+ +      for (p = 0; p < rs->set.raid_devs; p++)
+ +              DMINFO("/dev/%s is raid disk %u",
+ +                     bdevname(rs->dev[p].dev->bdev, buf), p);
+ +
+ +      DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
+ +             rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
+ +             atomic_read(&rs->sc.stripes));
+ +      DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
+ +             rs->xor.chunks, mbpers(rs, speed));
+ +      DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
+ +             rs->set.data_devs, rs->set.raid_devs);
+ +}
+ +
+ +/* Get all devices and offsets. */
+ +static int
+ +dev_parms(struct dm_target *ti, struct raid_set *rs,
+ +        char **argv, int *p)
+ +{
+ +      for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
+ +              int r;
+ +              unsigned long long tmp;
+ +              struct raid_dev *dev = rs->dev + *p;
+ +              union dev_lookup dl = {.dev = dev };
+ +
+ +              /* Get offset and device. */
+ +              r = sscanf(argv[1], "%llu", &tmp);
+ +              if (r != 1)
+ +                      TI_ERR("Invalid RAID device offset parameter");
+ +
+ +              dev->start = tmp;
-               r = dm_get_device(ti, argv[0], dev->start,
-                                 rs->set.sectors_per_dev,
-                                 dm_table_get_mode(ti->table), &dev->dev);
++              r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
++                                &dev->dev);
+ +              if (r)
+ +                      TI_ERR_RET("RAID device lookup failure", r);
+ +
+ +              r = raid_dev_lookup(rs, bynumber, &dl);
+ +              if (r != -ENODEV && r < *p) {
+ +                      (*p)++; /* Ensure dm_put_device() on actual device. */
+ +                      TI_ERR_RET("Duplicate RAID device", -ENXIO);
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Set recovery bandwidth. */
+ +static INLINE void
+ +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+ +{
+ +      rs->recover.bandwidth = bandwidth;
+ +      rs->recover.bandwidth_work = 100 / bandwidth;
+ +}
+ +
+ +/* Handle variable number of RAID parameters. */
+ +static int
+ +raid_variable_parms(struct dm_target *ti, char **argv,
+ +                  unsigned i, int *raid_parms,
+ +                  int *chunk_size, int *chunk_size_parm,
+ +                  int *stripes, int *stripes_parm,
+ +                  int *io_size, int *io_size_parm,
+ +                  int *recover_io_size, int *recover_io_size_parm,
+ +                  int *bandwidth, int *bandwidth_parm)
+ +{
+ +      /* Fetch # of variable raid parameters. */
+ +      if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
+ +          !range_ok(*raid_parms, 0, 5))
+ +              TI_ERR("Bad variable raid parameters number");
+ +
+ +      if (*raid_parms) {
+ +              /*
+ +               * If we've got variable RAID parameters,
+ +               * chunk size is the first one
+ +               */
+ +              if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
+ +                  (*chunk_size != -1 &&
+ +                   (!POWER_OF_2(*chunk_size) ||
+ +                    !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
+ +                      TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
+ +
+ +              *chunk_size_parm = *chunk_size;
+ +              if (*chunk_size == -1)
+ +                      *chunk_size = CHUNK_SIZE;
+ +
+ +              /*
+ +               * In case we've got 2 or more variable raid
+ +               * parameters, the number of stripes is the second one
+ +               */
+ +              if (*raid_parms > 1) {
+ +                      if (sscanf(argv[i++], "%d", stripes) != 1 ||
+ +                          (*stripes != -1 &&
+ +                           !range_ok(*stripes, STRIPES_MIN,
+ +                                     STRIPES_MAX)))
+ +                              TI_ERR("Invalid number of stripes: must "
+ +                                     "be >= 8 and <= 8192");
+ +              }
+ +
+ +              *stripes_parm = *stripes;
+ +              if (*stripes == -1)
+ +                      *stripes = STRIPES;
+ +
+ +              /*
+ +               * In case we've got 3 or more variable raid
+ +               * parameters, the io size is the third one.
+ +               */
+ +              if (*raid_parms > 2) {
+ +                      if (sscanf(argv[i++], "%d", io_size) != 1 ||
+ +                          (*io_size != -1 &&
+ +                           (!POWER_OF_2(*io_size) ||
+ +                            !range_ok(*io_size, IO_SIZE_MIN,
+ +                                      min(BIO_MAX_SECTORS / 2,
+ +                                      *chunk_size)))))
+ +                              TI_ERR("Invalid io size; must "
+ +                                     "be 2^^n and less equal "
+ +                                     "min(BIO_MAX_SECTORS/2, chunk size)");
+ +              } else
+ +                      *io_size = *chunk_size;
+ +
+ +              *io_size_parm = *io_size;
+ +              if (*io_size == -1)
+ +                      *io_size = *chunk_size;
+ +
+ +              /*
+ +               * In case we've got 4 variable raid parameters,
+ +               * the recovery stripe io_size is the fourth one
+ +               */
+ +              if (*raid_parms > 3) {
+ +                      if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
+ +                          (*recover_io_size != -1 &&
+ +                           (!POWER_OF_2(*recover_io_size) ||
+ +                           !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
+ +                                     BIO_MAX_SECTORS / 2))))
+ +                              TI_ERR("Invalid recovery io size; must be "
+ +                                     "2^^n and less equal BIO_MAX_SECTORS/2");
+ +              }
+ +
+ +              *recover_io_size_parm = *recover_io_size;
+ +              if (*recover_io_size == -1)
+ +                      *recover_io_size = RECOVER_IO_SIZE;
+ +
+ +              /*
+ +               * In case we've got 5 variable raid parameters,
+ +               * the recovery io bandwidth is the fifth one
+ +               */
+ +              if (*raid_parms > 4) {
+ +                      if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
+ +                          (*bandwidth != -1 &&
+ +                           !range_ok(*bandwidth, BANDWIDTH_MIN,
+ +                                     BANDWIDTH_MAX)))
+ +                              TI_ERR("Invalid recovery bandwidth "
+ +                                     "percentage; must be > 0 and <= 100");
+ +              }
+ +
+ +              *bandwidth_parm = *bandwidth;
+ +              if (*bandwidth == -1)
+ +                      *bandwidth = BANDWIDTH;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Parse optional locking parameters. */
+ +static int
+ +raid_locking_parms(struct dm_target *ti, char **argv,
+ +                 unsigned i, int *locking_parms,
+ +                 struct dm_raid45_locking_type **locking_type)
+ +{
+ +      *locking_parms = 0;
+ +      *locking_type = &locking_none;
+ +
+ +      if (!strnicmp(argv[i], "none", strlen(argv[i])))
+ +              *locking_parms = 1;
+ +      else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
+ +              *locking_type = &locking_none;
+ +              *locking_parms = 2;
+ +      } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
+ +              *locking_type = &locking_cluster;
+ +              /* FIXME: namespace. */
+ +              *locking_parms = 3;
+ +      }
+ +
+ +      return *locking_parms == 1 ? -EINVAL : 0;
+ +}
+ +
+ +/* Set backing device information properties of RAID set. */
+ +static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
+ +{
+ +      unsigned p, ra_pages;
+ +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ +      struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+ +
+ +      /* Set read-ahead for the RAID set and the component devices. */
+ +      bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
+ +      ra_pages = chunks * chunk_pages(rs->set.io_size);
+ +      for (p = rs->set.raid_devs; p--; ) {
+ +              struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+ +
+ +              q->backing_dev_info.ra_pages = ra_pages;
+ +      }
+ +
+ +      /* Set congested function and data. */
+ +      bdi->congested_fn = raid_set_congested;
+ +      bdi->congested_data = rs;
+ +
+ +      dm_put(md);
+ +}
+ +
+ +/* Get backing device information properties of RAID set. */
+ +static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
+ +{
+ +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ +
+ +       *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
+ +                  / stripe_pages(rs, rs->set.io_size);
+ +      *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
+ +                / chunk_pages(rs->set.io_size);
+ +
+ +      dm_put(md);
+ +}
+ +
+ +/*
+ + * Construct a RAID4/5 mapping:
+ + *
+ + * log_type #log_params <log_params> \
+ + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ + * [locking "none"/"cluster"]
+ + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ + *
+ + * log_type = "core"/"disk",
+ + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ + * log_params = [dirty_log_path] region_size [[no]sync])
+ + *
+ + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
+ + *
+ + * #parity_dev = N if raid_type = "raid4"
+ + * o N = -1: pick default = last device
+ + * o N >= 0 and < #raid_devs: parity device index
+ + *
+ + * #raid_variable_params = 0-5; raid_params (-1 = default):
+ + *   [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
+ + *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ + *     and <= CHUNK_SIZE_MAX)
+ + *   o #stripes is number of stripes allocated to stripe cache
+ + *     (must be > 1 and < STRIPES_MAX)
+ + *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ + *   o recover_io_size (io unit size per device for recovery in sectors;
+ +       must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ + *   o %recovery_bandwith is the maximum amount spend for recovery during
+ + *     application io (1-100%)
+ + * If raid_variable_params = 0, defaults will be used.
+ + * Any raid_variable_param can be set to -1 to apply a default
+ + *
+ + * #raid_devs = N (N >= 3)
+ + *
+ + * #dev_to_initialize = N
+ + * -1: initialize parity on all devices
+ + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ + * of a failed devices content after replacement
+ + *
+ + * <dev_path> = device_path (eg, /dev/sdd1)
+ + * <offset>   = begin at offset on <dev_path>
+ + *
+ + */
+ +#define       MIN_PARMS       13
+ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+ +{
+ +      int bandwidth = BANDWIDTH, bandwidth_parm = -1,
+ +          chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
+ +          dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
+ +          i, io_size = IO_SIZE, io_size_parm = -1,
+ +          r, raid_devs, raid_parms,
+ +          recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
+ +          stripes = STRIPES, stripes_parm = -1;
+ +      unsigned speed;
+ +      sector_t tmp, sectors_per_dev;
+ +      struct dm_raid45_locking_type *locking;
+ +      struct raid_set *rs;
+ +      struct raid_type *raid_type;
+ +
+ +      /* Ensure minimum number of parameters. */
+ +      if (argc < MIN_PARMS)
+ +              TI_ERR("Not enough parameters");
+ +
+ +      /* Fetch # of dirty log parameters. */
+ +      if (sscanf(argv[1], "%d", &dl_parms) != 1
+ +          || !range_ok(dl_parms, 1, 4711))
+ +              TI_ERR("Bad dirty log parameters number");
+ +
+ +      /* Check raid_type. */
+ +      raid_type = get_raid_type(argv[dl_parms + 2]);
+ +      if (!raid_type)
+ +              TI_ERR("Bad raid type");
+ +
+ +      /* In case of RAID4, parity drive is selectable. */
+ +      parity_parm = !!(raid_type->level == raid4);
+ +
+ +      /* Handle variable number of RAID parameters. */
+ +      r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
+ +                              &raid_parms,
+ +                              &chunk_size, &chunk_size_parm,
+ +                              &stripes, &stripes_parm,
+ +                              &io_size, &io_size_parm,
+ +                              &recover_io_size, &recover_io_size_parm,
+ +                              &bandwidth, &bandwidth_parm);
+ +      if (r)
+ +              return r;
+ +
+ +      r = raid_locking_parms(ti, argv,
+ +                             dl_parms + parity_parm + raid_parms + 4,
+ +                             &locking_parms, &locking);
+ +      if (r)
+ +              return r;
+ +
+ +      /* # of raid devices. */
+ +      i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
+ +      if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+ +          raid_devs < raid_type->minimal_devs)
+ +              TI_ERR("Invalid number of raid devices");
+ +
+ +      /* In case of RAID4, check parity drive index is in limits. */
+ +      if (raid_type->level == raid4) {
+ +              /* Fetch index of parity device. */
+ +              if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+ +                  !range_ok(pi, 0, raid_devs - 1))
+ +                      TI_ERR("Invalid RAID4 parity device index");
+ +      }
+ +
+ +      /*
+ +       * Index of device to initialize starts at 0
+ +       *
+ +       * o -1 -> don't initialize a particular device,
+ +       * o 0..raid_devs-1 -> initialize respective device
+ +       *   (used for reconstruction of a replaced device)
+ +       */
+ +      if (sscanf
+ +          (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
+ +           "%d", &dev_to_init) != 1
+ +          || !range_ok(dev_to_init, -1, raid_devs - 1))
+ +              TI_ERR("Invalid number for raid device to initialize");
+ +
+ +      /* Check # of raid device arguments. */
+ +      if (argc - dl_parms - parity_parm - raid_parms - 6 !=
+ +          2 * raid_devs)
+ +              TI_ERR("Wrong number of raid device/offset arguments");
+ +
+ +      /*
+ +       * Check that the table length is devisable
+ +       * w/o rest by (raid_devs - parity_devs)
+ +       */
+ +      if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+ +                    &sectors_per_dev))
+ +              TI_ERR
+ +                  ("Target length not divisable by number of data devices");
+ +
+ +      /*
+ +       * Check that the device size is
+ +       * devisable w/o rest by chunk size
+ +       */
+ +      if (!multiple(sectors_per_dev, chunk_size, &tmp))
+ +              TI_ERR("Device length not divisable by chunk_size");
+ +
+ +      /****************************************************************
+ +       * Now that we checked the constructor arguments ->
+ +       * let's allocate the RAID set
+ +       ****************************************************************/
+ +      r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
+ +                        recover_io_size, raid_devs, sectors_per_dev,
+ +                        ti, dl_parms, argv);
+ +      if (r)
+ +              return r;
+ +
+ +      /*
+ +       * Set these here in order to avoid passing
+ +       * too many arguments to context_alloc()
+ +       */
+ +      rs->set.dev_to_init_parm = dev_to_init;
+ +      rs->set.dev_to_init = dev_to_init;
+ +      rs->set.pi_parm = pi;
+ +      rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
+ +      rs->set.raid_parms = raid_parms;
+ +      rs->set.chunk_size_parm = chunk_size_parm;
+ +      rs->set.io_size_parm = io_size_parm;
+ +      rs->sc.stripes_parm = stripes_parm;
+ +      rs->recover.io_size_parm = recover_io_size_parm;
+ +      rs->recover.bandwidth_parm = bandwidth_parm;
+ +      recover_set_bandwidth(rs, bandwidth);
+ +
+ +      /* Use locking type to lock stripe access. */
+ +      rs->locking = locking;
+ +
+ +      /* Get the device/offset tupels. */
+ +      argv += dl_parms + 6 + parity_parm + raid_parms;
+ +      r = dev_parms(ti, rs, argv, &i);
+ +      if (r)
+ +              goto err;
+ +
+ +      /* Initialize recovery. */
+ +      rs->recover.start_jiffies = jiffies;
+ +      rs->recover.end_jiffies = 0;
+ +      recovery_region_reset(rs);
+ +
+ +      /* Allow for recovery of any nosync regions. */
+ +      SetRSRecover(rs);
+ +
+ +      /* Set backing device information (eg. read ahead). */
+ +      rs_set_bdi(rs, chunk_size * 2, io_size * 4);
+ +      SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
+ +
+ +      speed = xor_optimize(rs); /* Select best xor algorithm. */
+ +
+ +      /* Initialize work queue to handle this RAID set's io. */
+ +      r = rs_workqueue_init(rs);
+ +      if (r)
+ +              goto err;
+ +
+ +      raid_set_log(rs, speed); /* Log information about RAID set. */
+ +
+ +      /*
+ +       * Make sure that dm core only hands maximum io size
+ +       * length down and pays attention to io boundaries.
+ +       */
+ +      ti->split_io = rs->set.io_size;
+ +      ti->private = rs;
+ +      return 0;
+ +
+ +err:
+ +      context_free(rs, ti, i);
+ +      return r;
+ +}
+ +
+ +/*
+ + * Destruct a raid mapping
+ + */
+ +static void raid_dtr(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +
+ +      /* Indicate recovery end so that ios in flight drain. */
+ +      ClearRSRecover(rs);
+ +
+ +      wake_do_raid(rs);       /* Wake daemon. */
+ +      wait_ios(rs);           /* Wait for any io still being processed. */
+ +      destroy_workqueue(rs->io.wq);
+ +      context_free(rs, ti, rs->set.raid_devs);
+ +}
+ +
+ +/* Queues ios to RAID sets. */
+ +static inline void queue_bio(struct raid_set *rs, struct bio *bio)
+ +{
+ +      int wake;
+ +      struct bio_list *in = &rs->io.in;
+ +      spinlock_t *in_lock = &rs->io.in_lock;
+ +
+ +      spin_lock_irq(in_lock);
+ +      wake = bio_list_empty(in);
+ +      bio_list_add(in, bio);
+ +      spin_unlock_irq(in_lock);
+ +
+ +      /* Wake daemon if input list was empty. */
+ +      if (wake)
+ +              wake_do_raid(rs);
+ +}
+ +
+ +/* Raid mapping function. */
+ +static int raid_map(struct dm_target *ti, struct bio *bio,
+ +                  union map_info *map_context)
+ +{
+ +      /* I don't want to waste stripe cache capacity. */
+ +      if (bio_rw(bio) == READA)
+ +              return -EIO;
+ +      else {
+ +              struct raid_set *rs = ti->private;
+ +
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats +
+ +                         (bio_data_dir(bio) == WRITE ?
+ +                          S_BIOS_WRITE : S_BIOS_READ));
+ +
+ +              /*
+ +               * Get io reference to be waiting for to drop
+ +               * to zero on device suspension/destruction.
+ +               */
+ +              io_get(rs);
+ +              bio->bi_sector -= ti->begin;    /* Remap sector. */
+ +              queue_bio(rs, bio);             /* Queue to the daemon. */
+ +              return DM_MAPIO_SUBMITTED;      /* Handle later. */
+ +      }
+ +}
+ +
+ +/* Device suspend. */
+ +static void raid_postsuspend(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +      struct dm_dirty_log *dl = rs->recover.dl;
+ +
+ +      SetRSSuspended(rs);
+ +
+ +      if (RSRecover(rs))
+ +              dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
+ +      else
+ +              wake_do_raid(rs);
+ +
+ +      wait_ios(rs);   /* Wait for completion of all ios being processed. */
+ +      if (dl->type->postsuspend && dl->type->postsuspend(dl))
+ +              /* Suspend dirty log. */
+ +              /* FIXME: need better error handling. */
+ +              DMWARN("log suspend failed");
+ +}
+ +
+ +/* Device resume. */
+ +static void raid_resume(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +      struct recover *rec = &rs->recover;
+ +      struct dm_dirty_log *dl = rec->dl;
+ +
+ +      if (dl->type->resume && dl->type->resume(dl))
+ +              /* Resume dirty log. */
+ +              /* FIXME: need better error handling. */
+ +              DMWARN("log resume failed");
+ +
+ +      rec->nr_regions_to_recover =
+ +          rec->nr_regions - dl->type->get_sync_count(dl);
+ +
+ +      ClearRSSuspended(rs);
+ +
+ +      /* Reset any unfinished recovery. */
+ +      if (RSRecover(rs)) {
+ +              recovery_region_reset(rs);
+ +              dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
+ +      } else
+ +              wake_do_raid(rs);
+ +}
+ +
+ +static INLINE unsigned sc_size(struct raid_set *rs)
+ +{
+ +      return to_sector(atomic_read(&rs->sc.stripes) *
+ +                       (sizeof(struct stripe) +
+ +                        (sizeof(struct stripe_set) +
+ +                         (sizeof(struct page_list) +
+ +                          to_bytes(rs->set.io_size) *
+ +                          rs->set.raid_devs)) +
+ +                        (rs->recover.
+ +                         end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
+ +                                                    rs->recover.
+ +                                                    io_size))));
+ +}
+ +
+ +/* REMOVEME: status output for development. */
+ +static void
+ +raid_devel_stats(struct dm_target *ti, char *result,
+ +               unsigned *size, unsigned maxlen)
+ +{
+ +      unsigned chunks, stripes, sz = *size;
+ +      unsigned long j;
+ +      char buf[BDEVNAME_SIZE], *p;
+ +      struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
+ +      struct raid_set *rs = ti->private;
+ +      struct recover *rec = &rs->recover;
+ +      struct timespec ts;
+ +
+ +      DMEMIT("%s ", version);
+ +      DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
+ +      DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
+ +
+ +      for (sm = stats_map; sm < sm_end; sm++)
+ +              DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
+ +
+ +      DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
+ +      DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
+ +             atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
+ +             sc_size(rs));
+ +
+ +      j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
+ +          rec->start_jiffies;
+ +      jiffies_to_timespec(j, &ts);
+ +      sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+ +      p = strchr(buf, '.');
+ +      p[3] = 0;
+ +
+ +      DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
+ +             (unsigned long long) rec->nr_regions_recovered,
+ +             RSRegionGet(rs) ? "+" : "",
+ +             (unsigned long long) rec->nr_regions_to_recover,
+ +             (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
+ +
+ +      rs_get_ra(rs, &stripes, &chunks);
+ +      DMEMIT("ra=%u/%u ", stripes, chunks);
+ +
+ +      *size = sz;
+ +}
+ +
+ +static int
+ +raid_status(struct dm_target *ti, status_type_t type,
+ +          char *result, unsigned maxlen)
+ +{
+ +      unsigned i, sz = 0;
+ +      char buf[BDEVNAME_SIZE];
+ +      struct raid_set *rs = ti->private;
+ +
+ +      switch (type) {
+ +      case STATUSTYPE_INFO:
+ +              /* REMOVEME: statistics. */
+ +              if (RSDevelStats(rs))
+ +                      raid_devel_stats(ti, result, &sz, maxlen);
+ +
+ +              DMEMIT("%u ", rs->set.raid_devs);
+ +
+ +              for (i = 0; i < rs->set.raid_devs; i++)
+ +                      DMEMIT("%s ",
+ +                             format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
+ +
+ +              DMEMIT("1 ");
+ +              for (i = 0; i < rs->set.raid_devs; i++) {
+ +                      DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
+ +
+ +                      if (rs->set.raid_type->level == raid4 &&
+ +                          i == rs->set.pi)
+ +                              DMEMIT("p");
+ +
+ +                      if (rs->set.dev_to_init == i)
+ +                              DMEMIT("i");
+ +              }
+ +
+ +              break;
+ +
+ +      case STATUSTYPE_TABLE:
+ +              sz = rs->recover.dl->type->status(rs->recover.dl, type,
+ +                                                result, maxlen);
+ +              DMEMIT("%s %u ", rs->set.raid_type->name,
+ +                     rs->set.raid_parms);
+ +
+ +              if (rs->set.raid_type->level == raid4)
+ +                      DMEMIT("%d ", rs->set.pi_parm);
+ +
+ +              if (rs->set.raid_parms)
+ +                      DMEMIT("%d ", rs->set.chunk_size_parm);
+ +
+ +              if (rs->set.raid_parms > 1)
+ +                      DMEMIT("%d ", rs->sc.stripes_parm);
+ +
+ +              if (rs->set.raid_parms > 2)
+ +                      DMEMIT("%d ", rs->set.io_size_parm);
+ +
+ +              if (rs->set.raid_parms > 3)
+ +                      DMEMIT("%d ", rs->recover.io_size_parm);
+ +
+ +              if (rs->set.raid_parms > 4)
+ +                      DMEMIT("%d ", rs->recover.bandwidth_parm);
+ +
+ +              DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
+ +
+ +              for (i = 0; i < rs->set.raid_devs; i++)
+ +                      DMEMIT("%s %llu ",
+ +                             format_dev_t(buf,
+ +                                          rs->dev[i].dev->bdev->bd_dev),
+ +                             (unsigned long long) rs->dev[i].start);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Message interface
+ + */
+ +enum raid_msg_actions {
+ +      act_bw,                 /* Recovery bandwidth switch. */
+ +      act_dev,                /* Device failure switch. */
+ +      act_overwrite,          /* Stripe overwrite check. */
+ +      act_read_ahead,         /* Set read ahead. */
+ +      act_stats,              /* Development statistics switch. */
+ +      act_sc,                 /* Stripe cache switch. */
+ +
+ +      act_on,                 /* Set entity on. */
+ +      act_off,                /* Set entity off. */
+ +      act_reset,              /* Reset entity. */
+ +
+ +      act_set = act_on,       /* Set # absolute. */
+ +      act_grow = act_off,     /* Grow # by an amount. */
+ +      act_shrink = act_reset, /* Shrink # by an amount. */
+ +};
+ +
+ +/* Turn a delta to absolute. */
+ +static int _absolute(unsigned long action, int act, int r)
+ +{
+ +      /* Make delta absolute. */
+ +      if (test_bit(act_set, &action))
+ +              ;
+ +      else if (test_bit(act_grow, &action))
+ +              r += act;
+ +      else if (test_bit(act_shrink, &action))
+ +              r = act - r;
+ +      else
+ +              r = -EINVAL;
+ +
+ +      return r;
+ +}
+ +
+ + /* Change recovery io bandwidth. */
+ +static int bandwidth_change(struct dm_msg *msg, void *context)
+ +{
+ +      struct raid_set *rs = context;
+ +      int act = rs->recover.bandwidth;
+ +      int bandwidth = DM_MSG_INT_ARG(msg);
+ +
+ +      if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ +              /* Make delta bandwidth absolute. */
+ +              bandwidth = _absolute(msg->action, act, bandwidth);
+ +
+ +              /* Check range. */
+ +              if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ +                      recover_set_bandwidth(rs, bandwidth);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      set_bit(dm_msg_ret_arg, &msg->ret);
+ +      set_bit(dm_msg_ret_inval, &msg->ret);
+ +      return -EINVAL;
+ +}
+ +
+ +/* Change state of a device (running/offline). */
+ +/* FIXME: this only works while recovering!. */
+ +static int device_state(struct dm_msg *msg, void *context)
+ +{
+ +      int r;
+ +      const char *str = "is already ";
+ +      union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
+ +      struct raid_set *rs = context;
+ +
+ +      r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
+ +                          bymajmin : byname, &dl);
+ +      if (r == -ENODEV) {
+ +              DMERR("device %s is no member of this set", dl.dev_name);
+ +              return r;
+ +      }
+ +
+ +      if (test_bit(act_off, &msg->action)) {
+ +              if (dev_operational(rs, r))
+ +                      str = "";
+ +      } else if (!dev_operational(rs, r))
+ +              str = "";
+ +
+ +      DMINFO("/dev/%s %s%s", dl.dev_name, str,
+ +             test_bit(act_off, &msg->action) ? "offline" : "running");
+ +
+ +      return test_bit(act_off, &msg->action) ?
+ +             raid_set_check_and_degrade(rs, NULL, r) :
+ +             raid_set_check_and_upgrade(rs, r);
+ +}
+ +
+ +/* Set/reset development feature flags. */
+ +static int devel_flags(struct dm_msg *msg, void *context)
+ +{
+ +      struct raid_set *rs = context;
+ +
+ +      if (test_bit(act_on, &msg->action))
+ +              return test_and_set_bit(msg->spec->parm,
+ +                                      &rs->io.flags) ? -EPERM : 0;
+ +      else if (test_bit(act_off, &msg->action))
+ +              return test_and_clear_bit(msg->spec->parm,
+ +                                        &rs->io.flags) ? 0 : -EPERM;
+ +      else if (test_bit(act_reset, &msg->action)) {
+ +              if (test_bit(act_stats, &msg->action)) {
+ +                      stats_reset(rs);
+ +                      goto on;
+ +              } else if (test_bit(act_overwrite, &msg->action)) {
+ +on:
+ +                      set_bit(msg->spec->parm, &rs->io.flags);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ + /* Set stripe and chunk read ahead pages. */
+ +static int read_ahead_set(struct dm_msg *msg, void *context)
+ +{
+ +      int stripes = DM_MSG_INT_ARGS(msg, 0);
+ +      int chunks  = DM_MSG_INT_ARGS(msg, 1);
+ +
+ +      if (range_ok(stripes, 1, 512) &&
+ +          range_ok(chunks, 1, 512)) {
+ +              rs_set_bdi(context, stripes, chunks);
+ +              return 0;
+ +      }
+ +
+ +      set_bit(dm_msg_ret_arg, &msg->ret);
+ +      set_bit(dm_msg_ret_inval, &msg->ret);
+ +      return -EINVAL;
+ +}
+ +
+ +/* Resize the stripe cache. */
+ +static int stripecache_resize(struct dm_msg *msg, void *context)
+ +{
+ +      int act, stripes;
+ +      struct raid_set *rs = context;
+ +
+ +      /* Deny permission in case the daemon is still shrinking!. */
+ +      if (atomic_read(&rs->sc.stripes_to_shrink))
+ +              return -EPERM;
+ +
+ +      stripes = DM_MSG_INT_ARG(msg);
+ +      if (stripes > 0) {
+ +              act = atomic_read(&rs->sc.stripes);
+ +
+ +              /* Make delta stripes absolute. */
+ +              stripes = _absolute(msg->action, act, stripes);
+ +
+ +              /*
+ +               * Check range and that the # of stripes changes.
+ +               * We can grow from gere but need to leave any
+ +               * shrinking to the worker for synchronization.
+ +               */
+ +              if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
+ +                      if (stripes > act)
+ +                              return sc_grow(&rs->sc, stripes - act, SC_GROW);
+ +                      else if (stripes < act) {
+ +                              atomic_set(&rs->sc.stripes_to_shrink,
+ +                                         act - stripes);
+ +                              wake_do_raid(rs);
+ +                      }
+ +
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      set_bit(dm_msg_ret_arg, &msg->ret);
+ +      set_bit(dm_msg_ret_inval, &msg->ret);
+ +      return -EINVAL;
+ +}
+ +
+ +/* Parse the RAID message action. */
+ +/*
+ + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'    # e.g 'ba se 50'
+ + * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
+ + * "o[verwrite]  {on,of[f],r[eset]}'          # e.g. 'o of'
+ + * "r[ead_ahead] set #stripes #chunks         # e.g. 'r se 3 2'
+ + * 'sta[tistics] {on,of[f],r[eset]}'          # e.g. 'stat of'
+ + * 'str[ipecache] {se[t],g[row],sh[rink]} #'  # e.g. 'stripe set 1024'
+ + *
+ + */
+ +static int
+ +raid_message(struct dm_target *ti, unsigned argc, char **argv)
+ +{
+ +      /* Variables to store the parsed parameters im. */
+ +      static int i[2];
+ +      static unsigned long *i_arg[] = {
+ +              (unsigned long *) i + 0,
+ +              (unsigned long *) i + 1,
+ +      };
+ +      static char *p;
+ +      static unsigned long *p_arg[] = { (unsigned long *) &p };
+ +
+ +      /* Declare all message option strings. */
+ +      static char *str_sgs[] = { "set", "grow", "shrink" };
+ +      static char *str_dev[] = { "running", "offline" };
+ +      static char *str_oor[] = { "on", "off", "reset" };
+ +
+ +      /* Declare all actions. */
+ +      static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
+ +      static unsigned long act_oor[] = { act_on, act_off, act_reset };
+ +
+ +      /* Bandwidth option. */
+ +      static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
+ +      static struct dm_message_argument bw_args = {
+ +              1, i_arg, { dm_msg_int_t }
+ +      };
+ +
+ +      /* Device option. */
+ +      static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
+ +      static struct dm_message_argument dev_args = {
+ +              1, p_arg, { dm_msg_base_t }
+ +      };
+ +
+ +      /* Read ahead option. */
+ +      static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
+ +      static struct dm_message_argument ra_args = {
+ +              2, i_arg, { dm_msg_int_t, dm_msg_int_t }
+ +      };
+ +
+ +      static struct dm_message_argument null_args = {
+ +              0, NULL, { dm_msg_int_t }
+ +      };
+ +
+ +      /* Overwrite and statistics option. */
+ +      static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
+ +
+ +      /* Sripecache option. */
+ +      static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
+ +
+ +      /* Declare messages. */
+ +      static struct dm_msg_spec specs[] = {
+ +              { "bandwidth", act_bw, &bw_opt, &bw_args,
+ +                0, bandwidth_change },
+ +              { "device", act_dev, &dev_opt, &dev_args,
+ +                0, device_state },
+ +              { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
+ +                RS_CHECK_OVERWRITE, devel_flags },
+ +              { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
+ +                0, read_ahead_set },
+ +              { "statistics", act_stats, &ovr_stats_opt, &null_args,
+ +                RS_DEVEL_STATS, devel_flags },
+ +              { "stripecache", act_sc, &stripe_opt, &bw_args,
+ +                0, stripecache_resize },
+ +      };
+ +
+ +      /* The message for the parser. */
+ +      struct dm_msg msg = {
+ +              .num_specs = ARRAY_SIZE(specs),
+ +              .specs = specs,
+ +      };
+ +
+ +      return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
+ +}
+ +/*
+ + * END message interface
+ + */
+ +
+ +static struct target_type raid_target = {
+ +      .name = "raid45",
+ +      .version = {1, 0, 0},
+ +      .module = THIS_MODULE,
+ +      .ctr = raid_ctr,
+ +      .dtr = raid_dtr,
+ +      .map = raid_map,
+ +      .postsuspend = raid_postsuspend,
+ +      .resume = raid_resume,
+ +      .status = raid_status,
+ +      .message = raid_message,
+ +};
+ +
+ +static void init_exit(const char *bad_msg, const char *good_msg, int r)
+ +{
+ +      if (r)
+ +              DMERR("Failed to %sregister target [%d]", bad_msg, r);
+ +      else
+ +              DMINFO("%s %s", good_msg, version);
+ +}
+ +
+ +static int __init dm_raid_init(void)
+ +{
+ +      int r;
+ +
+ +      r = dm_register_target(&raid_target);
+ +      init_exit("", "initialized", r);
+ +      return r;
+ +}
+ +
+ +static void __exit dm_raid_exit(void)
+ +{
+ +      dm_unregister_target(&raid_target);
+ +      init_exit("un", "exit", 0);
+ +}
+ +
+ +/* Module hooks. */
+ +module_init(dm_raid_init);
+ +module_exit(dm_raid_exit);
+ +
+ +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
+ +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
+ +MODULE_LICENSE("GPL");
diff --cc drivers/md/dm-table.c
Simple merge
diff --cc drivers/md/dm.c
Simple merge
diff --cc drivers/media/video/uvc/uvc_ctrl.c

index 27be63a,3b2e780..e44601f
--- 1/drivers/media/video/uvc/uvc_ctrl.c
--- 2/drivers/media/video/uvc/uvc_ctrl.c
+++ b/drivers/media/video/uvc/uvc_ctrl.c
@@@ -762,13 -809,8 +809,12 @@@ int uvc_query_v4l2_ctrl(struct uvc_vide
         struct uvc_control_mapping *mapping;
         struct uvc_menu_info *menu;
         unsigned int i;
-       __u8 *data;
         int ret;
   
+ +      if ((chain->dev->quirks & UVC_QUIRK_HUE_EPIPE) &&
+ +              (v4l2_ctrl->id == V4L2_CID_HUE))
+ +              return -EINVAL;
+ +
         ctrl = uvc_find_control(chain, v4l2_ctrl->id, &mapping);
         if (ctrl == NULL)
                 return -EINVAL;
diff --cc drivers/media/video/uvc/uvc_driver.c
Simple merge
diff --cc drivers/media/video/uvc/uvcvideo.h
Simple merge
diff --cc drivers/mtd/maps/omap_nor.c

index ead0b2f,e69de29..0000000

deleted file mode 100644,100644
--- 1/drivers/mtd/maps/omap_nor.c
--- 2/drivers/mtd/maps/omap_nor.c
+++ /dev/null
diff --cc drivers/net/bnx2.c
Simple merge
diff --cc drivers/net/e1000/e1000_main.c
Simple merge
diff --cc drivers/net/e1000e/e1000.h
Simple merge
diff --cc drivers/net/e1000e/netdev.c
Simple merge
diff --cc drivers/net/ehea/ehea_main.c
Simple merge
diff --cc drivers/net/fs_enet/fs_enet-main.c
Simple merge
diff --cc drivers/net/fs_enet/mii-fec.c
Simple merge
diff --cc drivers/net/gianfar.c
Simple merge
diff --cc drivers/net/ibm_newemac/core.c
Simple merge
diff --cc drivers/net/igb/igb_main.c

index b0e41f4,583a21c..5fd6ce8
--- 1/drivers/net/igb/igb_main.c
--- 2/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@@ -60,11 -60,7 +60,11 @@@ static const struct e1000_info *igb_inf
         [board_82575] = &e1000_82575_info,
   };
   
+ +static int entropy = 0;
+ +module_param(entropy, int, 0);
+ +MODULE_PARM_DESC(entropy, "Allow igb to populate the /dev/random entropy pool");
+ +
- static struct pci_device_id igb_pci_tbl[] = {
+ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = {
         { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_COPPER), board_82575 },
         { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_FIBER), board_82575 },
         { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_SERDES), board_82575 },
diff --cc drivers/net/ixgbe/ixgbe_main.c
Simple merge
diff --cc drivers/net/myri_sbus.c
Simple merge
diff --cc drivers/net/niu.c
Simple merge
diff --cc drivers/net/sky2.c

index 2421540,653bdd7..a3874be
--- 1/drivers/net/sky2.c
--- 2/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@@ -2312,17 -2354,11 +2350,11 @@@ static struct sk_buff *receive_new(stru
   
         skb = re->skb;
         sky2_rx_unmap_skb(sky2->hw->pdev, re);
- 
         prefetch(skb->data);
-       re->skb = nskb;
-       if (sky2_rx_map_skb(sky2->hw->pdev, re, hdr_space)) {
-               dev_kfree_skb(nskb);
-               re->skb = skb;
-               return NULL;
-       }
+       *re = nre;
   
         if (skb_shinfo(skb)->nr_frags)
- -              skb_put_frags(skb, hdr_space, length);
+ +              skb_put_frags(sky2, skb, hdr_space, length);
         else
                 skb_put(skb, length);
         return skb;
diff --cc drivers/net/sunbmac.c
Simple merge
diff --cc drivers/net/sunhme.c
Simple merge
diff --cc drivers/net/sunlance.c
Simple merge
diff --cc drivers/net/sunqe.c
Simple merge
diff --cc drivers/net/tehuti.c
Simple merge
diff --cc drivers/net/tg3.c

index ecdb4e1,0fa7688..18c01cc
--- 1/drivers/net/tg3.c
--- 2/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@@ -65,14 -66,9 +65,13 @@@
   
   #include "tg3.h"
   
+ +static int entropy = 0;
+ +module_param(entropy, int, 0);
+ +MODULE_PARM_DESC(entropy, "Allow tg3 to populate the /dev/random entropy pool");
+ +
   #define DRV_MODULE_NAME               "tg3"
- #define PFX DRV_MODULE_NAME   ": "
- #define DRV_MODULE_VERSION    "3.106"
- #define DRV_MODULE_RELDATE    "January 12, 2010"
+ #define DRV_MODULE_VERSION    "3.108"
+ #define DRV_MODULE_RELDATE    "February 17, 2010"
   
   #define TG3_DEF_MAC_MODE      0
   #define TG3_DEF_RX_MODE               0
diff --cc drivers/net/tulip/tulip_core.c
Simple merge
diff --cc drivers/net/ucc_geth.c
Simple merge
diff --cc drivers/net/wireless/Kconfig

index 07b3a5c,5889436..f7630c2
--- 1/drivers/net/wireless/Kconfig
--- 2/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@@ -112,7 -112,9 +112,8 @@@ config AIRO_C
         depends on PCMCIA && (BROKEN || !M32R)
         select WIRELESS_EXT
         select WEXT_SPY
+       select WEXT_PRIV
         select CRYPTO
- -      select CRYPTO_AES
         ---help---
           This is the standard Linux driver to support Cisco/Aironet PCMCIA
           802.11 wireless cards.  This driver is the same as the Aironet
diff --cc drivers/pcmcia/electra_cf.c
Simple merge
diff --cc drivers/pcmcia/m8xx_pcmcia.c
Simple merge
diff --cc drivers/scsi/hpsa.c

index 65f6d3e,03697ba..f551dda
--- 1/drivers/scsi/hpsa.c
--- 2/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@@ -3107,17 -3293,17 +3293,17 @@@ static void __devinit hpsa_interrupt_mo
   default_int_mode:
   #endif                                /* CONFIG_PCI_MSI */
         /* if we get here we're going to use the default interrupt mode */
-       h->intr[SIMPLE_MODE_INT] = pdev->irq;
-       return;
+       h->intr[PERF_MODE_INT] = pdev->irq;
   }
   
- -static int hpsa_pci_init(struct ctlr_info *h, struct pci_dev *pdev)
+ +static int __devinit hpsa_pci_init(struct ctlr_info *h, struct pci_dev *pdev)
   {
         ushort subsystem_vendor_id, subsystem_device_id, command;
-       __u32 board_id, scratchpad = 0;
-       __u64 cfg_offset;
-       __u32 cfg_base_addr;
-       __u64 cfg_base_addr_index;
+       u32 board_id, scratchpad = 0;
+       u64 cfg_offset;
+       u32 cfg_base_addr;
+       u64 cfg_base_addr_index;
+       u32 trans_offset;
         int i, prod_index, err;
   
         subsystem_vendor_id = pdev->subsystem_vendor;
diff --cc drivers/scsi/ibmvscsi/ibmvscsi.c
Simple merge
diff --cc drivers/scsi/lpfc/lpfc_hw.h
Simple merge
diff --cc drivers/scsi/lpfc/lpfc_init.c
Simple merge
diff --cc drivers/scsi/qla4xxx/ql4_init.c
Simple merge
diff --cc drivers/scsi/scsi_lib.c
Simple merge
diff --cc drivers/scsi/scsi_scan.c
Simple merge
diff --cc drivers/scsi/scsi_sysfs.c
Simple merge
diff --cc drivers/scsi/sd.c
Simple merge
diff --cc drivers/serial/8250.c
Simple merge
diff --cc drivers/serial/mpc52xx_uart.c
Simple merge
diff --cc drivers/spi/spi_mpc8xxx.c
Simple merge
diff --cc drivers/usb/core/hcd.c

index 79d9505,2f8cedd..b19d9f4
--- 1/drivers/usb/core/hcd.c
--- 2/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@@ -39,9 -39,7 +39,10 @@@
   #include <linux/platform_device.h>
   #include <linux/workqueue.h>
   #include <linux/mutex.h>
+ #include <linux/pm_runtime.h>
+ +#ifdef CONFIG_KDB_USB
+ +#include <linux/kdb.h>
+ +#endif
   
   #include <linux/usb.h>
   
diff --cc drivers/usb/core/hcd.h

index 8e8fb0f,a3cdb09..f84e57e
--- 1/drivers/usb/core/hcd.h
--- 2/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@@ -289,14 -286,7 +289,15 @@@ struct hc_driver 
                  */
         int     (*update_hub_device)(struct usb_hcd *, struct usb_device *hdev,
                         struct usb_tt *tt, gfp_t mem_flags);
+       int     (*reset_device)(struct usb_hcd *, struct usb_device *);
+ +
+ +#ifdef CONFIG_KDB_USB
+ +      /* KDB poll function for this HC */
+ +      int     (*kdb_poll_char)(struct urb *urb);
+ +      void    (*kdb_completion)(struct urb *urb);
+ +      kdb_hc_keyboard_attach_t        kdb_hc_keyboard_attach;
+ +      kdb_hc_keyboard_detach_t        kdb_hc_keyboard_detach;
+ +#endif /* CONFIG_KDB_USB */
   };
   
   extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
diff --cc drivers/usb/gadget/fsl_qe_udc.c
Simple merge
diff --cc drivers/usb/host/ehci-hcd.c
Simple merge
diff --cc drivers/usb/host/fhci-hcd.c
Simple merge
diff --cc drivers/usb/host/isp1760-if.c
Simple merge
diff --cc drivers/usb/host/ohci-hcd.c
Simple merge
diff --cc drivers/usb/host/uhci-hcd.c
Simple merge
diff --cc drivers/video/Kconfig
Simple merge
diff --cc drivers/video/console/fbcon.c
Simple merge
diff --cc drivers/video/console/vgacon.c
Simple merge
diff --cc drivers/watchdog/cpwd.c
Simple merge
diff --cc drivers/watchdog/riowd.c
Simple merge
diff --cc fs/Kconfig
Simple merge
diff --cc fs/Makefile
Simple merge
diff --cc fs/bio.c
Simple merge
diff --cc fs/compat_ioctl.c
Simple merge
diff --cc fs/dlm/dlm_internal.h
Simple merge
diff --cc fs/exec.c
Simple merge
diff --cc fs/ext3/file.c

index 5f5a4c6,f55df0e..9bd88ff
--- 1/fs/ext3/file.c
--- 2/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@@ -21,12 -21,11 +21,13 @@@
   #include <linux/time.h>
   #include <linux/fs.h>
   #include <linux/jbd.h>
+ #include <linux/quotaops.h>
   #include <linux/ext3_fs.h>
   #include <linux/ext3_jbd.h>
+ +#include "namei.h"
   #include "xattr.h"
   #include "acl.h"
+ +#include "nfs4acl.h"
   
   /*
    * Called when an inode is released. Note that this is different
diff --cc fs/ext3/ialloc.c

index 9d2f948,ef9008b..fab1c76
--- 1/fs/ext3/ialloc.c
--- 2/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@@ -589,15 -588,12 +589,15 @@@ got
                 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
   
         ret = inode;
-       if (vfs_dq_alloc_inode(inode)) {
-               err = -EDQUOT;
+       dquot_initialize(inode);
+       err = dquot_alloc_inode(inode);
+       if (err)
                 goto fail_drop;
-       }
   
- -      err = ext3_init_acl(handle, inode, dir);
+ +      if (test_opt(sb, NFS4ACL))
+ +              err = ext3_nfs4acl_init(handle, inode, dir);
+ +      else
+ +              err = ext3_init_acl(handle, inode, dir);
         if (err)
                 goto fail_free_drop;
   
diff --cc fs/ext3/inode.c
Simple merge
diff --cc fs/ext3/namei.c
Simple merge
diff --cc fs/ext3/super.c

index eae9e2d,e844acc..8a2766f
--- 1/fs/ext3/super.c
--- 2/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@@ -36,9 -36,7 +36,8 @@@
   #include <linux/namei.h>
   #include <linux/quotaops.h>
   #include <linux/seq_file.h>
+ +#include <linux/nfs4acl.h>
   #include <linux/log2.h>
- #include <linux/precache.h>
   
   #include <asm/uaccess.h>
   
@@@ -534,13 -528,8 +533,15 @@@ static void destroy_inodecache(void
   static void ext3_clear_inode(struct inode *inode)
   {
         struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+ 
+ +#ifdef CONFIG_EXT3_FS_NFS4ACL
+ +      if (EXT3_I(inode)->i_nfs4acl &&
+ +                      EXT3_I(inode)->i_nfs4acl != EXT3_NFS4ACL_NOT_CACHED) {
+ +              nfs4acl_put(EXT3_I(inode)->i_nfs4acl);
+ +              EXT3_I(inode)->i_nfs4acl = EXT3_NFS4ACL_NOT_CACHED;
+ +      }
+ +#endif
+       dquot_drop(inode);
         ext3_discard_reservation(inode);
         EXT3_I(inode)->i_block_alloc_info = NULL;
         if (unlikely(rsv))
@@@ -1725,11 -1694,8 +1724,11 @@@ static int ext3_fill_super (struct supe
                             NULL, 0))
                 goto failed_mount;
   
- -      sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- -              (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+ +      sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-       if (sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL)
++      if (test_opt(sb, POSIX_ACL))
+ +              sb->s_flags |= MS_POSIXACL;
-       if (sbi->s_mount_opt & EXT3_MOUNT_NFS4ACL)
++      if (test_opt(sb, NFS4ACL))
+ +              sb->s_flags |= MS_POSIXACL | MS_WITHAPPEND;
   
         if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
             (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@@ -2603,15 -2562,11 +2602,15 @@@ static int ext3_remount (struct super_b
                 goto restore_opts;
         }
   
-       if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+       if (test_opt(sb, ABORT))
                 ext3_abort(sb, __func__, "Abort forced by user");
   
- -      sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- -              (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+ +      sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-       if (sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL)
++      if (test_opt(sb, POSIX_ACL))
+ +              sb->s_flags |= MS_POSIXACL;
-       if (sbi->s_mount_opt & EXT3_MOUNT_NFS4ACL)
++      if (test_opt(sb, NFS4ACL))
+ +              sb->s_flags |= MS_POSIXACL;
+ +
   
         es = sbi->s_es;
   
diff --cc fs/ext3/xattr.c
Simple merge
diff --cc fs/gfs2/ops_fstype.c
Simple merge
diff --cc fs/namei.c

index 859682e,48e60a1..4f0e98d
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -1352,21 -1317,9 +1322,21 @@@ static int may_delete(struct inode *dir
                 return -ENOENT;
   
         BUG_ON(victim->d_parent->d_inode != dir);
-       audit_inode_child(victim->d_name.name, victim, dir);
+       audit_inode_child(victim, dir);
   
- -      error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ +      if (dir->i_op->may_delete) {
+ +              if (IS_RDONLY(dir))
+ +                      return -EROFS;
+ +              if (IS_IMMUTABLE(dir))
+ +                      return -EACCES;
+ +              error = dir->i_op->may_delete(dir, victim->d_inode);
+ +              if (!error)
+ +                      error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ +      } else {
+ +              error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ +              if (!error && check_sticky(dir, victim->d_inode))
+ +                      error = -EPERM;
+ +      }
         if (error)
                 return error;
         if (IS_APPEND(dir))
@@@ -1404,37 -1355,9 +1374,21 @@@ static inline int may_create(struct ino
                 return -EEXIST;
         if (IS_DEADDIR(dir))
                 return -ENOENT;
- -      return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ +
+ +      if (dir->i_op->may_create) {
+ +              if (IS_RDONLY(dir))
+ +                      return -EROFS;
+ +              if (IS_IMMUTABLE(dir))
+ +                      return -EACCES;
+ +              error = dir->i_op->may_create(dir, isdir);
+ +              if (!error)
+ +                      error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ +      } else
+ +              error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ +
+ +      return error;
   }
   
- /* 
-  * O_DIRECTORY translates into forcing a directory lookup.
-  */
- static inline int lookup_flags(unsigned int f)
- {
-       unsigned long retval = LOOKUP_FOLLOW;
- 
-       if (f & O_NOFOLLOW)
-               retval &= ~LOOKUP_FOLLOW;
-       
-       if (f & O_DIRECTORY)
-               retval |= LOOKUP_DIRECTORY;
- 
-       return retval;
- }
- 
   /*
    * p1 and p2 should be directories on the same fs.
    */
diff --cc fs/nfs/Kconfig
Simple merge
diff --cc fs/nfs/dns_resolve.c
Simple merge
diff --cc fs/nfs/file.c
Simple merge
diff --cc fs/nfs/inode.c
Simple merge
diff --cc fs/nfs/internal.h
Simple merge
diff --cc fs/nfs/write.c

index f8939d7,53ff70e..bb448b6
--- 1/fs/nfs/write.c
--- 2/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@@ -509,10 -438,10 +509,11 @@@ nfs_mark_request_commit(struct nfs_pag
         radix_tree_tag_set(&nfsi->nfs_page_tree,
                         req->wb_index,
                         NFS_PAGE_TAG_COMMIT);
+       nfsi->ncommit++;
         spin_unlock(&inode->i_lock);
         inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- -      inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+ +      inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ +                      BDI_RECLAIMABLE);
         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
   }
   
diff --cc fs/nfsd/vfs.c

index fae897a,a11b0e8..a2adca6
--- 1/fs/nfsd/vfs.c
--- 2/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@@ -377,16 -404,6 +404,15 @@@ nfsd_setattr(struct svc_rqst *rqstp, st
                         put_write_access(inode);
                         goto out_nfserr;
                 }
-               vfs_dq_init(inode);
+ +
+ +              /*
+ +               * Tell a Hierarchical Storage Manager (e.g. via DMAPI) to
+ +               * return EAGAIN when an action would take minutes instead of
+ +               * milliseconds so that NFS can reply to the client with
+ +               * NFSERR_JUKEBOX instead of blocking an nfsd thread.
+ +               */
+ +              if (rqstp->rq_vers >= 3)
+ +                      iap->ia_valid |= ATTR_NO_BLOCK;
         }
   
         /* sanitize the mode change */
diff --cc fs/ocfs2/Makefile
Simple merge
diff --cc fs/ocfs2/aops.c
Simple merge
diff --cc fs/ocfs2/cluster/masklog.c
Simple merge
diff --cc fs/ocfs2/cluster/masklog.h
Simple merge
diff --cc fs/ocfs2/dir.c
Simple merge
diff --cc fs/ocfs2/file.c
Simple merge
diff --cc fs/ocfs2/inode.c
Simple merge
diff --cc fs/ocfs2/localalloc.c
Simple merge
diff --cc fs/ocfs2/ocfs2.h
Simple merge
diff --cc fs/ocfs2/suballoc.c
Simple merge
diff --cc fs/ocfs2/suballoc.h

index 5eb7753,fa60723..74e502e
--- 1/fs/ocfs2/suballoc.h
--- 2/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@@ -54,10 -54,9 +54,11 @@@ struct ocfs2_alloc_context 
         u64    ac_last_group;
         u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                  is the same as ~0 - unlimited */
+ +
+ +      struct ocfs2_alloc_reservation  *ac_resv;
   };
   
+ void ocfs2_init_steal_slots(struct ocfs2_super *osb);
   void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
   static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
   {
diff --cc fs/ocfs2/super.c
Simple merge
diff --cc fs/partitions/check.c
Simple merge
diff --cc fs/proc/array.c
Simple merge
diff --cc fs/proc/base.c
Simple merge
diff --cc fs/reiserfs/super.c
Simple merge
diff --cc fs/super.c

index 07b02ac,f35ac60..677e759
--- 1/fs/super.c
--- 2/fs/super.c
+++ b/fs/super.c
@@@ -562,13 -556,19 +556,13 @@@ out
         return err;
   }
   
- -/**
- - *    do_remount_sb - asks filesystem to change mount options.
- - *    @sb:    superblock in question
- - *    @flags: numeric part of options
- - *    @data:  the rest of options
- - *      @force: whether or not to force the change
- - *
- - *    Alters the mount options of a mounted file system.
- - */
- -int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+ +#define REMOUNT_FORCE         1
+ +#define REMOUNT_SHRINK_DCACHE 2
+ +
+ +static int __do_remount_sb(struct super_block *sb, int flags, void *data, int rflags)
   {
         int retval;
-       int remount_rw;
+       int remount_rw, remount_ro;
   
         if (sb->s_frozen != SB_UNFROZEN)
                 return -EBUSY;
@@@ -580,14 -580,16 +574,17 @@@
   
         if (flags & MS_RDONLY)
                 acct_auto_close(sb);
- -      shrink_dcache_sb(sb);
+ +      if (rflags & REMOUNT_SHRINK_DCACHE)
+ +              shrink_dcache_sb(sb);
         sync_filesystem(sb);
   
+       remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+       remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
+ 
         /* If we are remounting RDONLY and current sb is read/write,
            make sure there are no rw files opened */
-       if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+       if (remount_ro) {
- -              if (force)
+ +              if (rflags & REMOUNT_FORCE)
                         mark_files_ro(sb);
                 else if (!fs_may_remount_ro(sb))
                         return -EBUSY;
diff --cc fs/xfs/Makefile

index ed4c49d,b4769e4..7468904
--- 1/fs/xfs/Makefile
--- 2/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@@ -107,11 -105,9 +107,10 @@@ xfs-y                            += $(addprefix $(XFS_LINUX)/, 
                                    xfs_globals.o \
                                    xfs_ioctl.o \
                                    xfs_iops.o \
-                                  xfs_lrw.o \
                                    xfs_super.o \
                                    xfs_sync.o \
- -                                 xfs_xattr.o)
+ +                                 xfs_xattr.o \
+ +                                 xfs_ksyms.o)
   
   # Objects in support/
   xfs-y                         += $(addprefix support/, \
diff --cc fs/xfs/dmapi/xfs_dm.c

index 81ca41d,0000000..9332824

mode 100644,000000..100644
--- 1/fs/xfs/dmapi/xfs_dm.c
--- /dev/null
+++ b/fs/xfs/dmapi/xfs_dm.c
@@@ -1,3327 -1,0 +1,3327 @@@
+ +/*
+ + * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ + * All Rights Reserved.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License as
+ + * published by the Free Software Foundation.
+ + *
+ + * This program is distributed in the hope that it would be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ + * GNU General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU General Public License
+ + * along with this program; if not, write the Free Software Foundation,
+ + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ + */
+ +#include "xfs.h"
+ +#include "xfs_fs.h"
+ +#include "xfs_types.h"
+ +#include "xfs_bit.h"
+ +#include "xfs_log.h"
+ +#include "xfs_inum.h"
+ +#include "xfs_trans.h"
+ +#include "xfs_sb.h"
+ +#include "xfs_ag.h"
+ +#include "xfs_dir2.h"
+ +#include "xfs_alloc.h"
+ +#include "xfs_dmapi.h"
+ +#include "xfs_mount.h"
+ +#include "xfs_da_btree.h"
+ +#include "xfs_bmap_btree.h"
+ +#include "xfs_alloc_btree.h"
+ +#include "xfs_ialloc_btree.h"
+ +#include "xfs_dir2_sf.h"
+ +#include "xfs_attr_sf.h"
+ +#include "xfs_dinode.h"
+ +#include "xfs_inode.h"
+ +#include "xfs_btree.h"
+ +#include "xfs_ialloc.h"
+ +#include "xfs_itable.h"
+ +#include "xfs_bmap.h"
+ +#include "xfs_rw.h"
+ +#include "xfs_acl.h"
+ +#include "xfs_attr.h"
+ +#include "xfs_attr_leaf.h"
+ +#include "xfs_inode_item.h"
+ +#include "xfs_vnodeops.h"
+ +#include <dmapi.h>
+ +#include <dmapi_kern.h>
+ +#include "xfs_dm.h"
+ +
+ +#include <linux/mount.h>
+ +
+ +#define MAXNAMLEN MAXNAMELEN
+ +
+ +#define MIN_DIO_SIZE(mp)              ((mp)->m_sb.sb_sectsize)
+ +#define MAX_DIO_SIZE(mp)              (INT_MAX & ~(MIN_DIO_SIZE(mp) - 1))
+ +
+ +static void up_rw_sems(struct inode *ip, int flags)
+ +{
+ +      if (flags & DM_FLAGS_IALLOCSEM_WR)
+ +              up_write(&ip->i_alloc_sem);
+ +      if (flags & DM_FLAGS_IMUX)
+ +              mutex_unlock(&ip->i_mutex);
+ +}
+ +
+ +static void down_rw_sems(struct inode *ip, int flags)
+ +{
+ +      if (flags & DM_FLAGS_IMUX)
+ +              mutex_lock(&ip->i_mutex);
+ +      if (flags & DM_FLAGS_IALLOCSEM_WR)
+ +              down_write(&ip->i_alloc_sem);
+ +}
+ +
+ +
+ +/* Structure used to hold the on-disk version of a dm_attrname_t.  All
+ +   on-disk attribute names start with the 8-byte string "SGI_DMI_".
+ +*/
+ +
+ +typedef struct        {
+ +      char    dan_chars[DMATTR_PREFIXLEN + DM_ATTR_NAME_SIZE + 1];
+ +} dm_dkattrname_t;
+ +
+ +/* Structure used by xfs_dm_get_bulkall(), used as the "private_data"
+ + * that we want xfs_bulkstat to send to our formatter.
+ + */
+ +typedef struct {
+ +      dm_fsid_t       fsid;
+ +      void __user     *laststruct;
+ +      dm_dkattrname_t attrname;
+ +} dm_bulkstat_one_t;
+ +
+ +/* In the on-disk inode, DMAPI attribute names consist of the user-provided
+ +   name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
+ +   changed!
+ +*/
+ +
+ +static        const   char    dmattr_prefix[DMATTR_PREFIXLEN + 1] = DMATTR_PREFIXSTRING;
+ +
+ +static        dm_size_t  dm_min_dio_xfer = 0; /* direct I/O disabled for now */
+ +
+ +
+ +/* See xfs_dm_get_dmattr() for a description of why this is needed. */
+ +
+ +#define XFS_BUG_KLUDGE        256     /* max size of an in-inode attribute value */
+ +
+ +#define DM_MAX_ATTR_BYTES_ON_DESTROY  256
+ +
+ +#define DM_STAT_SIZE(dmtype,namelen)  \
+ +      (sizeof(dmtype) + sizeof(dm_handle_t) + namelen)
+ +
+ +#define DM_STAT_ALIGN         (sizeof(__uint64_t))
+ +
+ +/* DMAPI's E2BIG == EA's ERANGE */
+ +#define DM_EA_XLATE_ERR(err) { if (err == ERANGE) err = E2BIG; }
+ +
+ +static inline size_t dm_stat_align(size_t size)
+ +{
+ +      return (size + (DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +}
+ +
+ +static inline size_t dm_stat_size(size_t namelen)
+ +{
+ +      return dm_stat_align(sizeof(dm_stat_t) + sizeof(dm_handle_t) + namelen);
+ +}
+ +
+ +/*
+ + *    xfs_dm_send_data_event()
+ + *
+ + *    Send data event to DMAPI.  Drop IO lock (if specified) before
+ + *    the dm_send_data_event() call and reacquire it afterwards.
+ + */
+ +int
+ +xfs_dm_send_data_event(
+ +      dm_eventtype_t  event,
+ +      xfs_inode_t     *ip,
+ +      xfs_off_t       offset,
+ +      size_t          length,
+ +      int             flags,
+ +      int             *lock_flags)
+ +{
+ +      struct inode    *inode = &ip->i_vnode;
+ +      int             error;
+ +      uint16_t        dmstate;
+ +
+ +      /* Returns positive errors to XFS */
+ +
+ +      do {
+ +              dmstate = ip->i_d.di_dmstate;
+ +              if (lock_flags)
+ +                      xfs_iunlock(ip, *lock_flags);
+ +
+ +              up_rw_sems(inode, flags);
+ +
+ +              error = dm_send_data_event(event, inode, DM_RIGHT_NULL,
+ +                              offset, length, flags);
+ +              error = -error; /* DMAPI returns negative errors */
+ +
+ +              down_rw_sems(inode, flags);
+ +
+ +              if (lock_flags)
+ +                      xfs_ilock(ip, *lock_flags);
+ +      } while (!error && (ip->i_d.di_dmstate != dmstate));
+ +
+ +      return error;
+ +}
+ +
+ +/*    prohibited_mr_events
+ + *
+ + *    Return event bits representing any events which cannot have managed
+ + *    region events set due to memory mapping of the file.  If the maximum
+ + *    protection allowed in any pregion includes PROT_WRITE, and the region
+ + *    is shared and not text, then neither READ nor WRITE events can be set.
+ + *    Otherwise if the file is memory mapped, no READ event can be set.
+ + *
+ + */
+ +STATIC int
+ +prohibited_mr_events(
+ +      struct address_space *mapping)
+ +{
+ +      int prohibited = (1 << DM_EVENT_READ);
+ +
+ +      if (!mapping_mapped(mapping))
+ +              return 0;
+ +
+ +      spin_lock(&mapping->i_mmap_lock);
+ +      if (mapping_writably_mapped(mapping))
+ +              prohibited |= (1 << DM_EVENT_WRITE);
+ +      spin_unlock(&mapping->i_mmap_lock);
+ +
+ +      return prohibited;
+ +}
+ +
+ +#ifdef        DEBUG_RIGHTS
+ +STATIC int
+ +xfs_vp_to_hexhandle(
+ +      struct inode    *inode,
+ +      u_int           type,
+ +      char            *buffer)
+ +{
+ +      dm_handle_t     handle;
+ +      u_char          *ip;
+ +      int             length;
+ +      int             error;
+ +      int             i;
+ +
+ +      /*
+ +       * XXX: dm_vp_to_handle doesn't exist.
+ +       *      Looks like this debug code is rather dead.
+ +       */
+ +      if ((error = dm_vp_to_handle(inode, &handle)))
+ +              return(error);
+ +
+ +      if (type == DM_FSYS_OBJ) {      /* a filesystem handle */
+ +              length = DM_FSHSIZE;
+ +      } else {
+ +              length = DM_HSIZE(handle);
+ +      }
+ +      for (ip = (u_char *)&handle, i = 0; i < length; i++) {
+ +              *buffer++ = "0123456789abcdef"[ip[i] >> 4];
+ +              *buffer++ = "0123456789abcdef"[ip[i] & 0xf];
+ +      }
+ +      *buffer = '\0';
+ +      return(0);
+ +}
+ +#endif        /* DEBUG_RIGHTS */
+ +
+ +
+ +
+ +
+ +/* Copy in and validate an attribute name from user space.  It should be a
+ +   string of at least one and at most DM_ATTR_NAME_SIZE characters.  Because
+ +   the dm_attrname_t structure doesn't provide room for the trailing NULL
+ +   byte, we just copy in one extra character and then zero it if it
+ +   happens to be non-NULL.
+ +*/
+ +
+ +STATIC int
+ +xfs_copyin_attrname(
+ +      dm_attrname_t   __user *from,   /* dm_attrname_t in user space */
+ +      dm_dkattrname_t *to)            /* name buffer in kernel space */
+ +{
+ +      int error = 0;
+ +      size_t len;
+ +
+ +      strcpy(to->dan_chars, dmattr_prefix);
+ +
+ +        len = strnlen_user((char __user *)from, DM_ATTR_NAME_SIZE);
+ +        if (len == 0)
+ +            error = EFAULT;
+ +        else {
+ +         if (copy_from_user(&to->dan_chars[DMATTR_PREFIXLEN], from, len))
+ +              to->dan_chars[sizeof(to->dan_chars) - 1] = '\0';
+ +         else if (to->dan_chars[DMATTR_PREFIXLEN] == '\0')
+ +              error = EINVAL;
+ +         else
+ +              to->dan_chars[DMATTR_PREFIXLEN + len - 1] = '\0';
+ +        }
+ +
+ +      return error;
+ +}
+ +
+ +
+ +/*
+ + * Convert the XFS flags into their DMAPI flag equivalent for export
+ + */
+ +STATIC uint
+ +_xfs_dic2dmflags(
+ +      __uint16_t              di_flags)
+ +{
+ +      uint                    flags = 0;
+ +
+ +      if (di_flags & XFS_DIFLAG_ANY) {
+ +              if (di_flags & XFS_DIFLAG_REALTIME)
+ +                      flags |= DM_XFLAG_REALTIME;
+ +              if (di_flags & XFS_DIFLAG_PREALLOC)
+ +                      flags |= DM_XFLAG_PREALLOC;
+ +              if (di_flags & XFS_DIFLAG_IMMUTABLE)
+ +                      flags |= DM_XFLAG_IMMUTABLE;
+ +              if (di_flags & XFS_DIFLAG_APPEND)
+ +                      flags |= DM_XFLAG_APPEND;
+ +              if (di_flags & XFS_DIFLAG_SYNC)
+ +                      flags |= DM_XFLAG_SYNC;
+ +              if (di_flags & XFS_DIFLAG_NOATIME)
+ +                      flags |= DM_XFLAG_NOATIME;
+ +              if (di_flags & XFS_DIFLAG_NODUMP)
+ +                      flags |= DM_XFLAG_NODUMP;
+ +      }
+ +      return flags;
+ +}
+ +
+ +STATIC uint
+ +xfs_ip2dmflags(
+ +      xfs_inode_t     *ip)
+ +{
+ +      return _xfs_dic2dmflags(ip->i_d.di_flags) |
+ +                      (XFS_IFORK_Q(ip) ? DM_XFLAG_HASATTR : 0);
+ +}
+ +
+ +STATIC uint
+ +xfs_dic2dmflags(
+ +      xfs_dinode_t    *dip)
+ +{
+ +      return _xfs_dic2dmflags(be16_to_cpu(dip->di_flags)) |
+ +                      (XFS_DFORK_Q(dip) ? DM_XFLAG_HASATTR : 0);
+ +}
+ +
+ +/*
+ + * This copies selected fields in an inode into a dm_stat structure.  Because
+ + * these fields must return the same values as they would in stat(), the
+ + * majority of this code was copied directly from xfs_getattr().  Any future
+ + * changes to xfs_gettattr() must also be reflected here.
+ + */
+ +STATIC void
+ +xfs_dip_to_stat(
+ +      xfs_mount_t             *mp,
+ +      xfs_ino_t               ino,
+ +      xfs_dinode_t            *dip,
+ +      dm_stat_t               *buf)
+ +{
+ +      xfs_dinode_t    *dic = dip;
+ +
+ +      /*
+ +       * The inode format changed when we moved the link count and
+ +       * made it 32 bits long.  If this is an old format inode,
+ +       * convert it in memory to look like a new one.  If it gets
+ +       * flushed to disk we will convert back before flushing or
+ +       * logging it.  We zero out the new projid field and the old link
+ +       * count field.  We'll handle clearing the pad field (the remains
+ +       * of the old uuid field) when we actually convert the inode to
+ +       * the new format. We don't change the version number so that we
+ +       * can distinguish this from a real new format inode.
+ +       */
+ +      if (dic->di_version == 1) {
+ +              buf->dt_nlink = be16_to_cpu(dic->di_onlink);
+ +              /*buf->dt_xfs_projid = 0;*/
+ +      } else {
+ +              buf->dt_nlink = be32_to_cpu(dic->di_nlink);
+ +              /*buf->dt_xfs_projid = be16_to_cpu(dic->di_projid);*/
+ +      }
+ +      buf->dt_ino = ino;
+ +      buf->dt_dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ +      buf->dt_mode = be16_to_cpu(dic->di_mode);
+ +      buf->dt_uid = be32_to_cpu(dic->di_uid);
+ +      buf->dt_gid = be32_to_cpu(dic->di_gid);
+ +      buf->dt_size = be64_to_cpu(dic->di_size);
+ +      buf->dt_atime = be32_to_cpu(dic->di_atime.t_sec);
+ +      buf->dt_mtime = be32_to_cpu(dic->di_mtime.t_sec);
+ +      buf->dt_ctime = be32_to_cpu(dic->di_ctime.t_sec);
+ +      buf->dt_xfs_xflags = xfs_dic2dmflags(dip);
+ +      buf->dt_xfs_extsize =
+ +              be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
+ +      buf->dt_xfs_extents = be32_to_cpu(dic->di_nextents);
+ +      buf->dt_xfs_aextents = be16_to_cpu(dic->di_anextents);
+ +      buf->dt_xfs_igen = be32_to_cpu(dic->di_gen);
+ +      buf->dt_xfs_dmstate = be16_to_cpu(dic->di_dmstate);
+ +
+ +      switch (dic->di_format) {
+ +      case XFS_DINODE_FMT_DEV:
+ +              buf->dt_rdev = xfs_dinode_get_rdev(dic);
+ +              buf->dt_blksize = BLKDEV_IOSIZE;
+ +              buf->dt_blocks = 0;
+ +              break;
+ +      case XFS_DINODE_FMT_LOCAL:
+ +      case XFS_DINODE_FMT_UUID:
+ +              buf->dt_rdev = 0;
+ +              buf->dt_blksize = mp->m_sb.sb_blocksize;
+ +              buf->dt_blocks = 0;
+ +              break;
+ +      case XFS_DINODE_FMT_EXTENTS:
+ +      case XFS_DINODE_FMT_BTREE:
+ +              buf->dt_rdev = 0;
+ +              buf->dt_blksize = mp->m_sb.sb_blocksize;
+ +              buf->dt_blocks =
+ +                      XFS_FSB_TO_BB(mp, be64_to_cpu(dic->di_nblocks));
+ +              break;
+ +      }
+ +
+ +      memset(&buf->dt_pad1, 0, sizeof(buf->dt_pad1));
+ +      memset(&buf->dt_pad2, 0, sizeof(buf->dt_pad2));
+ +      memset(&buf->dt_pad3, 0, sizeof(buf->dt_pad3));
+ +
+ +      /* Finally fill in the DMAPI specific fields */
+ +      buf->dt_pers = 0;
+ +      buf->dt_change = 0;
+ +      buf->dt_nevents = DM_EVENT_MAX;
+ +      buf->dt_emask = be32_to_cpu(dic->di_dmevmask);
+ +      buf->dt_dtime = be32_to_cpu(dic->di_ctime.t_sec);
+ +      /* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
+ +      buf->dt_pmanreg = (DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
+ +                      DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
+ +                      DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask)) ? 1 : 0;
+ +}
+ +
+ +/*
+ + * Pull out both ondisk and incore fields, incore has preference.
+ + * The inode must be kept locked SHARED by the caller.
+ + */
+ +STATIC void
+ +xfs_ip_to_stat(
+ +      xfs_mount_t             *mp,
+ +      xfs_ino_t               ino,
+ +      xfs_inode_t             *ip,
+ +      dm_stat_t               *buf)
+ +{
+ +      xfs_icdinode_t          *dic = &ip->i_d;
+ +
+ +      buf->dt_ino = ino;
+ +      buf->dt_nlink = dic->di_nlink;
+ +      /*buf->dt_xfs_projid = dic->di_projid;*/
+ +      buf->dt_mode = dic->di_mode;
+ +      buf->dt_uid = dic->di_uid;
+ +      buf->dt_gid = dic->di_gid;
+ +      buf->dt_size = XFS_ISIZE(ip);
+ +      buf->dt_dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ +      buf->dt_atime = VFS_I(ip)->i_atime.tv_sec;
+ +      buf->dt_mtime = dic->di_mtime.t_sec;
+ +      buf->dt_ctime = dic->di_ctime.t_sec;
+ +      buf->dt_xfs_xflags = xfs_ip2dmflags(ip);
+ +      buf->dt_xfs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+ +      buf->dt_xfs_extents = dic->di_nextents;
+ +      buf->dt_xfs_aextents = dic->di_anextents;
+ +      buf->dt_xfs_igen = dic->di_gen;
+ +      buf->dt_xfs_dmstate = dic->di_dmstate;
+ +
+ +      switch (dic->di_format) {
+ +      case XFS_DINODE_FMT_DEV:
+ +              buf->dt_rdev = ip->i_df.if_u2.if_rdev;
+ +              buf->dt_blksize = BLKDEV_IOSIZE;
+ +              buf->dt_blocks = 0;
+ +              break;
+ +      case XFS_DINODE_FMT_LOCAL:
+ +      case XFS_DINODE_FMT_UUID:
+ +              buf->dt_rdev = 0;
+ +              buf->dt_blksize = mp->m_sb.sb_blocksize;
+ +              buf->dt_blocks = 0;
+ +              break;
+ +      case XFS_DINODE_FMT_EXTENTS:
+ +      case XFS_DINODE_FMT_BTREE:
+ +              buf->dt_rdev = 0;
+ +              buf->dt_blksize = mp->m_sb.sb_blocksize;
+ +              buf->dt_blocks = XFS_FSB_TO_BB(mp,
+ +                              (dic->di_nblocks + ip->i_delayed_blks));
+ +              break;
+ +      }
+ +
+ +      memset(&buf->dt_pad1, 0, sizeof(buf->dt_pad1));
+ +      memset(&buf->dt_pad2, 0, sizeof(buf->dt_pad2));
+ +      memset(&buf->dt_pad3, 0, sizeof(buf->dt_pad3));
+ +
+ +      /* Finally fill in the DMAPI specific fields */
+ +      buf->dt_pers = 0;
+ +      buf->dt_change = 0;
+ +      buf->dt_nevents = DM_EVENT_MAX;
+ +      buf->dt_emask = dic->di_dmevmask;
+ +      buf->dt_dtime = dic->di_ctime.t_sec;
+ +      /* Set if one of READ, WRITE or TRUNCATE bits is set in emask */
+ +      buf->dt_pmanreg = (DMEV_ISSET(DM_EVENT_READ, buf->dt_emask) ||
+ +                      DMEV_ISSET(DM_EVENT_WRITE, buf->dt_emask) ||
+ +                      DMEV_ISSET(DM_EVENT_TRUNCATE, buf->dt_emask)) ? 1 : 0;
+ +}
+ +
+ +/*
+ + * Take the handle and put it at the end of a dm_xstat buffer.
+ + * dt_compname is unused in bulkstat - so we zero it out.
+ + * Finally, update link in dm_xstat_t to point to next struct.
+ + */
+ +STATIC void
+ +xfs_dm_handle_to_xstat(
+ +      dm_xstat_t      *xbuf,
+ +      size_t          xstat_sz,
+ +      dm_handle_t     *handle,
+ +      size_t          handle_sz)
+ +{
+ +      dm_stat_t       *sbuf = &xbuf->dx_statinfo;
+ +
+ +      memcpy(xbuf + 1, handle, handle_sz);
+ +      sbuf->dt_handle.vd_offset = (ssize_t) sizeof(dm_xstat_t);
+ +      sbuf->dt_handle.vd_length = (size_t) DM_HSIZE(*handle);
+ +      memset(&sbuf->dt_compname, 0, sizeof(dm_vardata_t));
+ +      sbuf->_link = xstat_sz;
+ +}
+ +
+ +STATIC int
+ +xfs_dm_bulkall_iget_one(
+ +      xfs_mount_t     *mp,
+ +      xfs_ino_t       ino,
+ +      xfs_daddr_t     bno,
+ +      int             *value_lenp,
+ +      dm_xstat_t      *xbuf,
+ +      u_int           *xstat_szp,
+ +      char            *attr_name,
+ +      caddr_t         attr_buf)
+ +{
+ +      xfs_inode_t     *ip;
+ +      dm_handle_t     handle;
+ +      u_int           xstat_sz = *xstat_szp;
+ +      int             value_len = *value_lenp;
+ +      int             error;
+ +
+ +      error = xfs_iget(mp, NULL, ino,
+ +                       XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+ +      if (error)
+ +              return error;
+ +
+ +      xfs_ip_to_stat(mp, ino, ip, &xbuf->dx_statinfo);
+ +      dm_ip_to_handle(&ip->i_vnode, &handle);
+ +      xfs_dm_handle_to_xstat(xbuf, xstat_sz, &handle, sizeof(handle));
+ +
+ +      /* Drop ILOCK_SHARED for call to xfs_attr_get */
+ +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ +
+ +      memset(&xbuf->dx_attrdata, 0, sizeof(dm_vardata_t));
+ +      error = xfs_attr_get(ip, attr_name, attr_buf, &value_len, ATTR_ROOT);
+ +      iput(&ip->i_vnode);
+ +
+ +      DM_EA_XLATE_ERR(error);
+ +      if (error && (error != ENOATTR)) {
+ +              if (error == E2BIG)
+ +                      error = ENOMEM;
+ +              return error;
+ +      }
+ +
+ +      /* How much space was in the attr? */
+ +      if (error != ENOATTR) {
+ +              xbuf->dx_attrdata.vd_offset = xstat_sz;
+ +              xbuf->dx_attrdata.vd_length = value_len;
+ +              xstat_sz += (value_len+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +      }
+ +      *xstat_szp = xbuf->dx_statinfo._link = xstat_sz;
+ +      *value_lenp = value_len;
+ +      return 0;
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_inline_attr(
+ +      xfs_mount_t     *mp,
+ +      xfs_dinode_t    *dip,
+ +      char            *attr_name,
+ +      caddr_t         attr_buf,
+ +      int             *value_lenp)
+ +{
+ +      if (dip->di_aformat == XFS_DINODE_FMT_LOCAL) {
+ +              xfs_attr_shortform_t    *sf;
+ +              xfs_attr_sf_entry_t     *sfe;
+ +              unsigned int            namelen = strlen(attr_name);
+ +              unsigned int            valuelen = *value_lenp;
+ +              int                     i;
+ +
+ +              sf = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+ +              sfe = &sf->list[0];
+ +              for (i = 0; i < sf->hdr.count;
+ +                              sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ +                      if (sfe->namelen != namelen)
+ +                              continue;
+ +                      if (!(sfe->flags & XFS_ATTR_ROOT))
+ +                              continue;
+ +                      if (memcmp(attr_name, sfe->nameval, namelen) != 0)
+ +                              continue;
+ +                      if (valuelen < sfe->valuelen)
+ +                              return ERANGE;
+ +                      valuelen = sfe->valuelen;
+ +                      memcpy(attr_buf, &sfe->nameval[namelen], valuelen);
+ +                      *value_lenp = valuelen;
+ +                      return 0;
+ +              }
+ +      }
+ +      *value_lenp = 0;
+ +      return ENOATTR;
+ +}
+ +
+ +STATIC void
+ +dm_dip_to_handle(
+ +      xfs_ino_t       ino,
+ +      xfs_dinode_t    *dip,
+ +      dm_fsid_t       *fsid,
+ +      dm_handle_t     *handlep)
+ +{
+ +      dm_fid_t        fid;
+ +      int             hsize;
+ +
+ +      fid.dm_fid_len = sizeof(struct dm_fid) - sizeof(fid.dm_fid_len);
+ +      fid.dm_fid_pad = 0;
+ +      fid.dm_fid_ino = ino;
+ +      fid.dm_fid_gen = be32_to_cpu(dip->di_gen);
+ +
+ +      memcpy(&handlep->ha_fsid, fsid, sizeof(*fsid));
+ +      memcpy(&handlep->ha_fid, &fid, fid.dm_fid_len + sizeof(fid.dm_fid_len));
+ +      hsize = DM_HSIZE(*handlep);
+ +      memset((char *)handlep + hsize, 0, sizeof(*handlep) - hsize);
+ +}
+ +
+ +STATIC int
+ +xfs_dm_bulkall_inline_one(
+ +      xfs_mount_t     *mp,
+ +      xfs_ino_t       ino,
+ +      xfs_dinode_t    *dip,
+ +      dm_fsid_t       *fsid,
+ +      int             *value_lenp,
+ +      dm_xstat_t      *xbuf,
+ +      u_int           *xstat_szp,
+ +      char            *attr_name,
+ +      caddr_t         attr_buf)
+ +{
+ +      dm_handle_t     handle;
+ +      u_int           xstat_sz = *xstat_szp;
+ +      int             value_len = *value_lenp;
+ +      int             error;
+ +
+ +      if (dip->di_mode == 0)
+ +              return ENOENT;
+ +
+ +      xfs_dip_to_stat(mp, ino, dip, &xbuf->dx_statinfo);
+ +      dm_dip_to_handle(ino, dip, fsid, &handle);
+ +      xfs_dm_handle_to_xstat(xbuf, xstat_sz, &handle, sizeof(handle));
+ +
+ +      memset(&xbuf->dx_attrdata, 0, sizeof(dm_vardata_t));
+ +      error = xfs_dm_inline_attr(mp, dip, attr_name, attr_buf, &value_len);
+ +      DM_EA_XLATE_ERR(error);
+ +      if (error && (error != ENOATTR)) {
+ +              if (error == E2BIG)
+ +                      error = ENOMEM;
+ +              return error;
+ +      }
+ +
+ +      /* How much space was in the attr? */
+ +      if (error != ENOATTR) {
+ +              xbuf->dx_attrdata.vd_offset = xstat_sz;
+ +              xbuf->dx_attrdata.vd_length = value_len;
+ +              xstat_sz += (value_len+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +      }
+ +      *xstat_szp = xbuf->dx_statinfo._link = xstat_sz;
+ +      *value_lenp = value_len;
+ +      return 0;
+ +}
+ +
+ +/*
+ + * This is used by dm_get_bulkall().
+ + * Given a inumber, it igets the inode and fills the given buffer
+ + * with the dm_xstat structure for the file.
+ + */
+ +STATIC int
+ +xfs_dm_bulkall_one(
+ +      xfs_mount_t     *mp,            /* mount point for filesystem */
+ +      xfs_ino_t       ino,            /* inode number to get data for */
+ +      void            __user *buffer, /* buffer to place output in */
+ +      int             ubsize,         /* size of buffer */
+ +      void            *private_data,  /* my private data */
+ +      xfs_daddr_t     bno,            /* starting block of inode cluster */
+ +      int             *ubused,        /* amount of buffer we used */
+ +      void            *dibuff,        /* on-disk inode buffer */
+ +      int             *res)           /* bulkstat result code */
+ +{
+ +      dm_xstat_t      *xbuf;
+ +      u_int           xstat_sz;
+ +      int             error;
+ +      int             value_len;
+ +      int             kern_buf_sz;
+ +      int             attr_buf_sz;
+ +      caddr_t         attr_buf;
+ +      void __user     *attr_user_buf;
+ +      dm_bulkstat_one_t *dmb = (dm_bulkstat_one_t*)private_data;
+ +
+ +      /* Returns positive errors to XFS */
+ +
+ +      *res = BULKSTAT_RV_NOTHING;
+ +
+ +      if (!buffer || xfs_internal_inum(mp, ino))
+ +              return EINVAL;
+ +
+ +      xstat_sz = DM_STAT_SIZE(*xbuf, 0);
+ +      xstat_sz = (xstat_sz + (DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +      if (xstat_sz > ubsize)
+ +              return ENOMEM;
+ +
+ +      kern_buf_sz = xstat_sz;
+ +      xbuf = kmem_alloc(kern_buf_sz, KM_SLEEP);
+ +
+ +      /* Determine place to drop attr value, and available space. */
+ +      value_len = ubsize - xstat_sz;
+ +      if (value_len > ATTR_MAX_VALUELEN)
+ +              value_len = ATTR_MAX_VALUELEN;
+ +
+ +      attr_user_buf = buffer + xstat_sz;
+ +      attr_buf_sz = value_len;
+ +      attr_buf = kmem_alloc(attr_buf_sz, KM_SLEEP);
+ +
+ +      if (!dibuff)
+ +              error = xfs_dm_bulkall_iget_one(mp, ino, bno,
+ +                                              &value_len, xbuf, &xstat_sz,
+ +                                              dmb->attrname.dan_chars,
+ +                                              attr_buf);
+ +      else
+ +              error = xfs_dm_bulkall_inline_one(mp, ino,
+ +                                                (xfs_dinode_t *)dibuff,
+ +                                                &dmb->fsid,
+ +                                                &value_len, xbuf, &xstat_sz,
+ +                                                dmb->attrname.dan_chars,
+ +                                                attr_buf);
+ +      if (error)
+ +              goto out_free_buffers;
+ +
+ +      if (copy_to_user(buffer, xbuf, kern_buf_sz)) {
+ +              error = EFAULT;
+ +              goto out_free_buffers;
+ +      }
+ +      if (copy_to_user(attr_user_buf, attr_buf, value_len)) {
+ +              error = EFAULT;
+ +              goto out_free_buffers;
+ +      }
+ +
+ +      kmem_free(attr_buf);
+ +      kmem_free(xbuf);
+ +
+ +      *res = BULKSTAT_RV_DIDONE;
+ +      if (ubused)
+ +              *ubused = xstat_sz;
+ +      dmb->laststruct = buffer;
+ +      return 0;
+ +
+ + out_free_buffers:
+ +      kmem_free(attr_buf);
+ +      kmem_free(xbuf);
+ +      return error;
+ +}
+ +
+ +/*
+ + * Take the handle and put it at the end of a dm_stat buffer.
+ + * dt_compname is unused in bulkstat - so we zero it out.
+ + * Finally, update link in dm_stat_t to point to next struct.
+ + */
+ +STATIC void
+ +xfs_dm_handle_to_stat(
+ +      dm_stat_t       *sbuf,
+ +      size_t          stat_sz,
+ +      dm_handle_t     *handle,
+ +      size_t          handle_sz)
+ +{
+ +      memcpy(sbuf + 1, handle, handle_sz);
+ +      sbuf->dt_handle.vd_offset = (ssize_t) sizeof(dm_stat_t);
+ +      sbuf->dt_handle.vd_length = (size_t) DM_HSIZE(*handle);
+ +      memset(&sbuf->dt_compname, 0, sizeof(dm_vardata_t));
+ +      sbuf->_link = stat_sz;
+ +}
+ +
+ +STATIC int
+ +xfs_dm_bulkattr_iget_one(
+ +      xfs_mount_t     *mp,
+ +      xfs_ino_t       ino,
+ +      xfs_daddr_t     bno,
+ +      dm_stat_t       *sbuf,
+ +      u_int           stat_sz)
+ +{
+ +      xfs_inode_t     *ip;
+ +      dm_handle_t     handle;
+ +      int             error;
+ +
+ +      error = xfs_iget(mp, NULL, ino,
+ +                       XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+ +      if (error)
+ +              return error;
+ +
+ +      xfs_ip_to_stat(mp, ino, ip, sbuf);
+ +      dm_ip_to_handle(&ip->i_vnode, &handle);
+ +      xfs_dm_handle_to_stat(sbuf, stat_sz, &handle, sizeof(handle));
+ +
+ +      xfs_iput(ip, XFS_ILOCK_SHARED);
+ +      return 0;
+ +}
+ +
+ +STATIC int
+ +xfs_dm_bulkattr_inline_one(
+ +      xfs_mount_t     *mp,
+ +      xfs_ino_t       ino,
+ +      xfs_dinode_t    *dip,
+ +      dm_fsid_t       *fsid,
+ +      dm_stat_t       *sbuf,
+ +      u_int           stat_sz)
+ +{
+ +      dm_handle_t     handle;
+ +
+ +      if (dip->di_mode == 0)
+ +              return ENOENT;
+ +      xfs_dip_to_stat(mp, ino, dip, sbuf);
+ +      dm_dip_to_handle(ino, dip, fsid, &handle);
+ +      xfs_dm_handle_to_stat(sbuf, stat_sz, &handle, sizeof(handle));
+ +      return 0;
+ +}
+ +
+ +/*
+ + * This is used by dm_get_bulkattr().
+ + * Given a inumber, it igets the inode and fills the given buffer
+ + * with the dm_stat structure for the file.
+ + */
+ +STATIC int
+ +xfs_dm_bulkattr_one(
+ +      xfs_mount_t     *mp,            /* mount point for filesystem */
+ +      xfs_ino_t       ino,            /* inode number to get data for */
+ +      void            __user *buffer, /* buffer to place output in */
+ +      int             ubsize,         /* size of buffer */
+ +      void            *private_data,  /* my private data */
+ +      xfs_daddr_t     bno,            /* starting block of inode cluster */
+ +      int             *ubused,        /* amount of buffer we used */
+ +      void            *dibuff,        /* on-disk inode buffer */
+ +      int             *res)           /* bulkstat result code */
+ +{
+ +      dm_stat_t       *sbuf;
+ +      u_int           stat_sz;
+ +      int             error;
+ +      dm_bulkstat_one_t *dmb = (dm_bulkstat_one_t*)private_data;
+ +
+ +      /* Returns positive errors to XFS */
+ +
+ +      *res = BULKSTAT_RV_NOTHING;
+ +
+ +      if (!buffer || xfs_internal_inum(mp, ino))
+ +              return EINVAL;
+ +
+ +      stat_sz = DM_STAT_SIZE(*sbuf, 0);
+ +      stat_sz = (stat_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +      if (stat_sz > ubsize)
+ +              return ENOMEM;
+ +
+ +      sbuf = kmem_alloc(stat_sz, KM_SLEEP);
+ +
+ +      if (!dibuff)
+ +              error = xfs_dm_bulkattr_iget_one(mp, ino, bno, sbuf, stat_sz);
+ +      else
+ +              error = xfs_dm_bulkattr_inline_one(mp, ino,
+ +                                                 (xfs_dinode_t *)dibuff,
+ +                                                 &dmb->fsid, sbuf, stat_sz);
+ +      if (error)
+ +              goto out_free_buffer;
+ +
+ +      if (copy_to_user(buffer, sbuf, stat_sz)) {
+ +              error = EFAULT;
+ +              goto out_free_buffer;
+ +      }
+ +
+ +      kmem_free(sbuf);
+ +      *res = BULKSTAT_RV_DIDONE;
+ +      if (ubused)
+ +              *ubused = stat_sz;
+ +      dmb->laststruct = buffer;
+ +      return 0;
+ +
+ + out_free_buffer:
+ +      kmem_free(sbuf);
+ +      return error;
+ +}
+ +
+ +/* xfs_dm_f_get_eventlist - return the dm_eventset_t mask for inode ip. */
+ +
+ +STATIC int
+ +xfs_dm_f_get_eventlist(
+ +      xfs_inode_t     *ip,
+ +      dm_right_t      right,
+ +      u_int           nelem,
+ +      dm_eventset_t   *eventsetp,             /* in kernel space! */
+ +      u_int           *nelemp)                /* in kernel space! */
+ +{
+ +      dm_eventset_t   eventset;
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(EACCES);
+ +
+ +      /* Note that we MUST return a regular file's managed region bits as
+ +         part of the mask because dm_get_eventlist is supposed to return the
+ +         union of all managed region flags in those bits.  Since we only
+ +         support one region, we can just return the bits as they are.  For
+ +         all other object types, the bits will already be zero.  Handy, huh?
+ +      */
+ +
+ +      eventset = ip->i_d.di_dmevmask;
+ +
+ +      /* Now copy the event mask and event count back to the caller.  We
+ +         return the lesser of nelem and DM_EVENT_MAX.
+ +      */
+ +
+ +      if (nelem > DM_EVENT_MAX)
+ +              nelem = DM_EVENT_MAX;
+ +      eventset &= (1 << nelem) - 1;
+ +
+ +      *eventsetp = eventset;
+ +      *nelemp = nelem;
+ +      return(0);
+ +}
+ +
+ +
+ +/* xfs_dm_f_set_eventlist - update the dm_eventset_t mask in the inode vp.  Only the
+ +   bits from zero to maxevent-1 are being replaced; higher bits are preserved.
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_f_set_eventlist(
+ +      xfs_inode_t     *ip,
+ +      dm_right_t      right,
+ +      dm_eventset_t   *eventsetp,     /* in kernel space! */
+ +      u_int           maxevent)
+ +{
+ +      dm_eventset_t   eventset;
+ +      dm_eventset_t   max_mask;
+ +      dm_eventset_t   valid_events;
+ +      xfs_trans_t     *tp;
+ +      xfs_mount_t     *mp;
+ +      int             error;
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(EACCES);
+ +
+ +      eventset = *eventsetp;
+ +      if (maxevent >= sizeof(ip->i_d.di_dmevmask) * NBBY)
+ +              return(EINVAL);
+ +      max_mask = (1 << maxevent) - 1;
+ +
+ +      if (S_ISDIR(ip->i_d.di_mode)) {
+ +              valid_events = DM_XFS_VALID_DIRECTORY_EVENTS;
+ +      } else {        /* file or symlink */
+ +              valid_events = DM_XFS_VALID_FILE_EVENTS;
+ +      }
+ +      if ((eventset & max_mask) & ~valid_events)
+ +              return(EINVAL);
+ +
+ +      /* Adjust the event mask so that the managed region bits will not
+ +         be altered.
+ +      */
+ +
+ +      max_mask &= ~(1 <<DM_EVENT_READ);       /* preserve current MR bits */
+ +      max_mask &= ~(1 <<DM_EVENT_WRITE);
+ +      max_mask &= ~(1 <<DM_EVENT_TRUNCATE);
+ +
+ +      mp = ip->i_mount;
+ +      tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+ +      error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ +      if (error) {
+ +              xfs_trans_cancel(tp, 0);
+ +              return(error);
+ +      }
+ +      xfs_ilock(ip, XFS_ILOCK_EXCL);
+ +      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ +
+ +      ip->i_d.di_dmevmask = (eventset & max_mask) | (ip->i_d.di_dmevmask & ~max_mask);
+ +
+ +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ +      igrab(&ip->i_vnode);
+ +      xfs_trans_commit(tp, 0);
+ +
+ +      return(0);
+ +}
+ +
+ +
+ +/* xfs_dm_fs_get_eventlist - return the dm_eventset_t mask for filesystem vfsp. */
+ +
+ +STATIC int
+ +xfs_dm_fs_get_eventlist(
+ +      xfs_mount_t     *mp,
+ +      dm_right_t      right,
+ +      u_int           nelem,
+ +      dm_eventset_t   *eventsetp,             /* in kernel space! */
+ +      u_int           *nelemp)                /* in kernel space! */
+ +{
+ +      dm_eventset_t   eventset;
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(EACCES);
+ +
+ +      eventset = mp->m_dmevmask;
+ +
+ +      /* Now copy the event mask and event count back to the caller.  We
+ +         return the lesser of nelem and DM_EVENT_MAX.
+ +      */
+ +
+ +      if (nelem > DM_EVENT_MAX)
+ +              nelem = DM_EVENT_MAX;
+ +      eventset &= (1 << nelem) - 1;
+ +
+ +      *eventsetp = eventset;
+ +      *nelemp = nelem;
+ +      return(0);
+ +}
+ +
+ +
+ +/* xfs_dm_fs_set_eventlist - update the dm_eventset_t mask in the mount structure for
+ +   filesystem vfsp.  Only the bits from zero to maxevent-1 are being replaced;
+ +   higher bits are preserved.
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_fs_set_eventlist(
+ +      xfs_mount_t     *mp,
+ +      dm_right_t      right,
+ +      dm_eventset_t   *eventsetp,     /* in kernel space! */
+ +      u_int           maxevent)
+ +{
+ +      dm_eventset_t   eventset;
+ +      dm_eventset_t   max_mask;
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(EACCES);
+ +
+ +      eventset = *eventsetp;
+ +
+ +      if (maxevent >= sizeof(mp->m_dmevmask) * NBBY)
+ +              return(EINVAL);
+ +      max_mask = (1 << maxevent) - 1;
+ +
+ +      if ((eventset & max_mask) & ~DM_XFS_VALID_FS_EVENTS)
+ +              return(EINVAL);
+ +
+ +      mp->m_dmevmask = (eventset & max_mask) | (mp->m_dmevmask & ~max_mask);
+ +      return(0);
+ +}
+ +
+ +
+ +/* Code in this routine must exactly match the logic in xfs_diordwr() in
+ +   order for this to work!
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_direct_ok(
+ +      xfs_inode_t     *ip,
+ +      dm_off_t        off,
+ +      dm_size_t       len,
+ +      void            __user *bufp)
+ +{
+ +      xfs_mount_t     *mp;
+ +
+ +      mp = ip->i_mount;
+ +
+ +      /* Realtime files can ONLY do direct I/O. */
+ +
+ +      if (XFS_IS_REALTIME_INODE(ip))
+ +              return(1);
+ +
+ +      /* If direct I/O is disabled, or if the request is too small, use
+ +         buffered I/O.
+ +      */
+ +
+ +      if (!dm_min_dio_xfer || len < dm_min_dio_xfer)
+ +              return(0);
+ +
+ +#if 0
+ +      /* If the request is not well-formed or is too large, use
+ +         buffered I/O.
+ +      */
+ +
+ +      if ((__psint_t)bufp & scache_linemask)  /* if buffer not aligned */
+ +              return(0);
+ +      if (off & mp->m_blockmask)              /* if file offset not aligned */
+ +              return(0);
+ +      if (len & mp->m_blockmask)              /* if xfer length not aligned */
+ +              return(0);
+ +      if (len > ctooff(v.v_maxdmasz - 1))     /* if transfer too large */
+ +              return(0);
+ +
+ +      /* A valid direct I/O candidate. */
+ +
+ +      return(1);
+ +#else
+ +      return(0);
+ +#endif
+ +}
+ +
+ +
+ +/* We need to be able to select various combinations of O_NONBLOCK,
+ +   O_DIRECT, and O_SYNC, yet we don't have a file descriptor and we don't have
+ +   the file's pathname.        All we have is a handle.
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_rdwr(
+ +      struct inode    *inode,
+ +      uint            fflag,
+ +      mode_t          fmode,
+ +      dm_off_t        off,
+ +      dm_size_t       len,
+ +      void            __user *bufp,
+ +      int             *rvp)
+ +{
+ +      const struct cred *cred = current_cred();
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      int             error;
+ +      int             oflags;
+ +      ssize_t         xfer;
+ +      struct file     *file;
+ +      struct dentry   *dentry;
+ +
+ +      if ((off < 0) || (off > i_size_read(inode)) || !S_ISREG(inode->i_mode))
+ +              return EINVAL;
+ +
+ +      if (fmode & FMODE_READ) {
+ +              oflags = O_RDONLY;
+ +      } else {
+ +              oflags = O_WRONLY;
+ +      }
+ +
+ +      /*
+ +       * Build file descriptor flags and I/O flags.  O_NONBLOCK is needed so
+ +       * that we don't block on mandatory file locks. This is an invisible IO,
+ +       * don't change the atime.
+ +       */
+ +
+ +      oflags |= O_LARGEFILE | O_NONBLOCK | O_NOATIME;
+ +      if (xfs_dm_direct_ok(ip, off, len, bufp))
+ +              oflags |= O_DIRECT;
+ +
+ +      if (fflag & O_SYNC)
+ +              oflags |= O_SYNC;
+ +
+ +      if (inode->i_fop == NULL) {
+ +              /* no iput; caller did get, and will do put */
+ +              return EINVAL;
+ +      }
+ +
+ +      igrab(inode);
+ +
+ +      dentry = d_obtain_alias(inode);
+ +      if (dentry == NULL) {
+ +              iput(inode);
+ +              return ENOMEM;
+ +      }
+ +
+ +      file = dentry_open(dentry, mntget(ip->i_mount->m_vfsmount), oflags,
+ +                         cred);
+ +      if (IS_ERR(file)) {
+ +              return -PTR_ERR(file);
+ +      }
+ +      file->f_mode |= FMODE_NOCMTIME;
+ +
+ +      if (fmode & FMODE_READ) {
+ +              xfer = file->f_op->read(file, bufp, len, (loff_t*)&off);
+ +      } else {
+ +              xfer = file->f_op->write(file, bufp, len, (loff_t*)&off);
+ +      }
+ +
+ +      if (xfer >= 0) {
+ +              *rvp = xfer;
+ +              error = 0;
+ +      } else {
+ +              /* xfs_read/xfs_write return negative error--flip it */
+ +              error = -(int)xfer;
+ +      }
+ +
+ +      fput(file);
+ +      return error;
+ +}
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_clear_inherit(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_attrname_t   __user *attrnamep)
+ +{
+ +      return(-ENOSYS); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_create_by_handle(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      void            __user *hanp,
+ +      size_t          hlen,
+ +      char            __user *cname)
+ +{
+ +      return(-ENOSYS); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_downgrade_right(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           type)           /* DM_FSYS_OBJ or zero */
+ +{
+ +#ifdef        DEBUG_RIGHTS
+ +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
+ +
+ +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ +              printf("dm_downgrade_right: old %d new %d type %d handle %s\n",
+ +                      right, DM_RIGHT_SHARED, type, buffer);
+ +      } else {
+ +              printf("dm_downgrade_right: old %d new %d type %d handle "
+ +                      "<INVALID>\n", right, DM_RIGHT_SHARED, type);
+ +      }
+ +#endif        /* DEBUG_RIGHTS */
+ +      return(0);
+ +}
+ +
+ +
+ +/* Note: xfs_dm_get_allocinfo() makes no attempt to coalesce two adjacent
+ +   extents when both are of type DM_EXTENT_RES; this is left to the caller.
+ +   XFS guarantees that there will never be two adjacent DM_EXTENT_HOLE extents.
+ +
+ +   In order to provide the caller with all extents in a file including
+ +   those beyond the file's last byte offset, we have to use the xfs_bmapi()
+ +   interface.
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_get_allocinfo_rvp(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_off_t        __user  *offp,
+ +      u_int           nelem,
+ +      dm_extent_t     __user *extentp,
+ +      u_int           __user *nelemp,
+ +      int             *rvp)
+ +{
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      xfs_mount_t     *mp;            /* file system mount point */
+ +      xfs_fileoff_t   fsb_offset;
+ +      xfs_filblks_t   fsb_length;
+ +      dm_off_t        startoff;
+ +      int             elem;
+ +      xfs_bmbt_irec_t *bmp = NULL;
+ +      u_int           bmpcnt = 50;
+ +      u_int           bmpsz = sizeof(xfs_bmbt_irec_t) * bmpcnt;
+ +      int             error = 0;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      if ((inode->i_mode & S_IFMT) != S_IFREG)
+ +              return(-EINVAL);
+ +
+ +      if (copy_from_user( &startoff, offp, sizeof(startoff)))
+ +              return(-EFAULT);
+ +
+ +      mp = ip->i_mount;
+ +      ASSERT(mp);
+ +
+ +      if (startoff > XFS_MAXIOFFSET(mp))
+ +              return(-EINVAL);
+ +
+ +      if (nelem == 0)
+ +              return(-EINVAL);
+ +
+ +      /* Convert the caller's starting offset into filesystem allocation
+ +         units as required by xfs_bmapi().  Round the offset down so that
+ +         it is sure to be included in the reply.
+ +      */
+ +
+ +      fsb_offset = XFS_B_TO_FSBT(mp, startoff);
+ +      fsb_length = XFS_B_TO_FSB(mp, XFS_MAXIOFFSET(mp)) - fsb_offset;
+ +      elem = 0;
+ +
+ +      if (fsb_length)
+ +              bmp = kmem_alloc(bmpsz, KM_SLEEP);
+ +
+ +      while (fsb_length && elem < nelem) {
+ +              dm_extent_t     extent;
+ +              xfs_filblks_t   fsb_bias;
+ +              dm_size_t       bias;
+ +              int             lock;
+ +              int             num;
+ +              int             i;
+ +
+ +              /* Compute how many getbmap structures to use on the xfs_bmapi
+ +                 call.
+ +              */
+ +
+ +              num = MIN((u_int)(nelem - elem), bmpcnt);
+ +
+ +              xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ +              lock = xfs_ilock_map_shared(ip);
+ +
+ +              error = xfs_bmapi(NULL, ip, fsb_offset, fsb_length,
+ +                      XFS_BMAPI_ENTIRE, NULL, 0, bmp, &num, NULL, NULL);
+ +
+ +              xfs_iunlock_map_shared(ip, lock);
+ +              xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ +
+ +              if (error) {
+ +                      error = -error; /* Return negative error to DMAPI */
+ +                      goto finish_out;
+ +              }
+ +
+ +              /* Fill in the caller's extents, adjusting the bias in the
+ +                 first entry if necessary.
+ +              */
+ +
+ +              for (i = 0; i < num; i++, extentp++) {
+ +                      bias = startoff - XFS_FSB_TO_B(mp, bmp[i].br_startoff);
+ +                      extent.ex_offset = startoff;
+ +                      extent.ex_length =
+ +                              XFS_FSB_TO_B(mp, bmp[i].br_blockcount) - bias;
+ +                      if (bmp[i].br_startblock == HOLESTARTBLOCK) {
+ +                              extent.ex_type = DM_EXTENT_HOLE;
+ +                      } else {
+ +                              extent.ex_type = DM_EXTENT_RES;
+ +                      }
+ +                      startoff = extent.ex_offset + extent.ex_length;
+ +
+ +                      if (copy_to_user( extentp, &extent, sizeof(extent))) {
+ +                              error = -EFAULT;
+ +                              goto finish_out;
+ +                      }
+ +
+ +                      fsb_bias = fsb_offset - bmp[i].br_startoff;
+ +                      fsb_offset += bmp[i].br_blockcount - fsb_bias;
+ +                      fsb_length -= bmp[i].br_blockcount - fsb_bias;
+ +                      elem++;
+ +              }
+ +      }
+ +
+ +      if (fsb_length == 0) {
+ +              startoff = 0;
+ +      }
+ +      if (copy_to_user( offp, &startoff, sizeof(startoff))) {
+ +              error = -EFAULT;
+ +              goto finish_out;
+ +      }
+ +
+ +      if (copy_to_user( nelemp, &elem, sizeof(elem))) {
+ +              error = -EFAULT;
+ +              goto finish_out;
+ +      }
+ +
+ +      *rvp = (fsb_length == 0 ? 0 : 1);
+ +
+ +finish_out:
+ +      if (bmp)
+ +              kmem_free(bmp);
+ +      return(error);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_zero_xstatinfo_link(
+ +      dm_xstat_t __user       *dxs)
+ +{
+ +      dm_xstat_t              *ldxs;
+ +      int                     error = 0;
+ +
+ +      if (!dxs)
+ +              return 0;
+ +      ldxs = kmalloc(sizeof(*ldxs), GFP_KERNEL);
+ +      if (!ldxs)
+ +              return -ENOMEM;
+ +      if (copy_from_user(ldxs, dxs, sizeof(*dxs))) {
+ +              error = -EFAULT;
+ +      } else {
+ +              ldxs->dx_statinfo._link = 0;
+ +              if (copy_to_user(dxs, ldxs, sizeof(*dxs)))
+ +                      error = -EFAULT;
+ +      }
+ +      kfree(ldxs);
+ +      return error;
+ +}
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_get_bulkall_rvp(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           mask,
+ +      dm_attrname_t   __user *attrnamep,
+ +      dm_attrloc_t    __user *locp,
+ +      size_t          buflen,
+ +      void            __user *bufp,   /* address of buffer in user space */
+ +      size_t          __user *rlenp,  /* user space address */
+ +      int             *rvalp)
+ +{
+ +      int             error, done;
+ +      int             nelems;
+ +      u_int           statstruct_sz;
+ +      dm_attrloc_t    loc;
+ +      xfs_mount_t     *mp = XFS_I(inode)->i_mount;
+ +      dm_attrname_t   attrname;
+ +      dm_bulkstat_one_t dmb;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (copy_from_user(&attrname, attrnamep, sizeof(attrname)) ||
+ +          copy_from_user(&loc, locp, sizeof(loc)))
+ +              return -EFAULT;
+ +
+ +      if (attrname.an_chars[0] == '\0')
+ +              return(-EINVAL);
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      /* Because we will write directly to the user's buffer, make sure that
+ +         the buffer is properly aligned.
+ +      */
+ +
+ +      if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
+ +              return(-EFAULT);
+ +
+ +      /* Size of the handle is constant for this function.
+ +       * If there are no files with attributes, then this will be the
+ +       * maximum number of inodes we can get.
+ +       */
+ +
+ +      statstruct_sz = DM_STAT_SIZE(dm_xstat_t, 0);
+ +      statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +
+ +      nelems = buflen / statstruct_sz;
+ +      if (nelems < 1) {
+ +              if (put_user( statstruct_sz, rlenp ))
+ +                      return(-EFAULT);
+ +              return(-E2BIG);
+ +      }
+ +
+ +      /* Build the on-disk version of the attribute name. */
+ +      strcpy(dmb.attrname.dan_chars, dmattr_prefix);
+ +      strncpy(&dmb.attrname.dan_chars[DMATTR_PREFIXLEN],
+ +              attrname.an_chars, DM_ATTR_NAME_SIZE + 1);
+ +      dmb.attrname.dan_chars[sizeof(dmb.attrname.dan_chars) - 1] = '\0';
+ +
+ +      /*
+ +       * fill the buffer with dm_xstat_t's
+ +       */
+ +
+ +      dmb.laststruct = NULL;
+ +      memcpy(&dmb.fsid, mp->m_fixedfsid, sizeof(dm_fsid_t));
+ +      error = xfs_bulkstat(mp, (xfs_ino_t *)&loc, &nelems,
+ +                           xfs_dm_bulkall_one, (void*)&dmb, statstruct_sz,
+ +                           bufp, BULKSTAT_FG_INLINE, &done);
+ +      if (error)
+ +              return(-error); /* Return negative error to DMAPI */
+ +
+ +      *rvalp = !done ? 1 : 0;
+ +
+ +      if (put_user( statstruct_sz * nelems, rlenp ))
+ +              return(-EFAULT);
+ +
+ +      if (copy_to_user( locp, &loc, sizeof(loc)))
+ +              return(-EFAULT);
+ +      /*
+ +       *  If we didn't do any, we must not have any more to do.
+ +       */
+ +      if (nelems < 1)
+ +              return(0);
+ +      /*
+ +       * Set _link in the last struct to zero
+ +       */
+ +      return xfs_dm_zero_xstatinfo_link((dm_xstat_t __user *)dmb.laststruct);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_zero_statinfo_link(
+ +      dm_stat_t __user        *dxs)
+ +{
+ +      dm_stat_t               *ldxs;
+ +      int                     error = 0;
+ +
+ +      if (!dxs)
+ +              return 0;
+ +      ldxs = kmalloc(sizeof(*ldxs), GFP_KERNEL);
+ +      if (!ldxs)
+ +              return -ENOMEM;
+ +      if (copy_from_user(ldxs, dxs, sizeof(*dxs))) {
+ +              error = -EFAULT;
+ +      } else {
+ +              ldxs->_link = 0;
+ +              if (copy_to_user(dxs, ldxs, sizeof(*dxs)))
+ +                      error = -EFAULT;
+ +      }
+ +      kfree(ldxs);
+ +      return error;
+ +}
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_get_bulkattr_rvp(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           mask,
+ +      dm_attrloc_t    __user *locp,
+ +      size_t          buflen,
+ +      void            __user *bufp,
+ +      size_t          __user *rlenp,
+ +      int             *rvalp)
+ +{
+ +      int             error, done;
+ +      int             nelems;
+ +      u_int           statstruct_sz;
+ +      dm_attrloc_t    loc;
+ +      xfs_mount_t     *mp = XFS_I(inode)->i_mount;
+ +      dm_bulkstat_one_t dmb;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      if (copy_from_user( &loc, locp, sizeof(loc)))
+ +              return(-EFAULT);
+ +
+ +      /* Because we will write directly to the user's buffer, make sure that
+ +         the buffer is properly aligned.
+ +      */
+ +
+ +      if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
+ +              return(-EFAULT);
+ +
+ +      /* size of the handle is constant for this function */
+ +
+ +      statstruct_sz = DM_STAT_SIZE(dm_stat_t, 0);
+ +      statstruct_sz = (statstruct_sz+(DM_STAT_ALIGN-1)) & ~(DM_STAT_ALIGN-1);
+ +
+ +      nelems = buflen / statstruct_sz;
+ +      if (nelems < 1) {
+ +              if (put_user( statstruct_sz, rlenp ))
+ +                      return(-EFAULT);
+ +              return(-E2BIG);
+ +      }
+ +
+ +      dmb.laststruct = NULL;
+ +      memcpy(&dmb.fsid, mp->m_fixedfsid, sizeof(dm_fsid_t));
+ +      error = xfs_bulkstat(mp, (xfs_ino_t *)&loc, &nelems,
+ +                              xfs_dm_bulkattr_one, (void*)&dmb,
+ +                              statstruct_sz, bufp, BULKSTAT_FG_INLINE, &done);
+ +      if (error)
+ +              return(-error); /* Return negative error to DMAPI */
+ +
+ +      *rvalp = !done ? 1 : 0;
+ +
+ +      if (put_user( statstruct_sz * nelems, rlenp ))
+ +              return(-EFAULT);
+ +
+ +      if (copy_to_user( locp, &loc, sizeof(loc)))
+ +              return(-EFAULT);
+ +
+ +      /*
+ +       *  If we didn't do any, we must not have any more to do.
+ +       */
+ +      if (nelems < 1)
+ +              return(0);
+ +      /*
+ +       * Set _link in the last struct to zero
+ +       */
+ +      return xfs_dm_zero_statinfo_link((dm_stat_t __user *)dmb.laststruct);
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_get_config(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_config_t     flagname,
+ +      dm_size_t       __user *retvalp)
+ +{
+ +      dm_size_t       retval;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      switch (flagname) {
+ +      case DM_CONFIG_DTIME_OVERLOAD:
+ +      case DM_CONFIG_PERS_ATTRIBUTES:
+ +      case DM_CONFIG_PERS_EVENTS:
+ +      case DM_CONFIG_PERS_MANAGED_REGIONS:
+ +      case DM_CONFIG_PUNCH_HOLE:
+ +      case DM_CONFIG_WILL_RETRY:
+ +              retval = DM_TRUE;
+ +              break;
+ +
+ +      case DM_CONFIG_CREATE_BY_HANDLE:        /* these will never be done */
+ +      case DM_CONFIG_LOCK_UPGRADE:
+ +      case DM_CONFIG_PERS_INHERIT_ATTRIBS:
+ +              retval = DM_FALSE;
+ +              break;
+ +
+ +      case DM_CONFIG_BULKALL:
+ +              retval = DM_TRUE;
+ +              break;
+ +      case DM_CONFIG_MAX_ATTR_ON_DESTROY:
+ +              retval = DM_MAX_ATTR_BYTES_ON_DESTROY;
+ +              break;
+ +
+ +      case DM_CONFIG_MAX_ATTRIBUTE_SIZE:
+ +              retval = ATTR_MAX_VALUELEN;
+ +              break;
+ +
+ +      case DM_CONFIG_MAX_HANDLE_SIZE:
+ +              retval = DM_MAX_HANDLE_SIZE;
+ +              break;
+ +
+ +      case DM_CONFIG_MAX_MANAGED_REGIONS:
+ +              retval = 1;
+ +              break;
+ +
+ +      case DM_CONFIG_TOTAL_ATTRIBUTE_SPACE:
+ +              retval = 0x7fffffff;    /* actually it's unlimited */
+ +              break;
+ +
+ +      default:
+ +              return(-EINVAL);
+ +      }
+ +
+ +      /* Copy the results back to the user. */
+ +
+ +      if (copy_to_user( retvalp, &retval, sizeof(retval)))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_get_config_events(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           nelem,
+ +      dm_eventset_t   __user *eventsetp,
+ +      u_int           __user *nelemp)
+ +{
+ +      dm_eventset_t   eventset;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (nelem == 0)
+ +              return(-EINVAL);
+ +
+ +      eventset = DM_XFS_SUPPORTED_EVENTS;
+ +
+ +      /* Now copy the event mask and event count back to the caller.  We
+ +         return the lesser of nelem and DM_EVENT_MAX.
+ +      */
+ +
+ +      if (nelem > DM_EVENT_MAX)
+ +              nelem = DM_EVENT_MAX;
+ +      eventset &= (1 << nelem) - 1;
+ +
+ +      if (copy_to_user( eventsetp, &eventset, sizeof(eventset)))
+ +              return(-EFAULT);
+ +
+ +      if (put_user(nelem, nelemp))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_get_destroy_dmattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_attrname_t  *attrnamep,
+ +      char            **valuepp,
+ +      int             *vlenp)
+ +{
+ +      dm_dkattrname_t dkattrname;
+ +      int             alloc_size;
+ +      int             value_len;
+ +      char            *value;
+ +      int             error;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      *vlenp = -1;            /* assume failure by default */
+ +
+ +      if (attrnamep->an_chars[0] == '\0')
+ +              return(-EINVAL);
+ +
+ +      /* Build the on-disk version of the attribute name. */
+ +
+ +      strcpy(dkattrname.dan_chars, dmattr_prefix);
+ +      strncpy(&dkattrname.dan_chars[DMATTR_PREFIXLEN],
+ +              (char *)attrnamep->an_chars, DM_ATTR_NAME_SIZE + 1);
+ +      dkattrname.dan_chars[sizeof(dkattrname.dan_chars) - 1] = '\0';
+ +
+ +      /* xfs_attr_get will not return anything if the buffer is too small,
+ +         and we don't know how big to make the buffer, so this may take
+ +         two tries to get it right.  The initial try must use a buffer of
+ +         at least XFS_BUG_KLUDGE bytes to prevent buffer overflow because
+ +         of a bug in XFS.
+ +      */
+ +
+ +      alloc_size = XFS_BUG_KLUDGE;
+ +      value = kmalloc(alloc_size, GFP_KERNEL);
+ +      if (value == NULL)
+ +              return(-ENOMEM);
+ +
+ +      error = xfs_attr_get(XFS_I(inode), dkattrname.dan_chars, value,
+ +                                                      &value_len, ATTR_ROOT);
+ +      if (error == ERANGE) {
+ +              kfree(value);
+ +              alloc_size = value_len;
+ +              value = kmalloc(alloc_size, GFP_KERNEL);
+ +              if (value == NULL)
+ +                      return(-ENOMEM);
+ +
+ +              error = xfs_attr_get(XFS_I(inode), dkattrname.dan_chars, value,
+ +                                      &value_len, ATTR_ROOT);
+ +      }
+ +      if (error) {
+ +              kfree(value);
+ +              DM_EA_XLATE_ERR(error);
+ +              return(-error); /* Return negative error to DMAPI */
+ +      }
+ +
+ +      /* The attribute exists and has a value.  Note that a value_len of
+ +         zero is valid!
+ +      */
+ +
+ +      if (value_len == 0) {
+ +              kfree(value);
+ +              *vlenp = 0;
+ +              return(0);
+ +      } else if (value_len > DM_MAX_ATTR_BYTES_ON_DESTROY) {
+ +              char    *value2;
+ +
+ +              value2 = kmalloc(DM_MAX_ATTR_BYTES_ON_DESTROY, GFP_KERNEL);
+ +              if (value2 == NULL) {
+ +                      kfree(value);
+ +                      return(-ENOMEM);
+ +              }
+ +              memcpy(value2, value, DM_MAX_ATTR_BYTES_ON_DESTROY);
+ +              kfree(value);
+ +              value = value2;
+ +              value_len = DM_MAX_ATTR_BYTES_ON_DESTROY;
+ +      }
+ +      *vlenp = value_len;
+ +      *valuepp = value;
+ +      return(0);
+ +}
+ +
+ +/* This code was taken from xfs_fcntl(F_DIOINFO) and modified slightly because
+ +   we don't have a flags parameter (no open file).
+ +   Taken from xfs_ioctl(XFS_IOC_DIOINFO) on Linux.
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_get_dioinfo(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_dioinfo_t    __user *diop)
+ +{
+ +      dm_dioinfo_t    dio;
+ +      xfs_mount_t     *mp;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      mp = ip->i_mount;
+ +
+ +      dio.d_miniosz = dio.d_mem = MIN_DIO_SIZE(mp);
+ +      dio.d_maxiosz = MAX_DIO_SIZE(mp);
+ +      dio.d_dio_only = DM_FALSE;
+ +
+ +      if (copy_to_user(diop, &dio, sizeof(dio)))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +typedef struct dm_readdir_cb {
+ +      xfs_mount_t             *mp;
+ +      char __user             *ubuf;
+ +      dm_stat_t __user        *lastbuf;
+ +      size_t                  spaceleft;
+ +      size_t                  nwritten;
+ +      int                     error;
+ +      dm_stat_t               kstat;
+ +} dm_readdir_cb_t;
+ +
+ +STATIC int
+ +dm_filldir(void *__buf, const char *name, int namelen, loff_t offset,
+ +              u64 ino, unsigned int d_type)
+ +{
+ +      dm_readdir_cb_t *cb = __buf;
+ +      dm_stat_t       *statp = &cb->kstat;
+ +      size_t          len;
+ +      int             error;
+ +      int             needed;
+ +
+ +      /*
+ +       * Make sure we have enough space.
+ +       */
+ +        needed = dm_stat_size(namelen + 1);
+ +      if (cb->spaceleft < needed) {
+ +              cb->spaceleft = 0;
+ +              return -ENOSPC;
+ +      }
+ +
+ +      error = -EINVAL;
+ +      if (xfs_internal_inum(cb->mp, ino))
+ +              goto out_err;
+ +
+ +      memset(statp, 0, dm_stat_size(MAXNAMLEN));
+ +      error = -xfs_dm_bulkattr_iget_one(cb->mp, ino, 0,
+ +                      statp, needed);
+ +      if (error)
+ +              goto out_err;
+ +
+ +      /*
+ +       * On return from bulkstat_one(), stap->_link points
+ +       * at the end of the handle in the stat structure.
+ +       */
+ +      statp->dt_compname.vd_offset = statp->_link;
+ +      statp->dt_compname.vd_length = namelen + 1;
+ +
+ +      len = statp->_link;
+ +
+ +      /* Word-align the record */
+ +      statp->_link = dm_stat_align(len + namelen + 1);
+ +
+ +      error = -EFAULT;
+ +      if (copy_to_user(cb->ubuf, statp, len))
+ +              goto out_err;
+ +      if (copy_to_user(cb->ubuf + len, name, namelen))
+ +              goto out_err;
+ +      if (put_user(0, cb->ubuf + len + namelen))
+ +              goto out_err;
+ +
+ +      cb->lastbuf = (dm_stat_t __user *)cb->ubuf;
+ +      cb->spaceleft -= statp->_link;
+ +      cb->nwritten += statp->_link;
+ +      cb->ubuf += statp->_link;
+ +
+ +      return 0;
+ +
+ + out_err:
+ +      cb->error = error;
+ +      return error;
+ +}
+ +
+ +/* Returns negative errors to DMAPI */
+ +STATIC int
+ +xfs_dm_get_dirattrs_rvp(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           mask,
+ +      dm_attrloc_t    __user *locp,
+ +      size_t          buflen,
+ +      void            __user *bufp,
+ +      size_t          __user *rlenp,
+ +      int             *rvp)
+ +{
+ +      xfs_inode_t     *dp = XFS_I(inode);
+ +      xfs_mount_t     *mp = dp->i_mount;
+ +      dm_readdir_cb_t *cb;
+ +      dm_attrloc_t    loc;
+ +      int             error;
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return -EACCES;
+ +
+ +        /*
+ +         * Make sure that the buffer is properly aligned.
+ +         */
+ +        if (((unsigned long)bufp & (DM_STAT_ALIGN - 1)) != 0)
+ +                return -EFAULT;
+ +
+ +      if (mask & ~(DM_AT_HANDLE|DM_AT_EMASK|DM_AT_PMANR|DM_AT_PATTR|
+ +                   DM_AT_DTIME|DM_AT_CFLAG|DM_AT_STAT))
+ +              return -EINVAL;
+ +
+ +      if (!S_ISDIR(inode->i_mode))
+ +              return -EINVAL;
+ +
+ +        /*
+ +         * bufp should be able to fit at least one dm_stat entry including
+ +         * dt_handle and full size MAXNAMLEN dt_compname.
+ +         */
+ +        if (buflen < dm_stat_size(MAXNAMLEN))
+ +                return -ENOMEM;
+ +
+ +      if (copy_from_user(&loc, locp, sizeof(loc)))
+ +              return -EFAULT;
+ +
+ +      cb = kzalloc(sizeof(*cb) + dm_stat_size(MAXNAMLEN), GFP_KERNEL);
+ +      if (!cb)
+ +              return -ENOMEM;
+ +
+ +      cb->mp = mp;
+ +      cb->spaceleft = buflen;
+ +      cb->ubuf = bufp;
+ +
+ +      mutex_lock(&inode->i_mutex);
+ +      error = -ENOENT;
+ +      if (!IS_DEADDIR(inode)) {
+ +              error = -xfs_readdir(dp, cb, dp->i_size,
+ +                                       (xfs_off_t *)&loc, dm_filldir);
+ +      }
+ +      mutex_unlock(&inode->i_mutex);
+ +
+ +      if (error)
+ +              goto out_kfree;
+ +      if (cb->error) {
+ +              error = cb->error;
+ +              goto out_kfree;
+ +      }
+ +
+ +      error = -EFAULT;
+ +      if (cb->lastbuf && put_user(0, &cb->lastbuf->_link))
+ +              goto out_kfree;
+ +      if (put_user(cb->nwritten, rlenp))
+ +              goto out_kfree;
+ +      if (copy_to_user(locp, &loc, sizeof(loc)))
+ +              goto out_kfree;
+ +
+ +      if (cb->nwritten)
+ +              *rvp = 1;
+ +      else
+ +              *rvp = 0;
+ +      error = 0;
+ +
+ + out_kfree:
+ +      kfree(cb);
+ +      return error;
+ +}
+ +
+ +STATIC int
+ +xfs_dm_get_dmattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_attrname_t   __user *attrnamep,
+ +      size_t          buflen,
+ +      void            __user *bufp,
+ +      size_t          __user  *rlenp)
+ +{
+ +      dm_dkattrname_t name;
+ +      char            *value;
+ +      int             value_len;
+ +      int             alloc_size;
+ +      int             error;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+ +              return(-error); /* Return negative error to DMAPI */
+ +
+ +      /* Allocate a buffer to receive the attribute's value.  We allocate
+ +         at least one byte even if the caller specified a buflen of zero.
+ +         (A buflen of zero is considered valid.)
+ +
+ +         Allocating a minimum of XFS_BUG_KLUDGE bytes temporarily works
+ +         around a bug within XFS in which in-inode attribute values are not
+ +         checked to see if they will fit in the buffer before they are
+ +         copied.  Since no in-core attribute value can be larger than 256
+ +         bytes (an 8-bit size field), we allocate that minimum size here to
+ +         prevent buffer overrun in both the kernel's and user's buffers.
+ +      */
+ +
+ +      alloc_size = buflen;
+ +      if (alloc_size < XFS_BUG_KLUDGE)
+ +              alloc_size = XFS_BUG_KLUDGE;
+ +      if (alloc_size > ATTR_MAX_VALUELEN)
+ +              alloc_size = ATTR_MAX_VALUELEN;
-       value = kmem_alloc(alloc_size, KM_SLEEP | KM_LARGE);
++      value = kmem_zalloc_large(alloc_size);
+ +
+ +      /* Get the attribute's value. */
+ +
+ +      value_len = alloc_size;         /* in/out parameter */
+ +
+ +      error = xfs_attr_get(XFS_I(inode), name.dan_chars, value, &value_len,
+ +                                      ATTR_ROOT);
+ +      DM_EA_XLATE_ERR(error);
+ +
+ +      /* DMAPI requires an errno of ENOENT if an attribute does not exist,
+ +         so remap ENOATTR here.
+ +      */
+ +
+ +      if (error == ENOATTR)
+ +              error = ENOENT;
+ +      if (!error && value_len > buflen)
+ +              error = E2BIG;
+ +      if (!error && copy_to_user(bufp, value, value_len))
+ +              error = EFAULT;
+ +      if (!error || error == E2BIG) {
+ +              if (put_user(value_len, rlenp))
+ +                      error = EFAULT;
+ +      }
+ +
+ +      kmem_free(value);
+ +      return(-error); /* Return negative error to DMAPI */
+ +}
+ +
+ +STATIC int
+ +xfs_dm_get_eventlist(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           type,
+ +      u_int           nelem,
+ +      dm_eventset_t   *eventsetp,
+ +      u_int           *nelemp)
+ +{
+ +      int             error;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (type == DM_FSYS_OBJ) {
+ +              error = xfs_dm_fs_get_eventlist(ip->i_mount, right, nelem,
+ +                      eventsetp, nelemp);
+ +      } else {
+ +              error = xfs_dm_f_get_eventlist(ip, right, nelem,
+ +                      eventsetp, nelemp);
+ +      }
+ +      return(-error); /* Returns negative error to DMAPI */
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_get_fileattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           mask,           /* not used; always return everything */
+ +      dm_stat_t       __user *statp)
+ +{
+ +      dm_stat_t       stat;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      xfs_mount_t     *mp;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      /* Find the mount point. */
+ +
+ +      mp = ip->i_mount;
+ +
+ +      xfs_ilock(ip, XFS_ILOCK_SHARED);
+ +      xfs_ip_to_stat(mp, ip->i_ino, ip, &stat);
+ +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ +
+ +      if (copy_to_user( statp, &stat, sizeof(stat)))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +
+ +/* We currently only support a maximum of one managed region per file, and
+ +   use the DM_EVENT_READ, DM_EVENT_WRITE, and DM_EVENT_TRUNCATE events in
+ +   the file's dm_eventset_t event mask to implement the DM_REGION_READ,
+ +   DM_REGION_WRITE, and DM_REGION_TRUNCATE flags for that single region.
+ +*/
+ +
+ +STATIC int
+ +xfs_dm_get_region(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           nelem,
+ +      dm_region_t     __user *regbufp,
+ +      u_int           __user *nelemp)
+ +{
+ +      dm_eventset_t   evmask;
+ +      dm_region_t     region;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      u_int           elem;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      evmask = ip->i_d.di_dmevmask;   /* read the mask "atomically" */
+ +
+ +      /* Get the file's current managed region flags out of the
+ +         dm_eventset_t mask and use them to build a managed region that
+ +         covers the entire file, i.e. set rg_offset and rg_size to zero.
+ +      */
+ +
+ +      memset((char *)&region, 0, sizeof(region));
+ +
+ +      if (evmask & (1 << DM_EVENT_READ))
+ +              region.rg_flags |= DM_REGION_READ;
+ +      if (evmask & (1 << DM_EVENT_WRITE))
+ +              region.rg_flags |= DM_REGION_WRITE;
+ +      if (evmask & (1 << DM_EVENT_TRUNCATE))
+ +              region.rg_flags |= DM_REGION_TRUNCATE;
+ +
+ +      elem = (region.rg_flags ? 1 : 0);
+ +
+ +      if (copy_to_user( nelemp, &elem, sizeof(elem)))
+ +              return(-EFAULT);
+ +      if (elem > nelem)
+ +              return(-E2BIG);
+ +      if (elem && copy_to_user(regbufp, &region, sizeof(region)))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_getall_dmattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      size_t          buflen,
+ +      void            __user *bufp,
+ +      size_t          __user *rlenp)
+ +{
+ +      attrlist_cursor_kern_t cursor;
+ +      attrlist_t      *attrlist;
+ +      dm_attrlist_t   __user *ulist;
+ +      int             *last_link;
+ +      int             alignment;
+ +      int             total_size;
+ +      int             list_size = 8192;       /* should be big enough */
+ +      int             error;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      /* Verify that the user gave us a buffer that is 4-byte aligned, lock
+ +         it down, and work directly within that buffer.  As a side-effect,
+ +         values of buflen < sizeof(int) return EINVAL.
+ +      */
+ +
+ +      alignment = sizeof(int) - 1;
+ +      if ((((__psint_t)bufp & alignment) != 0) ||
+ +               !access_ok(VERIFY_WRITE, bufp, buflen)) {
+ +              return(-EFAULT);
+ +      }
+ +      buflen &= ~alignment;           /* round down the alignment */
+ +
+ +      /* Initialize all the structures and variables for the main loop. */
+ +
+ +      memset(&cursor, 0, sizeof(cursor));
+ +      attrlist = (attrlist_t *)kmem_alloc(list_size, KM_SLEEP);
+ +      total_size = 0;
+ +      ulist = (dm_attrlist_t *)bufp;
+ +      last_link = NULL;
+ +
+ +      /* Use vop_attr_list to get the names of DMAPI attributes, and use
+ +         vop_attr_get to get their values.  There is a risk here that the
+ +         DMAPI attributes could change between the vop_attr_list and
+ +         vop_attr_get calls.  If we can detect it, we return EIO to notify
+ +         the user.
+ +      */
+ +
+ +      do {
+ +              int     i;
+ +
+ +              /* Get a buffer full of attribute names.  If there aren't any
+ +                 more or if we encounter an error, then finish up.
+ +              */
+ +
+ +              error = xfs_attr_list(XFS_I(inode), (char *)attrlist, list_size,
+ +                                              ATTR_ROOT, &cursor);
+ +              DM_EA_XLATE_ERR(error);
+ +
+ +              if (error || attrlist->al_count == 0)
+ +                      break;
+ +
+ +              for (i = 0; i < attrlist->al_count; i++) {
+ +                      attrlist_ent_t  *entry;
+ +                      char            *user_name;
+ +                      int             size_needed;
+ +                      int             value_len;
+ +
+ +                      /* Skip over all non-DMAPI attributes.  If the
+ +                         attribute name is too long, we assume it is
+ +                         non-DMAPI even if it starts with the correct
+ +                         prefix.
+ +                      */
+ +
+ +                      entry = ATTR_ENTRY(attrlist, i);
+ +                      if (strncmp(entry->a_name, dmattr_prefix, DMATTR_PREFIXLEN))
+ +                              continue;
+ +                      user_name = &entry->a_name[DMATTR_PREFIXLEN];
+ +                      if (strlen(user_name) > DM_ATTR_NAME_SIZE)
+ +                              continue;
+ +
+ +                      /* We have a valid DMAPI attribute to return.  If it
+ +                         won't fit in the user's buffer, we still need to
+ +                         keep track of the number of bytes for the user's
+ +                         next call.
+ +                      */
+ +
+ +
+ +                      size_needed = sizeof(*ulist) + entry->a_valuelen;
+ +                      size_needed = (size_needed + alignment) & ~alignment;
+ +
+ +                      total_size += size_needed;
+ +                      if (total_size > buflen)
+ +                              continue;
+ +
+ +                      /* Start by filling in all the fields in the
+ +                         dm_attrlist_t structure.
+ +                      */
+ +
+ +                      strncpy((char *)ulist->al_name.an_chars, user_name,
+ +                              DM_ATTR_NAME_SIZE);
+ +                      ulist->al_data.vd_offset = sizeof(*ulist);
+ +                      ulist->al_data.vd_length = entry->a_valuelen;
+ +                      ulist->_link =  size_needed;
+ +                      last_link = &ulist->_link;
+ +
+ +                      /* Next read the attribute's value into its correct
+ +                         location after the dm_attrlist structure.  Any sort
+ +                         of error indicates that the data is moving under us,
+ +                         so we return EIO to let the user know.
+ +                      */
+ +
+ +                      value_len = entry->a_valuelen;
+ +
+ +                      error = xfs_attr_get(XFS_I(inode), entry->a_name,
+ +                                              (void *)(ulist + 1), &value_len,
+ +                                              ATTR_ROOT);
+ +                      DM_EA_XLATE_ERR(error);
+ +
+ +                      if (error || value_len != entry->a_valuelen) {
+ +                              error = EIO;
+ +                              break;
+ +                      }
+ +
+ +                      ulist = (dm_attrlist_t *)((char *)ulist + ulist->_link);
+ +              }
+ +      } while (!error && attrlist->al_more);
+ +      if (last_link)
+ +              *last_link = 0;
+ +
+ +      if (!error && total_size > buflen)
+ +              error = E2BIG;
+ +      if (!error || error == E2BIG) {
+ +              if (put_user(total_size, rlenp))
+ +                      error = EFAULT;
+ +      }
+ +
+ +      kmem_free(attrlist);
+ +      return(-error); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_getall_inherit(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           nelem,
+ +      dm_inherit_t    __user *inheritbufp,
+ +      u_int           __user *nelemp)
+ +{
+ +      return(-ENOSYS); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/* Initialize location pointer for subsequent dm_get_dirattrs,
+ +   dm_get_bulkattr, and dm_get_bulkall calls.  The same initialization must
+ +   work for inode-based routines (dm_get_dirattrs) and filesystem-based
+ +   routines (dm_get_bulkattr and dm_get_bulkall).  Filesystem-based functions
+ +   call this routine using the filesystem's root inode.
+ +*/
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_init_attrloc(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_attrloc_t    __user *locp)
+ +{
+ +      dm_attrloc_t    loc = 0;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      if (copy_to_user( locp, &loc, sizeof(loc)))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_mkdir_by_handle(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      void            __user *hanp,
+ +      size_t          hlen,
+ +      char            __user *cname)
+ +{
+ +      return(-ENOSYS); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/*
+ + * Probe and Punch
+ + *
+ + * Hole punching alignment is based on the underlying device base
+ + * allocation size. Because it is not defined in the DMAPI spec, we
+ + * can align how we choose here. Round inwards (offset up and length
+ + * down) to the block, extent or page size whichever is bigger. Our
+ + * DMAPI implementation rounds the hole geometry strictly inwards. If
+ + * this is not possible, return EINVAL for both for xfs_dm_probe_hole
+ + * and xfs_dm_punch_hole which differs from the DMAPI spec.  Note that
+ + * length = 0 is special - it means "punch to EOF" and at that point
+ + * we treat the punch as remove everything past offset (including
+ + * preallocation past EOF).
+ + */
+ +
+ +STATIC int
+ +xfs_dm_round_hole(
+ +      dm_off_t        offset,
+ +      dm_size_t       length,
+ +      dm_size_t       align,
+ +      xfs_fsize_t     filesize,
+ +      dm_off_t        *roff,
+ +      dm_size_t       *rlen)
+ +{
+ +
+ +      dm_off_t        off = offset;
+ +      dm_size_t       len = length;
+ +
+ +      /* Try to round offset up to the nearest boundary */
+ +      *roff = roundup_64(off, align);
+ +      if ((*roff >= filesize) || (len && (len < align)))
+ +              return -EINVAL;
+ +
+ +      if ((len == 0) || ((off + len) == filesize)) {
+ +              /* punch to EOF */
+ +              *rlen = 0;
+ +      } else {
+ +              /* Round length down to the nearest boundary. */
+ +              ASSERT(len >= align);
+ +              ASSERT(align > (*roff - off));
+ +              len -= *roff - off;
+ +              *rlen = len - do_mod(len, align);
+ +              if (*rlen == 0)
+ +                      return -EINVAL; /* requested length is too small */
+ +      }
+ +#ifdef CONFIG_DMAPI_DEBUG
+ +      printk("xfs_dm_round_hole: off %lu, len %ld, align %lu, "
+ +             "filesize %llu, roff %ld, rlen %ld\n",
+ +             offset, length, align, filesize, *roff, *rlen);
+ +#endif
+ +      return 0; /* hole geometry successfully rounded */
+ +}
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_probe_hole(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_off_t        off,
+ +      dm_size_t       len,
+ +      dm_off_t        __user  *roffp,
+ +      dm_size_t       __user *rlenp)
+ +{
+ +      dm_off_t        roff;
+ +      dm_size_t       rlen;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      xfs_mount_t     *mp;
+ +      uint            lock_flags;
+ +      xfs_fsize_t     realsize;
+ +      dm_size_t       align;
+ +      int             error;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return -EACCES;
+ +
+ +      if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
+ +              return -EINVAL;
+ +
+ +      mp = ip->i_mount;
+ +      lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+ +      xfs_ilock(ip, lock_flags);
+ +      realsize = ip->i_size;
+ +      xfs_iunlock(ip, lock_flags);
+ +
+ +      if ((off + len) > realsize)
+ +              return -E2BIG;
+ +
+ +      align = 1 << mp->m_sb.sb_blocklog;
+ +
+ +      error = xfs_dm_round_hole(off, len, align, realsize, &roff, &rlen);
+ +      if (error)
+ +              return error;
+ +
+ +      if (copy_to_user( roffp, &roff, sizeof(roff)))
+ +              return -EFAULT;
+ +      if (copy_to_user( rlenp, &rlen, sizeof(rlen)))
+ +              return -EFAULT;
+ +      return(0);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_punch_hole(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_off_t        off,
+ +      dm_size_t       len)
+ +{
+ +      xfs_flock64_t   bf;
+ +      int             error = 0;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      xfs_mount_t     *mp;
+ +      dm_size_t       align;
+ +      xfs_fsize_t     realsize;
+ +      dm_off_t        roff;
+ +      dm_size_t       rlen;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return -EACCES;
+ +
+ +      /* Make sure there are no leases. */
+ +      error = break_lease(inode, FMODE_WRITE);
+ +      if (error)
+ +              return -EBUSY;
+ +
+ +      error = get_write_access(inode);
+ +      if (error)
+ +              return -EBUSY;
+ +
+ +      mp = ip->i_mount;
+ +
+ +      down_rw_sems(inode, DM_SEM_FLAG_WR);
+ +
+ +      xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ +      realsize = ip->i_size;
+ +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ +      align = xfs_get_extsz_hint(ip);
+ +      if (align == 0)
+ +              align = 1;
+ +
+ +      align <<= mp->m_sb.sb_blocklog;
+ +
+ +      if ((off + len) > realsize) {
+ +              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ +              error = -E2BIG;
+ +              goto up_and_out;
+ +      }
+ +
+ +      if ((off + len) == realsize)
+ +              len = 0;
+ +
+ +      error = xfs_dm_round_hole(off, len, align, realsize, &roff, &rlen);
+ +      if (error || (off != roff) || (len != rlen)) {
+ +              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ +              error = -EINVAL;
+ +              goto up_and_out;
+ +      }
+ +
+ +      bf.l_type = 0;
+ +      bf.l_whence = 0;
+ +      bf.l_start = (xfs_off_t)off;
+ +      if (len) {
+ +              bf.l_len = len;
+ +      }
+ +      else {
+ +              /*
+ +               * When we are punching to EOF, we have to make sure we punch
+ +               * the last partial block that contains EOF. Round up
+ +               * the length to make sure we punch the block and not just
+ +               * zero it.
+ +               */
+ +              bf.l_len = roundup_64((realsize - off), mp->m_sb.sb_blocksize);
+ +      }
+ +
+ +#ifdef CONFIG_DMAPI_DEBUG
+ +      printk("xfs_dm_punch_hole: off %lu, len %ld, align %lu\n",
+ +              off, len, align);
+ +#endif
+ +
+ +      error = xfs_change_file_space(ip, XFS_IOC_UNRESVSP, &bf,
+ +                              (xfs_off_t)off, XFS_ATTR_DMI|XFS_ATTR_NOLOCK);
+ +
+ +      /*
+ +       * if punching to end of file, kill any blocks past EOF that
+ +       * may have been (speculatively) preallocated. No point in
+ +       * leaving them around if we are migrating the file....
+ +       */
+ +      if (!error && (len == 0)) {
+ +              error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_HASLOCK);
+ +      }
+ +
+ +      /*
+ +       * negate the error for return here as core XFS functions return
+ +       * positive error numbers
+ +       */
+ +      if (error)
+ +              error = -error;
+ +
+ +      /* Let threads in send_data_event know we punched the file. */
+ +      ip->i_d.di_dmstate++;
+ +      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ +
+ +up_and_out:
+ +      up_rw_sems(inode, DM_SEM_FLAG_WR);
+ +      put_write_access(inode);
+ +
+ +      return error;
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_read_invis_rvp(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_off_t        off,
+ +      dm_size_t       len,
+ +      void            __user *bufp,
+ +      int             *rvp)
+ +{
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_SHARED)
+ +              return(-EACCES);
+ +
+ +      return(-xfs_dm_rdwr(inode, 0, FMODE_READ, off, len, bufp, rvp));
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_release_right(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           type)           /* DM_FSYS_OBJ or zero */
+ +{
+ +#ifdef        DEBUG_RIGHTS
+ +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
+ +
+ +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ +              printf("dm_release_right: old %d type %d handle %s\n",
+ +                      right, type, buffer);
+ +      } else {
+ +              printf("dm_release_right: old %d type %d handle "
+ +                      " <INVALID>\n", right, type);
+ +      }
+ +#endif        /* DEBUG_RIGHTS */
+ +      return(0);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_remove_dmattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      int             setdtime,
+ +      dm_attrname_t   __user *attrnamep)
+ +{
+ +      dm_dkattrname_t name;
+ +      int             error;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(-EACCES);
+ +
+ +      if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+ +              return(-error); /* Return negative error to DMAPI */
+ +
+ +      /* Remove the attribute from the object. */
+ +
+ +      error = xfs_attr_remove(XFS_I(inode), name.dan_chars, setdtime ?
+ +                              ATTR_ROOT : (ATTR_ROOT|ATTR_KERNOTIME));
+ +      DM_EA_XLATE_ERR(error);
+ +
+ +      if (error == ENOATTR)
+ +              error = ENOENT;
+ +      return(-error); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_request_right(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           type,           /* DM_FSYS_OBJ or zero */
+ +      u_int           flags,
+ +      dm_right_t      newright)
+ +{
+ +#ifdef        DEBUG_RIGHTS
+ +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
+ +
+ +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ +              printf("dm_request_right: old %d new %d type %d flags 0x%x "
+ +                      "handle %s\n", right, newright, type, flags, buffer);
+ +      } else {
+ +              printf("dm_request_right: old %d new %d type %d flags 0x%x "
+ +                      "handle <INVALID>\n", right, newright, type, flags);
+ +      }
+ +#endif        /* DEBUG_RIGHTS */
+ +      return(0);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_set_dmattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_attrname_t   __user *attrnamep,
+ +      int             setdtime,
+ +      size_t          buflen,
+ +      void            __user *bufp)
+ +{
+ +      dm_dkattrname_t name;
+ +      char            *value;
+ +      int             alloc_size;
+ +      int             error;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(-EACCES);
+ +
+ +      if ((error = xfs_copyin_attrname(attrnamep, &name)) != 0)
+ +              return(-error); /* Return negative error to DMAPI */
+ +      if (buflen > ATTR_MAX_VALUELEN)
+ +              return(-E2BIG);
+ +
+ +      /* Copy in the attribute's value and store the <name,value> pair in
+ +         the object.  We allocate a buffer of at least one byte even if the
+ +         caller specified a buflen of zero.  (A buflen of zero is considered
+ +         valid.)
+ +      */
+ +
+ +      alloc_size = (buflen == 0) ? 1 : buflen;
+ +      value = kmem_alloc(alloc_size, KM_SLEEP);
+ +      if (copy_from_user( value, bufp, buflen)) {
+ +              error = EFAULT;
+ +      } else {
+ +              error = xfs_attr_set(XFS_I(inode), name.dan_chars, value, buflen,
+ +                                      setdtime ? ATTR_ROOT :
+ +                                      (ATTR_ROOT|ATTR_KERNOTIME));
+ +              DM_EA_XLATE_ERR(error);
+ +      }
+ +      kmem_free(value);
+ +      return(-error); /* Return negative error to DMAPI */
+ +}
+ +
+ +STATIC int
+ +xfs_dm_set_eventlist(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           type,
+ +      dm_eventset_t   *eventsetp,     /* in kernel space! */
+ +      u_int           maxevent)
+ +{
+ +      int             error;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (type == DM_FSYS_OBJ) {
+ +              error = xfs_dm_fs_set_eventlist(ip->i_mount, right, eventsetp, maxevent);
+ +      } else {
+ +              error = xfs_dm_f_set_eventlist(ip, right, eventsetp, maxevent);
+ +      }
+ +      return(-error); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +/*
+ + *  This turned out not XFS-specific, but leave it here with get_fileattr.
+ + */
+ +
+ +STATIC int
+ +xfs_dm_set_fileattr(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           mask,
+ +      dm_fileattr_t   __user *statp)
+ +{
+ +      dm_fileattr_t   stat;
+ +      struct iattr    iattr;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(-EACCES);
+ +
+ +      if (copy_from_user( &stat, statp, sizeof(stat)))
+ +              return(-EFAULT);
+ +
+ +      iattr.ia_valid = 0;
+ +
+ +      if (mask & DM_AT_MODE) {
+ +              iattr.ia_valid |= ATTR_MODE;
+ +              iattr.ia_mode = stat.fa_mode;
+ +      }
+ +      if (mask & DM_AT_UID) {
+ +              iattr.ia_valid |= ATTR_UID;
+ +              iattr.ia_uid = stat.fa_uid;
+ +      }
+ +      if (mask & DM_AT_GID) {
+ +              iattr.ia_valid |= ATTR_GID;
+ +              iattr.ia_gid = stat.fa_gid;
+ +      }
+ +      if (mask & DM_AT_ATIME) {
+ +              iattr.ia_valid |= ATTR_ATIME;
+ +              iattr.ia_atime.tv_sec = stat.fa_atime;
+ +              iattr.ia_atime.tv_nsec = 0;
+ +                inode->i_atime.tv_sec = stat.fa_atime;
+ +      }
+ +      if (mask & DM_AT_MTIME) {
+ +              iattr.ia_valid |= ATTR_MTIME;
+ +              iattr.ia_mtime.tv_sec = stat.fa_mtime;
+ +              iattr.ia_mtime.tv_nsec = 0;
+ +      }
+ +      if (mask & DM_AT_CTIME) {
+ +              iattr.ia_valid |= ATTR_CTIME;
+ +              iattr.ia_ctime.tv_sec = stat.fa_ctime;
+ +              iattr.ia_ctime.tv_nsec = 0;
+ +      }
+ +
+ +      /*
+ +       * DM_AT_DTIME only takes effect if DM_AT_CTIME is not specified.  We
+ +       * overload ctime to also act as dtime, i.e. DM_CONFIG_DTIME_OVERLOAD.
+ +       */
+ +      if ((mask & DM_AT_DTIME) && !(mask & DM_AT_CTIME)) {
+ +              iattr.ia_valid |= ATTR_CTIME;
+ +              iattr.ia_ctime.tv_sec = stat.fa_dtime;
+ +              iattr.ia_ctime.tv_nsec = 0;
+ +      }
+ +      if (mask & DM_AT_SIZE) {
+ +              iattr.ia_valid |= ATTR_SIZE;
+ +              iattr.ia_size = stat.fa_size;
+ +      }
+ +
+ +      return -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_DMI);
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_set_inherit(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      dm_attrname_t   __user *attrnamep,
+ +      mode_t          mode)
+ +{
+ +      return(-ENOSYS); /* Return negative error to DMAPI */
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_set_region(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           nelem,
+ +      dm_region_t     __user *regbufp,
+ +      dm_boolean_t    __user *exactflagp)
+ +{
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +      xfs_trans_t     *tp;
+ +      xfs_mount_t     *mp;
+ +      dm_region_t     region;
+ +      dm_eventset_t   new_mask;
+ +      dm_eventset_t   mr_mask;
+ +      int             error;
+ +      u_int           exactflag;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(-EACCES);
+ +
+ +      /* If the caller gave us more than one dm_region_t structure, complain.
+ +         (He has to call dm_get_config() to find out what our limit is.)
+ +      */
+ +
+ +      if (nelem > 1)
+ +              return(-E2BIG);
+ +
+ +      /* If the user provided a dm_region_t structure, then copy it in,
+ +         validate it, and convert its flags to the corresponding bits in a
+ +         dm_set_eventlist() event mask.  A call with zero regions is
+ +         equivalent to clearing all region flags.
+ +      */
+ +
+ +      new_mask = 0;
+ +      if (nelem == 1) {
+ +              if (copy_from_user( &region, regbufp, sizeof(region)))
+ +                      return(-EFAULT);
+ +
+ +              if (region.rg_flags & ~(DM_REGION_READ|DM_REGION_WRITE|DM_REGION_TRUNCATE))
+ +                      return(-EINVAL);
+ +              if (region.rg_flags & DM_REGION_READ)
+ +                      new_mask |= 1 << DM_EVENT_READ;
+ +              if (region.rg_flags & DM_REGION_WRITE)
+ +                      new_mask |= 1 << DM_EVENT_WRITE;
+ +              if (region.rg_flags & DM_REGION_TRUNCATE)
+ +                      new_mask |= 1 << DM_EVENT_TRUNCATE;
+ +      }
+ +      mr_mask = (1 << DM_EVENT_READ) | (1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE);
+ +
+ +      /* Get the file's existing event mask, clear the old managed region
+ +         bits, add in the new ones, and update the file's mask.
+ +      */
+ +
+ +      if (new_mask & prohibited_mr_events(inode->i_mapping)) {
+ +              /* If the change is simply to remove the READ
+ +               * bit, then that's always okay.  Otherwise, it's busy.
+ +               */
+ +              dm_eventset_t m1;
+ +              m1 = ip->i_d.di_dmevmask & ((1 << DM_EVENT_WRITE) | (1 << DM_EVENT_TRUNCATE));
+ +              if (m1 != new_mask) {
+ +                      return -EBUSY;
+ +              }
+ +      }
+ +
+ +      mp = ip->i_mount;
+ +      tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+ +      error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+ +      if (error) {
+ +              xfs_trans_cancel(tp, 0);
+ +              return(-error); /* Return negative error to DMAPI */
+ +      }
+ +      xfs_ilock(ip, XFS_ILOCK_EXCL);
+ +      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ +
+ +      ip->i_d.di_dmevmask = (ip->i_d.di_dmevmask & ~mr_mask) | new_mask;
+ +
+ +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ +      igrab(inode);
+ +      xfs_trans_commit(tp, 0);
+ +
+ +      /* Return the proper value for *exactflagp depending upon whether or not
+ +         we "changed" the user's managed region.  In other words, if the user
+ +         specified a non-zero value for either rg_offset or rg_size, we
+ +         round each of those values back to zero.
+ +      */
+ +
+ +      if (nelem && (region.rg_offset || region.rg_size)) {
+ +              exactflag = DM_FALSE;   /* user region was changed */
+ +      } else {
+ +              exactflag = DM_TRUE;    /* user region was unchanged */
+ +      }
+ +      if (copy_to_user( exactflagp, &exactflag, sizeof(exactflag)))
+ +              return(-EFAULT);
+ +      return(0);
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_symlink_by_handle(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      void __user     *hanp,
+ +      size_t          hlen,
+ +      char            __user *cname,
+ +      char            __user *path)
+ +{
+ +      return(-ENOSYS); /* Return negative errors to DMAPI */
+ +}
+ +
+ +
+ +/*
+ + * xfs_dm_sync_by_handle needs to do the same thing as sys_fsync()
+ + */
+ +STATIC int
+ +xfs_dm_sync_by_handle(
+ +      struct inode    *inode,
+ +      dm_right_t      right)
+ +{
+ +      int             err, ret;
+ +      xfs_inode_t     *ip = XFS_I(inode);
+ +
+ +      /* Returns negative errors to DMAPI */
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(-EACCES);
+ +
+ +      /* We need to protect against concurrent writers.. */
+ +      ret = filemap_fdatawrite(inode->i_mapping);
+ +      down_rw_sems(inode, DM_FLAGS_IMUX);
-       err = -xfs_fsync(ip);
++      err = xfs_fsync(inode, 1);
+ +      if (!ret)
+ +              ret = err;
+ +      up_rw_sems(inode, DM_FLAGS_IMUX);
+ +      err = filemap_fdatawait(inode->i_mapping);
+ +      if (!ret)
+ +              ret = err;
+ +      xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ +      return ret;
+ +}
+ +
+ +
+ +/* ARGSUSED */
+ +STATIC int
+ +xfs_dm_upgrade_right(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      u_int           type)           /* DM_FSYS_OBJ or zero */
+ +{
+ +#ifdef        DEBUG_RIGHTS
+ +      char            buffer[sizeof(dm_handle_t) * 2 + 1];
+ +
+ +      if (!xfs_vp_to_hexhandle(inode, type, buffer)) {
+ +              printf("dm_upgrade_right: old %d new %d type %d handle %s\n",
+ +                      right, DM_RIGHT_EXCL, type, buffer);
+ +      } else {
+ +              printf("dm_upgrade_right: old %d new %d type %d handle "
+ +                      "<INVALID>\n", right, DM_RIGHT_EXCL, type);
+ +      }
+ +#endif        /* DEBUG_RIGHTS */
+ +      return(0);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_write_invis_rvp(
+ +      struct inode    *inode,
+ +      dm_right_t      right,
+ +      int             flags,
+ +      dm_off_t        off,
+ +      dm_size_t       len,
+ +      void __user     *bufp,
+ +      int             *rvp)
+ +{
+ +      int             fflag = 0;
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (right < DM_RIGHT_EXCL)
+ +              return(-EACCES);
+ +
+ +      if (flags & DM_WRITE_SYNC)
+ +              fflag |= O_SYNC;
+ +      return(-xfs_dm_rdwr(inode, fflag, FMODE_WRITE, off, len, bufp, rvp));
+ +}
+ +
+ +
+ +STATIC void
+ +xfs_dm_obj_ref_hold(
+ +      struct inode    *inode)
+ +{
+ +      igrab(inode);
+ +}
+ +
+ +
+ +static fsys_function_vector_t xfs_fsys_vector[DM_FSYS_MAX];
+ +
+ +
+ +STATIC int
+ +xfs_dm_get_dmapiops(
+ +      struct super_block      *sb,
+ +      void                    *addr)
+ +{
+ +      static  int             initialized = 0;
+ +      dm_fcntl_vector_t       *vecrq;
+ +      fsys_function_vector_t  *vecp;
+ +      int                     i = 0;
+ +
+ +      vecrq = (dm_fcntl_vector_t *)addr;
+ +      vecrq->count =
+ +              sizeof(xfs_fsys_vector) / sizeof(xfs_fsys_vector[0]);
+ +      vecrq->vecp = xfs_fsys_vector;
+ +      if (initialized)
+ +              return(0);
+ +      vecrq->code_level = DM_CLVL_XOPEN;
+ +      vecp = xfs_fsys_vector;
+ +
+ +      vecp[i].func_no = DM_FSYS_CLEAR_INHERIT;
+ +      vecp[i++].u_fc.clear_inherit = xfs_dm_clear_inherit;
+ +      vecp[i].func_no = DM_FSYS_CREATE_BY_HANDLE;
+ +      vecp[i++].u_fc.create_by_handle = xfs_dm_create_by_handle;
+ +      vecp[i].func_no = DM_FSYS_DOWNGRADE_RIGHT;
+ +      vecp[i++].u_fc.downgrade_right = xfs_dm_downgrade_right;
+ +      vecp[i].func_no = DM_FSYS_GET_ALLOCINFO_RVP;
+ +      vecp[i++].u_fc.get_allocinfo_rvp = xfs_dm_get_allocinfo_rvp;
+ +      vecp[i].func_no = DM_FSYS_GET_BULKALL_RVP;
+ +      vecp[i++].u_fc.get_bulkall_rvp = xfs_dm_get_bulkall_rvp;
+ +      vecp[i].func_no = DM_FSYS_GET_BULKATTR_RVP;
+ +      vecp[i++].u_fc.get_bulkattr_rvp = xfs_dm_get_bulkattr_rvp;
+ +      vecp[i].func_no = DM_FSYS_GET_CONFIG;
+ +      vecp[i++].u_fc.get_config = xfs_dm_get_config;
+ +      vecp[i].func_no = DM_FSYS_GET_CONFIG_EVENTS;
+ +      vecp[i++].u_fc.get_config_events = xfs_dm_get_config_events;
+ +      vecp[i].func_no = DM_FSYS_GET_DESTROY_DMATTR;
+ +      vecp[i++].u_fc.get_destroy_dmattr = xfs_dm_get_destroy_dmattr;
+ +      vecp[i].func_no = DM_FSYS_GET_DIOINFO;
+ +      vecp[i++].u_fc.get_dioinfo = xfs_dm_get_dioinfo;
+ +      vecp[i].func_no = DM_FSYS_GET_DIRATTRS_RVP;
+ +      vecp[i++].u_fc.get_dirattrs_rvp = xfs_dm_get_dirattrs_rvp;
+ +      vecp[i].func_no = DM_FSYS_GET_DMATTR;
+ +      vecp[i++].u_fc.get_dmattr = xfs_dm_get_dmattr;
+ +      vecp[i].func_no = DM_FSYS_GET_EVENTLIST;
+ +      vecp[i++].u_fc.get_eventlist = xfs_dm_get_eventlist;
+ +      vecp[i].func_no = DM_FSYS_GET_FILEATTR;
+ +      vecp[i++].u_fc.get_fileattr = xfs_dm_get_fileattr;
+ +      vecp[i].func_no = DM_FSYS_GET_REGION;
+ +      vecp[i++].u_fc.get_region = xfs_dm_get_region;
+ +      vecp[i].func_no = DM_FSYS_GETALL_DMATTR;
+ +      vecp[i++].u_fc.getall_dmattr = xfs_dm_getall_dmattr;
+ +      vecp[i].func_no = DM_FSYS_GETALL_INHERIT;
+ +      vecp[i++].u_fc.getall_inherit = xfs_dm_getall_inherit;
+ +      vecp[i].func_no = DM_FSYS_INIT_ATTRLOC;
+ +      vecp[i++].u_fc.init_attrloc = xfs_dm_init_attrloc;
+ +      vecp[i].func_no = DM_FSYS_MKDIR_BY_HANDLE;
+ +      vecp[i++].u_fc.mkdir_by_handle = xfs_dm_mkdir_by_handle;
+ +      vecp[i].func_no = DM_FSYS_PROBE_HOLE;
+ +      vecp[i++].u_fc.probe_hole = xfs_dm_probe_hole;
+ +      vecp[i].func_no = DM_FSYS_PUNCH_HOLE;
+ +      vecp[i++].u_fc.punch_hole = xfs_dm_punch_hole;
+ +      vecp[i].func_no = DM_FSYS_READ_INVIS_RVP;
+ +      vecp[i++].u_fc.read_invis_rvp = xfs_dm_read_invis_rvp;
+ +      vecp[i].func_no = DM_FSYS_RELEASE_RIGHT;
+ +      vecp[i++].u_fc.release_right = xfs_dm_release_right;
+ +      vecp[i].func_no = DM_FSYS_REMOVE_DMATTR;
+ +      vecp[i++].u_fc.remove_dmattr = xfs_dm_remove_dmattr;
+ +      vecp[i].func_no = DM_FSYS_REQUEST_RIGHT;
+ +      vecp[i++].u_fc.request_right = xfs_dm_request_right;
+ +      vecp[i].func_no = DM_FSYS_SET_DMATTR;
+ +      vecp[i++].u_fc.set_dmattr = xfs_dm_set_dmattr;
+ +      vecp[i].func_no = DM_FSYS_SET_EVENTLIST;
+ +      vecp[i++].u_fc.set_eventlist = xfs_dm_set_eventlist;
+ +      vecp[i].func_no = DM_FSYS_SET_FILEATTR;
+ +      vecp[i++].u_fc.set_fileattr = xfs_dm_set_fileattr;
+ +      vecp[i].func_no = DM_FSYS_SET_INHERIT;
+ +      vecp[i++].u_fc.set_inherit = xfs_dm_set_inherit;
+ +      vecp[i].func_no = DM_FSYS_SET_REGION;
+ +      vecp[i++].u_fc.set_region = xfs_dm_set_region;
+ +      vecp[i].func_no = DM_FSYS_SYMLINK_BY_HANDLE;
+ +      vecp[i++].u_fc.symlink_by_handle = xfs_dm_symlink_by_handle;
+ +      vecp[i].func_no = DM_FSYS_SYNC_BY_HANDLE;
+ +      vecp[i++].u_fc.sync_by_handle = xfs_dm_sync_by_handle;
+ +      vecp[i].func_no = DM_FSYS_UPGRADE_RIGHT;
+ +      vecp[i++].u_fc.upgrade_right = xfs_dm_upgrade_right;
+ +      vecp[i].func_no = DM_FSYS_WRITE_INVIS_RVP;
+ +      vecp[i++].u_fc.write_invis_rvp = xfs_dm_write_invis_rvp;
+ +      vecp[i].func_no = DM_FSYS_OBJ_REF_HOLD;
+ +      vecp[i++].u_fc.obj_ref_hold = xfs_dm_obj_ref_hold;
+ +
+ +      return(0);
+ +}
+ +
+ +
+ +/*    xfs_dm_send_mmap_event - send events needed for memory mapping a file.
+ + *
+ + *    This is a workaround called for files that are about to be
+ + *    mapped.  DMAPI events are not being generated at a low enough level
+ + *    in the kernel for page reads/writes to generate the correct events.
+ + *    So for memory-mapped files we generate read  or write events for the
+ + *    whole byte range being mapped.  If the mmap call can never cause a
+ + *    write to the file, then only a read event is sent.
+ + *
+ + *    Code elsewhere prevents adding managed regions to a file while it
+ + *    is still mapped.
+ + */
+ +
+ +STATIC int
+ +xfs_dm_send_mmap_event(
+ +      struct vm_area_struct *vma,
+ +      unsigned int    wantflag)
+ +{
+ +      xfs_inode_t     *ip;
+ +      int             error = 0;
+ +      dm_eventtype_t  max_event = DM_EVENT_READ;
+ +      xfs_fsize_t     filesize;
+ +      xfs_off_t       length, end_of_area, evsize, offset;
+ +      int             iolock;
+ +
+ +      if (!vma->vm_file)
+ +              return 0;
+ +
+ +      ip = XFS_I(vma->vm_file->f_dentry->d_inode);
+ +
+ +      if (!S_ISREG(vma->vm_file->f_dentry->d_inode->i_mode) ||
+ +          !(ip->i_mount->m_flags & XFS_MOUNT_DMAPI))
+ +              return 0;
+ +
+ +      /* If they specifically asked for 'read', then give it to them.
+ +       * Otherwise, see if it's possible to give them 'write'.
+ +       */
+ +      if( wantflag & VM_READ ){
+ +              max_event = DM_EVENT_READ;
+ +      }
+ +      else if( ! (vma->vm_flags & VM_DENYWRITE) ) {
+ +              if((wantflag & VM_WRITE) || (vma->vm_flags & VM_WRITE))
+ +                      max_event = DM_EVENT_WRITE;
+ +      }
+ +
+ +      if( (wantflag & VM_WRITE) && (max_event != DM_EVENT_WRITE) ){
+ +              return -EACCES;
+ +      }
+ +
+ +      /* Figure out how much of the file is being requested by the user. */
+ +      offset = 0; /* beginning of file, for now */
+ +      length = 0; /* whole file, for now */
+ +
+ +      filesize = ip->i_new_size;
+ +      if (filesize < ip->i_size) {
+ +              filesize = ip->i_size;
+ +      }
+ +
+ +      /* Set first byte number beyond the map area. */
+ +
+ +      if (length) {
+ +              end_of_area = offset + length;
+ +              if (end_of_area > filesize)
+ +                      end_of_area = filesize;
+ +      } else {
+ +              end_of_area = filesize;
+ +      }
+ +
+ +      /* Set the real amount being mapped. */
+ +      evsize = end_of_area - offset;
+ +      if (evsize < 0)
+ +              evsize = 0;
+ +
+ +      if (max_event == DM_EVENT_READ)
+ +              iolock = XFS_IOLOCK_SHARED;
+ +      else
+ +              iolock = XFS_IOLOCK_EXCL;
+ +
+ +      xfs_ilock(ip, iolock);
+ +      /* If write possible, try a DMAPI write event */
+ +      if (max_event == DM_EVENT_WRITE && DM_EVENT_ENABLED(ip, max_event)) {
+ +              error = xfs_dm_send_data_event(max_event, ip, offset,
+ +                                             evsize, 0, &iolock);
+ +              goto out_unlock;
+ +      }
+ +
+ +      /* Try a read event if max_event was != DM_EVENT_WRITE or if it
+ +       * was DM_EVENT_WRITE but the WRITE event was not enabled.
+ +       */
+ +      if (DM_EVENT_ENABLED(ip, DM_EVENT_READ)) {
+ +              error = xfs_dm_send_data_event(DM_EVENT_READ, ip, offset,
+ +                                             evsize, 0, &iolock);
+ +      }
+ +out_unlock:
+ +      xfs_iunlock(ip, iolock);
+ +      return -error;
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_send_destroy_event(
+ +      xfs_inode_t     *ip,
+ +      dm_right_t      vp_right)       /* always DM_RIGHT_NULL */
+ +{
+ +      /* Returns positive errors to XFS */
+ +      return -dm_send_destroy_event(&ip->i_vnode, vp_right);
+ +}
+ +
+ +
+ +STATIC int
+ +xfs_dm_send_namesp_event(
+ +      dm_eventtype_t  event,
+ +      struct xfs_mount *mp,
+ +      xfs_inode_t     *ip1,
+ +      dm_right_t      vp1_right,
+ +      xfs_inode_t     *ip2,
+ +      dm_right_t      vp2_right,
+ +      const char      *name1,
+ +      const char      *name2,
+ +      mode_t          mode,
+ +      int             retcode,
+ +      int             flags)
+ +{
+ +      /* Returns positive errors to XFS */
+ +      return -dm_send_namesp_event(event, mp ? mp->m_super : NULL,
+ +                                  &ip1->i_vnode, vp1_right,
+ +                                  ip2 ? &ip2->i_vnode : NULL, vp2_right,
+ +                                  name1, name2,
+ +                                  mode, retcode, flags);
+ +}
+ +
+ +STATIC int
+ +xfs_dm_send_mount_event(
+ +      struct xfs_mount        *mp,
+ +      dm_right_t              root_right,
+ +      char                    *mtpt,
+ +      char                    *fsname)
+ +{
+ +      return dm_send_mount_event(mp->m_super, root_right,
+ +                      NULL, DM_RIGHT_NULL,
+ +                      mp->m_rootip ? VFS_I(mp->m_rootip) : NULL,
+ +                      DM_RIGHT_NULL, mtpt, fsname);
+ +}
+ +
+ +STATIC void
+ +xfs_dm_send_unmount_event(
+ +      struct xfs_mount *mp,
+ +      xfs_inode_t     *ip,            /* NULL if unmount successful */
+ +      dm_right_t      vfsp_right,
+ +      mode_t          mode,
+ +      int             retcode,        /* errno, if unmount failed */
+ +      int             flags)
+ +{
+ +      dm_send_unmount_event(mp->m_super, ip ? &ip->i_vnode : NULL,
+ +                            vfsp_right, mode, retcode, flags);
+ +}
+ +
+ +
+ +/*
+ + * Data migration operations accessed by the rest of XFS.
+ + * When DMAPI support is configured in, this vector is used.
+ + */
+ +
+ +xfs_dmops_t   xfs_dmcore_xfs = {
+ +      .xfs_send_data          = xfs_dm_send_data_event,
+ +      .xfs_send_mmap          = xfs_dm_send_mmap_event,
+ +      .xfs_send_destroy       = xfs_dm_send_destroy_event,
+ +      .xfs_send_namesp        = xfs_dm_send_namesp_event,
+ +      .xfs_send_mount         = xfs_dm_send_mount_event,
+ +      .xfs_send_unmount       = xfs_dm_send_unmount_event,
+ +};
+ +EXPORT_SYMBOL(xfs_dmcore_xfs);
+ +
+ +STATIC int
+ +xfs_dm_fh_to_inode(
+ +      struct super_block      *sb,
+ +      struct inode            **inode,
+ +      dm_fid_t                *dmfid)
+ +{
+ +      xfs_mount_t             *mp = XFS_M(sb);
+ +      xfs_inode_t             *ip;
+ +      xfs_ino_t               ino;
+ +      unsigned int            igen;
+ +      int                     error;
+ +
+ +      *inode = NULL;
+ +
+ +      if (!dmfid->dm_fid_len) {
+ +              /* filesystem handle */
+ +              *inode = igrab(&mp->m_rootip->i_vnode);
+ +              if (!*inode)
+ +                      return -ENOENT;
+ +              return 0;
+ +      }
+ +
+ +      if (dmfid->dm_fid_len != sizeof(*dmfid) - sizeof(dmfid->dm_fid_len))
+ +              return -EINVAL;
+ +
+ +      ino  = dmfid->dm_fid_ino;
+ +      igen = dmfid->dm_fid_gen;
+ +
+ +      /* fail requests for ino 0 gracefully. */
+ +      if (ino == 0)
+ +              return -ESTALE;
+ +
+ +      error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+ +      if (error)
+ +              return -error;
+ +      if (!ip)
+ +              return -EIO;
+ +
+ +      if (!ip->i_d.di_mode || ip->i_d.di_gen != igen) {
+ +              xfs_iput_new(ip, XFS_ILOCK_SHARED);
+ +              return -ENOENT;
+ +      }
+ +
+ +      *inode = &ip->i_vnode;
+ +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ +      return 0;
+ +}
+ +
+ +STATIC int
+ +xfs_dm_inode_to_fh(
+ +      struct inode            *inode,
+ +      dm_fid_t                *dmfid,
+ +      dm_fsid_t               *dmfsid)
+ +{
+ +      xfs_inode_t             *ip = XFS_I(inode);
+ +
+ +      /* Returns negative errors to DMAPI */
+ +
+ +      if (ip->i_mount->m_fixedfsid == NULL)
+ +              return -EINVAL;
+ +
+ +      dmfid->dm_fid_len = sizeof(dm_fid_t) - sizeof(dmfid->dm_fid_len);
+ +      dmfid->dm_fid_pad = 0;
+ +      /*
+ +       * use memcpy because the inode is a long long and there's no
+ +       * assurance that dmfid->dm_fid_ino is properly aligned.
+ +       */
+ +      memcpy(&dmfid->dm_fid_ino, &ip->i_ino, sizeof(dmfid->dm_fid_ino));
+ +      dmfid->dm_fid_gen = ip->i_d.di_gen;
+ +
+ +      memcpy(dmfsid, ip->i_mount->m_fixedfsid, sizeof(*dmfsid));
+ +      return 0;
+ +}
+ +
+ +STATIC void
+ +xfs_dm_get_fsid(
+ +      struct super_block      *sb,
+ +      dm_fsid_t               *fsid)
+ +{
+ +      memcpy(fsid, XFS_M(sb)->m_fixedfsid, sizeof(*fsid));
+ +}
+ +
+ +/*
+ + * Filesystem operations accessed by the DMAPI core.
+ + */
+ +static struct filesystem_dmapi_operations xfs_dmapiops = {
+ +      .get_fsys_vector        = xfs_dm_get_dmapiops,
+ +      .fh_to_inode            = xfs_dm_fh_to_inode,
+ +      .inode_to_fh            = xfs_dm_inode_to_fh,
+ +      .get_fsid               = xfs_dm_get_fsid,
+ +};
+ +
+ +static int __init
+ +xfs_dm_init(void)
+ +{
+ +      printk(KERN_INFO "SGI XFS Data Management API subsystem\n");
+ +
+ +      dmapi_register(&xfs_fs_type, &xfs_dmapiops);
+ +      return 0;
+ +}
+ +
+ +static void __exit
+ +xfs_dm_exit(void)
+ +{
+ +      dmapi_unregister(&xfs_fs_type);
+ +}
+ +
+ +MODULE_AUTHOR("Silicon Graphics, Inc.");
+ +MODULE_DESCRIPTION("SGI XFS dmapi subsystem");
+ +MODULE_LICENSE("GPL");
+ +
+ +module_init(xfs_dm_init);
+ +module_exit(xfs_dm_exit);
diff --cc fs/xfs/linux-2.6/xfs_file.c

index 0651d2c,42dd3bc..18cda5b
--- 1/fs/xfs/linux-2.6/xfs_file.c
--- 2/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@@ -43,33 -47,170 +47,180 @@@
   #include <linux/dcache.h>
   
   static const struct vm_operations_struct xfs_file_vm_ops;
+ +#ifdef HAVE_DMAPI
+ +static struct vm_operations_struct xfs_dmapi_file_vm_ops;
+ +#endif
   
- STATIC ssize_t
- xfs_file_aio_read(
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       unsigned long           nr_segs,
-       loff_t                  pos)
+ /*
+  *    xfs_iozero
+  *
+  *    xfs_iozero clears the specified range of buffer supplied,
+  *    and marks all the affected blocks as valid and modified.  If
+  *    an affected block is not allocated, it will be allocated.  If
+  *    an affected block is not completely overwritten, and is not
+  *    valid before the operation, it will be read from disk before
+  *    being partially zeroed.
+  */
+ STATIC int
+ xfs_iozero(
+       struct xfs_inode        *ip,    /* inode                        */
+       loff_t                  pos,    /* offset in file               */
+       size_t                  count)  /* size of data to zero         */
   {
-       struct file             *file = iocb->ki_filp;
-       int                     ioflags = 0;
+       struct page             *page;
+       struct address_space    *mapping;
+       int                     status;
   
-       BUG_ON(iocb->ki_pos != pos);
-       if (unlikely(file->f_flags & O_DIRECT))
-               ioflags |= IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-       return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
-                               nr_segs, &iocb->ki_pos, ioflags);
+       mapping = VFS_I(ip)->i_mapping;
+       do {
+               unsigned offset, bytes;
+               void *fsdata;
+ 
+               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+               bytes = PAGE_CACHE_SIZE - offset;
+               if (bytes > count)
+                       bytes = count;
+ 
+               status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                       AOP_FLAG_UNINTERRUPTIBLE,
+                                       &page, &fsdata);
+               if (status)
+                       break;
+ 
+               zero_user(page, offset, bytes);
+ 
+               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+                                       page, fsdata);
+               WARN_ON(status <= 0); /* can't return less than zero! */
+               pos += bytes;
+               count -= bytes;
+               status = 0;
+       } while (count);
+ 
+       return (-status);
   }
   
- -STATIC int
- -xfs_file_fsync(
- -      struct file             *file,
- -      struct dentry           *dentry,
- -      int                     datasync)
++int
++xfs_fsync(struct inode *inode, int datasync)
+ {
- -      struct xfs_inode        *ip = XFS_I(dentry->d_inode);
++      struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_trans        *tp;
+       int                     error = 0;
+       int                     log_flushed = 0;
+ 
+       xfs_itrace_entry(ip);
+ 
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -XFS_ERROR(EIO);
+ 
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ 
+       /*
+        * We always need to make sure that the required inode state is safe on
+        * disk.  The inode might be clean but we still might need to force the
+        * log because of committed transactions that haven't hit the disk yet.
+        * Likewise, there could be unflushed non-transactional changes to the
+        * inode core that have to go to disk and this requires us to issue
+        * a synchronous transaction to capture these changes correctly.
+        *
+        * This code relies on the assumption that if the i_update_core field
+        * of the inode is clear and the inode is unpinned then it is clean
+        * and no action is required.
+        */
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+ 
+       /*
+        * First check if the VFS inode is marked dirty.  All the dirtying
+        * of non-transactional updates no goes through mark_inode_dirty*,
+        * which allows us to distinguish beteeen pure timestamp updates
+        * and i_size updates which need to be caught for fdatasync.
+        * After that also theck for the dirty state in the XFS inode, which
+        * might gets cleared when the inode gets written out via the AIL
+        * or xfs_iflush_cluster.
+        */
- -      if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
- -          ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
++      if (((inode->i_state & I_DIRTY_DATASYNC) ||
++          ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+           ip->i_update_core) {
+               /*
+                * Kick off a transaction to log the inode core to get the
+                * updates.  The sync transaction will also force the log.
+                */
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+               tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+               error = xfs_trans_reserve(tp, 0,
+                               XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       return -error;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+ 
+               /*
+                * Note - it's possible that we might have pushed ourselves out
+                * of the way during trans_reserve which would flush the inode.
+                * But there's no guarantee that the inode buffer has actually
+                * gone out yet (it's delwri).  Plus the buffer could be pinned
+                * anyway if it's part of an inode in another recent
+                * transaction.  So we play it safe and fire off the
+                * transaction anyway.
+                */
+               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+               xfs_trans_ihold(tp, ip);
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               xfs_trans_set_sync(tp);
+               error = _xfs_trans_commit(tp, 0, &log_flushed);
+ 
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       } else {
+               /*
+                * Timestamps/size haven't changed since last inode flush or
+                * inode transaction commit.  That means either nothing got
+                * written or a transaction committed which caught the updates.
+                * If the latter happened and the transaction hasn't hit the
+                * disk yet, the inode will be still be pinned.  If it is,
+                * force the log.
+                */
+               if (xfs_ipincount(ip)) {
+                       error = _xfs_log_force_lsn(ip->i_mount,
+                                       ip->i_itemp->ili_last_lsn,
+                                       XFS_LOG_SYNC, &log_flushed);
+               }
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       }
+ 
+       if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+               /*
+                * If the log write didn't issue an ordered tag we need
+                * to flush the disk cache for the data device now.
+                */
+               if (!log_flushed)
+                       xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+ 
+               /*
+                * If this inode is on the RT dev we need to flush that
+                * cache as well.
+                */
+               if (XFS_IS_REALTIME_INODE(ip))
+                       xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+       }
+ 
+       return -error;
+ }
+ 
++STATIC int
++xfs_file_fsync(
++      struct file             *file,
++      struct dentry           *dentry,
++      int                     datasync)
++{
++      return xfs_fsync(dentry->d_inode, datasync);
++}
++
++
   STATIC ssize_t
- xfs_file_aio_write(
+ xfs_file_aio_read(
         struct kiocb            *iocb,
-       const struct iovec      *iov,
+       const struct iovec      *iovp,
         unsigned long           nr_segs,
         loff_t                  pos)
   {
@@@ -163,45 -938,6 +948,23 @@@ xfs_file_release
         return -xfs_release(XFS_I(inode));
   }
   
- /*
-  * We ignore the datasync flag here because a datasync is effectively
-  * identical to an fsync. That is, datasync implies that we need to write
-  * only the metadata needed to be able to access the data that is written
-  * if we crash after the call completes. Hence if we are writing beyond
-  * EOF we have to log the inode size change as well, which makes it a
-  * full fsync. If we don't write beyond EOF, the inode core will be
-  * clean in memory and so we don't need to log the inode, just like
-  * fsync.
-  */
- STATIC int
- xfs_file_fsync(
-       struct file             *file,
-       struct dentry           *dentry,
-       int                     datasync)
- {
-       struct xfs_inode        *ip = XFS_I(dentry->d_inode);
- 
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-       return -xfs_fsync(ip);
- }
- 
+ +#ifdef HAVE_DMAPI
+ +STATIC int
+ +xfs_vm_fault(
+ +      struct vm_area_struct   *vma,
+ +      struct vm_fault *vmf)
+ +{
+ +      struct inode    *inode = vma->vm_file->f_path.dentry->d_inode;
+ +      struct xfs_mount *mp = XFS_M(inode->i_sb);
+ +
+ +      ASSERT_ALWAYS(mp->m_flags & XFS_MOUNT_DMAPI);
+ +
+ +      if (XFS_SEND_MMAP(mp, vma, 0))
+ +              return VM_FAULT_SIGBUS;
+ +      return filemap_fault(vma, vmf);
+ +}
+ +#endif /* HAVE_DMAPI */
+ +
   STATIC int
   xfs_file_readdir(
         struct file     *filp,
diff --cc fs/xfs/linux-2.6/xfs_iops.c
Simple merge
diff --cc fs/xfs/linux-2.6/xfs_iops.h

index ef41c92,ef41c92..f85e6e1
--- 1/fs/xfs/linux-2.6/xfs_iops.h
--- 2/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@@ -27,4 -27,4 +27,7 @@@ extern ssize_t xfs_vn_listxattr(struct 
   
   extern void xfs_setup_inode(struct xfs_inode *);
   
++extern int xfs_fsync(struct inode *, int);
++
   #endif /* __XFS_IOPS_H__ */
++
diff --cc fs/xfs/linux-2.6/xfs_linux.h
Simple merge
diff --cc fs/xfs/linux-2.6/xfs_super.c
Simple merge
diff --cc fs/xfs/xfs_itable.c
Simple merge
diff --cc fs/xfs/xfs_mount.h
Simple merge
diff --cc fs/xfs/xfs_rw.c
Simple merge
diff --cc fs/xfs/xfs_rw.h
Simple merge
diff --cc fs/xfs/xfs_vnodeops.c

index a74749c,9d376be..b611f1d
--- 1/fs/xfs/xfs_vnodeops.c
--- 2/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@@ -584,116 -584,11 +584,6 @@@ xfs_readlink
   }
   
   /*
-  * xfs_fsync
-  *
-  * This is called to sync the inode and its data out to disk.  We need to hold
-  * the I/O lock while flushing the data, and the inode lock while flushing the
-  * inode.  The inode lock CANNOT be held while flushing the data, so acquire
-  * after we're done with that.
-  */
- int
- xfs_fsync(
-       xfs_inode_t     *ip)
- {
-       xfs_trans_t     *tp;
-       int             error = 0;
-       int             log_flushed = 0, changed = 1;
- 
-       xfs_itrace_entry(ip);
- 
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return XFS_ERROR(EIO);
- 
-       /*
-        * We always need to make sure that the required inode state is safe on
-        * disk.  The inode might be clean but we still might need to force the
-        * log because of committed transactions that haven't hit the disk yet.
-        * Likewise, there could be unflushed non-transactional changes to the
-        * inode core that have to go to disk and this requires us to issue
-        * a synchronous transaction to capture these changes correctly.
-        *
-        * This code relies on the assumption that if the update_* fields
-        * of the inode are clear and the inode is unpinned then it is clean
-        * and no action is required.
-        */
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
- 
-       if (!ip->i_update_core) {
-               /*
-                * Timestamps/size haven't changed since last inode flush or
-                * inode transaction commit.  That means either nothing got
-                * written or a transaction committed which caught the updates.
-                * If the latter happened and the transaction hasn't hit the
-                * disk yet, the inode will be still be pinned.  If it is,
-                * force the log.
-                */
- 
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
- 
-               if (xfs_ipincount(ip)) {
-                       error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-                                     XFS_LOG_FORCE | XFS_LOG_SYNC,
-                                     &log_flushed);
-               } else {
-                       /*
-                        * If the inode is not pinned and nothing has changed
-                        * we don't need to flush the cache.
-                        */
-                       changed = 0;
-               }
-       } else  {
-               /*
-                * Kick off a transaction to log the inode core to get the
-                * updates.  The sync transaction will also force the log.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, 0,
-                               XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       return error;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
- 
-               /*
-                * Note - it's possible that we might have pushed ourselves out
-                * of the way during trans_reserve which would flush the inode.
-                * But there's no guarantee that the inode buffer has actually
-                * gone out yet (it's delwri).  Plus the buffer could be pinned
-                * anyway if it's part of an inode in another recent
-                * transaction.  So we play it safe and fire off the
-                * transaction anyway.
-                */
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               xfs_trans_set_sync(tp);
-               error = _xfs_trans_commit(tp, 0, &log_flushed);
- 
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
- 
-       if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
-               /*
-                * If the log write didn't issue an ordered tag we need
-                * to flush the disk cache for the data device now.
-                */
-               if (!log_flushed)
-                       xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
- 
-               /*
-                * If this inode is on the RT dev we need to flush that
-                * cache as well.
-                */
-               if (XFS_IS_REALTIME_INODE(ip))
-                       xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-       }
- 
-       return error;
- }
- 
- /*
- - * Flags for xfs_free_eofblocks
- - */
- -#define XFS_FREE_EOF_TRYLOCK  (1<<0)
- -
- -/*
    * This is called by xfs_inactive to free any blocks beyond eof
    * when the link count isn't zero and by xfs_dm_punch_hole() when
    * punching a hole to EOF.
diff --cc include/linux/audit.h
Simple merge
diff --cc include/linux/blkdev.h

index 0cc9c93,ebd22db..cc65bde
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -1014,15 -1033,15 +1033,19 @@@ extern int blk_verify_command(unsigned 
   #define MAX_PHYS_SEGMENTS 128
   #define MAX_HW_SEGMENTS 128
   #define SAFE_MAX_SECTORS 255
+ #define MAX_SEGMENT_SIZE      65536
+ 
+ enum blk_default_limits {
+       BLK_MAX_SEGMENTS        = 128,
+       BLK_SAFE_MAX_SECTORS    = 255,
+ +#ifndef CONFIG_KERNEL_DESKTOP
- #define BLK_DEF_MAX_SECTORS 2048
++      BLK_DEF_MAX_SECTORS     = 2048,
+ +#else
- #define BLK_DEF_MAX_SECTORS 1024
+       BLK_DEF_MAX_SECTORS     = 1024,
+ +#endif
- 
- #define MAX_SEGMENT_SIZE      65536
- 
- #define BLK_SEG_BOUNDARY_MASK 0xFFFFFFFFUL
+       BLK_MAX_SEGMENT_SIZE    = 65536,
+       BLK_SEG_BOUNDARY_MASK   = 0xFFFFFFFFUL,
+ };
   
   #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
   
diff --cc include/linux/device.h
Simple merge
diff --cc include/linux/ext3_fs.h
Simple merge
diff --cc include/linux/ext3_fs_i.h
Simple merge
diff --cc include/linux/fb.h
Simple merge
diff --cc include/linux/fs.h
Simple merge
diff --cc include/linux/genhd.h
Simple merge
diff --cc include/linux/gfp.h
Simple merge
diff --cc include/linux/irq.h
Simple merge
diff --cc include/linux/jbd.h
Simple merge
diff --cc include/linux/kernel.h
Simple merge
diff --cc include/linux/libata.h
Simple merge
diff --cc include/linux/mm.h
Simple merge
diff --cc include/linux/mm_types.h
Simple merge
diff --cc include/linux/mmzone.h
Simple merge
diff --cc include/linux/module.h
Simple merge
diff --cc include/linux/nfs_fs.h
Simple merge
diff --cc include/linux/page-flags.h
Simple merge
diff --cc include/linux/sched.h
Simple merge
diff --cc include/linux/security.h
Simple merge
diff --cc include/linux/skbuff.h

index 097aff3,03f816a..6e33022
--- 1/include/linux/skbuff.h
--- 2/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@@ -381,16 -380,9 +380,12 @@@ struct sk_buff 
   #ifdef CONFIG_IPV6_NDISC_NODETYPE
         __u8                    ndisc_nodetype:2;
   #endif
+ +#ifdef        CONFIG_NETVM
+ +      __u8                    emergency:1;
+ +#endif
- #ifdef CONFIG_XEN
-       __u8                    proto_data_valid:1,
-                               proto_csum_blank:1;
- #endif
         kmemcheck_bitfield_end(flags2);
   
-       /* 0/9...15 bit hole */
+       /* 0/14 bit hole */
   
   #ifdef CONFIG_NET_DMA
         dma_cookie_t            dma_cookie;
diff --cc include/linux/slab.h
Simple merge
diff --cc include/linux/slub_def.h

index cceaebf,0249d41..555d709
--- 1/include/linux/slub_def.h
--- 2/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@@ -38,9 -38,6 +38,7 @@@ struct kmem_cache_cpu 
         void **freelist;        /* Pointer to first free per cpu object */
         struct page *page;      /* The slab from which we are allocating */
         int node;               /* The node of the page (or -1 for debug) */
-       unsigned int offset;    /* Freepointer offset (in word units) */
-       unsigned int objsize;   /* Size of an object (from kmem_cache) */
+ +      int reserve;            /* Did the current page come from the reserve */
   #ifdef CONFIG_SLUB_STATS
         unsigned stat[NR_SLUB_STAT_ITEMS];
   #endif
diff --cc include/linux/swap.h
Simple merge
diff --cc include/net/netns/ipv6.h
Simple merge
diff --cc include/net/sock.h
Simple merge
diff --cc include/scsi/scsi_device.h
Simple merge
diff --cc init/Kconfig

index 31e3e0a,089a230..f1b03dc
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -486,62 -461,8 +502,10 @@@ config LOG_BUF_SHIF
   config HAVE_UNSTABLE_SCHED_CLOCK
         bool
   
- config GROUP_SCHED
-       bool "Group CPU scheduler"
-       depends on EXPERIMENTAL
-       default n if KERNEL_DESKTOP
-       default y
-       help
-         This feature lets CPU scheduler recognize task groups and control CPU
-         bandwidth allocation to such task groups.
-         In order to create a group from arbitrary set of processes, use
-         CONFIG_CGROUPS. (See Control Group support.)
- 
- config FAIR_GROUP_SCHED
-       bool "Group scheduling for SCHED_OTHER"
-       depends on GROUP_SCHED
-       default GROUP_SCHED
- 
- config RT_GROUP_SCHED
-       bool "Group scheduling for SCHED_RR/FIFO"
-       depends on EXPERIMENTAL
-       depends on GROUP_SCHED
-       default n
-       help
-         This feature lets you explicitly allocate real CPU bandwidth
-         to users or control groups (depending on the "Basis for grouping tasks"
-         setting below. If enabled, it will also make it impossible to
-         schedule realtime tasks for non-root users until you allocate
-         realtime bandwidth for them.
-         See Documentation/scheduler/sched-rt-group.txt for more information.
- 
- choice
-       depends on GROUP_SCHED
-       prompt "Basis for grouping tasks"
-       default USER_SCHED
- 
- config USER_SCHED
-       bool "user id"
-       help
-         This option will choose userid as the basis for grouping
-         tasks, thus providing equal CPU bandwidth to each user.
- 
- config CGROUP_SCHED
-       bool "Control groups"
-       depends on CGROUPS
-       help
-         This option allows you to create arbitrary task groups
-         using the "cgroup" pseudo filesystem and control
-         the cpu bandwidth allocated to each such task group.
-         Refer to Documentation/cgroups/cgroups.txt for more
-         information on "cgroup" pseudo filesystem.
- 
- endchoice
- 
   menuconfig CGROUPS
         boolean "Control Group support"
+ +      default n if KERNEL_DESKTOP
+ +      default y
         help
           This option adds support for grouping sets of processes together, for
           use with process control subsystems such as Cpusets, CFS, memory
@@@ -660,6 -581,36 +624,37 @@@ config CGROUP_MEM_RES_CTLR_SWA
           Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
           size is 4096bytes, 512k per 1Gbytes of swap.
   
+ menuconfig CGROUP_SCHED
+       bool "Group CPU scheduler"
+       depends on EXPERIMENTAL && CGROUPS
- -      default n
++      default n if KERNEL_DESKTOP
++      default y
+       help
+         This feature lets CPU scheduler recognize task groups and control CPU
+         bandwidth allocation to such task groups. It uses cgroups to group
+         tasks.
+ 
+ if CGROUP_SCHED
+ config FAIR_GROUP_SCHED
+       bool "Group scheduling for SCHED_OTHER"
+       depends on CGROUP_SCHED
+       default CGROUP_SCHED
+ 
+ config RT_GROUP_SCHED
+       bool "Group scheduling for SCHED_RR/FIFO"
+       depends on EXPERIMENTAL
+       depends on CGROUP_SCHED
+       default n
+       help
+         This feature lets you explicitly allocate real CPU bandwidth
+         to users or control groups (depending on the "Basis for grouping tasks"
+         setting below. If enabled, it will also make it impossible to
+         schedule realtime tasks for non-root users until you allocate
+         realtime bandwidth for them.
+         See Documentation/scheduler/sched-rt-group.txt for more information.
+ 
+ endif #CGROUP_SCHED
+ 
   endif # CGROUPS
   
   config MM_OWNER
@@@ -1155,25 -1093,8 +1137,25 @@@ config MMAP_ALLOW_UNINITIALIZE
   
           See Documentation/nommu-mmap.txt for more information.
   
+ +config DEFAULT_VM_DIRTY_RATIO
+ +      int "Default VM dirty ratio (in %)"
+ +      default 20 if KERNEL_DESKTOP
+ +      default 40
+ +      help
+ +        Allows to tune VM dirty ratio to suit different workloads. Increased
+ +        VM dirty ratio improves performance of most server workloads that
+ +        dirties a lot of memory (e.g. simple databases not using direct IO,
+ +        workloads doing heavy writes). The latency-sensitive workloads like
+ +        desktop and typical workstations perform better with a decreased
+ +        VM dirty ratio.
+ +
+ +        Recommended value for desktop workload is 20.
+ +        Recommended value for server workload is 40.
+ +
+ +        Only use this if you really know what you are doing.
+ +
   config PROFILING
-       bool "Profiling support (EXPERIMENTAL)"
+       bool "Profiling support"
         help
           Say Y here to enable the extended profiling support mechanisms used
           by profilers such as OProfile.
diff --cc init/main.c
Simple merge
diff --cc ipc/mqueue.c

index 6c97934,b6cb064..e4e3f04
--- 1/ipc/mqueue.c
--- 2/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@@ -152,8 -155,9 +155,9 @@@ static struct inode *mqueue_get_inode(s
                         spin_lock(&mq_lock);
                         if (u->mq_bytes + mq_bytes < u->mq_bytes ||
                             u->mq_bytes + mq_bytes >
- -                          p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
+ +                          task_rlimit(p, RLIMIT_MSGQUEUE)) {
                                 spin_unlock(&mq_lock);
+                               kfree(info->messages);
                                 goto out_inode;
                         }
                         u->mq_bytes += mq_bytes;
diff --cc kdb/modules/kdbm_vm.c

index c34ac4f,0000000..64e9a17

mode 100644,000000..100644
--- 1/kdb/modules/kdbm_vm.c
--- /dev/null
+++ b/kdb/modules/kdbm_vm.c
@@@ -1,1041 -1,0 +1,1043 @@@
+ +/*
+ + * This file is subject to the terms and conditions of the GNU General Public
+ + * License.  See the file "COPYING" in the main directory of this archive
+ + * for more details.
+ + *
+ + * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ + */
+ +
+ +#include <linux/blkdev.h>
+ +#include <linux/types.h>
+ +#include <linux/kdb.h>
+ +#include <linux/kdbprivate.h>
+ +#include <linux/module.h>
+ +#include <linux/init.h>
+ +#include <linux/mm.h>
+ +#include <linux/swap.h>
+ +#include <linux/swapops.h>
+ +
+ +#include <scsi/scsi.h>
+ +#include <scsi/scsi_cmnd.h>
+ +#include <scsi/scsi_device.h>
+ +#include <scsi/scsi_host.h>
+ +#include <asm/pgtable.h>
+ +
+ +MODULE_AUTHOR("SGI");
+ +MODULE_DESCRIPTION("Debug VM information");
+ +MODULE_LICENSE("GPL");
+ +
+ +struct __vmflags {
+ +      unsigned long mask;
+ +      char *name;
+ +};
+ +
+ +static struct __vmflags vmflags[] = {
+ +      { VM_READ, "VM_READ " },
+ +      { VM_WRITE, "VM_WRITE " },
+ +      { VM_EXEC, "VM_EXEC " },
+ +      { VM_SHARED, "VM_SHARED " },
+ +      { VM_MAYREAD, "VM_MAYREAD " },
+ +      { VM_MAYWRITE, "VM_MAYWRITE " },
+ +      { VM_MAYEXEC, "VM_MAYEXEC " },
+ +      { VM_MAYSHARE, "VM_MAYSHARE " },
+ +      { VM_GROWSDOWN, "VM_GROWSDOWN " },
+ +      { VM_GROWSUP, "VM_GROWSUP " },
+ +      { VM_PFNMAP, "VM_PFNMAP " },
+ +      { VM_DENYWRITE, "VM_DENYWRITE " },
+ +      { VM_EXECUTABLE, "VM_EXECUTABLE " },
+ +      { VM_LOCKED, "VM_LOCKED " },
+ +      { VM_IO, "VM_IO " },
+ +      { VM_SEQ_READ, "VM_SEQ_READ " },
+ +      { VM_RAND_READ, "VM_RAND_READ " },
+ +      { VM_DONTCOPY, "VM_DONTCOPY " },
+ +      { VM_DONTEXPAND, "VM_DONTEXPAND " },
+ +      { VM_RESERVED, "VM_RESERVED " },
+ +      { VM_ACCOUNT, "VM_ACCOUNT " },
+ +      { VM_HUGETLB, "VM_HUGETLB " },
+ +      { VM_NONLINEAR, "VM_NONLINEAR " },
+ +      { VM_MAPPED_COPY, "VM_MAPPED_COPY " },
+ +      { VM_INSERTPAGE, "VM_INSERTPAGE " },
+ +      { 0, "" }
+ +};
+ +
+ +static int
+ +kdbm_print_vm(struct vm_area_struct *vp, unsigned long addr, int verbose_flg)
+ +{
+ +      struct __vmflags *tp;
+ +
+ +      kdb_printf("struct vm_area_struct at 0x%lx for %d bytes\n",
+ +                 addr, (int) sizeof (struct vm_area_struct));
+ +
+ +      kdb_printf("vm_start = 0x%p   vm_end = 0x%p\n", (void *) vp->vm_start,
+ +                 (void *) vp->vm_end);
+ +      kdb_printf("vm_page_prot = 0x%llx\n",
+ +              (unsigned long long)pgprot_val(vp->vm_page_prot));
+ +
+ +      kdb_printf("vm_flags: ");
+ +      for (tp = vmflags; tp->mask; tp++) {
+ +              if (vp->vm_flags & tp->mask) {
+ +                      kdb_printf(" %s", tp->name);
+ +              }
+ +      }
+ +      kdb_printf("\n");
+ +
+ +      if (!verbose_flg)
+ +              return 0;
+ +
+ +      kdb_printf("vm_mm = 0x%p\n", (void *) vp->vm_mm);
+ +      kdb_printf("vm_next = 0x%p\n", (void *) vp->vm_next);
+ +      kdb_printf("shared.vm_set.list.next = 0x%p\n", (void *) vp->shared.vm_set.list.next);
+ +      kdb_printf("shared.vm_set.list.prev = 0x%p\n", (void *) vp->shared.vm_set.list.prev);
+ +      kdb_printf("shared.vm_set.parent = 0x%p\n", (void *) vp->shared.vm_set.parent);
+ +      kdb_printf("shared.vm_set.head = 0x%p\n", (void *) vp->shared.vm_set.head);
-       kdb_printf("anon_vma_node.next = 0x%p\n", (void *) vp->anon_vma_node.next);
-       kdb_printf("anon_vma_node.prev = 0x%p\n", (void *) vp->anon_vma_node.prev);
++      kdb_printf("anon_vma_chain.next = 0x%p\n", (void *) vp->anon_vma_chain.next);
++      kdb_printf("anon_vma_chain.prev = 0x%p\n", (void *) vp->anon_vma_chain.prev);
+ +      kdb_printf("vm_ops = 0x%p\n", (void *) vp->vm_ops);
+ +      if (vp->vm_ops != NULL) {
+ +              kdb_printf("vm_ops->open = 0x%p\n", vp->vm_ops->open);
+ +              kdb_printf("vm_ops->close = 0x%p\n", vp->vm_ops->close);
+ +              kdb_printf("vm_ops->fault = 0x%p\n", vp->vm_ops->fault);
+ +#ifdef HAVE_VMOP_MPROTECT
+ +              kdb_printf("vm_ops->mprotect = 0x%p\n", vp->vm_ops->mprotect);
+ +#endif
+ +#ifdef CONFIG_NUMA
+ +              kdb_printf("vm_ops->set_policy = 0x%p\n", vp->vm_ops->set_policy);
+ +              kdb_printf("vm_ops->get_policy = 0x%p\n", vp->vm_ops->get_policy);
+ +#endif
+ +      }
+ +      kdb_printf("vm_pgoff = 0x%lx\n", vp->vm_pgoff);
+ +      kdb_printf("vm_file = 0x%p\n", (void *) vp->vm_file);
+ +      kdb_printf("vm_private_data = 0x%p\n", vp->vm_private_data);
+ +#ifdef CONFIG_NUMA
+ +      kdb_printf("vm_policy = 0x%p\n", vp->vm_policy);
+ +#endif
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +kdbm_print_vmp(struct vm_area_struct *vp, int verbose_flg)
+ +{
+ +      struct __vmflags *tp;
+ +
+ +      if (verbose_flg) {
+ +              kdb_printf("0x%lx:  ", (unsigned long) vp);
+ +      }
+ +
+ +      kdb_printf("0x%p  0x%p ", (void *) vp->vm_start, (void *) vp->vm_end);
+ +
+ +      for (tp = vmflags; tp->mask; tp++) {
+ +              if (vp->vm_flags & tp->mask) {
+ +                      kdb_printf(" %s", tp->name);
+ +              }
+ +      }
+ +      kdb_printf("\n");
+ +
+ +      return 0;
+ +}
+ +
+ +
+ +#ifdef CONFIG_NUMA
+ +#include <linux/mempolicy.h>
+ +
+ +/*
+ + * kdbm_mpol
+ + *
+ + *    This function implements the 'mempolicy' command.
+ + *    Print a struct mempolicy.
+ + *
+ + *    mempolicy <address>     Print struct mempolicy at <address>
+ + */
+ +static int
+ +kdbm_mpol(int argc, const char **argv)
+ +{
+ +      unsigned long addr;
+ +      long offset = 0;
+ +      int nextarg;
+ +      int err = 0;
+ +      struct mempolicy *mp = NULL;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((err = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ +                              NULL)) != 0)
+ +              return(err);
+ +
+ +      if (!(mp = kmalloc(sizeof(*mp), GFP_ATOMIC))) {
+ +              kdb_printf("%s: cannot kmalloc mp\n", __FUNCTION__);
+ +              goto out;
+ +      }
+ +
+ +      if ((err = kdb_getarea(*mp, addr))) {
+ +              kdb_printf("%s: invalid mempolicy address\n", __FUNCTION__);
+ +              goto out;
+ +      }
+ +
+ +      kdb_printf("struct mempolicy at 0x%p\n", (struct mempolicy *)addr);
+ +      kdb_printf("  refcnt %d\n", atomic_read(&mp->refcnt));
+ +
+ +      switch (mp->mode) {
+ +        case MPOL_DEFAULT:
+ +              kdb_printf("  mode %d (MPOL_DEFAULT)\n", mp->mode);
+ +              break;
+ +
+ +        case MPOL_PREFERRED:
+ +              kdb_printf("  mode %d (MPOL_PREFERRED)\n", mp->mode);
+ +              if (mp->flags & MPOL_F_LOCAL)
+ +                      kdb_printf("  preferred_node local\n");
+ +              else
+ +                      kdb_printf("  preferred_node %d\n", mp->v.preferred_node);
+ +              break;
+ +
+ +        case MPOL_BIND:
+ +        case MPOL_INTERLEAVE:
+ +        {
+ +              int i, nlongs;
+ +              unsigned long *longp;
+ +
+ +              kdb_printf("  mode %d (%s)\n", mp->mode,
+ +                      mp->mode == MPOL_INTERLEAVE
+ +                              ? "MPOL_INTERLEAVE"
+ +                              : "MPOL_BIND");
+ +              nlongs = (int)BITS_TO_LONGS(MAX_NUMNODES);
+ +              kdb_printf("  nodes:");
+ +              longp = mp->v.nodes.bits;
+ +              for (i = 0; i < nlongs; i++, longp++)
+ +                      kdb_printf("  0x%lx ", *longp);
+ +              kdb_printf("\n");
+ +              break;
+ +        }
+ +
+ +        default:
+ +              kdb_printf("  mode %d (unknown)\n", mp->mode);
+ +              break;
+ +      }
+ +out:
+ +      if (mp)
+ +              kfree(mp);
+ +      return err;
+ +}
+ +
+ +#endif /* CONFIG_NUMA */
+ +
+ +/*
+ + * kdbm_pgdat
+ + *
+ + *    This function implements the 'pgdat' command.
+ + *    Print a struct pglist_data (pg_dat_t).
+ + *
+ + *    pgdat <node_id>         Print struct pglist_data for node <node_id>.
+ + *
+ + *    Print pglist_data for node 0 if node_id not specified,
+ + *    or print the one pglist_data structure if !CONFIG_NUMA.
+ + */
+ +static int
+ +kdbm_pgdat(int argc, const char **argv)
+ +{
+ +      int err = 0, node_id = 0, i;
+ +      pg_data_t *pgdatp = NULL;
+ +
+ +#ifdef CONFIG_NUMA
+ +      if (argc > 1)
+ +              return KDB_ARGCOUNT;
+ +      if (argc == 1) {
+ +              int nextarg;
+ +              long offset = 0;
+ +              unsigned long node_id_ul;
+ +
+ +              nextarg = 1;
+ +              if ((err = kdbgetaddrarg(argc, argv, &nextarg, &node_id_ul,
+ +                                       &offset, NULL)) != 0) {
+ +                      return(err);
+ +              }
+ +              node_id = (int)node_id_ul;
+ +      }
+ +#endif
+ +      for_each_online_pgdat(pgdatp) {
+ +              if (pgdatp->node_id == node_id)
+ +                      break;
+ +      }
+ +      if (!pgdatp) {
+ +              kdb_printf("%s: specified node not found\n", __FUNCTION__);
+ +              return 0;
+ +      }
+ +      kdb_printf("struct pglist_data at 0x%p  node_id = %d\n",
+ +                 pgdatp, pgdatp->node_id);
+ +
+ +      for (i = 0; i < MAX_ZONELISTS; i++) {
+ +              int zr;
+ +              struct zoneref *zonerefp;
+ +              struct zone *zonep;
+ +
+ +              zonerefp = pgdatp->node_zonelists[i]._zonerefs;
+ +              kdb_printf("  _zonerefs[%d] at 0x%p\n", i, zonerefp);
+ +
+ +              for (zr = 0; zr <= MAX_ZONES_PER_ZONELIST; zr++, zonerefp++) {
+ +                      int z;
+ +                      pg_data_t *tmp_pgdatp;
+ +
+ +                      zonep = zonelist_zone(zonerefp);
+ +                      if (!zonep)
+ +                              break;
+ +
+ +                      kdb_printf("    0x%p", zonep);
+ +
+ +                      for_each_online_pgdat(tmp_pgdatp) {
+ +                              for (z = 0; z < MAX_NR_ZONES; z++) {
+ +                                      if (zonep == &tmp_pgdatp->node_zones[z]) {
+ +                                              kdb_printf ("  (node %d node_zones[%d])",
+ +                                                   tmp_pgdatp->node_id, z);
+ +                                              break;
+ +                                      }
+ +                              }
+ +                              if (z != MAX_NR_ZONES)
+ +                                      break;  /* found it */
+ +                      }
+ +                      kdb_printf("\n");
+ +              }
+ +      }
+ +
+ +      kdb_printf("  nr_zones = %d", pgdatp->nr_zones);
+ +#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ +      kdb_printf("  node_mem_map = 0x%p\n", pgdatp->node_mem_map);
+ +#endif
++#ifndef CONFIG_NO_BOOTMEM
+ +      kdb_printf("  bdata = 0x%p", pgdatp->bdata);
++#endif
+ +      kdb_printf("  node_start_pfn = 0x%lx\n", pgdatp->node_start_pfn);
+ +      kdb_printf("  node_present_pages = %ld (0x%lx)\n",
+ +                 pgdatp->node_present_pages, pgdatp->node_present_pages);
+ +      kdb_printf("  node_spanned_pages = %ld (0x%lx)\n",
+ +                 pgdatp->node_spanned_pages, pgdatp->node_spanned_pages);
+ +      kdb_printf("  kswapd = 0x%p\n", pgdatp->kswapd);
+ +
+ +      return err;
+ +}
+ +
+ +/*
+ + * kdbm_vm
+ + *
+ + *     This function implements the 'vm' command.  Print a vm_area_struct.
+ + *
+ + *     vm [-v] <address>      Print vm_area_struct at <address>
+ + *     vmp [-v] <pid>         Print all vm_area_structs for <pid>
+ + */
+ +
+ +static int
+ +kdbm_vm(int argc, const char **argv)
+ +{
+ +      unsigned long addr;
+ +      long offset = 0;
+ +      int nextarg;
+ +      int diag;
+ +      int verbose_flg = 0;
+ +
+ +      if (argc == 2) {
+ +              if (strcmp(argv[1], "-v") != 0) {
+ +                      return KDB_ARGCOUNT;
+ +              }
+ +              verbose_flg = 1;
+ +      } else if (argc != 1) {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +      if (strcmp(argv[0], "vmp") == 0) {
+ +              struct task_struct *g, *tp;
+ +              struct vm_area_struct *vp;
+ +              pid_t pid;
+ +
+ +              if ((diag = kdbgetularg(argv[argc], (unsigned long *) &pid)))
+ +                      return diag;
+ +
+ +              kdb_do_each_thread(g, tp) {
+ +                      if (tp->pid == pid) {
+ +                              if (tp->mm != NULL) {
+ +                                      if (verbose_flg)
+ +                                              kdb_printf
+ +                                                  ("vm_area_struct       ");
+ +                                      kdb_printf
+ +                                          ("vm_start            vm_end              vm_flags\n");
+ +                                      vp = tp->mm->mmap;
+ +                                      while (vp != NULL) {
+ +                                              kdbm_print_vmp(vp, verbose_flg);
+ +                                              vp = vp->vm_next;
+ +                                      }
+ +                              }
+ +                              return 0;
+ +                      }
+ +              } kdb_while_each_thread(g, tp);
+ +
+ +              kdb_printf("No process with pid == %d found\n", pid);
+ +
+ +      } else {
+ +              struct vm_area_struct v;
+ +
+ +              nextarg = argc;
+ +              if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ +                                        NULL))
+ +                  || (diag = kdb_getarea(v, addr)))
+ +                      return (diag);
+ +
+ +              kdbm_print_vm(&v, addr, verbose_flg);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +kdbm_print_pte(pte_t * pte)
+ +{
+ +      kdb_printf("0x%lx (", (unsigned long) pte_val(*pte));
+ +
+ +      if (pte_present(*pte)) {
+ +#ifdef        pte_exec
+ +              if (pte_exec(*pte))
+ +                      kdb_printf("X");
+ +#endif
+ +              if (pte_write(*pte))
+ +                      kdb_printf("W");
+ +#ifdef        pte_read
+ +              if (pte_read(*pte))
+ +                      kdb_printf("R");
+ +#endif
+ +              if (pte_young(*pte))
+ +                      kdb_printf("A");
+ +              if (pte_dirty(*pte))
+ +                      kdb_printf("D");
+ +
+ +      } else {
+ +              kdb_printf("OFFSET=0x%lx ", swp_offset(pte_to_swp_entry(*pte)));
+ +              kdb_printf("TYPE=0x%ulx", swp_type(pte_to_swp_entry(*pte)));
+ +      }
+ +
+ +      kdb_printf(")");
+ +
+ +      /* final newline is output by caller of kdbm_print_pte() */
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * kdbm_pte
+ + *
+ + *     This function implements the 'pte' command.  Print all pte_t structures
+ + *     that map to the given virtual address range (<address> through <address>
+ + *     plus <nbytes>) for the given process. The default value for nbytes is
+ + *     one.
+ + *
+ + *     pte -m <mm> <address> [<nbytes>]    Print all pte_t structures for
+ + *                                       virtual <address> in address space
+ + *                                       of <mm> which is a pointer to a
+ + *                                       mm_struct
+ + *     pte -p <pid> <address> [<nbytes>]   Print all pte_t structures for
+ + *                                       virtual <address> in address space
+ + *                                       of <pid>
+ + */
+ +
+ +static int
+ +kdbm_pte(int argc, const char **argv)
+ +{
+ +      unsigned long addr;
+ +      long offset = 0;
+ +      int nextarg;
+ +      unsigned long nbytes = 1;
+ +      long npgs;
+ +      int diag;
+ +      int found;
+ +      pid_t pid;
+ +      struct task_struct *tp;
+ +      struct mm_struct *mm, copy_of_mm;
+ +      pgd_t *pgd;
+ +      pud_t *pud;
+ +      pmd_t *pmd;
+ +      pte_t *pte;
+ +
+ +      if (argc < 3 || argc > 4) {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +       if (strcmp(argv[1], "-p") == 0) {
+ +              if ((diag = kdbgetularg(argv[2], (unsigned long *) &pid))) {
+ +                      return diag;
+ +              }
+ +
+ +              found = 0;
+ +              for_each_process(tp) {
+ +                      if (tp->pid == pid) {
+ +                              if (tp->mm != NULL) {
+ +                                      found = 1;
+ +                                      break;
+ +                              }
+ +                              kdb_printf("task structure's mm field is NULL\n");
+ +                              return 0;
+ +                      }
+ +              }
+ +
+ +              if (!found) {
+ +                      kdb_printf("No process with pid == %d found\n", pid);
+ +                      return 0;
+ +              }
+ +              mm = tp->mm;
+ +      } else if (strcmp(argv[1], "-m") == 0) {
+ +
+ +
+ +              nextarg = 2;
+ +              if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ +                                        NULL))
+ +                  || (diag = kdb_getarea(copy_of_mm, addr)))
+ +                      return (diag);
+ +              mm = &copy_of_mm;
+ +      } else {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +      if ((diag = kdbgetularg(argv[3], &addr))) {
+ +              return diag;
+ +      }
+ +
+ +      if (argc == 4) {
+ +              if ((diag = kdbgetularg(argv[4], &nbytes))) {
+ +                      return diag;
+ +              }
+ +      }
+ +
+ +      kdb_printf("vaddr              pte\n");
+ +
+ +      npgs = ((((addr & ~PAGE_MASK) + nbytes) + ~PAGE_MASK) >> PAGE_SHIFT);
+ +      while (npgs-- > 0) {
+ +
+ +              kdb_printf("0x%p ", (void *) (addr & PAGE_MASK));
+ +
+ +              pgd = pgd_offset(mm, addr);
+ +              if (pgd_present(*pgd)) {
+ +                      pud = pud_offset(pgd, addr);
+ +                      if (pud_present(*pud)) {
+ +                              pmd = pmd_offset(pud, addr);
+ +                              if (pmd_present(*pmd)) {
+ +                                      pte = pte_offset_map(pmd, addr);
+ +                                      if (pte_present(*pte)) {
+ +                                              kdbm_print_pte(pte);
+ +                                      }
+ +                              }
+ +                      }
+ +              }
+ +
+ +              kdb_printf("\n");
+ +              addr += PAGE_SIZE;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * kdbm_rpte
+ + *
+ + *     This function implements the 'rpte' command.  Print all pte_t structures
+ + *     that contain the given physical page range (<pfn> through <pfn>
+ + *     plus <npages>) for the given process. The default value for npages is
+ + *     one.
+ + *
+ + *     rpte -m <mm> <pfn> [<npages>]     Print all pte_t structures for
+ + *                                       physical page <pfn> in address space
+ + *                                       of <mm> which is a pointer to a
+ + *                                       mm_struct
+ + *     rpte -p <pid> <pfn> [<npages>]    Print all pte_t structures for
+ + *                                       physical page <pfn> in address space
+ + *                                       of <pid>
+ + */
+ +
+ +static int
+ +kdbm_rpte(int argc, const char **argv)
+ +{
+ +      unsigned long addr;
+ +      unsigned long pfn;
+ +      long offset = 0;
+ +      int nextarg;
+ +      unsigned long npages = 1;
+ +      int diag;
+ +      int found;
+ +      pid_t pid;
+ +      struct task_struct *tp;
+ +      struct mm_struct *mm, copy_of_mm;
+ +      pgd_t *pgd;
+ +      pud_t *pud;
+ +      pmd_t *pmd;
+ +      pte_t *pte;
+ +      unsigned long g, u, m, t;
+ +
+ +      if (argc < 3 || argc > 4) {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +       if (strcmp(argv[1], "-p") == 0) {
+ +              if ((diag = kdbgetularg(argv[2], (unsigned long *) &pid))) {
+ +                      return diag;
+ +              }
+ +
+ +              found = 0;
+ +              for_each_process(tp) {
+ +                      if (tp->pid == pid) {
+ +                              if (tp->mm != NULL) {
+ +                                      found = 1;
+ +                                      break;
+ +                              }
+ +                              kdb_printf("task structure's mm field is NULL\n");
+ +                              return 0;
+ +                      }
+ +              }
+ +
+ +              if (!found) {
+ +                      kdb_printf("No process with pid == %d found\n", pid);
+ +                      return 0;
+ +              }
+ +              mm = tp->mm;
+ +      } else if (strcmp(argv[1], "-m") == 0) {
+ +
+ +
+ +              nextarg = 2;
+ +              if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset,
+ +                                        NULL))
+ +                  || (diag = kdb_getarea(copy_of_mm, addr)))
+ +                      return (diag);
+ +              mm = &copy_of_mm;
+ +      } else {
+ +              return KDB_ARGCOUNT;
+ +      }
+ +
+ +      if ((diag = kdbgetularg(argv[3], &pfn))) {
+ +              return diag;
+ +      }
+ +
+ +      if (argc == 4) {
+ +              if ((diag = kdbgetularg(argv[4], &npages))) {
+ +                      return diag;
+ +              }
+ +      }
+ +
+ +      /* spaces after vaddr depends on sizeof(unsigned long) */
+ +      kdb_printf("pfn              vaddr%*s pte\n",
+ +                 (int)(2*sizeof(unsigned long) + 2 - 5), " ");
+ +
+ +      for (g = 0, pgd = pgd_offset(mm, 0UL); g < PTRS_PER_PGD; ++g, ++pgd) {
+ +              if (pgd_none(*pgd) || pgd_bad(*pgd))
+ +                      continue;
+ +              for (u = 0, pud = pud_offset(pgd, 0UL); u < PTRS_PER_PUD; ++u, ++pud) {
+ +                      if (pud_none(*pud) || pud_bad(*pud))
+ +                              continue;
+ +                      for (m = 0, pmd = pmd_offset(pud, 0UL); m < PTRS_PER_PMD; ++m, ++pmd) {
+ +                              if (pmd_none(*pmd) || pmd_bad(*pmd))
+ +                                      continue;
+ +                              for (t = 0, pte = pte_offset_map(pmd, 0UL); t < PTRS_PER_PTE; ++t, ++pte) {
+ +                                      if (pte_none(*pte))
+ +                                              continue;
+ +                                      if (pte_pfn(*pte) < pfn || pte_pfn(*pte) >= (pfn + npages))
+ +                                              continue;
+ +                                      addr = g << PGDIR_SHIFT;
+ +#ifdef __ia64__
+ +                                      /* IA64 plays tricks with the pgd mapping to save space.
+ +                                       * This reverses pgd_index().
+ +                                       */
+ +                                      {
+ +                                              unsigned long region = g >> (PAGE_SHIFT - 6);
+ +                                              unsigned long l1index = g - (region << (PAGE_SHIFT - 6));
+ +                                              addr = (region << 61) + (l1index << PGDIR_SHIFT);
+ +                                      }
+ +#endif
+ +                                      addr += (m << PMD_SHIFT) + (t << PAGE_SHIFT);
+ +                                      kdb_printf("0x%-14lx " kdb_bfd_vma_fmt0 " ",
+ +                                                 pte_pfn(*pte), addr);
+ +                                      kdbm_print_pte(pte);
+ +                                      kdb_printf("\n");
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +kdbm_print_dentry(unsigned long daddr)
+ +{
+ +      struct dentry d;
+ +      int diag;
+ +      char buf[256];
+ +
+ +      kdb_printf("Dentry at 0x%lx\n", daddr);
+ +      if ((diag = kdb_getarea(d, (unsigned long)daddr)))
+ +              return diag;
+ +
+ +      if ((d.d_name.len > sizeof(buf)) || (diag = kdb_getarea_size(buf, (unsigned long)(d.d_name.name), d.d_name.len)))
+ +              kdb_printf(" d_name.len = %d d_name.name = 0x%p\n",
+ +                                      d.d_name.len, d.d_name.name);
+ +      else
+ +              kdb_printf(" d_name.len = %d d_name.name = 0x%p <%.*s>\n",
+ +                                      d.d_name.len, d.d_name.name,
+ +                                      (int)(d.d_name.len), d.d_name.name);
+ +
+ +      kdb_printf(" d_count = %d d_flags = 0x%x d_inode = 0x%p\n",
+ +                                      atomic_read(&d.d_count), d.d_flags, d.d_inode);
+ +
+ +      kdb_printf(" d_parent = 0x%p\n", d.d_parent);
+ +
+ +      kdb_printf(" d_hash.nxt = 0x%p d_hash.prv = 0x%p\n",
+ +                                      d.d_hash.next, d.d_hash.pprev);
+ +
+ +      kdb_printf(" d_lru.nxt = 0x%p d_lru.prv = 0x%p\n",
+ +                                      d.d_lru.next, d.d_lru.prev);
+ +
+ +      kdb_printf(" d_child.nxt = 0x%p d_child.prv = 0x%p\n",
+ +                                      d.d_u.d_child.next, d.d_u.d_child.prev);
+ +
+ +      kdb_printf(" d_subdirs.nxt = 0x%p d_subdirs.prv = 0x%p\n",
+ +                                      d.d_subdirs.next, d.d_subdirs.prev);
+ +
+ +      kdb_printf(" d_alias.nxt = 0x%p d_alias.prv = 0x%p\n",
+ +                                      d.d_alias.next, d.d_alias.prev);
+ +
+ +      kdb_printf(" d_op = 0x%p d_sb = 0x%p d_fsdata = 0x%p\n",
+ +                                      d.d_op, d.d_sb, d.d_fsdata);
+ +
+ +      kdb_printf(" d_iname = %s\n",
+ +                                      d.d_iname);
+ +
+ +      if (d.d_inode) {
+ +              struct inode i;
+ +              kdb_printf("\nInode Entry at 0x%p\n", d.d_inode);
+ +              if ((diag = kdb_getarea(i, (unsigned long)d.d_inode)))
+ +                      return diag;
+ +              kdb_printf(" i_mode = 0%o  i_nlink = %d  i_rdev = 0x%x\n",
+ +                                              i.i_mode, i.i_nlink, i.i_rdev);
+ +
+ +              kdb_printf(" i_ino = %ld i_count = %d\n",
+ +                                              i.i_ino, atomic_read(&i.i_count));
+ +
+ +              kdb_printf(" i_hash.nxt = 0x%p i_hash.prv = 0x%p\n",
+ +                                              i.i_hash.next, i.i_hash.pprev);
+ +
+ +              kdb_printf(" i_list.nxt = 0x%p i_list.prv = 0x%p\n",
+ +                                              i.i_list.next, i.i_list.prev);
+ +
+ +              kdb_printf(" i_dentry.nxt = 0x%p i_dentry.prv = 0x%p\n",
+ +                                              i.i_dentry.next, i.i_dentry.prev);
+ +
+ +      }
+ +      kdb_printf("\n");
+ +      return 0;
+ +}
+ +
+ +static int
+ +kdbm_filp(int argc, const char **argv)
+ +{
+ +      struct file   f;
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset;
+ +      int diag;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ +          (diag = kdb_getarea(f, addr)))
+ +              return diag;
+ +
+ +      kdb_printf("File Pointer at 0x%lx\n", addr);
+ +
+ +      kdb_printf(" fu_list.nxt = 0x%p fu_list.prv = 0x%p\n",
+ +                      f.f_u.fu_list.next, f.f_u.fu_list.prev);
+ +
+ +      kdb_printf(" f_dentry = 0x%p f_vfsmnt = 0x%p f_op = 0x%p\n",
+ +                      f.f_dentry, f.f_vfsmnt, f.f_op);
+ +
+ +      kdb_printf(" f_count = %ld f_flags = 0x%x f_mode = 0x%x\n",
-                       f.f_count, f.f_flags, f.f_mode);
++                      atomic_long_read(&f.f_count), f.f_flags, f.f_mode);
+ +
+ +      kdb_printf(" f_pos = %Ld\n", f.f_pos);
+ +#ifdef        CONFIG_SECURITY
+ +      kdb_printf(" security = 0x%p\n", f.f_security);
+ +#endif
+ +
+ +      kdb_printf(" private_data = 0x%p f_mapping = 0x%p\n\n",
+ +                                      f.private_data, f.f_mapping);
+ +
+ +      return kdbm_print_dentry((unsigned long)f.f_dentry);
+ +}
+ +
+ +static int
+ +kdbm_fl(int argc, const char **argv)
+ +{
+ +      struct file_lock fl;
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset;
+ +      int diag;
+ +
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ +              (diag = kdb_getarea(fl, addr)))
+ +                      return diag;
+ +
+ +      kdb_printf("File_lock at 0x%lx\n", addr);
+ +
+ +      kdb_printf(" fl_next = 0x%p fl_link.nxt = 0x%p fl_link.prv = 0x%p\n",
+ +                      fl.fl_next, fl.fl_link.next, fl.fl_link.prev);
+ +      kdb_printf(" fl_block.nxt = 0x%p fl_block.prv = 0x%p\n",
+ +                      fl.fl_block.next, fl.fl_block.prev);
+ +      kdb_printf(" fl_owner = 0x%p fl_pid = %d fl_wait = 0x%p\n",
+ +                      fl.fl_owner, fl.fl_pid, &fl.fl_wait);
+ +      kdb_printf(" fl_file = 0x%p fl_flags = 0x%x\n",
+ +                      fl.fl_file, fl.fl_flags);
+ +      kdb_printf(" fl_type = %d fl_start = 0x%llx fl_end = 0x%llx\n",
+ +                      fl.fl_type, fl.fl_start, fl.fl_end);
+ +
+ +      kdb_printf(" file_lock_operations");
+ +      if (fl.fl_ops)
+ +              kdb_printf("\n   fl_copy_lock = 0x%p fl_release_private = 0x%p\n",
+ +                      fl.fl_ops->fl_copy_lock, fl.fl_ops->fl_release_private);
+ +      else
+ +              kdb_printf("   empty\n");
+ +
+ +      kdb_printf(" lock_manager_operations");
+ +      if (fl.fl_lmops)
+ +              kdb_printf("\n   fl_compare_owner = 0x%p fl_notify = 0x%p\n",
+ +                      fl.fl_lmops->fl_compare_owner, fl.fl_lmops->fl_notify);
+ +      else
+ +              kdb_printf("   empty\n");
+ +
+ +      kdb_printf(" fl_fasync = 0x%p fl_break 0x%lx\n",
+ +                      fl.fl_fasync, fl.fl_break_time);
+ +
+ +      return 0;
+ +}
+ +
+ +
+ +static int
+ +kdbm_dentry(int argc, const char **argv)
+ +{
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset;
+ +      int diag;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
+ +              return diag;
+ +
+ +      return kdbm_print_dentry(addr);
+ +}
+ +
+ +static int
+ +kdbm_kobject(int argc, const char **argv)
+ +{
+ +      struct kobject k;
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset;
+ +      int diag;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ +          (diag = kdb_getarea(k, addr)))
+ +              return diag;
+ +
+ +
+ +      kdb_printf("kobject at 0x%lx\n", addr);
+ +
+ +      if (k.name) {
+ +              char c;
+ +              kdb_printf(" name 0x%p", k.name);
+ +              if (kdb_getarea(c, (unsigned long)k.name) == 0)
+ +                      kdb_printf(" '%s'", k.name);
+ +              kdb_printf("\n");
+ +      }
+ +
+ +      if (k.name != kobject_name((struct kobject *)addr))
+ +              kdb_printf(" name '%.20s'\n", k.name);
+ +
+ +      kdb_printf(" kref.refcount %d'\n", atomic_read(&k.kref.refcount));
+ +
+ +      kdb_printf(" entry.next = 0x%p entry.prev = 0x%p\n",
+ +                                      k.entry.next, k.entry.prev);
+ +
+ +      kdb_printf(" parent = 0x%p kset = 0x%p ktype = 0x%p sd = 0x%p\n",
+ +                                      k.parent, k.kset, k.ktype, k.sd);
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +kdbm_sh(int argc, const char **argv)
+ +{
+ +      int diag;
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset = 0L;
+ +      struct Scsi_Host sh;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)) ||
+ +          (diag = kdb_getarea(sh, addr)))
+ +              return diag;
+ +
+ +      kdb_printf("Scsi_Host at 0x%lx\n", addr);
+ +      kdb_printf("host_queue = 0x%p\n", sh.__devices.next);
+ +      kdb_printf("ehandler = 0x%p eh_action = 0x%p\n",
+ +                 sh.ehandler, sh.eh_action);
+ +      kdb_printf("host_wait = 0x%p hostt = 0x%p\n",
+ +                 &sh.host_wait, sh.hostt);
+ +      kdb_printf("host_failed = %d  host_no = %d resetting = %d\n",
+ +                 sh.host_failed, sh.host_no, sh.resetting);
+ +      kdb_printf("max id/lun/channel = [%d/%d/%d]  this_id = %d\n",
+ +                 sh.max_id, sh.max_lun, sh.max_channel, sh.this_id);
+ +      kdb_printf("can_queue = %d cmd_per_lun = %d  sg_tablesize = %d u_isa_dma = %d\n",
+ +                 sh.can_queue, sh.cmd_per_lun, sh.sg_tablesize, sh.unchecked_isa_dma);
+ +      kdb_printf("host_blocked = %d  reverse_ordering = %d \n",
+ +                 sh.host_blocked, sh.reverse_ordering);
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +kdbm_sd(int argc, const char **argv)
+ +{
+ +      int diag;
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset = 0L;
+ +      struct scsi_device *sd = NULL;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
+ +              goto out;
+ +      if (!(sd = kmalloc(sizeof(*sd), GFP_ATOMIC))) {
+ +              kdb_printf("kdbm_sd: cannot kmalloc sd\n");
+ +              goto out;
+ +      }
+ +      if ((diag = kdb_getarea(*sd, addr)))
+ +              goto out;
+ +
+ +      kdb_printf("scsi_device at 0x%lx\n", addr);
+ +      kdb_printf("next = 0x%p   prev = 0x%p  host = 0x%p\n",
+ +                 sd->siblings.next, sd->siblings.prev, sd->host);
+ +      kdb_printf("device_busy = %d   current_cmnd 0x%p\n",
+ +                 sd->device_busy, sd->current_cmnd);
+ +      kdb_printf("id/lun/chan = [%d/%d/%d]  single_lun = %d  device_blocked = %d\n",
+ +                 sd->id, sd->lun, sd->channel, sd->sdev_target->single_lun, sd->device_blocked);
+ +      kdb_printf("queue_depth = %d current_tag = %d  scsi_level = %d\n",
+ +                 sd->queue_depth, sd->current_tag, sd->scsi_level);
+ +      kdb_printf("%8.8s %16.16s %4.4s\n", sd->vendor, sd->model, sd->rev);
+ +out:
+ +      if (sd)
+ +              kfree(sd);
+ +      return diag;
+ +}
+ +
+ +static int
+ +kdbm_sc(int argc, const char **argv)
+ +{
+ +      int diag;
+ +      int nextarg;
+ +      unsigned long addr;
+ +      long offset = 0L;
+ +      struct scsi_cmnd *sc = NULL;
+ +
+ +      if (argc != 1)
+ +              return KDB_ARGCOUNT;
+ +
+ +      nextarg = 1;
+ +      if ((diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL)))
+ +              goto out;
+ +      if (!(sc = kmalloc(sizeof(*sc), GFP_ATOMIC))) {
+ +              kdb_printf("kdbm_sc: cannot kmalloc sc\n");
+ +              goto out;
+ +      }
+ +      if ((diag = kdb_getarea(*sc, addr)))
+ +              goto out;
+ +
+ +      kdb_printf("scsi_cmnd at 0x%lx\n", addr);
+ +      kdb_printf("device = 0x%p  next = 0x%p\n",
+ +                 sc->device, sc->list.next);
+ +      kdb_printf("serial_number = %ld  retries = %d\n",
+ +                 sc->serial_number, sc->retries);
+ +      kdb_printf("cmd_len = %d\n", sc->cmd_len);
+ +      kdb_printf("cmnd = [%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x]\n",
+ +                 sc->cmnd[0], sc->cmnd[1], sc->cmnd[2], sc->cmnd[3], sc->cmnd[4],
+ +                 sc->cmnd[5], sc->cmnd[6], sc->cmnd[7], sc->cmnd[8], sc->cmnd[9],
+ +                 sc->cmnd[10], sc->cmnd[11]);
+ +      kdb_printf("request_buffer = 0x%p  request_bufflen = %d\n",
+ +                 scsi_sglist(sc), scsi_bufflen(sc));
+ +      kdb_printf("use_sg = %d\n", scsi_sg_count(sc));
+ +      kdb_printf("underflow = %d transfersize = %d\n",
+ +                 sc->underflow, sc->transfersize);
+ +      kdb_printf("tag = %d\n", sc->tag);
+ +
+ +out:
+ +      if (sc)
+ +              kfree(sc);
+ +      return diag;
+ +}
+ +
+ +static int __init kdbm_vm_init(void)
+ +{
+ +      kdb_register("vm", kdbm_vm, "[-v] <vaddr>", "Display vm_area_struct", 0);
+ +      kdb_register("vmp", kdbm_vm, "[-v] <pid>", "Display all vm_area_struct for <pid>", 0);
+ +#ifdef CONFIG_NUMA
+ +      kdb_register("mempolicy", kdbm_mpol, "<vaddr>", "Display mempolicy structure", 0);
+ +      kdb_register("pgdat", kdbm_pgdat, "<node_id>", "Display pglist_data node structure", 0);
+ +#else
+ +      kdb_register("pgdat", kdbm_pgdat, "", "Display pglist_data node structure", 0);
+ +#endif
+ +      kdb_register("pte", kdbm_pte, "( -m <mm> | -p <pid> ) <vaddr> [<nbytes>]", "Display pte_t for mm_struct or pid", 0);
+ +      kdb_register("rpte", kdbm_rpte, "( -m <mm> | -p <pid> ) <pfn> [<npages>]", "Find pte_t containing pfn for mm_struct or pid", 0);
+ +      kdb_register("dentry", kdbm_dentry, "<dentry>", "Display interesting dentry stuff", 0);
+ +      kdb_register("kobject", kdbm_kobject, "<kobject>", "Display interesting kobject stuff", 0);
+ +      kdb_register("filp", kdbm_filp, "<filp>", "Display interesting filp stuff", 0);
+ +      kdb_register("fl", kdbm_fl, "<fl>", "Display interesting file_lock stuff", 0);
+ +      kdb_register("sh", kdbm_sh, "<vaddr>", "Show scsi_host", 0);
+ +      kdb_register("sd", kdbm_sd, "<vaddr>", "Show scsi_device", 0);
+ +      kdb_register("sc", kdbm_sc, "<vaddr>", "Show scsi_cmnd", 0);
+ +
+ +      return 0;
+ +}
+ +
+ +static void __exit kdbm_vm_exit(void)
+ +{
+ +      kdb_unregister("vm");
+ +      kdb_unregister("vmp");
+ +#ifdef CONFIG_NUMA
+ +      kdb_unregister("mempolicy");
+ +#endif
+ +      kdb_unregister("pgdat");
+ +      kdb_unregister("pte");
+ +      kdb_unregister("rpte");
+ +      kdb_unregister("dentry");
+ +      kdb_unregister("kobject");
+ +      kdb_unregister("filp");
+ +      kdb_unregister("fl");
+ +      kdb_unregister("sh");
+ +      kdb_unregister("sd");
+ +      kdb_unregister("sc");
+ +}
+ +
+ +module_init(kdbm_vm_init)
+ +module_exit(kdbm_vm_exit)
diff --cc kernel/Kconfig.preempt
Simple merge
diff --cc kernel/Makefile
Simple merge
diff --cc kernel/capability.c
Simple merge
diff --cc kernel/cgroup.c
Simple merge
diff --cc kernel/exit.c
Simple merge
diff --cc kernel/irq/chip.c
Simple merge
diff --cc kernel/irq/handle.c
Simple merge
diff --cc kernel/irq/spurious.c
Simple merge
diff --cc kernel/kexec.c

index 7e76e73,87ebe8a..09bdf64
--- 1/kernel/kexec.c
--- 2/kernel/kexec.c
+++ b/kernel/kexec.c
@@@ -40,16 -40,8 +40,14 @@@
   #include <asm/system.h>
   #include <asm/sections.h>
   
+ +#ifdef CONFIG_KDB_KDUMP
+ +#include <linux/module.h>
+ +#include <linux/device.h>
+ +#include <linux/kdb.h>
+ +#endif
+ +
- #ifndef CONFIG_XEN
   /* Per cpu memory for storing cpu states in case of system crash. */
- note_buf_t* crash_notes;
- #endif
+ note_buf_t __percpu *crash_notes;
   
   /* vmcoreinfo stuff */
   static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --cc kernel/ksysfs.c
Simple merge
diff --cc kernel/module.c
Simple merge
diff --cc kernel/panic.c

index f0ffd4e,13d966b..35cc245
--- 1/kernel/panic.c
--- 2/kernel/panic.c
+++ b/kernel/panic.c
@@@ -113,17 -122,10 +131,15 @@@ NORET_TYPE void panic(const char * fmt
                  * We can't use the "normal" timers since we just panicked.
                  */
                 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
- -
+ +#ifdef CONFIG_BOOTSPLASH
+ +              {
+ +                      extern int splash_verbose(void);
+ +                      (void)splash_verbose();
+ +              }
+ +#endif
-               for (i = 0; i < panic_timeout*1000; ) {
+               for (i = 0; i < panic_timeout; i++) {
                         touch_nmi_watchdog();
-                       i += panic_blink(i);
-                       mdelay(1);
-                       i++;
+                       panic_blink_one_second();
                 }
                 /*
                  * This will not be a clean reboot, with everything
@@@ -149,17 -151,9 +165,15 @@@
         }
   #endif
         local_irq_enable();
+ +#ifdef CONFIG_BOOTSPLASH
+ +      {
+ +              extern int splash_verbose(void);
+ +              (void)splash_verbose();
+ +      }
+ +#endif
-       for (i = 0; ; ) {
+       while (1) {
                 touch_softlockup_watchdog();
-               i += panic_blink(i);
-               mdelay(1);
-               i++;
+               panic_blink_one_second();
         }
   }
   
diff --cc kernel/posix-cpu-timers.c
Simple merge
diff --cc kernel/printk.c

index b45dfe2,75077ad..f7fc7b5
--- 1/kernel/printk.c
--- 2/kernel/printk.c
+++ b/kernel/printk.c
@@@ -35,8 -35,7 +35,9 @@@
   #include <linux/kexec.h>
   #include <linux/ratelimit.h>
   #include <linux/kmsg_dump.h>
+ #include <linux/syslog.h>
+ +#include <linux/jhash.h>
+ +#include <linux/device.h>
   
   #include <asm/uaccess.h>
   
@@@ -419,24 -410,9 +412,24 @@@ out
   
   SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
   {
-       return do_syslog(type, buf, len);
+       return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
   }
   
+ +#ifdef CONFIG_DEBUG_KERNEL
+ +/* Its very handy to be able to view the syslog buffer during debug.
+ + * But do_syslog() uses locks so it cannot be used during debugging.
+ + * Instead, provide the start and end of the physical and logical logs.
+ + * This is equivalent to do_syslog(3).
+ + */
+ +void debugger_syslog_data(char *syslog_data[4])
+ +{
+ +      syslog_data[0] = log_buf;
+ +      syslog_data[1] = log_buf + log_buf_len;
+ +      syslog_data[2] = log_buf + log_end - (logged_chars < log_buf_len ? logged_chars : log_buf_len);
+ +      syslog_data[3] = log_buf + log_end;
+ +}
+ +#endif   /* CONFIG_DEBUG_KERNEL */
+ +
   /*
    * Call the console drivers on a range of log_buf
    */
diff --cc kernel/ptrace.c
Simple merge
diff --cc kernel/sched.c
Simple merge
diff --cc kernel/signal.c
Simple merge
diff --cc kernel/sys.c
Simple merge
diff --cc kernel/sysctl.c

index 6f25d5f,0ef19c6..04609e9
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -1279,24 -1262,6 +1278,13 @@@ static struct ctl_table vm_table[] = 
                 .mode           = 0644,
                 .proc_handler   = scan_unevictable_handler,
         },
+ +      {
+ +              .procname       = "heap-stack-gap",
+ +              .data           = &heap_stack_gap,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0644,
+ +              .proc_handler   = proc_dointvec,
+ +      },
- #ifdef CONFIG_PRESWAP
-       {
-               .procname       = "preswap",
-               .data           = NULL,
-               .maxlen         = sizeof(unsigned long),
-               .mode           = 0644,
-               .proc_handler   = preswap_sysctl_handler,
-               .extra1         = (void *)&preswap_zero,
-               .extra2         = (void *)&preswap_infinity,
-       },
- #endif
   #ifdef CONFIG_MEMORY_FAILURE
         {
                 .procname       = "memory_failure_early_kill",
diff --cc kernel/sysctl_binary.c
Simple merge
diff --cc kernel/taskstats.c
Simple merge
diff --cc lib/Kconfig.debug
Simple merge
diff --cc mm/Makefile
Simple merge
diff --cc mm/filemap.c
Simple merge
diff --cc mm/hugetlb.c
Simple merge
diff --cc mm/memcontrol.c
Simple merge
diff --cc mm/memory.c
Simple merge
diff --cc mm/migrate.c
Simple merge
diff --cc mm/mmap.c
Simple merge
diff --cc mm/page_alloc.c
Simple merge
diff --cc mm/page_io.c

index 5245453,a19af95..371be49
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -99,24 -97,6 +99,17 @@@ int swap_writepage(struct page *page, s
                 unlock_page(page);
                 goto out;
         }
+ +
+ +      if (sis->flags & SWP_FILE) {
+ +              struct file *swap_file = sis->swap_file;
+ +              struct address_space *mapping = swap_file->f_mapping;
+ +
+ +              ret = mapping->a_ops->swap_out(swap_file, page, wbc);
+ +              if (!ret)
+ +                      count_vm_event(PSWPOUT);
+ +              return ret;
+ +      }
+ +
-       if (preswap_put(page) == 1) {
-               set_page_writeback(page);
-               unlock_page(page);
-               end_page_writeback(page);
-               goto out;
-       }
- 
         bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
         if (bio == NULL) {
                 set_page_dirty(page);
@@@ -174,23 -121,6 +167,17 @@@ int swap_readpage(struct page *page
   
         VM_BUG_ON(!PageLocked(page));
         VM_BUG_ON(PageUptodate(page));
+ +
+ +      if (sis->flags & SWP_FILE) {
+ +              struct file *swap_file = sis->swap_file;
+ +              struct address_space *mapping = swap_file->f_mapping;
+ +
+ +              ret = mapping->a_ops->swap_in(swap_file, page);
+ +              if (!ret)
+ +                      count_vm_event(PSWPIN);
+ +              return ret;
+ +      }
+ +
-       if (preswap_get(page) == 1) {
-               SetPageUptodate(page);
-               unlock_page(page);
-               goto out;
-       }
- 
         bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
         if (bio == NULL) {
                 unlock_page(page);
diff --cc mm/slab.c
Simple merge
diff --cc mm/slub.c

index 97ddaaf,a2b8969..015a4dd
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -1656,10 -1628,10 +1643,10 @@@ load_freelist
         object = c->page->freelist;
         if (unlikely(!object))
                 goto another_slab;
- -      if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
- -              goto debug;
+ +      if (unlikely(PageSlubDebug(c->page) || c->reserve))
+ +              goto slow_path;
   
-       c->freelist = object[c->offset];
+       c->freelist = get_freepointer(s, object);
         c->page->inuse = c->page->objects;
         c->page->freelist = NULL;
         c->node = page_to_nid(c->page);
@@@ -1689,9 -1660,8 +1676,9 @@@ grow_slab
                 local_irq_disable();
   
         if (new) {
-               c = get_cpu_slab(s, smp_processor_id());
+               c = __this_cpu_ptr(s->cpu_slab);
+ +              c->reserve = reserve;
-               stat(c, ALLOC_SLAB);
+               stat(s, ALLOC_SLAB);
                 if (c->page)
                         flush_slab(s, c);
                 slab_lock(new);
@@@ -1702,23 -1672,12 +1689,23 @@@
         if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
                 slab_out_of_memory(s, gfpflags, node);
         return NULL;
- -debug:
- -      if (!alloc_debug_processing(s, c->page, object, addr))
+ +
+ +slow_path:
+ +      if (PageSlubDebug(c->page) &&
+ +                      !alloc_debug_processing(s, c->page, object, addr))
                 goto another_slab;
   
+ +      /*
+ +       * Avoid the slub fast path in slab_alloc() by not setting
+ +       * c->freelist and the fast path in slab_free() by making
+ +       * node_match() fail by setting c->node to -1.
+ +       *
+ +       * We use this for for debug and reserve checks which need
+ +       * to be done for each allocation.
+ +       */
+ +
         c->page->inuse++;
-       c->page->freelist = object[c->offset];
+       c->page->freelist = get_freepointer(s, object);
         c->node = -1;
         goto unlock_out;
   }
diff --cc mm/swapfile.c
Simple merge
diff --cc mm/vmscan.c
Simple merge
diff --cc mm/vmstat.c
Simple merge
diff --cc net/bridge/br_if.c
Simple merge
diff --cc net/core/dev.c

index 725c17c,bcc490c..d20568d
--- 1/net/core/dev.c
--- 2/net/core/dev.c
+++ b/net/core/dev.c
@@@ -2514,9 -2484,9 +2512,10 @@@ int netif_receive_skb(struct sk_buff *s
         struct packet_type *ptype, *pt_prev;
         struct net_device *orig_dev;
         struct net_device *null_or_orig;
+       struct net_device *null_or_bond;
         int ret = NET_RX_DROP;
         __be16 type;
+ +      unsigned long pflags = current->flags;
   
         if (!skb->tstamp.tv64)
                 net_timestamp(skb);
@@@ -2569,22 -2527,6 +2568,9 @@@
         }
   #endif
   
- #ifdef CONFIG_XEN
-       switch (skb->ip_summed) {
-       case CHECKSUM_UNNECESSARY:
-               skb->proto_data_valid = 1;
-               break;
-       case CHECKSUM_PARTIAL:
-               /* XXX Implement me. */
-       default:
-               skb->proto_data_valid = 0;
-               break;
-       }
- #endif
- 
+ +      if (skb_emergency(skb))
+ +              goto skip_taps;
+ +
         list_for_each_entry_rcu(ptype, &ptype_all, list) {
                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
                     ptype->dev == orig_dev) {
@@@ -2602,16 -2543,25 +2588,28 @@@ skip_taps
   ncls:
   #endif
   
+ +      if (!skb_emergency_protocol(skb))
+ +              goto drop;
+ +
         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
         if (!skb)
- -              goto out;
+ +              goto unlock;
         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
         if (!skb)
- -              goto out;
+ +              goto unlock;
   
+       /*
+        * Make sure frames received on VLAN interfaces stacked on
+        * bonding interfaces still make their way to any base bonding
+        * device that may have registered for a specific ptype.  The
+        * handler may have to adjust skb->dev and orig_dev.
+        */
+       null_or_bond = NULL;
+       if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
+           (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
+               null_or_bond = vlan_dev_real_dev(skb->dev);
+       }
+ 
         type = skb->protocol;
         list_for_each_entry_rcu(ptype,
                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
diff --cc net/core/filter.c
Simple merge
diff --cc net/core/skbuff.c
Simple merge
diff --cc net/core/sock.c
Simple merge
diff --cc net/ipv4/ip_fragment.c

index 03756d0,b59430b..c0ae093
--- 1/net/ipv4/ip_fragment.c
--- 2/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@@ -734,10 -729,8 +759,10 @@@ static inline void ip4_frags_ctl_regist
   }
   #endif
   
- static int ipv4_frags_init_net(struct net *net)
+ static int __net_init ipv4_frags_init_net(struct net *net)
   {
+ +      int ret;
+ +
         /*
          * Fragment cache limits. We will commit 256K at one time. Should we
          * cross that limit we will prune down to 192K. This should cope with
@@@ -755,31 -748,11 +780,31 @@@
   
         inet_frags_init_net(&net->ipv4.frags);
   
- -      return ip4_frags_ns_ctl_register(net);
+ +      ret = ip4_frags_ns_ctl_register(net);
+ +      if (ret)
+ +              goto out_reg;
+ +
+ +      mem_reserve_init(&net->ipv4.frags.reserve, "IPv4 fragment cache",
+ +                      &net_skb_reserve);
+ +      ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
+ +                      net->ipv4.frags.high_thresh);
+ +      if (ret)
+ +              goto out_reserve;
+ +
+ +      return 0;
+ +
+ +out_reserve:
+ +      mem_reserve_disconnect(&net->ipv4.frags.reserve);
+ +      ip4_frags_ns_ctl_unregister(net);
+ +out_reg:
+ +      inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+ +
+ +      return ret;
   }
   
- static void ipv4_frags_exit_net(struct net *net)
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
   {
+ +      mem_reserve_disconnect(&net->ipv4.frags.reserve);
         ip4_frags_ns_ctl_unregister(net);
         inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
   }
diff --cc net/ipv4/route.c
Simple merge
diff --cc net/ipv4/tcp.c
Simple merge
diff --cc net/ipv4/tcp_input.c
Simple merge
diff --cc net/ipv4/tcp_output.c
Simple merge
diff --cc net/ipv6/addrconf.c

index c8731ad,88fd8c5..d557343
--- 1/net/ipv6/addrconf.c
--- 2/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@@ -2792,10 -2804,9 +2804,10 @@@ static void addrconf_dad_start(struct i
         read_lock_bh(&idev->lock);
         if (ifp->dead)
                 goto out;
-       spin_lock_bh(&ifp->lock);
   
+       spin_lock(&ifp->lock);
         if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+ +          !(dev->flags&IFF_MULTICAST) ||
             idev->cnf.accept_dad < 1 ||
             !(ifp->flags&IFA_F_TENTATIVE) ||
             ifp->flags & IFA_F_NODAD) {
diff --cc net/ipv6/reassembly.c

index 8fb1744,a555156..ad3d8b8
--- 1/net/ipv6/reassembly.c
--- 2/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@@ -774,41 -740,19 +769,41 @@@ static inline void ip6_frags_sysctl_unr
   }
   #endif
   
- static int ipv6_frags_init_net(struct net *net)
+ static int __net_init ipv6_frags_init_net(struct net *net)
   {
+ +      int ret;
+ +
-       net->ipv6.frags.high_thresh = 256 * 1024;
-       net->ipv6.frags.low_thresh = 192 * 1024;
+       net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
         net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
   
         inet_frags_init_net(&net->ipv6.frags);
   
- -      return ip6_frags_ns_sysctl_register(net);
+ +      ret = ip6_frags_ns_sysctl_register(net);
+ +      if (ret)
+ +              goto out_reg;
+ +
+ +      mem_reserve_init(&net->ipv6.frags.reserve, "IPv6 fragment cache",
+ +                       &net_skb_reserve);
+ +      ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
+ +                                    net->ipv6.frags.high_thresh);
+ +      if (ret)
+ +              goto out_reserve;
+ +
+ +      return 0;
+ +
+ +out_reserve:
+ +      mem_reserve_disconnect(&net->ipv6.frags.reserve);
+ +      ip6_frags_ns_sysctl_unregister(net);
+ +out_reg:
+ +      inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+ +
+ +      return ret;
   }
   
- static void ipv6_frags_exit_net(struct net *net)
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
   {
+ +      mem_reserve_disconnect(&net->ipv6.frags.reserve);
         ip6_frags_ns_sysctl_unregister(net);
         inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
   }
diff --cc net/ipv6/route.c
Simple merge
diff --cc net/ipv6/tcp_ipv6.c
Simple merge
diff --cc net/mac80211/Kconfig
Simple merge
diff --cc net/netfilter/Kconfig
Simple merge
diff --cc net/netfilter/Makefile
Simple merge
diff --cc net/netfilter/nf_conntrack_netlink.c
Simple merge
diff --cc net/sunrpc/svc_xprt.c
Simple merge
diff --cc net/sunrpc/svcauth_unix.c
Simple merge
diff --cc net/sunrpc/xprtsock.c
Simple merge
diff --cc scripts/Makefile.build
Simple merge
diff --cc scripts/kconfig/Makefile
Simple merge
diff --cc security/apparmor/path.c

index a48b5f4,0000000..8881a22

mode 100644,000000..100644
--- 1/security/apparmor/path.c
--- /dev/null
+++ b/security/apparmor/path.c
@@@ -1,183 -1,0 +1,187 @@@
+ +/*
+ + * AppArmor security module
+ + *
+ + * This file contains AppArmor function for pathnames
+ + *
+ + * Copyright (C) 1998-2008 Novell/SUSE
+ + * Copyright 2009 Canonical Ltd.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License as
+ + * published by the Free Software Foundation, version 2 of the
+ + * License.
+ + */
+ +
+ +#include <linux/mnt_namespace.h>
+ +#include <linux/mount.h>
+ +#include <linux/namei.h>
+ +#include <linux/path.h>
+ +#include <linux/sched.h>
+ +#include <linux/slab.h>
+ +#include <linux/fs_struct.h>
+ +
+ +#include "include/apparmor.h"
+ +#include "include/path.h"
+ +
+ +int aa_get_name_to_buffer(struct path *path, int is_dir, char *buffer, int size,
+ +                        char **name)
+ +{
+ +      int error = d_namespace_path(path, buffer, size - is_dir, name);
+ +
+ +      if (!error && is_dir && (*name)[1] != '\0')
+ +              /*
+ +               * Append "/" to the pathname.  The root directory is a special
+ +               * case; it already ends in slash.
+ +               */
+ +              strcpy(&buffer[size - 2], "/");
+ +
+ +      return error;
+ +}
+ +
+ +/**
+ + * aa_get_name - compute the pathname of a file
+ + * @path: path the file
+ + * @is_dir: set if the file is a directory
+ + * @buffer: buffer that aa_get_name() allocated
+ + * @name: the error code indicating whether aa_get_name failed
+ + *
+ + * Returns an error code if the there was a failure in obtaining the
+ + * name.
+ + *
+ + * @name is apointer to the beginning of the pathname (which usually differs
+ + * from the beginning of the buffer), or NULL.  If there is an error @name
+ + * may contain a partial or invalid name (in the case of a deleted file), that
+ + * can be used for audit purposes, but it can not be used for mediation.
+ + *
+ + * We need @is_dir to indicate whether the file is a directory or not because
+ + * the file may not yet exist, and so we cannot check the inode's file type.
+ + */
+ +int aa_get_name(struct path *path, int is_dir, char **buffer, char **name)
+ +{
+ +      char *buf, *str = NULL;
+ +      int size = 256;
+ +      int error;
+ +
+ +      *name = NULL;
+ +      *buffer = NULL;
+ +      for (;;) {
+ +              buf = kmalloc(size, GFP_KERNEL);
+ +              if (!buf)
+ +                      return -ENOMEM;
+ +
+ +              error = aa_get_name_to_buffer(path, is_dir, buf, size, &str);
+ +              if (!error || (error == -ENOENT) || (error == -ESTALE))
+ +                      break;
+ +
+ +              kfree(buf);
+ +              size <<= 1;
+ +              if (size > g_apparmor_path_max)
+ +                      return -ENAMETOOLONG;
+ +      }
+ +      *buffer = buf;
+ +      *name = str;
+ +
+ +      return error;
+ +}
+ +
++/* Only needed until d_namespace_path is cleaned up and doesn't use
++ * vfsmount_lock anymore. -jeffm */
++extern spinlock_t vfsmount_lock;
++
+ +int d_namespace_path(struct path *path, char *buf, int buflen, char **name)
+ +{
+ +      struct path root, tmp, ns_root = { };
+ +      char *res;
+ +      int deleted;
+ +      int error = 0;
+ +
+ +      read_lock(&current->fs->lock);
+ +      root = current->fs->root;
+ +      path_get(&current->fs->root);
+ +      read_unlock(&current->fs->lock);
+ +      spin_lock(&vfsmount_lock);
+ +      if (root.mnt && root.mnt->mnt_ns)
+ +              ns_root.mnt = mntget(root.mnt->mnt_ns->root);
+ +      if (ns_root.mnt)
+ +              ns_root.dentry = dget(ns_root.mnt->mnt_root);
+ +      spin_unlock(&vfsmount_lock);
+ +      spin_lock(&dcache_lock);
+ +
+ +      do {
+ +              tmp = ns_root;
+ +              deleted = d_unlinked(path->dentry);
+ +              res = __d_path(path, &tmp, buf, buflen);
+ +      } while (deleted != d_unlinked(path->dentry));
+ +
+ +      *name = res;
+ +      /* handle error conditions - and still allow a partial path to
+ +       * be returned */
+ +      if (IS_ERR(res)) {
+ +              error = PTR_ERR(res);
+ +              *name = buf;
+ +      } else if (deleted) {
+ +              /* The stripping of (deleted) is a hack that could be removed
+ +               * with an updated __d_path
+ +               */
+ +
+ +              /* Currently 2 cases fall into here.  Fixing the mediation
+ +               * of deleted files for things like trunc.
+ +               * And the newly allocated dentry case.  The first case
+ +               * means we strip deleted for everything so the new
+ +               * dentry test case is commented out below.
+ +               */
+ +              buf[buflen - 11] = 0;   /* - (len(" (deleted)") +\0) */
+ +
+ +              /* if (!path->dentry->d_inode) {
+ +               * On some filesystems, newly allocated dentries appear
+ +               * to the security_path hooks as a deleted
+ +               * dentry except without an inode allocated.
+ +               *
+ +               * Remove the appended deleted text and return as a
+ +               * string for normal mediation.  The (deleted) string
+ +               * is guarenteed to be added in this case, so just
+ +               * strip it.
+ +               */
+ +      } else if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) {
+ +              error = -ENOENT;
+ +#if 0
+ +      } else if (tmp.dentry != ns_root.dentry && tmp.mnt != ns_root.mnt) {
+ +              /* disconnected path don return pathname starting with '/' */
+ +              error = -ESTALE;
+ +              if (*res == '/')
+ +                      *name = res + 1;
+ +#endif
+ +      }
+ +
+ +      spin_unlock(&dcache_lock);
+ +      path_put(&root);
+ +      path_put(&ns_root);
+ +
+ +      return error;
+ +}
+ +
+ +char *sysctl_pathname(struct ctl_table *table, char *buffer, int buflen)
+ +{
+ +      if (buflen < 1)
+ +              return NULL;
+ +      buffer += --buflen;
+ +      *buffer = '\0';
+ +
+ +      while (table) {
+ +              int namelen = strlen(table->procname);
+ +
+ +              if (buflen < namelen + 1)
+ +                      return NULL;
+ +              buflen -= namelen + 1;
+ +              buffer -= namelen;
+ +              memcpy(buffer, table->procname, namelen);
+ +              *--buffer = '/';
+ +              table = table->parent;
+ +      }
+ +      if (buflen < 4)
+ +              return NULL;
+ +      buffer -= 4;
+ +      memcpy(buffer, "/sys", 4);
+ +
+ +      return buffer;
+ +}
diff --cc security/capability.c
Simple merge
diff --cc security/security.c
Simple merge
diff --cc security/selinux/avc.c
Simple merge
diff --cc security/selinux/hooks.c
Simple merge
diff --cc virt/kvm/ioapic.c
Simple merge
author	Jeff Mahoney <jeffm@suse.de>
	Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
committer	Jeff Mahoney <jeffm@suse.de>
	Tue, 9 Mar 2010 23:03:25 +0000 (18:03 -0500)
		1	2
Documentation/feature-removal-schedule.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/filesystems/Locking	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/filesystems/proc.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/networking/ixgbevf.txt	patch \|	\|	diff2 \|	blob \| history
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kernel/acpi.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kvm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/of_platform.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/52xx/mpc52xx_gpt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/cell/interrupt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/sysdev/fsl_msi.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/sysdev/ipic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/xmon/xmon.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/boot/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/apic.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/fixmap.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/io.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/irq_vectors.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/paravirt.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/paravirt_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgalloc.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/ptrace.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/required-features.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/system.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kdb/kdba_bt.c	patch \|	diff1 \|	\|	blob \| history
arch/x86/kdb/kdba_support.c	patch \|	diff1 \|	\|	blob \| history
arch/x86/kernel/acpi/boot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic/io_apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apm_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/dumpstack_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/dumpstack_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/e820.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_32.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/hpet.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/machine_kexec_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/machine_kexec_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/paravirt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/pci-dma.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/reboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/tsc.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/x8664_ksyms_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/oprofile/nmi_int.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/numa.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/osl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/ata/ahci.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/ata/ata_piix.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/ata/libata-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/atm/fore200e.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/agp/intel-agp.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/keyboard.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/mem.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/crypto/amcc/crypto4xx_core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/crypto/talitos.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/dma/fsldma.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/hid/hid-apple.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/hid/hid-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/hid/hid-ids.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/hid/usbhid/hid-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/ieee1394/sbp2.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/input/serio/xilinx_ps2.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/input/touchscreen/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/input/touchscreen/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/macintosh/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/macintosh/adb.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/macintosh/therm_pm72.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/macintosh/therm_windtunnel.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-mpath.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-raid45.c	patch \|	diff1 \|	\|	blob \| history
drivers/md/dm-table.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/media/video/uvc/uvc_ctrl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/media/video/uvc/uvc_driver.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/media/video/uvc/uvcvideo.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/mtd/maps/omap_nor.c	patch \|	blob1 \|	blob2 \|	history
drivers/net/bnx2.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/e1000/e1000_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/e1000e/e1000.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/e1000e/netdev.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ehea/ehea_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/fs_enet/fs_enet-main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/fs_enet/mii-fec.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/gianfar.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ibm_newemac/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/igb/igb_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ixgbe/ixgbe_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/myri_sbus.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/niu.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sky2.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sunbmac.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sunhme.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sunlance.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sunqe.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/tehuti.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/tg3.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/tulip/tulip_core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ucc_geth.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/wireless/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/pcmcia/electra_cf.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/pcmcia/m8xx_pcmcia.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/hpsa.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/ibmvscsi/ibmvscsi.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/lpfc/lpfc_hw.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/lpfc/lpfc_init.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/qla4xxx/ql4_init.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_scan.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/sd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/serial/8250.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/serial/mpc52xx_uart.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/spi/spi_mpc8xxx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/core/hcd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/core/hcd.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/gadget/fsl_qe_udc.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/host/ehci-hcd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/host/fhci-hcd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/host/isp1760-if.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/host/ohci-hcd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/host/uhci-hcd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/video/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/video/console/fbcon.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/video/console/vgacon.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/watchdog/cpwd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/watchdog/riowd.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
fs/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/compat_ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dlm/dlm_internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/ialloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/xattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/ops_fstype.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/dns_resolve.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/write.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/vfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/cluster/masklog.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/cluster/masklog.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/localalloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/ocfs2.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/suballoc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/suballoc.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/partitions/check.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/array.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/reiserfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/dmapi/xfs_dm.c	patch \|	diff1 \|	\|	blob \| history
fs/xfs/linux-2.6/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/linux-2.6/xfs_iops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/linux-2.6/xfs_iops.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/linux-2.6/xfs_linux.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/linux-2.6/xfs_super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_itable.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_mount.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_rw.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_rw.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_vnodeops.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/audit.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/device.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/ext3_fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/ext3_fs_i.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fb.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/genhd.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/gfp.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/jbd.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kernel.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/libata.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mmzone.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/module.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/nfs_fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/page-flags.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/security.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/skbuff.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/slab.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/slub_def.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/swap.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/netns/ipv6.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/sock.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/scsi/scsi_device.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
ipc/mqueue.c	patch \|	diff1 \|	diff2 \|	blob \| history
kdb/modules/kdbm_vm.c	patch \|	diff1 \|	\|	blob \| history
kernel/Kconfig.preempt	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/capability.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/chip.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/handle.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/spurious.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kexec.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/ksysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/module.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/panic.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/posix-cpu-timers.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/printk.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/ptrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl_binary.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/taskstats.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
mm/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/hugetlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/migrate.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/swapfile.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmstat.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/bridge/br_if.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/filter.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/skbuff.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/sock.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/ip_fragment.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/route.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/tcp.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/tcp_input.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/tcp_output.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/addrconf.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/reassembly.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/route.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/tcp_ipv6.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/mac80211/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
net/netfilter/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
net/netfilter/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
net/netfilter/nf_conntrack_netlink.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sunrpc/svc_xprt.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sunrpc/svcauth_unix.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sunrpc/xprtsock.c	patch \|	diff1 \|	diff2 \|	blob \| history
scripts/Makefile.build	patch \|	diff1 \|	diff2 \|	blob \| history
scripts/kconfig/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
security/apparmor/path.c	patch \|	diff1 \|	\|	blob \| history
security/capability.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/security.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/selinux/avc.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/selinux/hooks.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/ioapic.c	patch \|	diff1 \|	diff2 \|	blob \| history